Ciro Santilli OurBigBook.com  Sponsor 中国独裁统治 China Dictatorship 新疆改造中心、六四事件、法轮功、郝海东、709大抓捕、2015巴拿马文件 邓家贵、低端人口、西藏骚乱
cia-2010-covert-communication-websites/cdx-post.sh
#!/usr/bin/env bash
# Post process the output of cdx.sh to enrich IDs even further, and reconstruct easier to Web Archive inspect domain names.
grep -P -e '([^,)]+)\)\/\1\.swf|\)/[^/]+.jar|([^,)]+),([^,)]+),([^,)]+)\)/cgi-bin/[^/]+\.cgi' "$1" |
  sed -r 's/\).*//' | awk -F, '{ printf("%s.%s\n", $2, $1) }' | uniq -c | awk '$1 == 1{ print $2 }' | tee $1.post
#while IFS= read -r domain; do
#  echo $i $domain
#  curl --connect-timeout 5 "https://web.archive.org/cdx/search/cdx?url=$domain&matchType=domain&filter=urlkey:.*\.(cgi|jar|swf)&to=20140101000000&limit=5" | tee -a "$out"
#  if [ "${PIPESTATUS[0]}" -ne 0 ]; then
#    echo $domain >> "$out_err"
#  fi
#  echo "$i" > "$nfile"
#  i=$((i+1))
#done <"1"