Ciro Santilli OurBigBook.com  Sponsor 中国独裁统治 China Dictatorship 新疆改造中心、六四事件、法轮功、郝海东、709大抓捕、2015巴拿马文件 邓家贵、低端人口、西藏骚乱
cia-2010-covert-communication-websites/cdx-tor.sh
#!/usr/bin/env bash

script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

domain_only=false
while [[ $# -gt 0 ]]; do
  case $1 in
    -d)
      # Do CDX on domains only without any filters to count how many archives there are for a given domain,
      # and thus remove anything with too many hits as an unlikely CIA website.
      domain_only=true
      shift
      ;;
    *)
      break
      ;;
  esac
done

# Input: domain list, one per line.
domains=$1

# First start this number of tor instances with:
# ```
# tor-army 100
# ```
# https://stackoverflow.com/questions/14321214/how-to-run-multiple-tor-processes-at-once-with-different-exit-ips/76749983#76749983
ntor=${2:-100}
ndigits=2

outdir=$domains.cdx
#rm -rf "$outdir"
mkdir -p "$outdir"
split -l $((`wc -l < $domains`/($ntor - 1))) --numeric-suffixes --suffix-length $ndigits "$domains" "$outdir/"
cd "$outdir"

dowork() (
  i=$(printf "%0${ndigits}d" ${1} )
  nfile=nfile$i
  if [ -r "$nfile" ]; then
    j="$(cat "$nfile")"
  else
    j=0
  fi
  out=out$i
  out_err=err$i
  out_err_403=err_403_$i
  out_err_log=log$i
  port=$((9050 + ${1}))
  pid="$(netstat -nlp 2>/dev/null | awk '$4~":'"$port"'"{ gsub(/\/.*/,"",$7); print $7 }')"
  tail -n+$((j + 1)) "$i" | while IFS= read -r domain; do
    retry=0
    while :; do
      err=false
      if $domain_only; then
        response="$(torsocks -P "$port" curl -s --connect-timeout 10 -w "%{http_code}" "https://web.archive.org/cdx/search/cdx?url=$domain" 2>&1 )"
      else
        response="$(torsocks -P "$port" curl -s --connect-timeout 10 -w "%{http_code}" "https://web.archive.org/cdx/search/cdx?url=$domain&matchType=domain&filter=urlkey:.*\.(cgi|jar|swf|js)&to=20140101000000&limit=10" 2>&1 )"
      fi
      if [ "$?" -ne 0 ]; then
        echo $i $j $domain err
        echo $domain >> "$out_err"
        echo $domain >> "$out_err_log"
        echo $response >> "$out_err_log"
        err=true
      else
        http_code=$(tail -n1 <<< "$response")
        content=$(sed '$ d' <<< "$response")
        echo $i $j $domain $http_code
        if [ $http_code -eq 200 ]; then
          if [ -n "$content" ]; then
            echo "$content" >> "$out"
          fi
        else
          if [ $http_code -eq 403 ]; then
            echo "$domain" >> "$out_err_403"
            # These appear to neve recover. And OMG they are interesting, why is access denied???
            break
          fi
          echo "$domain" >> "$out_err"
          err=true
        fi
      fi
      if $err; then
        kill -HUP $pid
        if [ $retry -eq 10 ]; then
          break
        fi
        echo "$i $j $domain retry=$retry pid=$pid new ip: $(torsocks -P $port curl -s http://checkip.amazonaws.com)"
      else
        break
      fi
      #sleep 1
      retry=$((retry+1))
    done
    echo "$j" > "$nfile"
    j=$((j+1))
  done
)
for i in `seq $ntor`; do
  dowork $i &
done
wait
cat out* > out
"${script_dir}/cdx-post.sh" out
trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT