cia-2010-covert-communication-websites/download-websites.sh

cia-2010-covert-communication-websites/download-websites.sh

#!/usr/bin/env bash
set -eux
jq -r '.[].archive' < ../../media/cia-2010-covert-communication-websites/hits.json |
  grep web.archive.org |
  cut -c 44- |
while read u; do
  domain="${u#http*://}"
  domain="${domain%/}"
  f=20030101000000
  t=20140101000000
  # These have some large legit websites on the selected timerange which
  # we'd rather exclude to speed things up and reduce noise.
  case "$domain" in
    azerinews.org) t=20120101000000 ;;
    bcenews.com) f=20050101000000 ;;
    globaltourist.net) t=20120401000000 ;;
    inside-vc.com) f=20050101000000 ;;
    inews-today.com) t=20130601000000 ;;
    www.cultura-digital.net) f=20060101000000
    www.headlines2day.com) t=20120101000000 ;;
  esac
  if ! [ -d "websites/$domain" ]; then
    wayback_machine_downloader -s -f "$f" -t "$t" "$u"
  fi
done