├── .gitignore
├── output
    └── README.md
├── badger-swarm-screencast.gif
├── .editorconfig
├── .github
    └── workflows
    │   └── lint.yml
├── settings.ini.sample
├── LICENSE
├── README.md
├── ARCHITECTURE.md
├── stats.py
└── main.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | *.sw?
2 | output/*
3 | settings.ini
4 | 


--------------------------------------------------------------------------------
/output/README.md:
--------------------------------------------------------------------------------
1 | Temporary files and scan results go here.
2 | 


--------------------------------------------------------------------------------
/badger-swarm-screencast.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EFForg/badger-swarm/HEAD/badger-swarm-screencast.gif


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig is awesome: http://EditorConfig.org
 2 | 
 3 | # top-most EditorConfig file
 4 | root = true
 5 | 
 6 | # Unix-style newlines with a newline ending every file
 7 | [*]
 8 | end_of_line = lf
 9 | insert_final_newline = true
10 | charset = utf-8
11 | 
12 | [*.sh]
13 | indent_style = space
14 | indent_size = 2
15 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | on: [push, pull_request, workflow_dispatch]
 2 | 
 3 | name: CI
 4 | 
 5 | jobs:
 6 |   lint:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     steps:
10 |       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
11 |       - uses: actions/checkout@v6
12 | 
13 |       - name: ShellCheck
14 |         uses: ludeeus/action-shellcheck@2.0.0
15 |         with:
16 |           scandir: .
17 | 


--------------------------------------------------------------------------------
/settings.ini.sample:
--------------------------------------------------------------------------------
 1 | [Digital Ocean settings]
 2 | # please uncomment and edit the name prefix for Droplets
 3 | #droplet_name_prefix=YOUR_NAME-badger-scanner-
 4 | # do_ssh_key is required to run this script. see `doctl compute ssh-key`
 5 | do_ssh_key=
 6 | # see `doctl compute region list`
 7 | do_region=sfo1
 8 | # see `doctl compute size list`
 9 | do_size=s-2vcpu-4gb
10 | # see `doctl compute image list-distribution`
11 | do_image=ubuntu-24-04-x64
12 | 
13 | [Badger Sett settings]
14 | browser=chrome
15 | num_crawlers=2
16 | num_sites=50
17 | pb_branch=master
18 | exclude_suffixes=.mil,.mil.??,.gov,.gov.??
19 | # for specifying custom Tranco lists
20 | #sitelist=output/tranco_XXXXX.csv
21 | 
22 | [local repository paths]
23 | # Badger Sett
24 | bs_repo_dir=../badger-sett
25 | # Privacy Badger
26 | pb_repo_dir=../privacybadger
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Electronic Frontier Foundation
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Badger Swarm
 2 | 
 3 | Runs distributed [Badger Sett](https://github.com/EFForg/badger-sett) scans on Digital Ocean. Yes, a group of badgers is called a _cete_, but "swarm" just sounds better.
 4 | 
 5 | ![Badger Swarm demo recording](badger-swarm-screencast.gif)
 6 | 
 7 | Badger Swarm converts a Badger Sett scan of X sites into N Badger Sett scans of X/N sites. This makes medium scans complete as quickly as small scans, and large scans complete in a reasonable amount of time.
 8 | 
 9 | For more information, visit our [Introducing Badger Swarm: New Project Helps Privacy Badger Block Ever More Trackers](https://www.eff.org/deeplinks/2023/10/privacy-badger-learns-block-ever-more-trackers) blog post.
10 | 
11 | 
12 | ## Architecture
13 | 
14 | See [ARCHITECTURE.md](ARCHITECTURE.md).
15 | 
16 | 
17 | ## Setup
18 | 
19 | 1. Check out this repository
20 | 2. [Install `doctl`](https://github.com/digitalocean/doctl#installing-doctl)
21 | 3. [Authenticate `doctl`](https://github.com/digitalocean/doctl#authenticating-with-digitalocean) with DigitalOcean
22 | 4. Copy `settings.ini.sample` to `settings.ini`
23 | 5. Review the settings. At minimum, specify your Digital Ocean SSH key (see `doctl compute ssh-key`). For Droplet sizes and hourly prices, see `doctl compute size list`.
24 | 6. To automatically merge results on completion, check out Badger Sett and [Privacy Badger](https://github.com/EFForg/privacybadger) (at the same directory level as this repository) and then [set up and activate a virtual environment](https://snarky.ca/a-quick-and-dirty-guide-on-how-to-install-packages-for-python/) for Badger Sett.
25 | 7. Run `./main.sh` to initiate a new run.
26 | 
27 | Once you are told the run is resumable, you can stop the script with <kbd>Ctrl</kbd>-<kbd>C</kbd> and then later resume the in-progress run with `./main.sh -r`.
28 | 
29 | 
30 | ## Helpful Bash scripting links
31 | - https://google.github.io/styleguide/shellguide.html
32 | - https://mywiki.wooledge.org/
33 | - https://tldp.org/LDP/abs/html/index.html
34 | - https://www.shellcheck.net/
35 | 


--------------------------------------------------------------------------------
/ARCHITECTURE.md:
--------------------------------------------------------------------------------
  1 | # Architecture
  2 | 
  3 | Once a run is confirmed, scans get initialized in parallel. Each scan instance receives their portion of the site list.
  4 | 
  5 | ```mermaid
  6 | stateDiagram-v2
  7 | 
  8 | [*] --> ConfirmRun
  9 | 
 10 | state fork1 <<fork>>
 11 | ConfirmRun --> fork1
 12 | fork1 --> BadgerInit1
 13 | fork1 --> BadgerInit2
 14 | fork1 --> BadgerInitN
 15 | 
 16 | state InitScans {
 17 |     cr1: CreateDroplet
 18 |     cr2: CreateDroplet
 19 |     cr3: CreateDroplet
 20 |     dep1: InstallDependencies
 21 |     dep2: InstallDependencies
 22 |     dep3: InstallDependencies
 23 |     sta1: StartScan
 24 |     sta2: StartScan
 25 |     sta3: StartScan
 26 | 
 27 |     state BadgerInit1 {
 28 |         [*] --> cr1
 29 |         cr1 --> dep1
 30 |         dep1 --> UploadSiteList1
 31 |         UploadSiteList1 --> sta1
 32 |         sta1 --> [*]
 33 |     }
 34 |     --
 35 |     state BadgerInit2 {
 36 |         [*] --> cr2
 37 |         cr2 --> dep2
 38 |         dep2 --> UploadSiteList2
 39 |         UploadSiteList2 --> sta2
 40 |         sta2 --> [*]
 41 |     }
 42 |     --
 43 |     state BadgerInitN {
 44 |         [*] --> cr3
 45 |         cr3 --> dep3
 46 |         dep3 --> UploadSiteListN
 47 |         UploadSiteListN --> sta3
 48 |         sta3 --> [*]
 49 |     }
 50 | }
 51 | 
 52 | state join1 <<join>>
 53 | BadgerInit1 --> join1
 54 | BadgerInit2 --> join1
 55 | BadgerInitN --> join1
 56 | 
 57 | join1 --> [*]
 58 | ```
 59 | 
 60 | The run is now resumable. Scans are checked for progress and status (errored/stalled/complete) in parallel.
 61 | 
 62 | - If a scan fails, its instance is deleted and the scan gets reinitialized.
 63 | - When a scan fails to progress long enough, it is considered stalled. Stalled scans get restarted, which mostly means they get to keep going after skipping the site they got stuck on.
 64 | - When a scan finishes, the results are extracted and the instance is deleted.
 65 | 
 66 | This continues until all scans finish.
 67 | 
 68 | ```mermaid
 69 | stateDiagram-v2
 70 | 
 71 | [*] --> ManageScans
 72 | 
 73 | state fork2 <<fork>>
 74 | ManageScans --> fork2
 75 | fork2 --> CheckBadgerScan1
 76 | fork2 --> CheckBadgerScan2
 77 | fork2 --> CheckBadgerScanN
 78 | 
 79 | state PollInProgressScansForStatus {
 80 |     chk1: CheckForFailure
 81 |     cr1: CreateDroplet
 82 |     dep1: InstallDependencies
 83 |     go1: StartScan
 84 |     pro1: ExtractProgress
 85 |     ter1: CheckForTermination
 86 |     fin1: ExtractResults
 87 |     fai1: ExtractErrorLog
 88 |     del1: DeleteDroplet
 89 |     sta1: CheckForStall
 90 |     res1: RestartScan
 91 |     ddd2: ...
 92 |     ddd3: ...
 93 | 
 94 |     state CheckBadgerScan1 {
 95 |         [*] --> chk1
 96 | 
 97 |         state scan1_failed <<choice>>
 98 |         chk1 --> scan1_failed
 99 |         scan1_failed --> cr1 : Scan previously failed
100 |         scan1_failed --> pro1 : No error log found
101 | 
102 |         cr1 --> dep1
103 |         dep1 --> UploadSiteList1
104 |         UploadSiteList1 --> go1
105 |         go1 --> [*]
106 | 
107 |         pro1 --> ter1
108 | 
109 |         state scan1_term <<choice>>
110 |         ter1 --> scan1_term
111 |         scan1_term --> fin1 : Scan finished
112 |         scan1_term --> fai1: Scan failed
113 |         scan1_term --> sta1 : Scan is still running
114 | 
115 |         fin1 --> del1
116 |         fai1 --> del1
117 | 
118 |         del1 --> [*]
119 | 
120 |         state scan1_stall <<choice>>
121 |         sta1 --> scan1_stall
122 |         scan1_stall --> res1: Progress file is stale
123 |         scan1_stall --> [*] : Progress was updated recently
124 | 
125 |         res1 --> [*]
126 |     }
127 |     --
128 |     state CheckBadgerScan2 {
129 |         [*] --> ddd2
130 |         ddd2 --> [*]
131 |     }
132 |     --
133 |     state CheckBadgerScanN {
134 |         [*] --> ddd3
135 |         ddd3 --> [*]
136 |     }
137 | }
138 | 
139 | state join2 <<join>>
140 | CheckBadgerScan1 --> join2
141 | CheckBadgerScan2 --> join2
142 | CheckBadgerScanN --> join2
143 | 
144 | state all_finished <<choice>>
145 | join2 --> all_finished
146 | all_finished --> PrintProgress : One or more scan results missing
147 | all_finished --> MergeResults : All scans completed successfully
148 | 
149 | PrintProgress --> ManageScans
150 | 
151 | MergeResults --> [*]
152 | ```
153 | 
154 | On completion scan results are merged by Privacy Badger as if each result was manually imported on the Manage Data tab on Privacy Badger's options page.
155 | 


--------------------------------------------------------------------------------
/stats.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # pylint: disable=too-many-locals
  4 | 
  5 | import fnmatch
  6 | import os
  7 | import pathlib
  8 | import re
  9 | import sys
 10 | 
 11 | from datetime import datetime
 12 | 
 13 | def get_log_stats(path):
 14 |     """Parses Badger Sett log files to extract scan statistics."""
 15 | 
 16 |     num_restarts = 0
 17 |     num_links = 0
 18 |     num_links_failed = 0
 19 |     num_site_timeouts = 0
 20 |     num_ext_timeouts = 0
 21 | 
 22 |     with path.open() as f:
 23 |         for line in f:
 24 |             if "Clicking on " in line:
 25 |                 num_links += 1
 26 |             elif "Failed to visit link" in line:
 27 |                 num_links_failed += 1
 28 |             elif "Timed out loading skin/" in line or "Timed out loading extension page" in line:
 29 |                 num_ext_timeouts += 1
 30 |             elif "Timed out loading " in line:
 31 |                 num_site_timeouts += 1
 32 |             elif "Restarting browser" in line:
 33 |                 num_restarts += 1
 34 |             elif "Starting new crawl" in line:
 35 |                 #print(line, end='')
 36 |                 start_time = datetime.strptime(line[:23], '%Y-%m-%d %H:%M:%S,%f')
 37 |             elif "Finished scan" in line:
 38 |                 #print(line, end='')
 39 |                 end_time = datetime.strptime(line[:23], '%Y-%m-%d %H:%M:%S,%f')
 40 |                 site_matches = re.search(r'Visited (\d+) sites and errored on (\d+)', line)
 41 |                 num_visited = int(site_matches.group(1))
 42 |                 num_errored = int(site_matches.group(2))
 43 |                 num_sites = num_visited + num_errored
 44 |                 error_rate = num_errored / num_sites * 100
 45 | 
 46 |     total_time = end_time - start_time
 47 | 
 48 |     return {
 49 |         "error_rate": error_rate,
 50 |         "num_links_failed": num_links_failed,
 51 |         "num_links": num_links,
 52 |         "num_restarts": num_restarts,
 53 |         "num_sites": num_sites,
 54 |         "num_timeouts_ext": num_ext_timeouts,
 55 |         "num_timeouts_site": num_site_timeouts,
 56 |         "num_visited": num_visited,
 57 |         "speed": num_sites / total_time.total_seconds() * 60 * 60,
 58 |         "time_end": end_time,
 59 |         "time_start": start_time,
 60 |         "time_total": total_time,
 61 |     }
 62 | 
 63 | def out(*args):
 64 |     print(f"{args[0]:<25}", *args[1:])
 65 | 
 66 | def print_scan_stats(path):
 67 |     logfile_glob = 'log.???.txt'
 68 |     if not any(True for _ in path.glob(logfile_glob)):
 69 |         logfile_glob = 'log.????.txt'
 70 |         if not any(True for _ in path.glob(logfile_glob)):
 71 |             print(f"Failed to find log files in {path}\n")
 72 |             return
 73 | 
 74 |     log_stats = []
 75 |     for log_path in sorted(path.glob(logfile_glob), key=os.path.getmtime):
 76 |         log_stats.append(get_log_stats(log_path))
 77 | 
 78 |     run_start = datetime.fromtimestamp(os.path.getctime(sorted(path.glob('*'), key=os.path.getctime)[0]))
 79 |     run_end = datetime.fromtimestamp(os.path.getmtime(sorted(path.glob(logfile_glob), key=os.path.getmtime)[-1]))
 80 |     sites_success = sum(i['num_visited'] for i in log_stats)
 81 |     sites_total = sum(i['num_sites'] for i in log_stats)
 82 |     links_total = sum(i['num_links'] for i in log_stats)
 83 |     link_click_rate = round(links_total / sites_success * 100, 1)
 84 |     link_failure_rate = round(sum(i['num_links_failed'] for i in log_stats) * 100 / links_total, 1) if links_total else 0
 85 |     error_avg = round(sum(i['error_rate'] for i in log_stats) / len(log_stats), 1)
 86 |     error_max = round(max(i['error_rate'] for i in log_stats), 1)
 87 |     restarts_total = sum(i['num_restarts'] for i in log_stats)
 88 |     timeout_rate = round(sum(i['num_timeouts_site'] for i in log_stats) * 100 / sites_total, 1)
 89 |     timeouts_ext_total = sum(i['num_timeouts_ext'] for i in log_stats)
 90 |     scan_time_avg = round(sum(i['time_total'].total_seconds() for i in log_stats) / len(log_stats) / 60 / 60, 1)
 91 |     scan_time_max = round(max(i['time_total'].total_seconds() for i in log_stats) / 60 / 60, 1)
 92 |     run_time_total = round((run_end - run_start).total_seconds() / 60 / 60, 1)
 93 |     speed_avg = round(sum(i['speed'] for i in log_stats) / len(log_stats), 1)
 94 |     speed_min = round(min(i['speed'] for i in log_stats), 1)
 95 | 
 96 |     out("Run path:", path)
 97 |     out("Date started:", run_start)
 98 |     out("Sites:", f"{sites_success} ({sites_total} total)")
 99 |     out("Links clicked:", f"{links_total} ({link_click_rate}% of sites) ({link_failure_rate}% failed)")
100 |     out("Overall error rate:", f"{error_avg}% average ({error_max}% max) ({restarts_total} restarts)")
101 |     out("Timeout rate:", f"{timeout_rate}% of sites ({timeouts_ext_total} extension page timeouts)")
102 |     out("Scan time:", f"{scan_time_avg} hours on average ({scan_time_max} max)")
103 |     out("Run time:", f"{run_time_total} hours")
104 |     out("Speed:", f"{speed_avg} sites/hour on average (slowest: {speed_min} sites/hour)\n")
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     # to limit output, add match pattern strings as positional arguments
109 |     # these are matched (with wildcards) against scan results directory names
110 |     # for example: ./stats.py chrome 20K
111 |     scan_paths = [x for x in pathlib.Path('output').iterdir() if x.is_dir() and
112 |                   all(fnmatch.fnmatch(x, f"*{s}*") for s in sys.argv[1:])]
113 |     # sort by date started
114 |     for path in sorted(scan_paths, key=lambda path: os.path.getctime(sorted(path.glob('*'), key=os.path.getctime)[0])):
115 |         print_scan_stats(path)
116 | 


--------------------------------------------------------------------------------
/main.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | err() {
  4 |   echo "$*" >&2
  5 | }
  6 | 
  7 | parse_args() {
  8 |   local OPTIND # enables multiple calls to getopts in same shell invocation
  9 |   local usage="Usage: $0 [-r]"
 10 | 
 11 |   while getopts 'r' flag; do
 12 |     case "$flag" in
 13 |       r) resume_run=true ;;
 14 |       *) err "$usage"; exit 1 ;;
 15 |     esac
 16 |   done
 17 | 
 18 |   if [ "$resume_run" = true ]; then
 19 |     if [ ! -f output/.run_in_progress ]; then
 20 |       err "No in-progress run found"
 21 |       exit 1
 22 |     fi
 23 |   else
 24 |     if [ -f output/.run_in_progress ]; then
 25 |       err "In-progress run found: $(cat output/.run_in_progress)"
 26 |       err "Either resume with the -r flag or delete output/.run_in_progress"
 27 |       exit 1
 28 |     fi
 29 |   fi
 30 | }
 31 | 
 32 | parse_config() {
 33 |   local settings_file="$1"
 34 |   local name value
 35 | 
 36 |   settings_file=${settings_file:-"settings.ini"}
 37 | 
 38 |   if [ ! -f "$settings_file" ]; then
 39 |     err "Missing $settings_file"
 40 |     exit 1
 41 |   fi
 42 | 
 43 |   while IFS='= ' read -r name value; do
 44 |     # ignore comments, section names and blank lines
 45 |     [ "${name:0:1}" = "#" ] || [ "${name:0:1}" = "[" ] || [ -z "$name" ] && continue
 46 | 
 47 |     if [ -z "$value" ]; then
 48 |       err "Missing $settings_file value for $name"
 49 |       exit 1
 50 |     fi
 51 | 
 52 |     case "$name" in
 53 |       browser) readonly browser="$value" ;;
 54 |       bs_repo_dir) readonly bs_repo_dir="$value" ;;
 55 |       do_image) readonly do_image="$value" ;;
 56 |       do_region) readonly do_region="$value" ;;
 57 |       do_size) readonly do_size="$value" ;;
 58 |       do_ssh_key) [ "$value" != "[REDACTED]" ] && readonly do_ssh_key="$value" ;;
 59 |       droplet_name_prefix) readonly droplet_name_prefix="$value" ;;
 60 |       num_crawlers) readonly num_crawlers="$value" ;;
 61 |       num_sites) readonly num_sites="$value" ;;
 62 |       pb_branch) readonly pb_branch="$value" ;;
 63 |       pb_repo_dir) readonly pb_repo_dir="$value" ;;
 64 |       exclude_suffixes) readonly exclude_suffixes="$value" ;;
 65 |       sitelist) readonly sitelist="$value" ;;
 66 |       *) err "Unknown $settings_file setting: $name"; exit 1 ;;
 67 |     esac
 68 |   done < "$settings_file"
 69 | 
 70 |   # do_ssh_key must be provided as it's required and there is no default
 71 |   if [ -z "$do_ssh_key" ]; then
 72 |     if [ "$settings_file" = settings.ini ]; then
 73 |       err "Missing $settings_file setting: do_ssh_key"
 74 |       exit 1
 75 |     else
 76 |       # try getting the key from the default settings file
 77 |       while IFS='= ' read -r name value; do
 78 |         if [ "$name" = "do_ssh_key" ]; then
 79 |           readonly do_ssh_key="$value"
 80 |           break
 81 |         fi
 82 |       done < settings.ini
 83 |       if [ -z "$do_ssh_key" ]; then
 84 |         err "Unable to find do_ssh_key in settings.ini"
 85 |         exit 1
 86 |       fi
 87 |     fi
 88 |   fi
 89 | 
 90 |   if [ -z "$num_crawlers" ] || [ "$num_crawlers" -lt 1 ] || [ "$num_crawlers" -gt 100 ]; then
 91 |     err "num_crawlers must be > 0 and <= 100"
 92 |     exit 1
 93 |   fi
 94 | 
 95 |   if [ -z "$num_sites" ] || [ "$num_sites" -lt 1 ] || [ "$num_sites" -gt 1000000 ]; then
 96 |     err "num_sites must be > 0 and <= 1,000,000"
 97 |     exit 1
 98 |   fi
 99 | }
100 | 
101 | confirm_run() {
102 |   # TODO hardcoded X sites/hour crawler speed
103 |   local time_estimate price speed=200 cost_estimate
104 | 
105 |   cat << EOF
106 | Starting distributed Badger Sett run:
107 | 
108 |   sites:        $(numfmt --to=si "$num_sites")
109 |   sitelist:     ${sitelist:-"default"}
110 |   Droplets:     $num_crawlers $do_size in $do_region
111 |   browser:      ${browser^}
112 |   PB branch:    $pb_branch
113 | 
114 | EOF
115 | 
116 |   # TODO update Droplet creation estimate
117 |   # about 27 seconds per Droplet at the start (45 mins for 100 Droplets),
118 |   # plus however long it takes to scan the number of sites in a chunk
119 |   time_estimate=$(echo "(27 * $num_crawlers / 60 / 60) + ($num_sites / $num_crawlers / $speed)" | bc -l)
120 | 
121 |   price=$(doctl compute size list --format Slug,PriceHourly | grep "$do_size " | awk '{print $2}')
122 |   [ -z "$price" ] && { err "Failed to look up Droplet prices. Is doctl authenticated?"; exit 1; }
123 | 
124 |   cost_estimate=$(echo "$time_estimate * $price * $num_crawlers" | bc -l)
125 | 
126 |   printf "This will take ~%.1f hours and cost ~\$%.0f\n" "$time_estimate" "$cost_estimate"
127 |   read -p "Continue (y/n)? " -n 1 -r
128 |   echo
129 |   if [ "$REPLY" = y ] || [ "$REPLY" = Y ]; then
130 |     return
131 |   fi
132 | 
133 |   exit 0
134 | }
135 | 
136 | init_sitelists() {
137 |   local lines_per_list
138 |   local tempfile=output/sitelist.txt
139 | 
140 |   if [ -n "$sitelist" ]; then
141 |     set -- --domain-list="$sitelist" "$@"
142 |   fi
143 | 
144 |   if [ -n "$exclude_suffixes" ]; then
145 |     set -- --exclude="$exclude_suffixes" "$@"
146 |   fi
147 | 
148 |   if ! "$bs_repo_dir"/crawler.py chrome "$num_sites" --exclude-failures-since='1 month' --get-sitelist-only "$@" > $tempfile; then
149 |     rm $tempfile
150 |     return 1
151 |   fi
152 | 
153 |   # randomize to even out performance (top sites should produce fewer errors)
154 |   shuf $tempfile --output $tempfile
155 | 
156 |   # create chunked site lists
157 |   # note: we will use +1 droplet when there is a division remainder
158 |   # TODO could be an extra droplet just to visit a single site ...
159 |   lines_per_list=$((num_sites / num_crawlers))
160 |   split --suffix-length=3 --numeric-suffixes=1 --lines="$lines_per_list" $tempfile "$results_folder"/sitelist.split.
161 | 
162 |   rm $tempfile
163 | }
164 | 
165 | create_droplet() {
166 |   local droplet="$1"
167 |   local ret retry_count=0
168 | 
169 |   echo "Creating $droplet ($do_region $do_image $do_size)"
170 | 
171 |   until doctl compute droplet create "$droplet" --region "$do_region" --image "$do_image" --size "$do_size" --ssh-keys "$do_ssh_key" >/dev/null; ret=$?; [ $ret -eq 0 ]; do
172 |     echo "Retrying creating $droplet after delay ..."
173 |     retry_count=$((retry_count + 1))
174 |     sleep $(((5 + RANDOM % 16) * retry_count)) # between 5*N and 20*N seconds
175 |   done
176 | 
177 |   # wait for active status
178 |   retry_count=0
179 |   sleep 5
180 |   until [ "$(doctl compute droplet get "$droplet" --template "{{.Status}}" 2>/dev/null)" = "active" ]; do
181 |     if [ $retry_count -gt 3 ]; then
182 |       echo "Still waiting for $droplet to become active ..."
183 |     fi
184 |     retry_count=$((retry_count + 1))
185 |     sleep $((5 * retry_count)) # 5*N seconds
186 |   done
187 | 
188 |   return $ret
189 | }
190 | 
191 | get_droplet_ip() {
192 |   local droplet="$1"
193 |   local ip_file="$results_folder"/"$droplet".ip
194 |   local ip
195 | 
196 |   if [ ! -f "$ip_file" ]; then
197 |     while [ -z "$ip" ]; do
198 |       ip=$(doctl compute droplet get "$droplet" --template "{{.PublicIPv4}}" 2>/dev/null)
199 |       [ -z "$ip" ] && sleep 5
200 |     done
201 |     echo "$ip" > "$ip_file"
202 |   fi
203 | 
204 |   cat "$ip_file"
205 | }
206 | 
207 | ssh_fn() {
208 |   local retry=true
209 |   if [ "$1" = noretry ]; then
210 |     retry=false
211 |     shift
212 |   fi
213 | 
214 |   local ret
215 | 
216 |   set -- -q -o StrictHostKeyChecking=accept-new -o UserKnownHostsFile="$results_folder/known_hosts" -o BatchMode=yes "$@"
217 |   while ssh "$@"; ret=$?; [ $ret -eq 255 ]; do
218 |     [ $retry = false ] && break
219 |     err "Waiting to retry SSH: $*"
220 |     sleep 10
221 |   done
222 | 
223 |   return $ret
224 | }
225 | 
226 | rsync_fn() {
227 |   local ret
228 |   set -- -q -e 'ssh -q -o StrictHostKeyChecking=accept-new -o UserKnownHostsFile='"$results_folder"'/known_hosts -o BatchMode=yes' "$@"
229 |   while rsync "$@"; ret=$?; [ $ret -ne 0 ] && [ $ret -ne 23 ]; do
230 |     err "Waiting to retry rsync (failed with $ret): $*"
231 |     sleep 10
232 |   done
233 |   return $ret
234 | }
235 | 
236 | install_dependencies() {
237 |   local droplet="$1"
238 |   local droplet_ip="$2"
239 |   local aptget_with_opts='DEBIAN_FRONTEND=noninteractive apt-get -qq -o DPkg::Lock::Timeout=60 -o Dpkg::Use-Pty=0'
240 | 
241 |   echo "Installing dependencies on $droplet ($droplet_ip) ..."
242 |   while true; do
243 |     ssh_fn root@"$droplet_ip" "$aptget_with_opts update >/dev/null 2>&1"
244 |     ssh_fn root@"$droplet_ip" "$aptget_with_opts install ca-certificates curl gnupg >/dev/null 2>&1"
245 |     ssh_fn root@"$droplet_ip" 'install -m 0755 -d /etc/apt/keyrings'
246 |     ssh_fn root@"$droplet_ip" 'curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg'
247 |     ssh_fn root@"$droplet_ip" 'chmod a+r /etc/apt/keyrings/docker.gpg'
248 |     # shellcheck disable=SC2016
249 |     ssh_fn root@"$droplet_ip" 'echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" > /etc/apt/sources.list.d/docker.list'
250 |     ssh_fn root@"$droplet_ip" "$aptget_with_opts update >/dev/null 2>&1"
251 |     ssh_fn root@"$droplet_ip" "$aptget_with_opts install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin >/dev/null 2>&1"
252 |     if ssh_fn root@"$droplet_ip" "command -v docker >/dev/null 2>&1"; then
253 |       break
254 |     fi
255 |     sleep 10
256 |   done
257 | }
258 | 
259 | init_scan() {
260 |   local droplet="$1"
261 |   local domains_chunk="$2"
262 |   local exclude="$3"
263 | 
264 |   local droplet_ip chunk_size
265 | 
266 |   droplet_ip=$(get_droplet_ip "$droplet")
267 | 
268 |   # wait for cloud-init to finish
269 |   # (discard stderr because we may have to retry SSH a few times
270 |   # as it might not yet be ready, but this isn't interesting)
271 |   ssh_fn root@"$droplet_ip" 'cloud-init status --wait >/dev/null' 2>/dev/null
272 | 
273 |   # create non-root user
274 |   ssh_fn root@"$droplet_ip" 'useradd -m crawluser && cp -r /root/.ssh /home/crawluser/ && chown -R crawluser:crawluser /home/crawluser/.ssh'
275 | 
276 |   install_dependencies "$droplet" "$droplet_ip"
277 | 
278 |   # add non-root user to docker group
279 |   ssh_fn root@"$droplet_ip" 'usermod -aG docker crawluser'
280 | 
281 |   # check out Badger Sett
282 |   until ssh_fn crawluser@"$droplet_ip" 'git clone -q --depth 1 https://github.com/EFForg/badger-sett.git'; do
283 |     sleep 10
284 |   done
285 | 
286 |   # remove previous scan results to avoid any potential confusion
287 |   ssh_fn crawluser@"$droplet_ip" 'rm badger-sett/results.json badger-sett/log.txt'
288 | 
289 |   # copy domain list
290 |   rsync_fn "$domains_chunk" crawluser@"$droplet_ip":badger-sett/domain-lists/domains.txt
291 | 
292 |   echo "Starting scan on $droplet ($droplet_ip) ..."
293 |   chunk_size=$(wc -l < ./"$domains_chunk")
294 |   if [ -n "$exclude" ]; then
295 |     exclude="--exclude=$exclude"
296 |   fi
297 |   # TODO support configuring --load-extension
298 |   ssh_fn crawluser@"$droplet_ip" "BROWSER=$browser GIT_PUSH=0 RUN_BY_CRON=1 PB_BRANCH=$pb_branch nohup ./badger-sett/runscan.sh $chunk_size --no-blocking --domain-list ./domain-lists/domains.txt --exclude-failures-since=off $exclude </dev/null >runscan.out 2>&1 &"
299 |   # TODO if Docker image fails to install (unknown layer in Dockerfile),
300 |   # TODO we run into log.txt rsync errors as we fail to detect the scan actually failed/never started
301 |   # TODO update scan_terminated() to be more robust? or, detect and handle when runscan.sh fails?
302 | }
303 | 
304 | scan_terminated() {
305 |   local droplet_ip="$1"
306 | 
307 |   if ssh_fn crawluser@"$droplet_ip" '[ ! -d ./badger-sett/.scan_in_progress ]' 2>/dev/null; then
308 |     return 0
309 |   fi
310 | 
311 |   return 1
312 | }
313 | 
314 | scan_succeeded() {
315 |   local droplet_ip="$1"
316 |   local scan_result
317 | 
318 |   scan_result=$(ssh_fn crawluser@"$droplet_ip" "tail -n1 runscan.out" 2>/dev/null)
319 | 
320 |   # successful scan
321 |   [ "${scan_result:0:16}" = "Scan successful." ] && return 0
322 | 
323 |   # failed scan
324 |   return 1
325 | }
326 | 
327 | extract_results() {
328 |   local droplet="$1"
329 |   local droplet_ip="$2"
330 |   local chunk="$3"
331 | 
332 |   if scan_succeeded "$droplet_ip"; then
333 |     # extract results
334 |     rsync_fn crawluser@"$droplet_ip":badger-sett/results.json "$results_folder"/results."$chunk".json 2>/dev/null
335 |     # and screenshots, if any
336 |     if ssh_fn crawluser@"$droplet_ip" '[ -d ./badger-sett/screenshots ]' 2>/dev/null; then
337 |       mkdir -p "$results_folder"/screenshots
338 |       rsync_fn crawluser@"$droplet_ip":badger-sett/screenshots/* "$results_folder"/screenshots
339 |     fi
340 |   else
341 |     # extract Docker output log
342 |     if ! rsync_fn crawluser@"$droplet_ip":runscan.out "$results_folder"/erroredscan."$chunk".out 2>/dev/null; then
343 |       echo "Missing Docker output log" > "$results_folder"/erroredscan."$chunk".out
344 |     fi
345 |   fi
346 | 
347 |   # extract Badger Sett log
348 |   if ! rsync_fn crawluser@"$droplet_ip":badger-sett/log.txt "$results_folder"/log."$chunk".txt 2>/dev/null; then
349 |     echo "Missing Badger Sett log" > "$results_folder"/log."$chunk".txt
350 |   fi
351 | 
352 |   until doctl compute droplet delete -f "$droplet"; do
353 |     sleep 10
354 |   done
355 |   rm -f "$results_folder"/"$droplet".ip "$results_folder"/"$droplet".status
356 | }
357 | 
358 | print_progress() {
359 |   local curval="$1"
360 |   local total="$2"
361 |   local pct=$((curval * 100 / total))
362 |   local num_filled=$((pct * 3 / 10)) # between 0 and 30 chars
363 |   local bar_fill bar_empty
364 |   printf -v bar_fill "%${num_filled}s"
365 |   printf -v bar_empty "%$((30 - num_filled))s" # num_empty
366 |   printf "[${bar_fill// /░}${bar_empty}] %*s  $pct%%\n" $((${#total} * 2 + 2)) "$curval/$total"
367 | }
368 | 
369 | manage_scan() {
370 |   local domains_chunk="$1"
371 |   local chunk=${domains_chunk##*.}
372 |   local droplet="${droplet_name_prefix}${chunk}"
373 |   local status_file="$results_folder"/"$droplet".status
374 |   local droplet_ip num_visited chunk_size status
375 | 
376 |   while true; do
377 |     # retry failed scans
378 |     if [ -f "$results_folder"/erroredscan."$chunk".out ]; then
379 |       if create_droplet "$droplet"; then
380 |         # back up Docker and Badger Sett logs
381 |         mv "$results_folder"/erroredscan."$chunk"{,."$(date +"%s")"}.out
382 |         mv "$results_folder"/log."$chunk"{,."$(date +"%s")"}.txt
383 | 
384 |         init_scan "$droplet" "$domains_chunk" "$exclude_suffixes"
385 |       fi
386 |     fi
387 | 
388 |     # skip finished and errored scans
389 |     [ -f "$results_folder"/log."$chunk".txt ] && return
390 | 
391 |     droplet_ip=$(get_droplet_ip "$droplet")
392 | 
393 |     num_visited=$(ssh_fn noretry crawluser@"$droplet_ip" 'if [ -f ./badger-sett/docker-out/log.txt ]; then grep -E "Visiting [0-9]+:" ./badger-sett/docker-out/log.txt | tail -n1 | sed "s/.*Visiting \([0-9]\+\):.*/\1/"; fi' 2>/dev/null)
394 | 
395 |     if [ $? -eq 255 ]; then
396 |       # SSH error
397 |       sleep 5
398 |       continue
399 |     fi
400 | 
401 |     # TODO make failed scan detection more robust:
402 |     # TODO we could have a failed scan where log.txt is still in docker-out/
403 |     # TODO which currently means we'll be stuck in a hopeless "stale" loop
404 |     # TODO so we should also check that the scan is actually running
405 |     if [ -z "$num_visited" ]; then
406 |       # empty num_visited can happen in the beginning but also at the end,
407 |       # after docker-out/log.txt was moved but before it was extracted
408 | 
409 |       if scan_terminated "$droplet_ip"; then
410 |         extract_results "$droplet" "$droplet_ip" "$chunk"
411 |         return
412 |       else
413 |         # wait until we detect scan termination, or num_visited gets populated
414 |         sleep 10
415 |         continue
416 |       fi
417 |     fi
418 | 
419 |     chunk_size=$(wc -l < ./"$domains_chunk")
420 |     status=$(print_progress "$num_visited" "$chunk_size")
421 | 
422 |     # we got a new progress update
423 |     if [ ! -f "$status_file" ] || [ "$status" != "$(cat "$status_file")" ]; then
424 |       echo "$status" > "$status_file"
425 | 
426 |     # no change in progress and the status file is now stale
427 |     elif [ ! "$(find "$status_file" -newermt "6 minutes ago")" ]; then
428 |       echo "stalled" > "$status_file"
429 | 
430 |       # force a restart by killing the browser
431 |       if [ "$browser" = chrome ]; then
432 |         ssh_fn crawluser@"$droplet_ip" 'pkill chrome'
433 |       elif [ "$browser" = firefox ]; then
434 |         ssh_fn crawluser@"$droplet_ip" 'pkill firefox-bin'
435 |       fi
436 |     fi
437 | 
438 |     return
439 |   done
440 | }
441 | 
442 | onint() {
443 |   # send HUP to the whole process group
444 |   # to avoid leaving subprocesses behind after a Ctrl-C
445 |   kill -HUP -$$
446 | }
447 | 
448 | onhup() {
449 |   echo
450 |   exit
451 | }
452 | 
453 | manage_scans() {
454 |   local all_done domains_chunk chunk droplet
455 |   declare -i num_lines=0
456 | 
457 |   trap onhup HUP
458 |   trap onint INT
459 | 
460 |   while true; do
461 |     all_done=true
462 | 
463 |     # update status files, restart stalled scans,
464 |     # and retry failed scans asynchronously
465 |     for domains_chunk in "$results_folder"/sitelist.split.*; do
466 |       [ -f "$domains_chunk" ] || continue
467 | 
468 |       # skip finished scans
469 |       if [ -f "$results_folder"/log."${domains_chunk##*.}".txt ] && \
470 |         [ ! -f "$results_folder"/erroredscan."${domains_chunk##*.}".out ]; then
471 |         continue
472 |       fi
473 | 
474 |       manage_scan "$domains_chunk" &
475 |     done
476 | 
477 |     wait
478 | 
479 |     # erase previous progress output if any
480 |     # TODO can't scroll beyond the number of lines that fit in the window
481 |     while [ $num_lines -gt 0 ]; do
482 |       # ANSI escape sequences for cursor movement:
483 |       # https://tldp.org/HOWTO/Bash-Prompt-HOWTO/x361.html
484 |       # TODO if we produce ANY output (like error messages) that's not covered by num_lines,
485 |       # TODO we fail to erase that number of droplet status lines
486 |       echo -ne '\033M\r\033[K' # scroll up a line and erase previous output
487 |       num_lines=$((num_lines - 1))
488 |     done
489 | 
490 |     # print statuses
491 |     for domains_chunk in "$results_folder"/sitelist.split.*; do
492 |       [ -f "$domains_chunk" ] || continue
493 |       chunk=${domains_chunk##*.}
494 |       droplet="${droplet_name_prefix}${chunk}"
495 | 
496 |       if [ -f "$results_folder"/erroredscan."$chunk".out ]; then
497 |         all_done=false
498 |         echo "$droplet failed"
499 |         num_lines=$((num_lines + 1))
500 |       elif [ -f "$results_folder"/results."$chunk".json ]; then
501 |         : # noop
502 |       elif [ -f "$results_folder"/log."$chunk".txt ]; then
503 |         echo "$droplet ??? (see $results_folder/log.${chunk}.txt)"
504 |         num_lines=$((num_lines + 1))
505 |       else
506 |         all_done=false
507 |         echo "$droplet $(cat "$results_folder"/"$droplet".status)"
508 |         num_lines=$((num_lines + 1))
509 |       fi
510 |     done
511 | 
512 |     # TODO ETA Xh:Ym
513 |     #echo "Last update: $(date +'%Y-%m-%dT%H:%M:%S%z')"
514 |     #echo "$total/$num_sites"
515 | 
516 |     [ $all_done = true ] && break
517 | 
518 |     sleep 30
519 | 
520 |   done
521 | 
522 |   # restore default signal behavior
523 |   trap - INT
524 |   trap - HUP
525 | }
526 | 
527 | merge_results() {
528 |   for results_chunk in "$results_folder"/results.*.json; do
529 |     [ -f "$results_chunk" ] || continue
530 |     set -- --load-data="$results_chunk" "$@"
531 |   done
532 | 
533 |   echo "${bs_repo_dir}/crawler.py chrome 0 --pb-dir $pb_repo_dir $*"
534 |   if ! "$bs_repo_dir"/crawler.py chrome 0 --pb-dir "$pb_repo_dir" "$@"; then
535 |     return 1
536 |   fi
537 |   mv results.json "$results_folder"/
538 | 
539 |   echo "${bs_repo_dir}/crawler.py chrome 0 --no-blocking --pb-dir $pb_repo_dir $*"
540 |   if ! "$bs_repo_dir"/crawler.py chrome 0 --no-blocking --pb-dir "$pb_repo_dir" "$@"; then
541 |     return 1
542 |   fi
543 |   mv results.json "$results_folder"/results-noblocking.json
544 | }
545 | 
546 | main() {
547 |   # cli args
548 |   local resume_run=false
549 | 
550 |   # settings.ini settings with default values
551 |   local browser=chrome
552 |   local do_image=ubuntu-24-04-x64
553 |   local do_region=nyc2
554 |   local do_size=s-1vcpu-1gb
555 |   local do_ssh_key=
556 |   local droplet_name_prefix=badger-sett-scanner-
557 |   local pb_branch=master
558 |   local num_crawlers num_sites exclude_suffixes sitelist
559 |   local bs_repo_dir pb_repo_dir
560 | 
561 |   # loop vars and misc.
562 |   local domains_chunk droplet
563 |   local results_folder
564 | 
565 |   parse_args "$@"
566 | 
567 |   if [ "$resume_run" = true ]; then
568 |     results_folder=$(cat output/.run_in_progress)
569 |     echo "Resuming run in $results_folder"
570 |     parse_config "$results_folder"/run_settings.ini
571 |   else
572 |     parse_config
573 | 
574 |     # confirm before starting
575 |     confirm_run
576 | 
577 |     # validate here and not in parse_config because
578 |     # we don't care about $sitelist when resuming a run
579 |     if [ -n "$sitelist" ] && [ ! -f "$sitelist" ]; then
580 |       err "Custom list file not found: $sitelist"
581 |       exit 1
582 |     fi
583 | 
584 |     results_folder="output/$(numfmt --to=si "$num_sites")${sitelist+"-CUSTOM_LIST"}-${browser}-${num_crawlers}-${do_size//-/_}-${do_region}-$(date +"%s")"
585 |     echo "Creating $results_folder"
586 |     mkdir -p "$results_folder"
587 | 
588 |     # save run params
589 |     cp settings.ini "$results_folder"/run_settings.ini
590 |     sed -i.bak 's/^do_ssh_key=.\+$/do_ssh_key=[REDACTED]/' "$results_folder"/run_settings.ini && rm "$results_folder"/run_settings.ini.bak
591 | 
592 |     if ! init_sitelists; then
593 |       echo "Failed generating the site list ... Check bs_repo_dir config value and/or enable the Python virtual environment for Badger Sett, and try again"
594 |       exit 1
595 |     fi
596 | 
597 |     # create droplets and initiate scans
598 |     for domains_chunk in "$results_folder"/sitelist.split.*; do
599 |       [ -f "$domains_chunk" ] || continue
600 |       {
601 |         droplet="${droplet_name_prefix}${domains_chunk##*.}"
602 |         if create_droplet "$droplet"; then
603 |           init_scan "$droplet" "$domains_chunk" "$exclude_suffixes"
604 |         else
605 |           err "Failed to create $droplet"
606 |           mv "$domains_chunk" "$results_folder"/NO_DROPLET."${domains_chunk##*.}"
607 |         fi
608 |       } &
609 |     done
610 | 
611 |     wait
612 |     echo "$results_folder" > output/.run_in_progress
613 |     echo "This run is now resumable (using the -r flag)"
614 |   fi
615 | 
616 |   # periodically poll for status, print progress, and clean up when finished
617 |   manage_scans
618 | 
619 |   echo "All scans finished"
620 |   rm output/.run_in_progress
621 | 
622 |   echo "Merging results ..."
623 |   merge_results || echo "Failed merging results ... Check bs_repo_dir and pb_repo_dir config values and/or enable the Python virtual environment for Badger Sett"
624 | 
625 |   # TODO summarize error rates (warn about outliers?), restarts, retries (stalls)
626 | 
627 |   if doctl compute droplet list --format Name | grep -q "$droplet_name_prefix"; then
628 |     sleep 10
629 |     if doctl compute droplet list --format Name | grep -q "$droplet_name_prefix"; then
630 |       err "WARNING: Not all Droplets deleted?"
631 |       err "Check with 'doctl compute droplet list --format ID,Name,PublicIPv4,Status'"
632 |     fi
633 |   fi
634 | 
635 |   echo "All done"
636 | }
637 | 
638 | main "$@"
639 | 


--------------------------------------------------------------------------------