├── .hgignore ├── README.md ├── build.py ├── chrome └── locale │ └── en-US │ └── meta.properties ├── dependencies ├── ensure_dependencies.py ├── icon.png ├── icon64.png ├── lib ├── child │ └── frameScript.js ├── commandLine.js ├── crawler.js └── main.js ├── metadata.gecko └── run.py /.hgignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | 3 | *.xpi 4 | *.zip 5 | *.pyc 6 | *.sh 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | abpcrawler 2 | ========== 3 | 4 | Firefox extension that loads a range of websites and records which 5 | elements are filtered by [Adblock Plus](http://adblockplus.org). 6 | 7 | Requirements 8 | ------------ 9 | 10 | * [Mercurial](https://www.mercurial-scm.org/) or [Git](https://git-scm.com/) (whichever you used to clone this repository) 11 | * [Python 2.x](https://www.python.org) 12 | * [The Jinja2 module](http://jinja.pocoo.org/docs) 13 | * [mozrunner module](https://pypi.python.org/pypi/mozrunner) 14 | 15 | Running 16 | ------- 17 | 18 | Execute the following: 19 | 20 | ./run.py -b /usr/bin/firefox urls.txt outputdir 21 | 22 | This will run the specified Firefox binary to crawl the URLs from `urls.txt` 23 | (one URL per line). The resulting data and screenshots will be written to the 24 | `outputdir` directory. Firefox will close automatically once all URLs have been 25 | processed. 26 | 27 | Optionally, you can provide the path to the Adblock Plus repository - Adblock 28 | Plus will no longer be downloaded then. 29 | 30 | License 31 | ------- 32 | 33 | This Source Code is subject to the terms of the Mozilla Public License 34 | version 2.0 (the "License"). You can obtain a copy of the License at 35 | http://mozilla.org/MPL/2.0/. 36 | -------------------------------------------------------------------------------- /build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import os 5 | import sys 6 | import subprocess 7 | 8 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 9 | DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") 10 | 11 | try: 12 | subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) 13 | except subprocess.CalledProcessError as e: 14 | print >>sys.stderr, e 15 | print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" 16 | 17 | import buildtools.build 18 | buildtools.build.processArgs(BASE_DIR, sys.argv) 19 | -------------------------------------------------------------------------------- /chrome/locale/en-US/meta.properties: -------------------------------------------------------------------------------- 1 | # Translator of this locale, separate by commas if multiple 2 | translator=Felix H. Dahlke 3 | # Extension title, usually it shouldn't be translated 4 | name=Adblock Plus Crawler 5 | # Extension description, to be displayed in the add-on manager 6 | description=Collects the elements blocked by a specified filter. 7 | -------------------------------------------------------------------------------- /dependencies: -------------------------------------------------------------------------------- 1 | _root = hg:https://hg.adblockplus.org/ git:https://github.com/adblockplus/ 2 | _self = buildtools/ensure_dependencies.py 3 | buildtools = buildtools hg:595808987fd9 git:5f8a4c2e86e11eebca8e4773e03e11a7ee1ba1bd 4 | -------------------------------------------------------------------------------- /ensure_dependencies.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This Source Code Form is subject to the terms of the Mozilla Public 4 | # License, v. 2.0. If a copy of the MPL was not distributed with this 5 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | 7 | import sys 8 | import os 9 | import posixpath 10 | import re 11 | import io 12 | import errno 13 | import logging 14 | import subprocess 15 | import urlparse 16 | import argparse 17 | 18 | from collections import OrderedDict 19 | from ConfigParser import RawConfigParser 20 | 21 | USAGE = ''' 22 | A dependencies file should look like this: 23 | 24 | # VCS-specific root URLs for the repositories 25 | _root = hg:https://hg.adblockplus.org/ git:https://github.com/adblockplus/ 26 | # File to update this script from (optional) 27 | _self = buildtools/ensure_dependencies.py 28 | # Clone elemhidehelper repository into extensions/elemhidehelper directory at 29 | # tag "1.2". 30 | extensions/elemhidehelper = elemhidehelper 1.2 31 | # Clone buildtools repository into buildtools directory at VCS-specific 32 | # revision IDs. 33 | buildtools = buildtools hg:016d16f7137b git:f3f8692f82e5 34 | # Clone the adblockplus repository into adblockplus directory, overwriting the 35 | # usual source URL for Git repository and specifying VCS specific revision IDs. 36 | adblockplus = adblockplus hg:893426c6a6ab git:git@github.com:user/adblockplus.git@b2ffd52b 37 | # Clone the adblockpluschrome repository into the adblockpluschrome directory, 38 | # from a specific Git repository, specifying the revision ID. 39 | adblockpluschrome = git:git@github.com:user/adblockpluschrome.git@1fad3a7 40 | ''' 41 | 42 | SKIP_DEPENDENCY_UPDATES = os.environ.get( 43 | 'SKIP_DEPENDENCY_UPDATES', '' 44 | ).lower() not in ('', '0', 'false') 45 | 46 | 47 | class Mercurial(): 48 | def istype(self, repodir): 49 | return os.path.exists(os.path.join(repodir, '.hg')) 50 | 51 | def clone(self, source, target): 52 | if not source.endswith('/'): 53 | source += '/' 54 | subprocess.check_call(['hg', 'clone', '--quiet', '--noupdate', source, target]) 55 | 56 | def get_revision_id(self, repo, rev=None): 57 | command = ['hg', 'id', '--repository', repo, '--id'] 58 | if rev: 59 | command.extend(['--rev', rev]) 60 | 61 | # Ignore stderr output and return code here: if revision lookup failed we 62 | # should simply return an empty string. 63 | result = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0] 64 | return result.strip() 65 | 66 | def pull(self, repo): 67 | subprocess.check_call(['hg', 'pull', '--repository', repo, '--quiet']) 68 | 69 | def update(self, repo, rev, revname): 70 | subprocess.check_call(['hg', 'update', '--repository', repo, '--quiet', '--check', '--rev', rev]) 71 | 72 | def ignore(self, target, repo): 73 | 74 | if not self.istype(target): 75 | 76 | config_path = os.path.join(repo, '.hg', 'hgrc') 77 | ignore_path = os.path.abspath(os.path.join(repo, '.hg', 'dependencies')) 78 | 79 | config = RawConfigParser() 80 | config.read(config_path) 81 | 82 | if not config.has_section('ui'): 83 | config.add_section('ui') 84 | 85 | config.set('ui', 'ignore.dependencies', ignore_path) 86 | with open(config_path, 'w') as stream: 87 | config.write(stream) 88 | 89 | module = os.path.relpath(target, repo) 90 | _ensure_line_exists(ignore_path, module) 91 | 92 | def postprocess_url(self, url): 93 | return url 94 | 95 | 96 | class Git(): 97 | def istype(self, repodir): 98 | return os.path.exists(os.path.join(repodir, '.git')) 99 | 100 | def clone(self, source, target): 101 | source = source.rstrip('/') 102 | if not source.endswith('.git'): 103 | source += '.git' 104 | subprocess.check_call(['git', 'clone', '--quiet', source, target]) 105 | 106 | def get_revision_id(self, repo, rev='HEAD'): 107 | command = ['git', 'rev-parse', '--revs-only', rev + '^{commit}'] 108 | return subprocess.check_output(command, cwd=repo).strip() 109 | 110 | def pull(self, repo): 111 | # Fetch tracked branches, new tags and the list of available remote branches 112 | subprocess.check_call(['git', 'fetch', '--quiet', '--all', '--tags'], cwd=repo) 113 | # Next we need to ensure all remote branches are tracked 114 | newly_tracked = False 115 | remotes = subprocess.check_output(['git', 'branch', '--remotes'], cwd=repo) 116 | for match in re.finditer(r'^\s*(origin/(\S+))$', remotes, re.M): 117 | remote, local = match.groups() 118 | with open(os.devnull, 'wb') as devnull: 119 | if subprocess.call(['git', 'branch', '--track', local, remote], 120 | cwd=repo, stdout=devnull, stderr=devnull) == 0: 121 | newly_tracked = True 122 | # Finally fetch any newly tracked remote branches 123 | if newly_tracked: 124 | subprocess.check_call(['git', 'fetch', '--quiet', 'origin'], cwd=repo) 125 | 126 | def update(self, repo, rev, revname): 127 | subprocess.check_call(['git', 'checkout', '--quiet', revname], cwd=repo) 128 | 129 | def ignore(self, target, repo): 130 | module = os.path.sep + os.path.relpath(target, repo) 131 | exclude_file = os.path.join(repo, '.git', 'info', 'exclude') 132 | _ensure_line_exists(exclude_file, module) 133 | 134 | def postprocess_url(self, url): 135 | # Handle alternative syntax of SSH URLS 136 | if '@' in url and ':' in url and not urlparse.urlsplit(url).scheme: 137 | return 'ssh://' + url.replace(':', '/', 1) 138 | return url 139 | 140 | repo_types = OrderedDict(( 141 | ('hg', Mercurial()), 142 | ('git', Git()), 143 | )) 144 | 145 | # [vcs:]value 146 | item_regexp = re.compile( 147 | '^(?:(' + '|'.join(map(re.escape, repo_types.keys())) + '):)?' 148 | '(.+)$' 149 | ) 150 | 151 | # [url@]rev 152 | source_regexp = re.compile( 153 | '^(?:(.*)@)?' 154 | '(.+)$' 155 | ) 156 | 157 | 158 | def merge_seqs(seq1, seq2): 159 | """Return a list of any truthy values from the suplied sequences 160 | 161 | (None, 2), (1,) => [1, 2] 162 | None, (1, 2) => [1, 2] 163 | (1, 2), (3, 4) => [3, 4] 164 | """ 165 | return map(lambda item1, item2: item2 or item1, seq1 or (), seq2 or ()) 166 | 167 | 168 | def parse_spec(path, line): 169 | if '=' not in line: 170 | logging.warning('Invalid line in file %s: %s' % (path, line)) 171 | return None, None 172 | 173 | key, value = line.split('=', 1) 174 | key = key.strip() 175 | items = value.split() 176 | if not len(items): 177 | logging.warning('No value specified for key %s in file %s' % (key, path)) 178 | return key, None 179 | 180 | result = OrderedDict() 181 | is_dependency_field = not key.startswith('_') 182 | 183 | for i, item in enumerate(items): 184 | try: 185 | vcs, value = re.search(item_regexp, item).groups() 186 | vcs = vcs or '*' 187 | if is_dependency_field: 188 | if i == 0 and vcs == '*': 189 | # In order to be backwards compatible we have to assume that the first 190 | # source contains only a URL/path for the repo if it does not contain 191 | # the VCS part 192 | url_rev = (value, None) 193 | else: 194 | url_rev = re.search(source_regexp, value).groups() 195 | result[vcs] = merge_seqs(result.get(vcs), url_rev) 196 | else: 197 | if vcs in result: 198 | logging.warning('Ignoring duplicate value for type %r ' 199 | '(key %r in file %r)' % (vcs, key, path)) 200 | result[vcs] = value 201 | except AttributeError: 202 | logging.warning('Ignoring invalid item %r for type %r ' 203 | '(key %r in file %r)' % (item, vcs, key, path)) 204 | continue 205 | return key, result 206 | 207 | 208 | def read_deps(repodir): 209 | result = {} 210 | deps_path = os.path.join(repodir, 'dependencies') 211 | try: 212 | with io.open(deps_path, 'rt', encoding='utf-8') as handle: 213 | for line in handle: 214 | # Remove comments and whitespace 215 | line = re.sub(r'#.*', '', line).strip() 216 | if not line: 217 | continue 218 | 219 | key, spec = parse_spec(deps_path, line) 220 | if spec: 221 | result[key] = spec 222 | return result 223 | except IOError as e: 224 | if e.errno != errno.ENOENT: 225 | raise 226 | return None 227 | 228 | 229 | def safe_join(path, subpath): 230 | # This has been inspired by Flask's safe_join() function 231 | forbidden = {os.sep, os.altsep} - {posixpath.sep, None} 232 | if any(sep in subpath for sep in forbidden): 233 | raise Exception('Illegal directory separator in dependency path %s' % subpath) 234 | 235 | normpath = posixpath.normpath(subpath) 236 | if posixpath.isabs(normpath): 237 | raise Exception('Dependency path %s cannot be absolute' % subpath) 238 | if normpath == posixpath.pardir or normpath.startswith(posixpath.pardir + posixpath.sep): 239 | raise Exception('Dependency path %s has to be inside the repository' % subpath) 240 | return os.path.join(path, *normpath.split(posixpath.sep)) 241 | 242 | 243 | def get_repo_type(repo): 244 | for name, repotype in repo_types.iteritems(): 245 | if repotype.istype(repo): 246 | return name 247 | return 'hg' 248 | 249 | 250 | def ensure_repo(parentrepo, parenttype, target, type, root, sourcename): 251 | if os.path.exists(target): 252 | return 253 | 254 | if SKIP_DEPENDENCY_UPDATES: 255 | logging.warning('SKIP_DEPENDENCY_UPDATES environment variable set, ' 256 | '%s not cloned', target) 257 | return 258 | 259 | postprocess_url = repo_types[type].postprocess_url 260 | root = postprocess_url(root) 261 | sourcename = postprocess_url(sourcename) 262 | 263 | if os.path.exists(root): 264 | url = os.path.join(root, sourcename) 265 | else: 266 | url = urlparse.urljoin(root, sourcename) 267 | 268 | logging.info('Cloning repository %s into %s' % (url, target)) 269 | repo_types[type].clone(url, target) 270 | repo_types[parenttype].ignore(target, parentrepo) 271 | 272 | 273 | def update_repo(target, type, revision): 274 | resolved_revision = repo_types[type].get_revision_id(target, revision) 275 | current_revision = repo_types[type].get_revision_id(target) 276 | 277 | if resolved_revision != current_revision: 278 | if SKIP_DEPENDENCY_UPDATES: 279 | logging.warning('SKIP_DEPENDENCY_UPDATES environment variable set, ' 280 | '%s not checked out to %s', target, revision) 281 | return 282 | 283 | if not resolved_revision: 284 | logging.info('Revision %s is unknown, downloading remote changes' % revision) 285 | repo_types[type].pull(target) 286 | resolved_revision = repo_types[type].get_revision_id(target, revision) 287 | if not resolved_revision: 288 | raise Exception('Failed to resolve revision %s' % revision) 289 | 290 | logging.info('Updating repository %s to revision %s' % (target, resolved_revision)) 291 | repo_types[type].update(target, resolved_revision, revision) 292 | 293 | 294 | def resolve_deps(repodir, level=0, self_update=True, overrideroots=None, skipdependencies=set()): 295 | config = read_deps(repodir) 296 | if config is None: 297 | if level == 0: 298 | logging.warning('No dependencies file in directory %s, nothing to do...\n%s' % (repodir, USAGE)) 299 | return 300 | if level >= 10: 301 | logging.warning('Too much subrepository nesting, ignoring %s' % repo) 302 | return 303 | 304 | if overrideroots is not None: 305 | config['_root'] = overrideroots 306 | 307 | for dir, sources in config.iteritems(): 308 | if (dir.startswith('_') or 309 | skipdependencies.intersection([s[0] for s in sources if s[0]])): 310 | continue 311 | 312 | target = safe_join(repodir, dir) 313 | parenttype = get_repo_type(repodir) 314 | _root = config.get('_root', {}) 315 | 316 | for key in sources.keys() + _root.keys(): 317 | if key == parenttype or key is None and vcs != '*': 318 | vcs = key 319 | source, rev = merge_seqs(sources.get('*'), sources.get(vcs)) 320 | 321 | if not (vcs and source and rev): 322 | logging.warning('No valid source / revision found to create %s' % target) 323 | continue 324 | 325 | ensure_repo(repodir, parenttype, target, vcs, _root.get(vcs, ''), source) 326 | update_repo(target, vcs, rev) 327 | resolve_deps(target, level + 1, self_update=False, 328 | overrideroots=overrideroots, skipdependencies=skipdependencies) 329 | 330 | if self_update and '_self' in config and '*' in config['_self']: 331 | source = safe_join(repodir, config['_self']['*']) 332 | try: 333 | with io.open(source, 'rb') as handle: 334 | sourcedata = handle.read() 335 | except IOError as e: 336 | if e.errno != errno.ENOENT: 337 | raise 338 | logging.warning("File %s doesn't exist, skipping self-update" % source) 339 | return 340 | 341 | target = __file__ 342 | with io.open(target, 'rb') as handle: 343 | targetdata = handle.read() 344 | 345 | if sourcedata != targetdata: 346 | logging.info("Updating %s from %s, don't forget to commit" % (target, source)) 347 | with io.open(target, 'wb') as handle: 348 | handle.write(sourcedata) 349 | if __name__ == '__main__': 350 | logging.info('Restarting %s' % target) 351 | os.execv(sys.executable, [sys.executable, target] + sys.argv[1:]) 352 | else: 353 | logging.warning('Cannot restart %s automatically, please rerun' % target) 354 | 355 | 356 | def _ensure_line_exists(path, pattern): 357 | with open(path, 'a+') as f: 358 | file_content = [l.strip() for l in f.readlines()] 359 | if not pattern in file_content: 360 | file_content.append(pattern) 361 | f.seek(0, os.SEEK_SET) 362 | f.truncate() 363 | for l in file_content: 364 | print >>f, l 365 | 366 | if __name__ == '__main__': 367 | logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) 368 | 369 | parser = argparse.ArgumentParser(description='Verify dependencies for a set of repositories, by default the repository of this script.') 370 | parser.add_argument('repos', metavar='repository', type=str, nargs='*', help='Repository path') 371 | parser.add_argument('-q', '--quiet', action='store_true', help='Suppress informational output') 372 | args = parser.parse_args() 373 | 374 | if args.quiet: 375 | logging.disable(logging.INFO) 376 | 377 | repos = args.repos 378 | if not len(repos): 379 | repos = [os.path.dirname(__file__)] 380 | for repo in repos: 381 | resolve_deps(repo) 382 | -------------------------------------------------------------------------------- /icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adblockplus/abpcrawler/02b38185f064b14b8a86cf551c086c9b84d67721/icon.png -------------------------------------------------------------------------------- /icon64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adblockplus/abpcrawler/02b38185f064b14b8a86cf551c086c9b84d67721/icon64.png -------------------------------------------------------------------------------- /lib/child/frameScript.js: -------------------------------------------------------------------------------- 1 | /* 2 | * This Source Code is subject to the terms of the Mozilla Public License 3 | * version 2.0 (the "License"). You can obtain a copy of the License at 4 | * http://mozilla.org/MPL/2.0/. 5 | */ 6 | 7 | "use strict"; 8 | 9 | const {classes: Cc, interfaces: Ci, utils: Cu, results: Cr} = Components; 10 | 11 | /** 12 | * @param e exception 13 | */ 14 | function reportException(e) 15 | { 16 | let stack = ""; 17 | if (e && typeof e == "object" && "stack" in e) 18 | stack = e.stack + "\n"; 19 | 20 | Cu.reportError(e); 21 | dump(e + "\n" + stack + "\n"); 22 | } 23 | 24 | const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {}); 25 | 26 | /** 27 | * Progress listener capturing the data of the current page and calling 28 | * onPageLoaded(data) when loading is finished, where data contains 29 | * HTTP status and headers. 30 | * 31 | * @type nsIWebProgressListener 32 | */ 33 | let webProgressListener = 34 | { 35 | onStateChange: function(webProgress, request, flags, status) 36 | { 37 | if (webProgress.DOMWindow == content && 38 | (flags & Ci.nsIWebProgressListener.STATE_STOP)) 39 | { 40 | // First time we receive STATE_STOP for about:blank and the second time 41 | // for our interested URL which is distinct from about:blank. 42 | // However we should not process about:blank because it can happen that 43 | // the message with information about about:blank is delivered when the 44 | // code in crawler.js is already waiting for a message from this tab. 45 | // Another case we are not interested in is about:newtab. 46 | if (content.location.protocol == "about:") 47 | return; 48 | let pageInfo = {channelStatus: status}; 49 | if (request instanceof Ci.nsIHttpChannel) 50 | { 51 | try 52 | { 53 | pageInfo.headers = []; 54 | pageInfo.headers.push("HTTP/x.x " + request.responseStatus + " " + request.responseStatusText); 55 | request.visitResponseHeaders((header, value) => pageInfo.headers.push(header + ": " + value)); 56 | } 57 | catch (e) 58 | { 59 | reportException(e); 60 | } 61 | } 62 | onPageLoaded(pageInfo); 63 | } 64 | }, 65 | 66 | onLocationChange: function() {}, 67 | onProgressChange: function() {}, 68 | onStatusChange: function() {}, 69 | onSecurityChange: function() {}, 70 | 71 | QueryInterface: XPCOMUtils.generateQI([Ci.nsIWebProgressListener, Ci.nsISupportsWeakReference]) 72 | }; 73 | 74 | function onPageLoaded(pageInfo) 75 | { 76 | Object.assign(pageInfo, gatherPageInfo(content)); 77 | sendAsyncMessage("abpcrawler:pageInfoGathered", pageInfo); 78 | }; 79 | 80 | let webProgress = docShell.QueryInterface(Ci.nsIInterfaceRequestor).getInterface(Ci.nsIWebProgress); 81 | webProgress.addProgressListener(webProgressListener, Ci.nsIWebProgress.NOTIFY_STATE_WINDOW); 82 | 83 | /** 84 | * Gathers information about a DOM window. 85 | * Currently 86 | * - creates a screenshot of the page 87 | * - serializes the page source code 88 | * @param {nsIDOMWindow} wnd window to process 89 | * @return {Object} the object containing "screenshot" and "source" properties. 90 | */ 91 | function gatherPageInfo(wnd) 92 | { 93 | let document = wnd.document; 94 | let result = {errors:[]}; 95 | if (!document.documentElement) 96 | { 97 | result.errors.push("No document.documentElement"); 98 | return result; 99 | } 100 | 101 | try 102 | { 103 | let canvas = document.createElementNS("http://www.w3.org/1999/xhtml", "canvas"); 104 | canvas.width = document.documentElement.scrollWidth; 105 | canvas.height = document.documentElement.scrollHeight; 106 | let context = canvas.getContext("2d"); 107 | context.drawWindow(wnd, 0, 0, canvas.width, canvas.height, "rgb(255, 255, 255)"); 108 | result.screenshot = canvas.toDataURL("image/jpeg", 0.8); 109 | } 110 | catch (e) 111 | { 112 | reportException(e); 113 | result.errors.push("Cannot make page screenshot"); 114 | } 115 | 116 | try 117 | { 118 | // TODO: Capture frames as well? 119 | let serializer = new wnd.XMLSerializer(); 120 | result.source = serializer.serializeToString(document.documentElement); 121 | } 122 | catch(e) 123 | { 124 | reportException(e); 125 | result.errors.push("Cannot obtain page source code"); 126 | } 127 | 128 | return result; 129 | } 130 | -------------------------------------------------------------------------------- /lib/commandLine.js: -------------------------------------------------------------------------------- 1 | /* 2 | * This Source Code is subject to the terms of the Mozilla Public License 3 | * version 2.0 (the "License"). You can obtain a copy of the License at 4 | * http://mozilla.org/MPL/2.0/. 5 | */ 6 | 7 | "use strict"; 8 | 9 | /** 10 | * @module commandLine 11 | */ 12 | 13 | const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {}); 14 | 15 | let CommandLineHandler = 16 | { 17 | // Starting the entry with "k" makes it have slightly higher priority than default command line handlers. 18 | classDescription: "k-abpcrawler", 19 | contractID: "@adblockplus.org/abpcrawler/cmdline;1", 20 | classID: Components.ID("{973636c2-e842-11e4-b02c-1681e6b88ec1}"), 21 | xpcom_categories: ["command-line-handler"], 22 | 23 | init: function() 24 | { 25 | let registrar = Components.manager.QueryInterface(Ci.nsIComponentRegistrar); 26 | registrar.registerFactory(this.classID, this.classDescription, this.contractID, this); 27 | 28 | let catMan = Cc["@mozilla.org/categorymanager;1"].getService(Ci.nsICategoryManager); 29 | for each (let category in this.xpcom_categories) 30 | catMan.addCategoryEntry(category, this.classDescription, this.contractID, false, true); 31 | 32 | onShutdown.add((function() 33 | { 34 | for each (let category in this.xpcom_categories) 35 | catMan.deleteCategoryEntry(category, this.classDescription, false); 36 | 37 | registrar.unregisterFactory(this.classID, this); 38 | }).bind(this)); 39 | }, 40 | 41 | createInstance: function(outer, iid) 42 | { 43 | if (outer) 44 | throw Cr.NS_ERROR_NO_AGGREGATION; 45 | return this.QueryInterface(iid); 46 | }, 47 | 48 | helpInfo: " -crawler-port Port that ABP Crawler should communicate to\n", 49 | 50 | handle: function(cmdline) 51 | { 52 | let port = cmdline.handleFlagWithParam("crawler-port", false); 53 | if (port != null) 54 | require("main").startup(parseInt(port)); 55 | }, 56 | 57 | QueryInterface: XPCOMUtils.generateQI([Ci.nsICommandLineHandler, Ci.nsIFactory]) 58 | }; 59 | 60 | CommandLineHandler.init(); 61 | -------------------------------------------------------------------------------- /lib/crawler.js: -------------------------------------------------------------------------------- 1 | /* 2 | * This Source Code is subject to the terms of the Mozilla Public License 3 | * version 2.0 (the "License"). You can obtain a copy of the License at 4 | * http://mozilla.org/MPL/2.0/. 5 | */ 6 | 7 | "use strict"; 8 | 9 | /** 10 | * @module crawler 11 | */ 12 | 13 | const {Services} = Cu.import("resource://gre/modules/Services.jsm", {}); 14 | const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {}); 15 | const {Task} = Cu.import("resource://gre/modules/Task.jsm", {}); 16 | const {setTimeout, clearTimeout} = Cu.import("resource://gre/modules/Timer.jsm", {}); 17 | 18 | function abprequire(module) 19 | { 20 | let result = {}; 21 | result.wrappedJSObject = result; 22 | Services.obs.notifyObservers(result, "adblockplus-require", module); 23 | return result.exports; 24 | } 25 | 26 | let {RequestNotifier} = abprequire("requestNotifier"); 27 | let {FilterNotifier} = abprequire("filterNotifier"); 28 | let {FilterStorage} = abprequire("filterStorage"); 29 | 30 | /** 31 | * Allocates tabs on request but not more than maxtabs at the same time. 32 | * 33 | * @param {tabbrowser} browser 34 | * The tabbed browser where tabs should be created 35 | * @param {int} maxtabs 36 | * The maximum number of tabs to be allocated 37 | * @constructor 38 | */ 39 | function TabAllocator(browser, maxtabs) 40 | { 41 | this._browser = browser; 42 | this._tabs = 0; 43 | this._maxtabs = maxtabs; 44 | // The queue containing resolve functions of promises waiting for a tab. 45 | this._resolvers = []; 46 | // Keep at least one tab alive to prevent browser from closing itself. 47 | this._tabKeepingWindowAlive = this._browser.tabs[0]; 48 | this._browser.removeAllTabsBut(this._tabKeepingWindowAlive); 49 | } 50 | TabAllocator.prototype = { 51 | _removeTabKeepingWindowAlive: function() 52 | { 53 | if (!this._tabKeepingWindowAlive) 54 | return; 55 | this._browser.removeTab(this._tabKeepingWindowAlive); 56 | delete this._tabKeepingWindowAlive; 57 | }, 58 | 59 | /** 60 | * Creates a blank tab in this._browser. 61 | * 62 | * @return {Promise.} promise which resolves once the tab is fully initialized. 63 | */ 64 | _createTab: function() 65 | { 66 | this._tabs++; 67 | let tab = this._browser.addTab("about:blank"); 68 | if (tab.linkedBrowser.outerWindowID) 69 | { 70 | this._removeTabKeepingWindowAlive(); 71 | return Promise.resolve(tab); 72 | } 73 | return new Promise((resolve, reject) => 74 | { 75 | let onBrowserInit = (msg) => 76 | { 77 | tab.linkedBrowser.messageManager.removeMessageListener("Browser:Init", onBrowserInit); 78 | this._removeTabKeepingWindowAlive(); 79 | resolve(tab); 80 | }; 81 | // "Browser:Init" message is sent once the browser is ready, see 82 | // https://bugzil.la/1256602#c1 83 | tab.linkedBrowser.messageManager.addMessageListener("Browser:Init", onBrowserInit); 84 | }); 85 | }, 86 | 87 | /** 88 | * Returns a promise that will resolve into a tab once a tab is allocated. 89 | * The tab cannot be used by other tasks until releaseTab() is called. 90 | * 91 | * @result {Promise.} 92 | */ 93 | getTab: function() 94 | { 95 | if (this._tabs < this._maxtabs) 96 | return this._createTab(); 97 | return new Promise((resolve, reject) => this._resolvers.push(resolve)); 98 | }, 99 | 100 | /** 101 | * Adds a tab back to the pool so that it can be used by other tasks. 102 | * 103 | * @param {tab} tab 104 | */ 105 | releaseTab: function(tab) 106 | { 107 | // If we are about to close last tab don't close it immediately to keep 108 | // the window alive. It will be closed when a new tab is created. 109 | if (this._tabs > 1) 110 | this._browser.removeTab(tab); 111 | else 112 | { 113 | // navigate away from previously opened URL 114 | tab.linkedBrowser.loadURI("about:blank", null, null); 115 | this._tabKeepingWindowAlive = tab; 116 | } 117 | 118 | this._tabs--; 119 | if (this._resolvers.length && this._tabs < this._maxtabs) 120 | { 121 | this._resolvers.shift()(this._createTab()); 122 | } 123 | }, 124 | }; 125 | 126 | /** 127 | * Once created, this object will make sure all new windows are dismissed 128 | * immediately. 129 | * 130 | * @constructor 131 | */ 132 | function WindowCloser() 133 | { 134 | Services.obs.addObserver(this, "xul-window-registered", true) 135 | } 136 | WindowCloser.prototype = { 137 | /** 138 | * Deactivates this object. 139 | */ 140 | stop: function() 141 | { 142 | Services.obs.removeObserver(this, "xul-window-registered") 143 | }, 144 | 145 | observe: function(subject, topic, data) 146 | { 147 | let window = subject.QueryInterface(Ci.nsIInterfaceRequestor) 148 | .getInterface(Ci.nsIDOMWindow) 149 | window.addEventListener("load", function() 150 | { 151 | if (window.document.documentElement.localName == 'dialog') 152 | window.document.documentElement.acceptDialog(); 153 | else 154 | window.close(); 155 | }, false); 156 | }, 157 | 158 | QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakReference]) 159 | }; 160 | 161 | function configureFrameScript() 162 | { 163 | const info = require("info"); 164 | let frameScriptPath = info.addonRoot + "/lib/child/frameScript.js"; 165 | Services.mm.loadFrameScript(frameScriptPath, true); 166 | 167 | onShutdown.add(() => 168 | { 169 | Services.mm.removeDelayedFrameScript(frameScriptPath); 170 | }); 171 | } 172 | 173 | /** 174 | * Starts the crawling session. The crawler opens each URL in a tab and stores 175 | * the results. 176 | * 177 | * @param {Window} window 178 | * The browser window we're operating in 179 | * @param {String[]} urls 180 | * URLs to be crawled 181 | * @param {int} timeout 182 | * Load timeout in milliseconds 183 | * @param {int} maxtabs 184 | * Maximum number of tabs to be opened 185 | * @param {String} targetURL 186 | * URL that should receive the results 187 | * @param {Function} onDone 188 | * The callback which is called after finishing of crawling of all URLs. 189 | */ 190 | function run(window, urls, timeout, maxtabs, targetURL, onDone) 191 | { 192 | configureFrameScript(); 193 | new Promise((resolve, reject) => 194 | { 195 | if (FilterStorage.subscriptions.length > 0) 196 | { 197 | resolve(); 198 | return; 199 | } 200 | let onFiltersLoaded = (action, item, newValue, oldValue) => 201 | { 202 | if (action == "load") 203 | { 204 | FilterNotifier.removeListener(onFiltersLoaded); 205 | resolve(); 206 | } 207 | }; 208 | FilterNotifier.addListener(onFiltersLoaded); 209 | }).then(() => crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone)) 210 | .catch(reportException); 211 | } 212 | exports.run = run; 213 | 214 | /** 215 | * Spawns a {Task} task to crawl each url from urls argument and calls 216 | * onDone when all tasks are finished. 217 | * @param {Window} window 218 | * The browser window we're operating in 219 | * @param {String[]} urls 220 | * URLs to be crawled 221 | * @param {int} timeout 222 | * Load timeout in milliseconds 223 | * @param {int} maxtabs 224 | * Maximum number of tabs to be opened 225 | * @param {String} targetURL 226 | * URL that should receive the results 227 | * @param {Function} onDone 228 | * The callback which is called after finishing of all tasks. 229 | */ 230 | function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone) 231 | { 232 | let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs); 233 | 234 | let running = 0; 235 | let windowCloser = new WindowCloser(); 236 | let taskDone = function() 237 | { 238 | running--; 239 | if (running <= 0) 240 | { 241 | windowCloser.stop(); 242 | onDone(); 243 | } 244 | }; 245 | 246 | for (let url of urls) 247 | { 248 | running++; 249 | Task.spawn(crawl_url.bind(null, url, tabAllocator, timeout)).then(function(result) 250 | { 251 | let request = new XMLHttpRequest(); 252 | request.open("POST", targetURL); 253 | request.addEventListener("load", taskDone, false); 254 | request.addEventListener("error", taskDone, false); 255 | request.send(JSON.stringify(result)); 256 | }, function(url, exception) 257 | { 258 | reportException(exception); 259 | 260 | let request = new XMLHttpRequest(); 261 | request.open("POST", targetURL); 262 | request.addEventListener("load", taskDone, false); 263 | request.addEventListener("error", taskDone, false); 264 | request.send(JSON.stringify({ 265 | url: url, 266 | startTime: Date.now(), 267 | error: String(exception) 268 | })); 269 | }.bind(null, url)); 270 | } 271 | } 272 | 273 | /** 274 | * Expects to receive page info gathered in a content process for the specified 275 | * `tab`. If there is no relevant message within specified `timeout` then 276 | * the result promise is resolved with error object. 277 | * @param tab 278 | * Tab in which we are interested in 279 | * @param {int} timeout 280 | * Timeout in milliseconds 281 | * @return {Promise} promise which will be resolved with the received page info 282 | */ 283 | function getPageInfo(tab, timeout) 284 | { 285 | return new Promise((resolve, result) => 286 | { 287 | let mm = tab.linkedBrowser.messageManager; 288 | let timerID; 289 | let onDone = (msg) => 290 | { 291 | mm.removeMessageListener("abpcrawler:pageInfoGathered", onDone); 292 | clearTimeout(timerID); 293 | resolve(msg.data); 294 | } 295 | mm.addMessageListener("abpcrawler:pageInfoGathered", onDone); 296 | timerID = setTimeout(() => onDone({data: {error: "timeout"}}), timeout); 297 | }); 298 | } 299 | 300 | /** 301 | * Crawls a URL. This is a generator meant to be used via a Task object. 302 | * 303 | * @param {String} url 304 | * @param {TabAllocator} tabAllocator 305 | * @param {int} timeout 306 | * Load timeout in milliseconds 307 | * @result {Object} 308 | * Crawling result 309 | */ 310 | function* crawl_url(url, tabAllocator, timeout) 311 | { 312 | let tab = yield tabAllocator.getTab(); 313 | let result = {url, requests: []}; 314 | let requestNotifier; 315 | try 316 | { 317 | result.startTime = Date.now(); 318 | requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID, 319 | function(entry, scanComplete) 320 | { 321 | if (!entry) 322 | return; 323 | let {type: contentType, location, filter} = entry; 324 | result.requests.push({location, contentType, filter}); 325 | }); 326 | 327 | tab.linkedBrowser.loadURI(url, null, null); 328 | 329 | Object.assign(result, yield getPageInfo(tab, timeout)); 330 | result.finalUrl = tab.linkedBrowser.currentURI.spec; 331 | result.endTime = Date.now(); 332 | } 333 | finally 334 | { 335 | if (requestNotifier) 336 | requestNotifier.shutdown(); 337 | tabAllocator.releaseTab(tab); 338 | } 339 | return result; 340 | } 341 | 342 | function reportException(e) 343 | { 344 | let stack = ""; 345 | if (e && typeof e == "object" && "stack" in e) 346 | stack = e.stack + "\n"; 347 | 348 | Cu.reportError(e); 349 | dump(e + "\n" + stack + "\n"); 350 | } 351 | -------------------------------------------------------------------------------- /lib/main.js: -------------------------------------------------------------------------------- 1 | /* 2 | * This Source Code is subject to the terms of the Mozilla Public License 3 | * version 2.0 (the "License"). You can obtain a copy of the License at 4 | * http://mozilla.org/MPL/2.0/. 5 | */ 6 | 7 | "use strict"; 8 | 9 | /** 10 | * @module main 11 | */ 12 | 13 | const {Services} = Cu.import("resource://gre/modules/Services.jsm", {}); 14 | const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {}); 15 | 16 | require("commandLine"); 17 | let {run} = require("crawler"); 18 | 19 | let baseURL = null; 20 | 21 | /** 22 | * Waits for the application to initialize. 23 | * @type {Promise} 24 | */ 25 | let applicationReady = new Promise((resolve, reject) => 26 | { 27 | let observer = { 28 | observe: function(subject, topic, data) 29 | { 30 | Services.obs.removeObserver(this, "sessionstore-windows-restored"); 31 | resolve(); 32 | }, 33 | QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakReference]) 34 | }; 35 | Services.obs.addObserver(observer, "sessionstore-windows-restored", true); 36 | onShutdown.add(() => Services.obs.removeObserver(observer, "sessionstore-windows-restored")); 37 | }); 38 | 39 | /** 40 | * Startup function, called from command line handler. 41 | * 42 | * @param {int} port Port to communicate with 43 | */ 44 | function startup(port) 45 | { 46 | baseURL = "http://localhost:" + port + "/"; 47 | 48 | let request = new XMLHttpRequest(); 49 | request.open("GET", baseURL + "parameters"); 50 | request.addEventListener("load", onParametersLoaded, false); 51 | request.addEventListener("error", onParametersFailed, false); 52 | request.responseType = "json"; 53 | request.send(); 54 | } 55 | exports.startup = startup; 56 | 57 | /** 58 | * Called if parameters loaded succesfully. 59 | * 60 | * @param {Event} event 61 | */ 62 | function onParametersLoaded(event) 63 | { 64 | let {urls, timeout, maxtabs} = event.target.response; 65 | 66 | applicationReady.then(function() 67 | { 68 | let window = Services.wm.getMostRecentWindow("navigator:browser"); 69 | run(window, urls, timeout, maxtabs, baseURL + "save", function() 70 | { 71 | Services.startup.quit(Services.startup.eAttemptQuit); 72 | }); 73 | }, function(exception) 74 | { 75 | Cu.reportError(exception); 76 | dump(exception + "\n") 77 | }); 78 | } 79 | 80 | /** 81 | * Called if requesting parameters failed. 82 | * 83 | * @param {Event} event 84 | */ 85 | function onParametersFailed(event) 86 | { 87 | Cu.reportError("Failed loading parameters"); 88 | } 89 | -------------------------------------------------------------------------------- /metadata.gecko: -------------------------------------------------------------------------------- 1 | [general] 2 | id=abpcrawler@adblockplus.org 3 | basename=abpcrawler 4 | version=0.1 5 | author=Felix H. Dahlke 6 | 7 | [compat] 8 | firefox=30.0/* 9 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import argparse 5 | import datetime 6 | import errno 7 | import hashlib 8 | import io 9 | import json 10 | import os 11 | import random 12 | import subprocess 13 | import sys 14 | import tempfile 15 | import threading 16 | import urllib 17 | import urlparse 18 | from wsgiref.simple_server import make_server 19 | 20 | from mozprofile import FirefoxProfile 21 | from mozrunner import FirefoxRunner 22 | 23 | 24 | class CrawlerApp: 25 | server = None 26 | 27 | def __init__(self, parameters): 28 | self.parameters = parameters 29 | with io.open(self.parameters.list, 'r', encoding='utf-8') as handle: 30 | self.urls = map(unicode.strip, handle.readlines()) 31 | 32 | def __call__(self, environ, start_response): 33 | path = environ.get('PATH_INFO', '') 34 | if path == '/parameters': 35 | start_response('200 OK', [('Content-Type', 'application/json')]) 36 | return [json.dumps({ 37 | 'urls': self.urls, 38 | 'timeout': self.parameters.timeout * 1000, 39 | 'maxtabs': self.parameters.maxtabs, 40 | })] 41 | elif path == '/save': 42 | try: 43 | request_body_size = int(environ.get('CONTENT_LENGTH', 0)) 44 | except (ValueError): 45 | start_response('400 Bad Request', []) 46 | return '' 47 | 48 | data = json.loads(environ['wsgi.input'].read(request_body_size)) 49 | self.urls.remove(data['url']) 50 | 51 | fullurl = data['url'] 52 | if not urlparse.urlparse(fullurl).scheme: 53 | fullurl = 'http://' + fullurl 54 | parsedurl = urlparse.urlparse(fullurl) 55 | urlhash = hashlib.new('md5', data['url']).hexdigest() 56 | timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).strftime('%Y-%m-%dT%H%M%S.%f') 57 | basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash) 58 | datapath = os.path.join(self.parameters.outdir, basename + ".json") 59 | screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg") 60 | sourcepath = os.path.join(self.parameters.outdir, basename + ".xml") 61 | 62 | try: 63 | os.makedirs(self.parameters.outdir) 64 | except OSError as e: 65 | if e.errno != errno.EEXIST: 66 | raise 67 | 68 | if "screenshot" in data: 69 | with open(screenshotpath, 'wb') as handle: 70 | handle.write(urllib.urlopen(data["screenshot"]).read()) 71 | del data["screenshot"] 72 | 73 | if "source" in data: 74 | with io.open(sourcepath, 'w', encoding='utf-8') as handle: 75 | handle.write(data["source"]) 76 | del data["source"] 77 | 78 | with io.open(datapath, 'w', encoding='utf-8') as handle: 79 | handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True)) + u'\n') 80 | start_response('204 No Content', []) 81 | return '' 82 | 83 | start_response('404 Not Found', []) 84 | return '' 85 | 86 | 87 | def run(): 88 | parser = argparse.ArgumentParser(description='Run crawler') 89 | parser.add_argument( 90 | '-b', '--binary', type=str, 91 | help='path to the Firefox binary' 92 | ) 93 | parser.add_argument( 94 | '-a', '--abpdir', type=str, 95 | help='path to the Adblock Plus repository' 96 | ) 97 | parser.add_argument( 98 | '-f', '--filters', metavar='url', type=str, nargs='+', 99 | default=["https://easylist-downloads.adblockplus.org/easylist.txt", "https://easylist-downloads.adblockplus.org/exceptionrules.txt"], 100 | help='filter lists to install in Adblock Plus. The arguments can also have the format path=url, the data will be read from the specified path then.' 101 | ) 102 | parser.add_argument( 103 | '-t', '--timeout', type=int, default=300, 104 | help='Load timeout (seconds)' 105 | ) 106 | parser.add_argument( 107 | '-x', '--maxtabs', type=int, default=15, 108 | help='Maximal number of tabs to open in parallel' 109 | ) 110 | parser.add_argument( 111 | 'list', type=str, 112 | help='URL list to process' 113 | ) 114 | parser.add_argument( 115 | 'outdir', type=str, 116 | help='directory to write data into' 117 | ) 118 | parameters = parser.parse_args() 119 | 120 | import buildtools.packagerGecko as packager 121 | cleanup = [] 122 | try: 123 | base_dir = os.path.dirname(os.path.abspath(__file__)) 124 | handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi') 125 | os.close(handle) 126 | cleanup.append(crawlerxpi) 127 | packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True) 128 | 129 | abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-1865-latest.xpi' 130 | if parameters.abpdir: 131 | handle, abpxpi = tempfile.mkstemp(suffix='.xpi') 132 | os.close(handle) 133 | cleanup.append(abpxpi) 134 | packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True) 135 | 136 | profile = FirefoxProfile( 137 | addons=[ 138 | crawlerxpi, 139 | abpxpi, 140 | ], 141 | preferences={ 142 | 'browser.startup.homepage': 'about:blank', 143 | 'browser.tabs.warnOnCloseOtherTabs': False, 144 | 'browser.uitour.enabled': False, 145 | 'prompts.tab_modal.enabled': False, 146 | 'startup.homepage_welcome_url': 'about:blank', 147 | 'startup.homepage_welcome_url.additional': 'about:blank', 148 | 'xpinstall.signatures.required': False, 149 | } 150 | ) 151 | 152 | abpsettings = os.path.join(profile.profile, 'adblockplus') 153 | os.makedirs(abpsettings) 154 | with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as handle: 155 | print >>handle, '# Adblock Plus preferences' 156 | print >>handle, 'version=4' 157 | for url in parameters.filters: 158 | if '=' in url: 159 | path, url = url.split('=', 1) 160 | with open(path, 'r') as source: 161 | data = source.read() 162 | else: 163 | data = urllib.urlopen(url).read() 164 | print >>handle, '[Subscription]' 165 | print >>handle, 'url=%s' % url 166 | print >>handle, '[Subscription filters]' 167 | print >>handle, '\n'.join(data.splitlines()[1:]) 168 | finally: 169 | for path in cleanup: 170 | os.unlink(path) 171 | 172 | server = None 173 | try: 174 | port = random.randrange(2000, 60000) 175 | print "Communicating with client on port %i" % port 176 | 177 | app = CrawlerApp(parameters) 178 | server = make_server('localhost', port, app) 179 | app.server = server 180 | threading.Thread(target=lambda: server.serve_forever()).start() 181 | 182 | runner = FirefoxRunner( 183 | profile=profile, 184 | binary=parameters.binary, 185 | cmdargs=['--crawler-port', str(port)], 186 | env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'), 187 | ) 188 | while app.urls: 189 | runner.start() 190 | runner.wait() 191 | finally: 192 | if server: 193 | server.shutdown() 194 | profile.cleanup() 195 | 196 | if __name__ == '__main__': 197 | BASE_DIR = os.path.dirname(os.path.abspath(__file__)) 198 | DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py") 199 | 200 | try: 201 | subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR]) 202 | except subprocess.CalledProcessError as e: 203 | print >>sys.stderr, e 204 | print >>sys.stderr, "Failed to ensure dependencies being up-to-date!" 205 | 206 | run() 207 | --------------------------------------------------------------------------------