├── .hgignore
├── README.md
├── build.py
├── chrome
    └── locale
    │   └── en-US
    │       └── meta.properties
├── dependencies
├── ensure_dependencies.py
├── icon.png
├── icon64.png
├── lib
    ├── child
    │   └── frameScript.js
    ├── commandLine.js
    ├── crawler.js
    └── main.js
├── metadata.gecko
└── run.py


/.hgignore:
--------------------------------------------------------------------------------
1 | syntax: glob
2 | 
3 | *.xpi
4 | *.zip
5 | *.pyc
6 | *.sh
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | abpcrawler
 2 | ==========
 3 | 
 4 | Firefox extension that loads a range of websites and records which
 5 | elements are filtered by [Adblock Plus](http://adblockplus.org).
 6 | 
 7 | Requirements
 8 | ------------
 9 | 
10 | * [Mercurial](https://www.mercurial-scm.org/) or [Git](https://git-scm.com/) (whichever you used to clone this repository)
11 | * [Python 2.x](https://www.python.org)
12 | * [The Jinja2 module](http://jinja.pocoo.org/docs)
13 | * [mozrunner module](https://pypi.python.org/pypi/mozrunner)
14 | 
15 | Running
16 | -------
17 | 
18 | Execute the following:
19 | 
20 |     ./run.py -b /usr/bin/firefox urls.txt outputdir
21 | 
22 | This will run the specified Firefox binary to crawl the URLs from `urls.txt`
23 | (one URL per line). The resulting data and screenshots will be written to the
24 | `outputdir` directory. Firefox will close automatically once all URLs have been
25 | processed.
26 | 
27 | Optionally, you can provide the path to the Adblock Plus repository - Adblock
28 | Plus will no longer be downloaded then.
29 | 
30 | License
31 | -------
32 | 
33 | This Source Code is subject to the terms of the Mozilla Public License
34 | version 2.0 (the "License"). You can obtain a copy of the License at
35 | http://mozilla.org/MPL/2.0/.
36 | 


--------------------------------------------------------------------------------
/build.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import os
 5 | import sys
 6 | import subprocess
 7 | 
 8 | BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 9 | DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py")
10 | 
11 | try:
12 |     subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR])
13 | except subprocess.CalledProcessError as e:
14 |     print >>sys.stderr, e
15 |     print >>sys.stderr, "Failed to ensure dependencies being up-to-date!"
16 | 
17 | import buildtools.build
18 | buildtools.build.processArgs(BASE_DIR, sys.argv)
19 | 


--------------------------------------------------------------------------------
/chrome/locale/en-US/meta.properties:
--------------------------------------------------------------------------------
1 | # Translator of this locale, separate by commas if multiple
2 | translator=Felix H. Dahlke
3 | # Extension title, usually it shouldn't be translated
4 | name=Adblock Plus Crawler
5 | # Extension description, to be displayed in the add-on manager
6 | description=Collects the elements blocked by a specified filter.
7 | 


--------------------------------------------------------------------------------
/dependencies:
--------------------------------------------------------------------------------
1 | _root = hg:https://hg.adblockplus.org/ git:https://github.com/adblockplus/
2 | _self = buildtools/ensure_dependencies.py
3 | buildtools = buildtools hg:595808987fd9 git:5f8a4c2e86e11eebca8e4773e03e11a7ee1ba1bd
4 | 


--------------------------------------------------------------------------------
/ensure_dependencies.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # This Source Code Form is subject to the terms of the Mozilla Public
  4 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  5 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6 | 
  7 | import sys
  8 | import os
  9 | import posixpath
 10 | import re
 11 | import io
 12 | import errno
 13 | import logging
 14 | import subprocess
 15 | import urlparse
 16 | import argparse
 17 | 
 18 | from collections import OrderedDict
 19 | from ConfigParser import RawConfigParser
 20 | 
 21 | USAGE = '''
 22 | A dependencies file should look like this:
 23 | 
 24 |   # VCS-specific root URLs for the repositories
 25 |   _root = hg:https://hg.adblockplus.org/ git:https://github.com/adblockplus/
 26 |   # File to update this script from (optional)
 27 |   _self = buildtools/ensure_dependencies.py
 28 |   # Clone elemhidehelper repository into extensions/elemhidehelper directory at
 29 |   # tag "1.2".
 30 |   extensions/elemhidehelper = elemhidehelper 1.2
 31 |   # Clone buildtools repository into buildtools directory at VCS-specific
 32 |   # revision IDs.
 33 |   buildtools = buildtools hg:016d16f7137b git:f3f8692f82e5
 34 |   # Clone the adblockplus repository into adblockplus directory, overwriting the
 35 |   # usual source URL for Git repository and specifying VCS specific revision IDs.
 36 |   adblockplus = adblockplus hg:893426c6a6ab git:git@github.com:user/adblockplus.git@b2ffd52b
 37 |   # Clone the adblockpluschrome repository into the adblockpluschrome directory,
 38 |   # from a specific Git repository, specifying the revision ID.
 39 |   adblockpluschrome = git:git@github.com:user/adblockpluschrome.git@1fad3a7
 40 | '''
 41 | 
 42 | SKIP_DEPENDENCY_UPDATES = os.environ.get(
 43 |     'SKIP_DEPENDENCY_UPDATES', ''
 44 | ).lower() not in ('', '0', 'false')
 45 | 
 46 | 
 47 | class Mercurial():
 48 |     def istype(self, repodir):
 49 |         return os.path.exists(os.path.join(repodir, '.hg'))
 50 | 
 51 |     def clone(self, source, target):
 52 |         if not source.endswith('/'):
 53 |             source += '/'
 54 |         subprocess.check_call(['hg', 'clone', '--quiet', '--noupdate', source, target])
 55 | 
 56 |     def get_revision_id(self, repo, rev=None):
 57 |         command = ['hg', 'id', '--repository', repo, '--id']
 58 |         if rev:
 59 |             command.extend(['--rev', rev])
 60 | 
 61 |         # Ignore stderr output and return code here: if revision lookup failed we
 62 |         # should simply return an empty string.
 63 |         result = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0]
 64 |         return result.strip()
 65 | 
 66 |     def pull(self, repo):
 67 |         subprocess.check_call(['hg', 'pull', '--repository', repo, '--quiet'])
 68 | 
 69 |     def update(self, repo, rev, revname):
 70 |         subprocess.check_call(['hg', 'update', '--repository', repo, '--quiet', '--check', '--rev', rev])
 71 | 
 72 |     def ignore(self, target, repo):
 73 | 
 74 |         if not self.istype(target):
 75 | 
 76 |             config_path = os.path.join(repo, '.hg', 'hgrc')
 77 |             ignore_path = os.path.abspath(os.path.join(repo, '.hg', 'dependencies'))
 78 | 
 79 |             config = RawConfigParser()
 80 |             config.read(config_path)
 81 | 
 82 |             if not config.has_section('ui'):
 83 |                 config.add_section('ui')
 84 | 
 85 |             config.set('ui', 'ignore.dependencies', ignore_path)
 86 |             with open(config_path, 'w') as stream:
 87 |                 config.write(stream)
 88 | 
 89 |             module = os.path.relpath(target, repo)
 90 |             _ensure_line_exists(ignore_path, module)
 91 | 
 92 |     def postprocess_url(self, url):
 93 |         return url
 94 | 
 95 | 
 96 | class Git():
 97 |     def istype(self, repodir):
 98 |         return os.path.exists(os.path.join(repodir, '.git'))
 99 | 
100 |     def clone(self, source, target):
101 |         source = source.rstrip('/')
102 |         if not source.endswith('.git'):
103 |             source += '.git'
104 |         subprocess.check_call(['git', 'clone', '--quiet', source, target])
105 | 
106 |     def get_revision_id(self, repo, rev='HEAD'):
107 |         command = ['git', 'rev-parse', '--revs-only', rev + '^{commit}']
108 |         return subprocess.check_output(command, cwd=repo).strip()
109 | 
110 |     def pull(self, repo):
111 |         # Fetch tracked branches, new tags and the list of available remote branches
112 |         subprocess.check_call(['git', 'fetch', '--quiet', '--all', '--tags'], cwd=repo)
113 |         # Next we need to ensure all remote branches are tracked
114 |         newly_tracked = False
115 |         remotes = subprocess.check_output(['git', 'branch', '--remotes'], cwd=repo)
116 |         for match in re.finditer(r'^\s*(origin/(\S+))$', remotes, re.M):
117 |             remote, local = match.groups()
118 |             with open(os.devnull, 'wb') as devnull:
119 |                 if subprocess.call(['git', 'branch', '--track', local, remote],
120 |                                    cwd=repo, stdout=devnull, stderr=devnull) == 0:
121 |                     newly_tracked = True
122 |         # Finally fetch any newly tracked remote branches
123 |         if newly_tracked:
124 |             subprocess.check_call(['git', 'fetch', '--quiet', 'origin'], cwd=repo)
125 | 
126 |     def update(self, repo, rev, revname):
127 |         subprocess.check_call(['git', 'checkout', '--quiet', revname], cwd=repo)
128 | 
129 |     def ignore(self, target, repo):
130 |         module = os.path.sep + os.path.relpath(target, repo)
131 |         exclude_file = os.path.join(repo, '.git', 'info', 'exclude')
132 |         _ensure_line_exists(exclude_file, module)
133 | 
134 |     def postprocess_url(self, url):
135 |         # Handle alternative syntax of SSH URLS
136 |         if '@' in url and ':' in url and not urlparse.urlsplit(url).scheme:
137 |             return 'ssh://' + url.replace(':', '/', 1)
138 |         return url
139 | 
140 | repo_types = OrderedDict((
141 |     ('hg', Mercurial()),
142 |     ('git', Git()),
143 | ))
144 | 
145 | # [vcs:]value
146 | item_regexp = re.compile(
147 |     '^(?:(' + '|'.join(map(re.escape, repo_types.keys())) + '):)?'
148 |     '(.+)$'
149 | )
150 | 
151 | # [url@]rev
152 | source_regexp = re.compile(
153 |     '^(?:(.*)@)?'
154 |     '(.+)$'
155 | )
156 | 
157 | 
158 | def merge_seqs(seq1, seq2):
159 |     """Return a list of any truthy values from the suplied sequences
160 | 
161 |     (None, 2), (1,)      => [1, 2]
162 |     None, (1, 2)         => [1, 2]
163 |     (1, 2), (3, 4)       => [3, 4]
164 |     """
165 |     return map(lambda item1, item2: item2 or item1, seq1 or (), seq2 or ())
166 | 
167 | 
168 | def parse_spec(path, line):
169 |     if '=' not in line:
170 |         logging.warning('Invalid line in file %s: %s' % (path, line))
171 |         return None, None
172 | 
173 |     key, value = line.split('=', 1)
174 |     key = key.strip()
175 |     items = value.split()
176 |     if not len(items):
177 |         logging.warning('No value specified for key %s in file %s' % (key, path))
178 |         return key, None
179 | 
180 |     result = OrderedDict()
181 |     is_dependency_field = not key.startswith('_')
182 | 
183 |     for i, item in enumerate(items):
184 |         try:
185 |             vcs, value = re.search(item_regexp, item).groups()
186 |             vcs = vcs or '*'
187 |             if is_dependency_field:
188 |                 if i == 0 and vcs == '*':
189 |                     # In order to be backwards compatible we have to assume that the first
190 |                     # source contains only a URL/path for the repo if it does not contain
191 |                     # the VCS part
192 |                     url_rev = (value, None)
193 |                 else:
194 |                     url_rev = re.search(source_regexp, value).groups()
195 |                 result[vcs] = merge_seqs(result.get(vcs), url_rev)
196 |             else:
197 |                 if vcs in result:
198 |                     logging.warning('Ignoring duplicate value for type %r '
199 |                                     '(key %r in file %r)' % (vcs, key, path))
200 |                 result[vcs] = value
201 |         except AttributeError:
202 |             logging.warning('Ignoring invalid item %r for type %r '
203 |                             '(key %r in file %r)' % (item, vcs, key, path))
204 |             continue
205 |     return key, result
206 | 
207 | 
208 | def read_deps(repodir):
209 |     result = {}
210 |     deps_path = os.path.join(repodir, 'dependencies')
211 |     try:
212 |         with io.open(deps_path, 'rt', encoding='utf-8') as handle:
213 |             for line in handle:
214 |                 # Remove comments and whitespace
215 |                 line = re.sub(r'#.*', '', line).strip()
216 |                 if not line:
217 |                     continue
218 | 
219 |                 key, spec = parse_spec(deps_path, line)
220 |                 if spec:
221 |                     result[key] = spec
222 |         return result
223 |     except IOError as e:
224 |         if e.errno != errno.ENOENT:
225 |             raise
226 |         return None
227 | 
228 | 
229 | def safe_join(path, subpath):
230 |     # This has been inspired by Flask's safe_join() function
231 |     forbidden = {os.sep, os.altsep} - {posixpath.sep, None}
232 |     if any(sep in subpath for sep in forbidden):
233 |         raise Exception('Illegal directory separator in dependency path %s' % subpath)
234 | 
235 |     normpath = posixpath.normpath(subpath)
236 |     if posixpath.isabs(normpath):
237 |         raise Exception('Dependency path %s cannot be absolute' % subpath)
238 |     if normpath == posixpath.pardir or normpath.startswith(posixpath.pardir + posixpath.sep):
239 |         raise Exception('Dependency path %s has to be inside the repository' % subpath)
240 |     return os.path.join(path, *normpath.split(posixpath.sep))
241 | 
242 | 
243 | def get_repo_type(repo):
244 |     for name, repotype in repo_types.iteritems():
245 |         if repotype.istype(repo):
246 |             return name
247 |     return 'hg'
248 | 
249 | 
250 | def ensure_repo(parentrepo, parenttype, target, type, root, sourcename):
251 |     if os.path.exists(target):
252 |         return
253 | 
254 |     if SKIP_DEPENDENCY_UPDATES:
255 |         logging.warning('SKIP_DEPENDENCY_UPDATES environment variable set, '
256 |                         '%s not cloned', target)
257 |         return
258 | 
259 |     postprocess_url = repo_types[type].postprocess_url
260 |     root = postprocess_url(root)
261 |     sourcename = postprocess_url(sourcename)
262 | 
263 |     if os.path.exists(root):
264 |         url = os.path.join(root, sourcename)
265 |     else:
266 |         url = urlparse.urljoin(root, sourcename)
267 | 
268 |     logging.info('Cloning repository %s into %s' % (url, target))
269 |     repo_types[type].clone(url, target)
270 |     repo_types[parenttype].ignore(target, parentrepo)
271 | 
272 | 
273 | def update_repo(target, type, revision):
274 |     resolved_revision = repo_types[type].get_revision_id(target, revision)
275 |     current_revision = repo_types[type].get_revision_id(target)
276 | 
277 |     if resolved_revision != current_revision:
278 |         if SKIP_DEPENDENCY_UPDATES:
279 |             logging.warning('SKIP_DEPENDENCY_UPDATES environment variable set, '
280 |                             '%s not checked out to %s', target, revision)
281 |             return
282 | 
283 |         if not resolved_revision:
284 |             logging.info('Revision %s is unknown, downloading remote changes' % revision)
285 |             repo_types[type].pull(target)
286 |             resolved_revision = repo_types[type].get_revision_id(target, revision)
287 |             if not resolved_revision:
288 |                 raise Exception('Failed to resolve revision %s' % revision)
289 | 
290 |         logging.info('Updating repository %s to revision %s' % (target, resolved_revision))
291 |         repo_types[type].update(target, resolved_revision, revision)
292 | 
293 | 
294 | def resolve_deps(repodir, level=0, self_update=True, overrideroots=None, skipdependencies=set()):
295 |     config = read_deps(repodir)
296 |     if config is None:
297 |         if level == 0:
298 |             logging.warning('No dependencies file in directory %s, nothing to do...\n%s' % (repodir, USAGE))
299 |         return
300 |     if level >= 10:
301 |         logging.warning('Too much subrepository nesting, ignoring %s' % repo)
302 |         return
303 | 
304 |     if overrideroots is not None:
305 |         config['_root'] = overrideroots
306 | 
307 |     for dir, sources in config.iteritems():
308 |         if (dir.startswith('_') or
309 |             skipdependencies.intersection([s[0] for s in sources if s[0]])):
310 |             continue
311 | 
312 |         target = safe_join(repodir, dir)
313 |         parenttype = get_repo_type(repodir)
314 |         _root = config.get('_root', {})
315 | 
316 |         for key in sources.keys() + _root.keys():
317 |             if key == parenttype or key is None and vcs != '*':
318 |                 vcs = key
319 |         source, rev = merge_seqs(sources.get('*'), sources.get(vcs))
320 | 
321 |         if not (vcs and source and rev):
322 |             logging.warning('No valid source / revision found to create %s' % target)
323 |             continue
324 | 
325 |         ensure_repo(repodir, parenttype, target, vcs, _root.get(vcs, ''), source)
326 |         update_repo(target, vcs, rev)
327 |         resolve_deps(target, level + 1, self_update=False,
328 |                      overrideroots=overrideroots, skipdependencies=skipdependencies)
329 | 
330 |     if self_update and '_self' in config and '*' in config['_self']:
331 |         source = safe_join(repodir, config['_self']['*'])
332 |         try:
333 |             with io.open(source, 'rb') as handle:
334 |                 sourcedata = handle.read()
335 |         except IOError as e:
336 |             if e.errno != errno.ENOENT:
337 |                 raise
338 |             logging.warning("File %s doesn't exist, skipping self-update" % source)
339 |             return
340 | 
341 |         target = __file__
342 |         with io.open(target, 'rb') as handle:
343 |             targetdata = handle.read()
344 | 
345 |         if sourcedata != targetdata:
346 |             logging.info("Updating %s from %s, don't forget to commit" % (target, source))
347 |             with io.open(target, 'wb') as handle:
348 |                 handle.write(sourcedata)
349 |             if __name__ == '__main__':
350 |                 logging.info('Restarting %s' % target)
351 |                 os.execv(sys.executable, [sys.executable, target] + sys.argv[1:])
352 |             else:
353 |                 logging.warning('Cannot restart %s automatically, please rerun' % target)
354 | 
355 | 
356 | def _ensure_line_exists(path, pattern):
357 |     with open(path, 'a+') as f:
358 |         file_content = [l.strip() for l in f.readlines()]
359 |         if not pattern in file_content:
360 |             file_content.append(pattern)
361 |             f.seek(0, os.SEEK_SET)
362 |             f.truncate()
363 |             for l in file_content:
364 |                 print >>f, l
365 | 
366 | if __name__ == '__main__':
367 |     logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
368 | 
369 |     parser = argparse.ArgumentParser(description='Verify dependencies for a set of repositories, by default the repository of this script.')
370 |     parser.add_argument('repos', metavar='repository', type=str, nargs='*', help='Repository path')
371 |     parser.add_argument('-q', '--quiet', action='store_true', help='Suppress informational output')
372 |     args = parser.parse_args()
373 | 
374 |     if args.quiet:
375 |         logging.disable(logging.INFO)
376 | 
377 |     repos = args.repos
378 |     if not len(repos):
379 |         repos = [os.path.dirname(__file__)]
380 |     for repo in repos:
381 |         resolve_deps(repo)
382 | 


--------------------------------------------------------------------------------
/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adblockplus/abpcrawler/02b38185f064b14b8a86cf551c086c9b84d67721/icon.png


--------------------------------------------------------------------------------
/icon64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adblockplus/abpcrawler/02b38185f064b14b8a86cf551c086c9b84d67721/icon64.png


--------------------------------------------------------------------------------
/lib/child/frameScript.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This Source Code is subject to the terms of the Mozilla Public License
  3 |  * version 2.0 (the "License"). You can obtain a copy of the License at
  4 |  * http://mozilla.org/MPL/2.0/.
  5 |  */
  6 | 
  7 | "use strict";
  8 | 
  9 | const {classes: Cc, interfaces: Ci, utils: Cu, results: Cr} = Components;
 10 | 
 11 | /**
 12 |  * @param e exception
 13 |  */
 14 | function reportException(e)
 15 | {
 16 |   let stack = "";
 17 |   if (e && typeof e == "object" && "stack" in e)
 18 |     stack = e.stack + "\n";
 19 | 
 20 |   Cu.reportError(e);
 21 |   dump(e + "\n" + stack + "\n");
 22 | }
 23 | 
 24 | const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {});
 25 | 
 26 | /**
 27 |  * Progress listener capturing the data of the current page and calling
 28 |  * onPageLoaded(data) when loading is finished, where data contains
 29 |  * HTTP status and headers.
 30 |  *
 31 |  * @type nsIWebProgressListener
 32 |  */
 33 | let webProgressListener =
 34 | {
 35 |   onStateChange: function(webProgress, request, flags, status)
 36 |   {
 37 |     if (webProgress.DOMWindow == content &&
 38 |         (flags & Ci.nsIWebProgressListener.STATE_STOP))
 39 |     {
 40 |       // First time we receive STATE_STOP for about:blank and the second time
 41 |       // for our interested URL which is distinct from about:blank.
 42 |       // However we should not process about:blank because it can happen that
 43 |       // the message with information about about:blank is delivered when the
 44 |       // code in crawler.js is already waiting for a message from this tab.
 45 |       // Another case we are not interested in is about:newtab.
 46 |       if (content.location.protocol == "about:")
 47 |         return;
 48 |       let pageInfo = {channelStatus: status};
 49 |       if (request instanceof Ci.nsIHttpChannel)
 50 |       {
 51 |         try
 52 |         {
 53 |           pageInfo.headers = [];
 54 |           pageInfo.headers.push("HTTP/x.x " + request.responseStatus + " " + request.responseStatusText);
 55 |           request.visitResponseHeaders((header, value) => pageInfo.headers.push(header + ": " + value));
 56 |         }
 57 |         catch (e)
 58 |         {
 59 |           reportException(e);
 60 |         }
 61 |       }
 62 |       onPageLoaded(pageInfo);
 63 |     }
 64 |   },
 65 | 
 66 |   onLocationChange: function() {},
 67 |   onProgressChange: function() {},
 68 |   onStatusChange: function() {},
 69 |   onSecurityChange: function() {},
 70 | 
 71 |   QueryInterface: XPCOMUtils.generateQI([Ci.nsIWebProgressListener, Ci.nsISupportsWeakReference])
 72 | };
 73 | 
 74 | function onPageLoaded(pageInfo)
 75 | {
 76 |   Object.assign(pageInfo, gatherPageInfo(content));
 77 |   sendAsyncMessage("abpcrawler:pageInfoGathered", pageInfo);
 78 | };
 79 | 
 80 | let webProgress = docShell.QueryInterface(Ci.nsIInterfaceRequestor).getInterface(Ci.nsIWebProgress);
 81 | webProgress.addProgressListener(webProgressListener, Ci.nsIWebProgress.NOTIFY_STATE_WINDOW);
 82 | 
 83 | /**
 84 |  * Gathers information about a DOM window.
 85 |  * Currently
 86 |  *  - creates a screenshot of the page
 87 |  *  - serializes the page source code
 88 |  * @param {nsIDOMWindow} wnd window to process
 89 |  * @return {Object} the object containing "screenshot" and "source" properties.
 90 |  */
 91 | function gatherPageInfo(wnd)
 92 | {
 93 |   let document = wnd.document;
 94 |   let result = {errors:[]};
 95 |   if (!document.documentElement)
 96 |   {
 97 |     result.errors.push("No document.documentElement");
 98 |     return result;
 99 |   }
100 | 
101 |   try
102 |   {
103 |     let canvas = document.createElementNS("http://www.w3.org/1999/xhtml", "canvas");
104 |     canvas.width = document.documentElement.scrollWidth;
105 |     canvas.height = document.documentElement.scrollHeight;
106 |     let context = canvas.getContext("2d");
107 |     context.drawWindow(wnd, 0, 0, canvas.width, canvas.height, "rgb(255, 255, 255)");
108 |     result.screenshot = canvas.toDataURL("image/jpeg", 0.8);
109 |   }
110 |   catch (e)
111 |   {
112 |     reportException(e);
113 |     result.errors.push("Cannot make page screenshot");
114 |   }
115 | 
116 |   try
117 |   {
118 |     // TODO: Capture frames as well?
119 |     let serializer = new wnd.XMLSerializer();
120 |     result.source = serializer.serializeToString(document.documentElement);
121 |   }
122 |   catch(e)
123 |   {
124 |     reportException(e);
125 |     result.errors.push("Cannot obtain page source code");
126 |   }
127 | 
128 |   return result;
129 | }
130 | 


--------------------------------------------------------------------------------
/lib/commandLine.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This Source Code is subject to the terms of the Mozilla Public License
 3 |  * version 2.0 (the "License"). You can obtain a copy of the License at
 4 |  * http://mozilla.org/MPL/2.0/.
 5 |  */
 6 | 
 7 | "use strict";
 8 | 
 9 | /**
10 |  * @module commandLine
11 |  */
12 | 
13 | const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {});
14 | 
15 | let CommandLineHandler =
16 | {
17 |   // Starting the entry with "k" makes it have slightly higher priority than default command line handlers.
18 |   classDescription: "k-abpcrawler",
19 |   contractID: "@adblockplus.org/abpcrawler/cmdline;1",
20 |   classID: Components.ID("{973636c2-e842-11e4-b02c-1681e6b88ec1}"),
21 |   xpcom_categories: ["command-line-handler"],
22 | 
23 |   init: function()
24 |   {
25 |     let registrar = Components.manager.QueryInterface(Ci.nsIComponentRegistrar);
26 |     registrar.registerFactory(this.classID, this.classDescription, this.contractID, this);
27 | 
28 |     let catMan = Cc["@mozilla.org/categorymanager;1"].getService(Ci.nsICategoryManager);
29 |     for each (let category in this.xpcom_categories)
30 |       catMan.addCategoryEntry(category, this.classDescription, this.contractID, false, true);
31 | 
32 |     onShutdown.add((function()
33 |     {
34 |       for each (let category in this.xpcom_categories)
35 |         catMan.deleteCategoryEntry(category, this.classDescription, false);
36 | 
37 |       registrar.unregisterFactory(this.classID, this);
38 |     }).bind(this));
39 |   },
40 | 
41 |   createInstance: function(outer, iid)
42 |   {
43 |     if (outer)
44 |       throw Cr.NS_ERROR_NO_AGGREGATION;
45 |     return this.QueryInterface(iid);
46 |   },
47 | 
48 |   helpInfo: "  -crawler-port      Port that ABP Crawler should communicate to\n",
49 | 
50 |   handle: function(cmdline)
51 |   {
52 |     let port = cmdline.handleFlagWithParam("crawler-port", false);
53 |     if (port != null)
54 |       require("main").startup(parseInt(port));
55 |   },
56 | 
57 |   QueryInterface: XPCOMUtils.generateQI([Ci.nsICommandLineHandler, Ci.nsIFactory])
58 | };
59 | 
60 | CommandLineHandler.init();
61 | 


--------------------------------------------------------------------------------
/lib/crawler.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This Source Code is subject to the terms of the Mozilla Public License
  3 |  * version 2.0 (the "License"). You can obtain a copy of the License at
  4 |  * http://mozilla.org/MPL/2.0/.
  5 |  */
  6 | 
  7 | "use strict";
  8 | 
  9 | /**
 10 |  * @module crawler
 11 |  */
 12 | 
 13 | const {Services} = Cu.import("resource://gre/modules/Services.jsm", {});
 14 | const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {});
 15 | const {Task} = Cu.import("resource://gre/modules/Task.jsm", {});
 16 | const {setTimeout, clearTimeout} = Cu.import("resource://gre/modules/Timer.jsm", {});
 17 | 
 18 | function abprequire(module)
 19 | {
 20 |   let result = {};
 21 |   result.wrappedJSObject = result;
 22 |   Services.obs.notifyObservers(result, "adblockplus-require", module);
 23 |   return result.exports;
 24 | }
 25 | 
 26 | let {RequestNotifier} = abprequire("requestNotifier");
 27 | let {FilterNotifier} = abprequire("filterNotifier");
 28 | let {FilterStorage} = abprequire("filterStorage");
 29 | 
 30 | /**
 31 |  * Allocates tabs on request but not more than maxtabs at the same time.
 32 |  *
 33 |  * @param {tabbrowser} browser
 34 |  *    The tabbed browser where tabs should be created
 35 |  * @param {int} maxtabs
 36 |  *    The maximum number of tabs to be allocated
 37 |  * @constructor
 38 |  */
 39 | function TabAllocator(browser, maxtabs)
 40 | {
 41 |   this._browser = browser;
 42 |   this._tabs = 0;
 43 |   this._maxtabs = maxtabs;
 44 |   // The queue containing resolve functions of promises waiting for a tab.
 45 |   this._resolvers = [];
 46 |   // Keep at least one tab alive to prevent browser from closing itself.
 47 |   this._tabKeepingWindowAlive = this._browser.tabs[0];
 48 |   this._browser.removeAllTabsBut(this._tabKeepingWindowAlive);
 49 | }
 50 | TabAllocator.prototype = {
 51 |   _removeTabKeepingWindowAlive: function()
 52 |   {
 53 |     if (!this._tabKeepingWindowAlive)
 54 |       return;
 55 |     this._browser.removeTab(this._tabKeepingWindowAlive);
 56 |     delete this._tabKeepingWindowAlive;
 57 |   },
 58 | 
 59 |   /**
 60 |    * Creates a blank tab in this._browser.
 61 |    *
 62 |    * @return {Promise.<tab>} promise which resolves once the tab is fully initialized.
 63 |    */
 64 |   _createTab: function()
 65 |   {
 66 |     this._tabs++;
 67 |     let tab = this._browser.addTab("about:blank");
 68 |     if (tab.linkedBrowser.outerWindowID)
 69 |     {
 70 |       this._removeTabKeepingWindowAlive();
 71 |       return Promise.resolve(tab);
 72 |     }
 73 |     return new Promise((resolve, reject) =>
 74 |     {
 75 |       let onBrowserInit = (msg) =>
 76 |       {
 77 |         tab.linkedBrowser.messageManager.removeMessageListener("Browser:Init", onBrowserInit);
 78 |         this._removeTabKeepingWindowAlive();
 79 |         resolve(tab);
 80 |       };
 81 |       // "Browser:Init" message is sent once the browser is ready, see
 82 |       // https://bugzil.la/1256602#c1
 83 |       tab.linkedBrowser.messageManager.addMessageListener("Browser:Init", onBrowserInit);
 84 |     });
 85 |   },
 86 | 
 87 |   /**
 88 |    * Returns a promise that will resolve into a tab once a tab is allocated.
 89 |    * The tab cannot be used by other tasks until releaseTab() is called.
 90 |    *
 91 |    * @result {Promise.<tab>}
 92 |    */
 93 |   getTab: function()
 94 |   {
 95 |     if (this._tabs < this._maxtabs)
 96 |       return this._createTab();
 97 |     return new Promise((resolve, reject) => this._resolvers.push(resolve));
 98 |   },
 99 | 
100 |   /**
101 |    * Adds a tab back to the pool so that it can be used by other tasks.
102 |    *
103 |    * @param {tab} tab
104 |    */
105 |   releaseTab: function(tab)
106 |   {
107 |     // If we are about to close last tab don't close it immediately to keep
108 |     // the window alive. It will be closed when a new tab is created.
109 |     if (this._tabs > 1)
110 |       this._browser.removeTab(tab);
111 |     else
112 |     {
113 |       // navigate away from previously opened URL
114 |       tab.linkedBrowser.loadURI("about:blank", null, null);
115 |       this._tabKeepingWindowAlive = tab;
116 |     }
117 | 
118 |     this._tabs--;
119 |     if (this._resolvers.length && this._tabs < this._maxtabs)
120 |     {
121 |       this._resolvers.shift()(this._createTab());
122 |     }
123 |   },
124 | };
125 | 
126 | /**
127 |  * Once created, this object will make sure all new windows are dismissed
128 |  * immediately.
129 |  *
130 |  * @constructor
131 |  */
132 | function WindowCloser()
133 | {
134 |   Services.obs.addObserver(this, "xul-window-registered", true)
135 | }
136 | WindowCloser.prototype = {
137 |   /**
138 |    * Deactivates this object.
139 |    */
140 |   stop: function()
141 |   {
142 |     Services.obs.removeObserver(this, "xul-window-registered")
143 |   },
144 | 
145 |   observe: function(subject, topic, data)
146 |   {
147 |     let window = subject.QueryInterface(Ci.nsIInterfaceRequestor)
148 |                         .getInterface(Ci.nsIDOMWindow)
149 |     window.addEventListener("load", function()
150 |     {
151 |       if (window.document.documentElement.localName == 'dialog')
152 |         window.document.documentElement.acceptDialog();
153 |       else
154 |         window.close();
155 |     }, false);
156 |   },
157 | 
158 |   QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakReference])
159 | };
160 | 
161 | function configureFrameScript()
162 | {
163 |   const info = require("info");
164 |   let frameScriptPath = info.addonRoot + "/lib/child/frameScript.js";
165 |   Services.mm.loadFrameScript(frameScriptPath, true);
166 | 
167 |   onShutdown.add(() =>
168 |   {
169 |     Services.mm.removeDelayedFrameScript(frameScriptPath);
170 |   });
171 | }
172 | 
173 | /**
174 |  * Starts the crawling session. The crawler opens each URL in a tab and stores
175 |  * the results.
176 |  *
177 |  * @param {Window} window
178 |  *    The browser window we're operating in
179 |  * @param {String[]} urls
180 |  *    URLs to be crawled
181 |  * @param {int} timeout
182 |  *    Load timeout in milliseconds
183 |  * @param {int} maxtabs
184 |  *    Maximum number of tabs to be opened
185 |  * @param {String} targetURL
186 |  *    URL that should receive the results
187 |  * @param {Function} onDone
188 |  *    The callback which is called after finishing of crawling of all URLs.
189 |  */
190 | function run(window, urls, timeout, maxtabs, targetURL, onDone)
191 | {
192 |   configureFrameScript();
193 |   new Promise((resolve, reject) =>
194 |   {
195 |     if (FilterStorage.subscriptions.length > 0)
196 |     {
197 |       resolve();
198 |       return;
199 |     }
200 |     let onFiltersLoaded = (action, item, newValue, oldValue) =>
201 |     {
202 |       if (action == "load")
203 |       {
204 |         FilterNotifier.removeListener(onFiltersLoaded);
205 |         resolve();
206 |       }
207 |     };
208 |     FilterNotifier.addListener(onFiltersLoaded);
209 |   }).then(() => crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone))
210 |   .catch(reportException);
211 | }
212 | exports.run = run;
213 | 
214 | /**
215 |  * Spawns a {Task} task to crawl each url from urls argument and calls
216 |  * onDone when all tasks are finished.
217 |  * @param {Window} window
218 |  *   The browser window we're operating in
219 |  * @param {String[]} urls
220 |  *   URLs to be crawled
221 |  * @param {int} timeout
222 |  *    Load timeout in milliseconds
223 |  * @param {int} maxtabs
224 |  *    Maximum number of tabs to be opened
225 |  * @param {String} targetURL
226 |  *    URL that should receive the results
227 |  * @param {Function} onDone
228 |  *    The callback which is called after finishing of all tasks.
229 |  */
230 | function crawl_urls(window, urls, timeout, maxtabs, targetURL, onDone)
231 | {
232 |   let tabAllocator = new TabAllocator(window.getBrowser(), maxtabs);
233 | 
234 |   let running = 0;
235 |   let windowCloser = new WindowCloser();
236 |   let taskDone = function()
237 |   {
238 |     running--;
239 |     if (running <= 0)
240 |     {
241 |       windowCloser.stop();
242 |       onDone();
243 |     }
244 |   };
245 | 
246 |   for (let url of urls)
247 |   {
248 |     running++;
249 |     Task.spawn(crawl_url.bind(null, url, tabAllocator, timeout)).then(function(result)
250 |     {
251 |       let request = new XMLHttpRequest();
252 |       request.open("POST", targetURL);
253 |       request.addEventListener("load", taskDone, false);
254 |       request.addEventListener("error", taskDone, false);
255 |       request.send(JSON.stringify(result));
256 |     }, function(url, exception)
257 |     {
258 |       reportException(exception);
259 | 
260 |       let request = new XMLHttpRequest();
261 |       request.open("POST", targetURL);
262 |       request.addEventListener("load", taskDone, false);
263 |       request.addEventListener("error", taskDone, false);
264 |       request.send(JSON.stringify({
265 |         url: url,
266 |         startTime: Date.now(),
267 |         error: String(exception)
268 |       }));
269 |     }.bind(null, url));
270 |   }
271 | }
272 | 
273 | /**
274 |  * Expects to receive page info gathered in a content process for the specified
275 |  * `tab`. If there is no relevant message within specified `timeout` then
276 |  * the result promise is resolved with error object.
277 |  * @param tab
278 |  *    Tab in which we are interested in
279 |  * @param {int} timeout
280 |  *    Timeout in milliseconds
281 |  * @return {Promise} promise which will be resolved with the received page info
282 |  */
283 | function getPageInfo(tab, timeout)
284 | {
285 |   return new Promise((resolve, result) =>
286 |   {
287 |     let mm = tab.linkedBrowser.messageManager;
288 |     let timerID;
289 |     let onDone = (msg) =>
290 |     {
291 |       mm.removeMessageListener("abpcrawler:pageInfoGathered", onDone);
292 |       clearTimeout(timerID);
293 |       resolve(msg.data);
294 |     }
295 |     mm.addMessageListener("abpcrawler:pageInfoGathered", onDone);
296 |     timerID = setTimeout(() => onDone({data: {error: "timeout"}}), timeout);
297 |   });
298 | }
299 | 
300 | /**
301 |  * Crawls a URL. This is a generator meant to be used via a Task object.
302 |  *
303 |  * @param {String} url
304 |  * @param {TabAllocator} tabAllocator
305 |  * @param {int} timeout
306 |  *    Load timeout in milliseconds
307 |  * @result {Object}
308 |  *    Crawling result
309 |  */
310 | function* crawl_url(url, tabAllocator, timeout)
311 | {
312 |   let tab = yield tabAllocator.getTab();
313 |   let result = {url, requests: []};
314 |   let requestNotifier;
315 |   try
316 |   {
317 |     result.startTime = Date.now();
318 |     requestNotifier = new RequestNotifier(tab.linkedBrowser.outerWindowID,
319 |       function(entry, scanComplete)
320 |     {
321 |       if (!entry)
322 |         return;
323 |       let {type: contentType, location, filter} = entry;
324 |       result.requests.push({location, contentType, filter});
325 |     });
326 | 
327 |     tab.linkedBrowser.loadURI(url, null, null);
328 | 
329 |     Object.assign(result, yield getPageInfo(tab, timeout));
330 |     result.finalUrl = tab.linkedBrowser.currentURI.spec;
331 |     result.endTime = Date.now();
332 |   }
333 |   finally
334 |   {
335 |     if (requestNotifier)
336 |       requestNotifier.shutdown();
337 |     tabAllocator.releaseTab(tab);
338 |   }
339 |   return result;
340 | }
341 | 
342 | function reportException(e)
343 | {
344 |   let stack = "";
345 |   if (e && typeof e == "object" && "stack" in e)
346 |     stack = e.stack + "\n";
347 | 
348 |   Cu.reportError(e);
349 |   dump(e + "\n" + stack + "\n");
350 | }
351 | 


--------------------------------------------------------------------------------
/lib/main.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This Source Code is subject to the terms of the Mozilla Public License
 3 |  * version 2.0 (the "License"). You can obtain a copy of the License at
 4 |  * http://mozilla.org/MPL/2.0/.
 5 |  */
 6 | 
 7 | "use strict";
 8 | 
 9 | /**
10 |  * @module main
11 |  */
12 | 
13 | const {Services} = Cu.import("resource://gre/modules/Services.jsm", {});
14 | const {XPCOMUtils} = Cu.import("resource://gre/modules/XPCOMUtils.jsm", {});
15 | 
16 | require("commandLine");
17 | let {run} = require("crawler");
18 | 
19 | let baseURL = null;
20 | 
21 | /**
22 |  * Waits for the application to initialize.
23 |  * @type {Promise}
24 |  */
25 | let applicationReady = new Promise((resolve, reject) =>
26 | {
27 |   let observer = {
28 |     observe: function(subject, topic, data)
29 |     {
30 |       Services.obs.removeObserver(this, "sessionstore-windows-restored");
31 |       resolve();
32 |     },
33 |     QueryInterface: XPCOMUtils.generateQI([Ci.nsIObserver, Ci.nsISupportsWeakReference])
34 |   };
35 |   Services.obs.addObserver(observer, "sessionstore-windows-restored", true);
36 |   onShutdown.add(() => Services.obs.removeObserver(observer, "sessionstore-windows-restored"));
37 | });
38 | 
39 | /**
40 |  * Startup function, called from command line handler.
41 |  *
42 |  * @param {int} port  Port to communicate with
43 |  */
44 | function startup(port)
45 | {
46 |   baseURL = "http://localhost:" + port + "/";
47 | 
48 |   let request = new XMLHttpRequest();
49 |   request.open("GET", baseURL + "parameters");
50 |   request.addEventListener("load", onParametersLoaded, false);
51 |   request.addEventListener("error", onParametersFailed, false);
52 |   request.responseType = "json";
53 |   request.send();
54 | }
55 | exports.startup = startup;
56 | 
57 | /**
58 |  * Called if parameters loaded succesfully.
59 |  *
60 |  * @param {Event} event
61 |  */
62 | function onParametersLoaded(event)
63 | {
64 |   let {urls, timeout, maxtabs} = event.target.response;
65 | 
66 |   applicationReady.then(function()
67 |   {
68 |     let window = Services.wm.getMostRecentWindow("navigator:browser");
69 |     run(window, urls, timeout, maxtabs, baseURL + "save", function()
70 |     {
71 |       Services.startup.quit(Services.startup.eAttemptQuit);
72 |     });
73 |   }, function(exception)
74 |   {
75 |     Cu.reportError(exception);
76 |     dump(exception + "\n")
77 |   });
78 | }
79 | 
80 | /**
81 |  * Called if requesting parameters failed.
82 |  *
83 |  * @param {Event} event
84 |  */
85 | function onParametersFailed(event)
86 | {
87 |   Cu.reportError("Failed loading parameters");
88 | }
89 | 


--------------------------------------------------------------------------------
/metadata.gecko:
--------------------------------------------------------------------------------
1 | [general]
2 | id=abpcrawler@adblockplus.org
3 | basename=abpcrawler
4 | version=0.1
5 | author=Felix H. Dahlke
6 | 
7 | [compat]
8 | firefox=30.0/*
9 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | import argparse
  5 | import datetime
  6 | import errno
  7 | import hashlib
  8 | import io
  9 | import json
 10 | import os
 11 | import random
 12 | import subprocess
 13 | import sys
 14 | import tempfile
 15 | import threading
 16 | import urllib
 17 | import urlparse
 18 | from wsgiref.simple_server import make_server
 19 | 
 20 | from mozprofile import FirefoxProfile
 21 | from mozrunner import FirefoxRunner
 22 | 
 23 | 
 24 | class CrawlerApp:
 25 |     server = None
 26 | 
 27 |     def __init__(self, parameters):
 28 |         self.parameters = parameters
 29 |         with io.open(self.parameters.list, 'r', encoding='utf-8') as handle:
 30 |             self.urls = map(unicode.strip, handle.readlines())
 31 | 
 32 |     def __call__(self, environ, start_response):
 33 |         path = environ.get('PATH_INFO', '')
 34 |         if path == '/parameters':
 35 |             start_response('200 OK', [('Content-Type', 'application/json')])
 36 |             return [json.dumps({
 37 |                 'urls': self.urls,
 38 |                 'timeout': self.parameters.timeout * 1000,
 39 |                 'maxtabs': self.parameters.maxtabs,
 40 |             })]
 41 |         elif path == '/save':
 42 |             try:
 43 |                 request_body_size = int(environ.get('CONTENT_LENGTH', 0))
 44 |             except (ValueError):
 45 |                 start_response('400 Bad Request', [])
 46 |                 return ''
 47 | 
 48 |             data = json.loads(environ['wsgi.input'].read(request_body_size))
 49 |             self.urls.remove(data['url'])
 50 | 
 51 |             fullurl = data['url']
 52 |             if not urlparse.urlparse(fullurl).scheme:
 53 |                 fullurl = 'http://' + fullurl
 54 |             parsedurl = urlparse.urlparse(fullurl)
 55 |             urlhash = hashlib.new('md5', data['url']).hexdigest()
 56 |             timestamp = datetime.datetime.fromtimestamp(data['startTime'] / 1000.0).strftime('%Y-%m-%dT%H%M%S.%f')
 57 |             basename = "%s-%s-%s" % (parsedurl.hostname, timestamp, urlhash)
 58 |             datapath = os.path.join(self.parameters.outdir, basename + ".json")
 59 |             screenshotpath = os.path.join(self.parameters.outdir, basename + ".jpg")
 60 |             sourcepath = os.path.join(self.parameters.outdir, basename + ".xml")
 61 | 
 62 |             try:
 63 |                 os.makedirs(self.parameters.outdir)
 64 |             except OSError as e:
 65 |                 if e.errno != errno.EEXIST:
 66 |                     raise
 67 | 
 68 |             if "screenshot" in data:
 69 |                 with open(screenshotpath, 'wb') as handle:
 70 |                     handle.write(urllib.urlopen(data["screenshot"]).read())
 71 |                 del data["screenshot"]
 72 | 
 73 |             if "source" in data:
 74 |                 with io.open(sourcepath, 'w', encoding='utf-8') as handle:
 75 |                     handle.write(data["source"])
 76 |                 del data["source"]
 77 | 
 78 |             with io.open(datapath, 'w', encoding='utf-8') as handle:
 79 |                 handle.write(unicode(json.dumps(data, indent=2, ensure_ascii=False, sort_keys=True)) + u'\n')
 80 |             start_response('204 No Content', [])
 81 |             return ''
 82 | 
 83 |         start_response('404 Not Found', [])
 84 |         return ''
 85 | 
 86 | 
 87 | def run():
 88 |     parser = argparse.ArgumentParser(description='Run crawler')
 89 |     parser.add_argument(
 90 |         '-b', '--binary', type=str,
 91 |         help='path to the Firefox binary'
 92 |     )
 93 |     parser.add_argument(
 94 |         '-a', '--abpdir', type=str,
 95 |         help='path to the Adblock Plus repository'
 96 |     )
 97 |     parser.add_argument(
 98 |         '-f', '--filters', metavar='url', type=str, nargs='+',
 99 |         default=["https://easylist-downloads.adblockplus.org/easylist.txt", "https://easylist-downloads.adblockplus.org/exceptionrules.txt"],
100 |         help='filter lists to install in Adblock Plus. The arguments can also have the format path=url, the data will be read from the specified path then.'
101 |     )
102 |     parser.add_argument(
103 |         '-t', '--timeout', type=int, default=300,
104 |         help='Load timeout (seconds)'
105 |     )
106 |     parser.add_argument(
107 |         '-x', '--maxtabs', type=int, default=15,
108 |         help='Maximal number of tabs to open in parallel'
109 |     )
110 |     parser.add_argument(
111 |         'list', type=str,
112 |         help='URL list to process'
113 |     )
114 |     parser.add_argument(
115 |         'outdir', type=str,
116 |         help='directory to write data into'
117 |     )
118 |     parameters = parser.parse_args()
119 | 
120 |     import buildtools.packagerGecko as packager
121 |     cleanup = []
122 |     try:
123 |         base_dir = os.path.dirname(os.path.abspath(__file__))
124 |         handle, crawlerxpi = tempfile.mkstemp(suffix='.xpi')
125 |         os.close(handle)
126 |         cleanup.append(crawlerxpi)
127 |         packager.createBuild(base_dir, outFile=crawlerxpi, releaseBuild=True)
128 | 
129 |         abpxpi = 'https://addons.mozilla.org/firefox/downloads/latest/1865/addon-1865-latest.xpi'
130 |         if parameters.abpdir:
131 |             handle, abpxpi = tempfile.mkstemp(suffix='.xpi')
132 |             os.close(handle)
133 |             cleanup.append(abpxpi)
134 |             packager.createBuild(parameters.abpdir, outFile=abpxpi, releaseBuild=True)
135 | 
136 |         profile = FirefoxProfile(
137 |             addons=[
138 |                 crawlerxpi,
139 |                 abpxpi,
140 |             ],
141 |             preferences={
142 |                 'browser.startup.homepage': 'about:blank',
143 |                 'browser.tabs.warnOnCloseOtherTabs': False,
144 |                 'browser.uitour.enabled': False,
145 |                 'prompts.tab_modal.enabled': False,
146 |                 'startup.homepage_welcome_url': 'about:blank',
147 |                 'startup.homepage_welcome_url.additional': 'about:blank',
148 |                 'xpinstall.signatures.required': False,
149 |             }
150 |         )
151 | 
152 |         abpsettings = os.path.join(profile.profile, 'adblockplus')
153 |         os.makedirs(abpsettings)
154 |         with open(os.path.join(abpsettings, 'patterns.ini'), 'w') as handle:
155 |             print >>handle, '# Adblock Plus preferences'
156 |             print >>handle, 'version=4'
157 |             for url in parameters.filters:
158 |                 if '=' in url:
159 |                     path, url = url.split('=', 1)
160 |                     with open(path, 'r') as source:
161 |                         data = source.read()
162 |                 else:
163 |                     data = urllib.urlopen(url).read()
164 |                 print >>handle, '[Subscription]'
165 |                 print >>handle, 'url=%s' % url
166 |                 print >>handle, '[Subscription filters]'
167 |                 print >>handle, '\n'.join(data.splitlines()[1:])
168 |     finally:
169 |         for path in cleanup:
170 |             os.unlink(path)
171 | 
172 |     server = None
173 |     try:
174 |         port = random.randrange(2000, 60000)
175 |         print "Communicating with client on port %i" % port
176 | 
177 |         app = CrawlerApp(parameters)
178 |         server = make_server('localhost', port, app)
179 |         app.server = server
180 |         threading.Thread(target=lambda: server.serve_forever()).start()
181 | 
182 |         runner = FirefoxRunner(
183 |             profile=profile,
184 |             binary=parameters.binary,
185 |             cmdargs=['--crawler-port', str(port)],
186 |             env=dict(os.environ, MOZ_CRASHREPORTER_DISABLE='1'),
187 |         )
188 |         while app.urls:
189 |             runner.start()
190 |             runner.wait()
191 |     finally:
192 |         if server:
193 |             server.shutdown()
194 |         profile.cleanup()
195 | 
196 | if __name__ == '__main__':
197 |     BASE_DIR = os.path.dirname(os.path.abspath(__file__))
198 |     DEPENDENCY_SCRIPT = os.path.join(BASE_DIR, "ensure_dependencies.py")
199 | 
200 |     try:
201 |         subprocess.check_call([sys.executable, DEPENDENCY_SCRIPT, BASE_DIR])
202 |     except subprocess.CalledProcessError as e:
203 |         print >>sys.stderr, e
204 |         print >>sys.stderr, "Failed to ensure dependencies being up-to-date!"
205 | 
206 |     run()
207 | 


--------------------------------------------------------------------------------