├── .gitignore
├── .travis.yml
├── CHANGES.rst
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── adblockparser
    ├── __init__.py
    ├── parser.py
    └── utils.py
├── setup.cfg
├── setup.py
├── tests
    ├── test_options.py
    ├── test_parsing.py
    └── test_rule_types.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *egg-info
 3 | .tox
 4 | venv
 5 | build
 6 | dist
 7 | .idea
 8 | .ipynb_checkpoints
 9 | MANIFEST
10 | easylist.txt
11 | .cache
12 | .coverage
13 | htmlcov
14 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python: 2.7
 3 | env:
 4 | - TOXENV=py27
 5 | - TOXENV=py33
 6 | - TOXENV=py34
 7 | - TOXENV=pypy
 8 | - TOXENV=py27-re2
 9 | before_install: >
10 |   if [[ "$TOXENV" == *re2 ]]; then
11 |      sudo add-apt-repository -y ppa:pi-rho/security;
12 |      sudo apt-get update -q;
13 |      sudo apt-get install -y libre2-dev;
14 |   fi
15 | 
16 | install:
17 | - pip install -U tox codecov
18 | 
19 | script: tox
20 | 
21 | after_success:
22 | - codecov
23 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
 1 | Changes
 2 | =======
 3 | 
 4 | 0.7 (2016-10-17)
 5 | ----------------
 6 | 
 7 | * Fixed parsing issue with recent easylist.txt;
 8 | * fixed a link to easylist (thanks https://github.com/limonte).
 9 | 
10 | 0.6 (2016-09-10)
11 | ----------------
12 | 
13 | * Added support for regex rules (thanks https://github.com/mlyko).
14 | 
15 | 0.5 (2016-03-04)
16 | ----------------
17 | 
18 | * Fixed an issue with blank lines in filter files
19 |   (thanks https://github.com/skrypka);
20 | * fixed an issue with applying rules with 'domain' option
21 |   when domain doesn't have a dot (e.g. 'localhost');
22 | * Python 2.6 and Python 3.2 support is dropped;
23 |   adblockparser likely still work in these interpreters,
24 |   but this is no longer checked by tests.
25 | 
26 | 0.4 (2015-03-29)
27 | ----------------
28 | 
29 | * AdblockRule now caches the compiled regexes (thanks
30 |   https://github.com/mozbugbox);
31 | * Fixed an issue with "domain" option handling
32 |   (thanks https://github.com/nbraem for the bug report and a test case);
33 | * cleanups and test improvements.
34 | 
35 | 0.3 (2014-07-11)
36 | ----------------
37 | 
38 | * Switch to setuptools;
39 | * better ``__repr__`` for ``AdblockRule``;
40 | * Python 3.4 support is confirmed;
41 | * testing improvements.
42 | 
43 | 0.2 (2014-03-20)
44 | ----------------
45 | 
46 | This release provides much faster `AdblockRules.should_block()` method
47 | for rules without options and rules with 'domain' option.
48 | 
49 | * better combined regex for option-less rules that makes re2 library
50 |   always use DFA without falling back to NFA;
51 | * an index for rules with domains;
52 | * ``params`` method arguments are renamed to ``options`` for consistency.
53 | 
54 | 0.1.1 (2014-03-11)
55 | ------------------
56 | 
57 | By default ``AdblockRules`` autodetects re2 library and uses
58 | it if a compatible version is detected.
59 | 
60 | 0.1 (2014-03-03)
61 | ----------------
62 | 
63 | Initial release.
64 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014 ScrapingHub Inc.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.rst
3 | include tox.ini
4 | exclude easylist.txt
5 | recursive-include docs *.txt
6 | recursive-include tests *.py
7 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | adblockparser
  2 | =============
  3 | 
  4 | .. image:: https://img.shields.io/pypi/v/adblockparser.svg
  5 |    :target: https://pypi.python.org/pypi/adblockparser
  6 |    :alt: PyPI Version
  7 | 
  8 | .. image:: https://img.shields.io/pypi/l/adblockparser.svg
  9 |    :target: https://github.com/scrapinghub/adblockparser/blob/master/LICENSE.txt
 10 |    :alt: License
 11 | 
 12 | .. image:: https://img.shields.io/travis/scrapinghub/adblockparser/master.svg
 13 |    :target: https://travis-ci.org/scrapinghub/adblockparser
 14 |    :alt: Build Status
 15 | 
 16 | .. image:: http://codecov.io/github/scrapinghub/adblockparser/coverage.svg?branch=master
 17 |    :target: http://codecov.io/github/scrapinghub/adblockparser?branch=master
 18 |    :alt: Code Coverage
 19 | 
 20 | 
 21 | ``adblockparser`` is a package for working with `Adblock Plus`_ filter rules.
 22 | It can parse Adblock Plus filters and match URLs against them.
 23 | 
 24 | .. _Adblock Plus: https://adblockplus.org
 25 | 
 26 | Installation
 27 | ------------
 28 | 
 29 | ::
 30 | 
 31 |     pip install adblockparser
 32 | 
 33 | Python 2.7 and Python 3.3+ are supported.
 34 | 
 35 | If you plan to use this library with a large number of filters
 36 | installing pyre2_ library is highly recommended: the speedup
 37 | for a list of default EasyList_ filters can be greater than 1000x.
 38 | 
 39 |     pip install 're2 >= 0.2.21'
 40 | 
 41 | Note that pyre2 library requires C++ re2_ library installed.
 42 | On OS X you can get it using homebrew (``brew install re2``).
 43 | 
 44 | .. _re2: https://github.com/google/re2
 45 | .. _pyre2: https://github.com/axiak/pyre2
 46 | .. _EasyList: https://easylist.to/
 47 | 
 48 | Usage
 49 | -----
 50 | 
 51 | To learn about Adblock Plus filter syntax check these links:
 52 | 
 53 | * https://adblockplus.org/en/filter-cheatsheet
 54 | * https://adblockplus.org/en/filters
 55 | 
 56 | 
 57 | 1. Get filter rules somewhere: write them manually, read lines from a file
 58 |    downloaded from EasyList_, etc.::
 59 | 
 60 |        >>> raw_rules = [
 61 |        ...     "||ads.example.com^",
 62 |        ...     "@@||ads.example.com/notbanner^$~script",
 63 |        ... ]
 64 | 
 65 | 2. Create ``AdblockRules`` instance from rule strings::
 66 | 
 67 |        >>> from adblockparser import AdblockRules
 68 |        >>> rules = AdblockRules(raw_rules)
 69 | 
 70 | 3. Use this instance to check if an URL should be blocked or not::
 71 | 
 72 |        >>> rules.should_block("http://ads.example.com")
 73 |        True
 74 | 
 75 |    Rules with options are ignored unless you pass a dict with options values::
 76 | 
 77 |        >>> rules.should_block("http://ads.example.com/notbanner")
 78 |        True
 79 |        >>> rules.should_block("http://ads.example.com/notbanner", {'script': False})
 80 |        False
 81 |        >>> rules.should_block("http://ads.example.com/notbanner", {'script': True})
 82 |        True
 83 | 
 84 | Consult with Adblock Plus `docs <https://adblockplus.org/en/filters#options>`__
 85 | for options description. These options allow to write filters that depend
 86 | on some external information not available in URL itself.
 87 | 
 88 | Performance
 89 | -----------
 90 | 
 91 | Regex engines
 92 | ^^^^^^^^^^^^^
 93 | 
 94 | ``AdblockRules`` class creates a huge regex to match filters that
 95 | don't use options. pyre2_ library works better than stdlib's re
 96 | with such regexes. If you have pyre2_ installed then ``AdblockRules``
 97 | should work faster, and the speedup can be dramatic - more than 1000x
 98 | in some cases.
 99 | 
100 | Sometimes pyre2 prints something like
101 | ``re2/dfa.cc:459: DFA out of memory: prog size 270515 mem 1713850`` to stderr.
102 | Give re2 library more memory to fix that::
103 | 
104 |     >>> rules = AdblockRules(raw_rules, use_re2=True, max_mem=512*1024*1024)  # doctest: +SKIP
105 | 
106 | Make sure you are using re2 0.2.20 installed from PyPI, it doesn't work.
107 | 
108 | Parsing rules with options
109 | ^^^^^^^^^^^^^^^^^^^^^^^^^^
110 | 
111 | Rules that have options are currently matched in a loop, one-by-one.
112 | Also, they are checked for compatibility with options passed by user:
113 | for example, if user didn't pass 'script' option (with a ``True`` or ``False``
114 | value), all rules involving ``script`` are discarded.
115 | 
116 | This is slow if you have thousands of such rules. To make it work faster,
117 | explicitly list all options you want to support in ``AdblockRules`` constructor,
118 | disable skipping of unsupported rules, and always pass a dict with all options
119 | to ``should_block`` method::
120 | 
121 |     >>> rules = AdblockRules(
122 |     ...    raw_rules,
123 |     ...    supported_options=['script', 'domain'],
124 |     ...    skip_unsupported_rules=False
125 |     ... )
126 |     >>> options = {'script': False, 'domain': 'www.mystartpage.com'}
127 |     >>> rules.should_block("http://ads.example.com/notbanner", options)
128 |     False
129 | 
130 | This way rules with unsupported options will be filtered once, when
131 | ``AdblockRules`` instance is created.
132 | 
133 | Limitations
134 | -----------
135 | 
136 | There are some known limitations of the current implementation:
137 | 
138 | * element hiding rules are ignored;
139 | * matching URLs against a large number of filters can be slow-ish,
140 |   especially if pyre2_ is not installed and many filter options are enabled;
141 | * ``match-case`` filter option is not properly supported (it is ignored);
142 | * ``document`` filter option is not properly supported;
143 | * rules are not validated *before* parsing, so invalid rules may raise
144 |   inconsistent exceptions or silently work incorrectly.
145 | 
146 | It is possible to remove all these limitations. Pull requests are welcome
147 | if you want to make it happen sooner!
148 | 
149 | Contributing
150 | ------------
151 | 
152 | * source code: https://github.com/scrapinghub/adblockparser
153 | * issue tracker: https://github.com/scrapinghub/adblockparser/issues
154 | 
155 | In order to run tests, install `tox <http://tox.testrun.org>`_ and type
156 | 
157 | ::
158 | 
159 |     tox
160 | 
161 | from the source checkout.
162 | 
163 | The license is MIT.
164 | 


--------------------------------------------------------------------------------
/adblockparser/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from .parser import AdblockRules, AdblockRule, AdblockParsingError
4 | 


--------------------------------------------------------------------------------
/adblockparser/parser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | import re
  4 | from collections import defaultdict
  5 | from functools import partial
  6 | from adblockparser.utils import split_data
  7 | 
  8 | 
  9 | class AdblockParsingError(ValueError):
 10 |     pass
 11 | 
 12 | 
 13 | class AdblockRule(object):
 14 |     r"""
 15 |     AdBlock Plus rule.
 16 | 
 17 |     Check these links for the format details:
 18 | 
 19 |     * https://adblockplus.org/en/filter-cheatsheet
 20 |     * https://adblockplus.org/en/filters
 21 | 
 22 |     Instantiate AdblockRule with a rule line:
 23 | 
 24 |     >>> from adblockparser import AdblockRule
 25 |     >>> rule = AdblockRule("@@||mydomain.no/artikler/$~third-party")
 26 | 
 27 |     Parsed data is available as rule attributes:
 28 | 
 29 |     >>> rule.is_comment
 30 |     False
 31 |     >>> rule.is_exception
 32 |     True
 33 |     >>> rule.is_html_rule
 34 |     False
 35 |     >>> rule.options
 36 |     {'third-party': False}
 37 |     >>> print(rule.regex)
 38 |     ^(?:[^:/?#]+:)?(?://(?:[^/?#]*\.)?)?mydomain\.no/artikler/
 39 | 
 40 |     To check if rule applies to an URL, use ``match_url`` method::
 41 | 
 42 |     >>> rule = AdblockRule("swf|")
 43 |     >>> rule.match_url("http://example.com/annoyingflash.swf")
 44 |     True
 45 |     >>> rule.match_url("http://example.com/swf/index.html")
 46 |     False
 47 | 
 48 |     Rules involving CSS selectors are detected but not supported well
 49 |     (``match_url`` doesn't work for them):
 50 | 
 51 |     >>> AdblockRule("domain.co.uk,domain2.com#@#.ad_description").is_html_rule
 52 |     True
 53 |     >>> AdblockRule("##.spot-ad").is_html_rule
 54 |     True
 55 |     """
 56 | 
 57 |     BINARY_OPTIONS = [
 58 |         "script",
 59 |         "image",
 60 |         "stylesheet",
 61 |         "object",
 62 |         "xmlhttprequest",
 63 |         "object-subrequest",
 64 |         "subdocument",
 65 |         "document",
 66 |         "elemhide",
 67 |         "other",
 68 |         "background",
 69 |         "xbl",
 70 |         "ping",
 71 |         "dtd",
 72 |         "media",
 73 |         "third-party",
 74 |         "match-case",
 75 |         "collapse",
 76 |         "donottrack",
 77 |         "websocket",
 78 |     ]
 79 |     OPTIONS_SPLIT_PAT = ',(?=~?(?:%s))' % ('|'.join(BINARY_OPTIONS + ["domain"]))
 80 |     OPTIONS_SPLIT_RE = re.compile(OPTIONS_SPLIT_PAT)
 81 | 
 82 |     __slots__ = ['raw_rule_text', 'is_comment', 'is_html_rule', 'is_exception',
 83 |                  'raw_options', 'options', '_options_keys', 'rule_text',
 84 |                  'regex', 'regex_re']
 85 | 
 86 |     def __init__(self, rule_text):
 87 |         self.raw_rule_text = rule_text
 88 |         self.regex_re = None
 89 | 
 90 |         rule_text = rule_text.strip()
 91 |         self.is_comment = not rule_text or rule_text.startswith(('!', '[Adblock'))
 92 |         if self.is_comment:
 93 |             self.is_html_rule = self.is_exception = False
 94 |         else:
 95 |             self.is_html_rule = '##' in rule_text or '#@#' in rule_text  # or rule_text.startswith('#')
 96 |             self.is_exception = rule_text.startswith('@@')
 97 |             if self.is_exception:
 98 |                 rule_text = rule_text[2:]
 99 | 
100 |         if not self.is_comment and '$' in rule_text:
101 |             rule_text, options_text = rule_text.split('$', 1)
102 |             self.raw_options = self._split_options(options_text)
103 |             self.options = dict(self._parse_option(opt) for opt in self.raw_options)
104 |         else:
105 |             self.raw_options = []
106 |             self.options = {}
107 |         self._options_keys = frozenset(self.options.keys()) - set(['match-case'])
108 | 
109 |         self.rule_text = rule_text
110 | 
111 |         if self.is_comment or self.is_html_rule:
112 |             # TODO: add support for HTML rules.
113 |             # We should split the rule into URL and HTML parts,
114 |             # convert URL part to a regex and parse the HTML part.
115 |             self.regex = ''
116 |         else:
117 |             self.regex = self.rule_to_regex(rule_text)
118 | 
119 |     def match_url(self, url, options=None):
120 |         """
121 |         Return if this rule matches the URL.
122 | 
123 |         What to do if rule is matched is up to developer. Most likely
124 |         ``.is_exception`` attribute should be taken in account.
125 |         """
126 |         options = options or {}
127 |         for optname in self.options:
128 |             if optname == 'match-case':  # TODO
129 |                 continue
130 | 
131 |             if optname not in options:
132 |                 raise ValueError("Rule requires option %s" % optname)
133 | 
134 |             if optname == 'domain':
135 |                 if not self._domain_matches(options['domain']):
136 |                     return False
137 |                 continue
138 | 
139 |             if options[optname] != self.options[optname]:
140 |                 return False
141 | 
142 |         return self._url_matches(url)
143 | 
144 |     def _domain_matches(self, domain):
145 |         domain_rules = self.options['domain']
146 |         for domain in _domain_variants(domain):
147 |             if domain in domain_rules:
148 |                 return domain_rules[domain]
149 |         return not any(domain_rules.values())
150 | 
151 |     def _url_matches(self, url):
152 |         if self.regex_re is None:
153 |             self.regex_re = re.compile(self.regex)
154 |         return bool(self.regex_re.search(url))
155 | 
156 |     def matching_supported(self, options=None):
157 |         """
158 |         Return whether this rule can return meaningful result,
159 |         given the `options` dict. If some options are missing,
160 |         then rule shouldn't be matched against, and this function
161 |         returns False.
162 | 
163 |         No options:
164 |         >>> rule = AdblockRule("swf|")
165 |         >>> rule.matching_supported({})
166 |         True
167 | 
168 |         Option is used in the rule, but its value is not available
169 |         at matching time:
170 |         >>> rule = AdblockRule("swf|$third-party")
171 |         >>> rule.matching_supported({})
172 |         False
173 | 
174 |         Option is used in the rule, and option value is available
175 |         at matching time:
176 |         >>> rule = AdblockRule("swf|$third-party")
177 |         >>> rule.matching_supported({'domain': 'example.com', 'third-party': False})
178 |         True
179 | 
180 |         Rule is a comment:
181 |         >>> rule = AdblockRule("!this is not a rule")
182 |         >>> rule.matching_supported({})
183 |         False
184 | 
185 |         """
186 |         if self.is_comment:
187 |             return False
188 | 
189 |         if self.is_html_rule:  # HTML rules are not supported yet
190 |             return False
191 | 
192 |         options = options or {}
193 |         keys = set(options.keys())
194 |         if not keys.issuperset(self._options_keys):
195 |             # some of the required options are not given
196 |             return False
197 | 
198 |         return True
199 | 
200 |     @classmethod
201 |     def _split_options(cls, options_text):
202 |         return cls.OPTIONS_SPLIT_RE.split(options_text)
203 | 
204 |     @classmethod
205 |     def _parse_domain_option(cls, text):
206 |         domains = text[len('domain='):]
207 |         parts = domains.replace(',', '|').split('|')
208 |         return dict(cls._parse_option_negation(p) for p in parts)
209 | 
210 |     @classmethod
211 |     def _parse_option_negation(cls, text):
212 |         return (text.lstrip('~'), not text.startswith('~'))
213 | 
214 |     @classmethod
215 |     def _parse_option(cls, text):
216 |         if text.startswith("domain="):
217 |             return ("domain", cls._parse_domain_option(text))
218 |         return cls._parse_option_negation(text)
219 | 
220 |     @classmethod
221 |     def rule_to_regex(cls, rule):
222 |         """
223 |         Convert AdBlock rule to a regular expression.
224 |         """
225 |         if not rule:
226 |             return rule
227 | 
228 |         # Check if the rule isn't already regexp
229 |         if rule.startswith('/') and rule.endswith('/'):
230 |             if len(rule) > 1:
231 |                 rule = rule[1:-1]
232 |             else:
233 |                 raise AdblockParsingError('Invalid rule')
234 |             return rule
235 | 
236 |         # escape special regex characters
237 |         rule = re.sub(r"([.$+?{}()\[\]\\])", r"\\\1", rule)
238 | 
239 |         # XXX: the resulting regex must use non-capturing groups (?:
240 |         # for performance reasons; also, there is a limit on number
241 |         # of capturing groups, no using them would prevent building
242 |         # a single regex out of several rules.
243 | 
244 |         # Separator character ^ matches anything but a letter, a digit, or
245 |         # one of the following: _ - . %. The end of the address is also
246 |         # accepted as separator.
247 |         rule = rule.replace("^", "(?:[^\w\d_\-.%]|$)")
248 | 
249 |         # * symbol
250 |         rule = rule.replace("*", ".*")
251 | 
252 |         # | in the end means the end of the address
253 |         if rule[-1] == '|':
254 |             rule = rule[:-1] + '$'
255 | 
256 |         # || in the beginning means beginning of the domain name
257 |         if rule[:2] == '||':
258 |             # XXX: it is better to use urlparse for such things,
259 |             # but urlparse doesn't give us a single regex.
260 |             # Regex is based on http://tools.ietf.org/html/rfc3986#appendix-B
261 |             if len(rule) > 2:
262 |                 #          |            | complete part     |
263 |                 #          |  scheme    | of the domain     |
264 |                 rule = r"^(?:[^:/?#]+:)?(?://(?:[^/?#]*\.)?)?" + rule[2:]
265 | 
266 |         elif rule[0] == '|':
267 |             # | in the beginning means start of the address
268 |             rule = '^' + rule[1:]
269 | 
270 |         # other | symbols should be escaped
271 |         # we have "|$" in our regexp - do not touch it
272 |         rule = re.sub("(\|)[^$]", r"\|", rule)
273 | 
274 |         return rule
275 | 
276 |     def __repr__(self):
277 |         return "AdblockRule(%r)" % self.raw_rule_text
278 | 
279 | 
280 | class AdblockRules(object):
281 |     """
282 |     AdblockRules is a class for checking URLs against multiple AdBlock rules.
283 | 
284 |     It is more efficient to use AdblockRules instead of creating AdblockRule
285 |     instances manually and checking them one-by-one because AdblockRules
286 |     optimizes some common cases.
287 |     """
288 | 
289 |     def __init__(self, rules, supported_options=None, skip_unsupported_rules=True,
290 |                  use_re2='auto', max_mem=256*1024*1024, rule_cls=AdblockRule):
291 | 
292 |         if supported_options is None:
293 |             self.supported_options = rule_cls.BINARY_OPTIONS + ['domain']
294 |         else:
295 |             self.supported_options = supported_options
296 | 
297 |         self.uses_re2 = _is_re2_supported() if use_re2 == 'auto' else use_re2
298 |         self.re2_max_mem = max_mem
299 |         self.rule_cls = rule_cls
300 |         self.skip_unsupported_rules = skip_unsupported_rules
301 | 
302 |         _params = dict((opt, True) for opt in self.supported_options)
303 |         self.rules = [
304 |             r for r in (
305 |                 r if isinstance(r, rule_cls) else rule_cls(r)
306 |                 for r in rules
307 |             )
308 |             if (r.regex or r.options) and r.matching_supported(_params)
309 |         ]
310 | 
311 |         # "advanced" rules are rules with options,
312 |         # "basic" rules are rules without options
313 |         advanced_rules, basic_rules = split_data(self.rules, lambda r: r.options)
314 | 
315 |         # Rules with domain option are handled separately:
316 |         # if user passes a domain we can discard all rules which
317 |         # require another domain. So we build an index:
318 |         # {domain: [rules_which_require_it]}, and only check
319 |         # rules which require our domain. If a rule doesn't require any
320 |         # domain.
321 |         # TODO: what about ~rules? Should we match them earlier?
322 |         domain_required_rules, non_domain_rules = split_data(
323 |             advanced_rules,
324 |             lambda r: (
325 |                 'domain' in r.options
326 |                 and any(r.options["domain"].values())
327 |             )
328 |         )
329 | 
330 |         # split rules into blacklists and whitelists
331 |         self.blacklist, self.whitelist = self._split_bw(basic_rules)
332 |         _combined = partial(_combined_regex, use_re2=self.uses_re2, max_mem=max_mem)
333 |         self.blacklist_re = _combined([r.regex for r in self.blacklist])
334 |         self.whitelist_re = _combined([r.regex for r in self.whitelist])
335 | 
336 |         self.blacklist_with_options, self.whitelist_with_options = \
337 |             self._split_bw(non_domain_rules)
338 |         self.blacklist_require_domain, self.whitelist_require_domain = \
339 |             self._split_bw_domain(domain_required_rules)
340 | 
341 |     def should_block(self, url, options=None):
342 |         # TODO: group rules with similar options and match them in bigger steps
343 |         options = options or {}
344 |         if self._is_whitelisted(url, options):
345 |             return False
346 |         if self._is_blacklisted(url, options):
347 |             return True
348 |         return False
349 | 
350 |     def _is_whitelisted(self, url, options):
351 |         return self._matches(
352 |             url, options,
353 |             self.whitelist_re,
354 |             self.whitelist_require_domain,
355 |             self.whitelist_with_options
356 |         )
357 | 
358 |     def _is_blacklisted(self, url, options):
359 |         return self._matches(
360 |             url, options,
361 |             self.blacklist_re,
362 |             self.blacklist_require_domain,
363 |             self.blacklist_with_options
364 |         )
365 | 
366 |     def _matches(self, url, options,
367 |                  general_re, domain_required_rules, rules_with_options):
368 |         """
369 |         Return if ``url``/``options`` are matched by rules defined by
370 |         ``general_re``, ``domain_required_rules`` and ``rules_with_options``.
371 | 
372 |         ``general_re`` is a compiled regex for rules without options.
373 | 
374 |         ``domain_required_rules`` is a {domain: [rules_which_require_it]}
375 |         mapping.
376 | 
377 |          ``rules_with_options`` is a list of AdblockRule instances that
378 |         don't require any domain, but have other options.
379 |         """
380 |         if general_re and general_re.search(url):
381 |             return True
382 | 
383 |         rules = []
384 |         if 'domain' in options and domain_required_rules:
385 |             src_domain = options['domain']
386 |             for domain in _domain_variants(src_domain):
387 |                 if domain in domain_required_rules:
388 |                     rules.extend(domain_required_rules[domain])
389 | 
390 |         rules.extend(rules_with_options)
391 | 
392 |         if self.skip_unsupported_rules:
393 |             rules = [rule for rule in rules if rule.matching_supported(options)]
394 | 
395 |         return any(rule.match_url(url, options) for rule in rules)
396 | 
397 |     @classmethod
398 |     def _split_bw(cls, rules):
399 |         return split_data(rules, lambda r: not r.is_exception)
400 | 
401 |     @classmethod
402 |     def _split_bw_domain(cls, rules):
403 |         blacklist, whitelist = cls._split_bw(rules)
404 |         return cls._domain_index(blacklist), cls._domain_index(whitelist)
405 | 
406 |     @classmethod
407 |     def _domain_index(cls, rules):
408 |         result = defaultdict(list)
409 |         for rule in rules:
410 |             domains = rule.options.get('domain', {})
411 |             for domain, required in domains.items():
412 |                 if required:
413 |                     result[domain].append(rule)
414 |         return dict(result)
415 | 
416 | 
417 | def _domain_variants(domain):
418 |     """
419 |     >>> list(_domain_variants("foo.bar.example.com"))
420 |     ['foo.bar.example.com', 'bar.example.com', 'example.com']
421 |     >>> list(_domain_variants("example.com"))
422 |     ['example.com']
423 |     >>> list(_domain_variants("localhost"))
424 |     ['localhost']
425 |     """
426 |     parts = domain.split('.')
427 |     if len(parts) == 1:
428 |         yield parts[0]
429 |     else:
430 |         for i in range(len(parts), 1, -1):
431 |             yield ".".join(parts[-i:])
432 | 
433 | 
434 | def _combined_regex(regexes, flags=re.IGNORECASE, use_re2=False, max_mem=None):
435 |     """
436 |     Return a compiled regex combined (using OR) from a list of ``regexes``.
437 |     If there is nothing to combine, None is returned.
438 | 
439 |     re2 library (https://github.com/axiak/pyre2) often can match and compile
440 |     large regexes much faster than stdlib re module (10x is not uncommon),
441 |     but there are some gotchas:
442 | 
443 |     * in case of "DFA out of memory" errors use ``max_mem`` argument
444 |       to increase the amount of memory re2 is allowed to use.
445 |     """
446 |     joined_regexes = "|".join(r for r in regexes if r)
447 |     if not joined_regexes:
448 |         return None
449 | 
450 |     if use_re2:
451 |         import re2
452 |         return re2.compile(joined_regexes, flags=flags, max_mem=max_mem)
453 |     return re.compile(joined_regexes, flags=flags)
454 | 
455 | 
456 | def _is_re2_supported():
457 |     try:
458 |         import re2
459 |     except ImportError:
460 |         return False
461 | 
462 |     # re2.match doesn't work in re2 v0.2.20 installed from pypi
463 |     # (it always returns None).
464 |     return re2.match('foo', 'foo') is not None
465 | 


--------------------------------------------------------------------------------
/adblockparser/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | 
 4 | 
 5 | def split_data(iterable, pred):
 6 |     """
 7 |     Split data from ``iterable`` into two lists.
 8 |     Each element is passed to function ``pred``; elements
 9 |     for which ``pred`` returns True are put into ``yes`` list,
10 |     other elements are put into ``no`` list.
11 | 
12 |     >>> split_data(["foo", "Bar", "Spam", "egg"], lambda t: t.istitle())
13 |     (['Bar', 'Spam'], ['foo', 'egg'])
14 |     """
15 |     yes, no = [], []
16 |     for d in iterable:
17 |         if pred(d):
18 |             yes.append(d)
19 |         else:
20 |             no.append(d)
21 |     return yes, no
22 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from setuptools import setup
 3 | 
 4 | version = '0.7'
 5 | 
 6 | setup(
 7 |     name='adblockparser',
 8 |     version=version,
 9 |     description="Parser for Adblock Plus rules",
10 |     long_description=open('README.rst').read() + '\n\n' + open('CHANGES.rst').read(),
11 |     classifiers=[
12 |         'Development Status :: 4 - Beta',
13 |         "Intended Audience :: Developers",
14 |         "License :: OSI Approved :: MIT License",
15 |         "Programming Language :: Python :: 2",
16 |         "Programming Language :: Python :: 2.7",
17 |         "Programming Language :: Python :: 3",
18 |         "Programming Language :: Python :: 3.3",
19 |         "Programming Language :: Python :: 3.4",
20 |         "Programming Language :: Python :: 3.5",
21 |     ],
22 |     keywords='adblock easylist',
23 |     author='Mikhail Korobov',
24 |     author_email='kmike84@gmail.com',
25 |     url='https://github.com/scrapinghub/adblockparser',
26 |     license='MIT',
27 |     packages=['adblockparser'],
28 | )
29 | 


--------------------------------------------------------------------------------
/tests/test_options.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | import pytest
 4 | from adblockparser import AdblockRule
 5 | 
 6 | SPLIT_OPTIONS_TESTS = [
 7 |     (
 8 |         "subdocument,third-party",
 9 |         ["subdocument", "third-party"]
10 |     ),
11 |     (
12 |         "object-subrequest,script,domain=~msnbc.msn.com,~www.nbcnews.com",
13 |         ["object-subrequest", "script", "domain=~msnbc.msn.com,~www.nbcnews.com"],
14 |     ),
15 |     (
16 |         "object-subrequest,script,domain=~msnbc.msn.com,~www.nbcnews.com",
17 |         ["object-subrequest", "script", "domain=~msnbc.msn.com,~www.nbcnews.com"],
18 |     ),
19 |     (
20 |         "~document,xbl,domain=~foo,bar,baz,~collapse,domain=foo.xbl|bar",
21 |         ["~document", "xbl", "domain=~foo,bar,baz", "~collapse", "domain=foo.xbl|bar"]
22 |     ),
23 |     (
24 |         "domain=~example.com,foo.example.com,script",
25 |         ["domain=~example.com,foo.example.com", "script"]
26 |     ),
27 | ]
28 | 
29 | DOMAIN_PARSING_TESTS = [
30 |     ("domain=example.com", {'example.com': True}),
31 |     ("domain=example.com|example.net", {
32 |         'example.com': True,
33 |         'example.net': True
34 |     }),
35 |     ("domain=~example.com", {'example.com': False}),
36 |     ("domain=example.com|~foo.example.com", {
37 |         'example.com': True,
38 |         'foo.example.com': False
39 |     }),
40 |     ("domain=~foo.example.com|example.com", {
41 |         'example.com': True,
42 |         'foo.example.com': False
43 |     }),
44 |     ("domain=example.com,example.net", {
45 |         'example.com': True,
46 |         'example.net': True
47 |     }),
48 |     ("domain=example.com|~foo.example.com", {
49 |         'example.com': True,
50 |         'foo.example.com': False
51 |     }),
52 |     ("domain=~msnbc.msn.com,~www.nbcnews.com", {
53 |         'msnbc.msn.com': False,
54 |         'www.nbcnews.com': False
55 |     }),
56 | ]
57 | 
58 | PARSE_OPTIONS_TESTS = [
59 |     ("domain=foo.bar", {}),
60 |     ("+Ads/$~stylesheet", {'stylesheet': False}),
61 |     ("-advertising-$domain=~advertise.bingads.domain.com", {
62 |         "domain": {'advertise.bingads.domain.com': False}
63 |     }),
64 |     (".se/?placement=$script,third-party", {
65 |         'script': True,
66 |         'third-party': True
67 |     }),
68 |     ("||tst.net^$object-subrequest,third-party,domain=domain1.com|domain5.com", {
69 |         'object-subrequest': True,
70 |         'third-party': True,
71 |         'domain': {
72 |             'domain1.com': True,
73 |             'domain5.com': True,
74 |         }
75 |     })
76 | ]
77 | 
78 | @pytest.mark.parametrize(('text', 'result'), SPLIT_OPTIONS_TESTS)
79 | def test_option_splitting(text, result):
80 |     assert AdblockRule._split_options(text) == result
81 | 
82 | 
83 | @pytest.mark.parametrize(('text', 'result'), DOMAIN_PARSING_TESTS)
84 | def test_domain_parsing(text, result):
85 |     assert AdblockRule._parse_domain_option(text) == result
86 | 
87 | 
88 | @pytest.mark.parametrize(('text', 'result'), PARSE_OPTIONS_TESTS)
89 | def test_options_extraction(text, result):
90 |     rule = AdblockRule(text)
91 |     assert rule.options == result
92 | 
93 | 


--------------------------------------------------------------------------------
/tests/test_parsing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from adblockparser import AdblockRules, AdblockRule, AdblockParsingError
  4 | 
  5 | import pytest
  6 | 
  7 | try:
  8 |     import re2
  9 |     USE_RE2 = [True, False, 'auto']
 10 | except Exception:
 11 |     USE_RE2 = ['auto']
 12 | 
 13 | # examples are from https://adblockplus.org/en/filter-cheatsheet
 14 | # and https://adblockplus.org/en/filters
 15 | DOCUMENTED_TESTS = {
 16 |     "/banner/*/img^": {
 17 |         "blocks": [
 18 |             "http://example.com/banner/foo/img",
 19 |             "http://example.com/banner/foo/bar/img?param",
 20 |             "http://example.com/banner//img/foo",
 21 |         ],
 22 |         "doesn't block": [
 23 |             "http://example.com/banner/img",
 24 |             "http://example.com/banner/foo/imgraph",
 25 |             "http://example.com/banner/foo/img.gif",
 26 |         ]
 27 |     },
 28 | 
 29 |     "||ads.example.com^": {
 30 |         "blocks": [
 31 |             "http://ads.example.com/foo.gif",
 32 |             "http://server1.ads.example.com/foo.gif",
 33 |             "https://ads.example.com:8000/",
 34 |         ],
 35 |         "doesn't block": [
 36 |             "http://ads.example.com.ua/foo.gif",
 37 |             "http://example.com/redirect/http://ads.example.com/",
 38 |         ]
 39 |     },
 40 | 
 41 |     "|http://example.com/|": {
 42 |         "blocks": [
 43 |             "http://example.com/",
 44 |         ],
 45 |         "doesn't block": [
 46 |             "http://example.com/foo.gif",
 47 |             "http://example.info/redirect/http://example.com/",
 48 |         ]
 49 |     },
 50 | 
 51 |     "swf|": {
 52 |         "blocks": ["http://example.com/annoyingflash.swf"],
 53 |         "doesn't block": ["http://example.com/swf/index.html"]
 54 |     },
 55 | 
 56 |     "|http://baddomain.example/": {
 57 |         "blocks": ["http://baddomain.example/banner.gif"],
 58 |         "doesn't block": ["http://gooddomain.example/analyze?http://baddomain.example"]
 59 |     },
 60 | 
 61 |     "||example.com/banner.gif": {
 62 |         "blocks": [
 63 |             "http://example.com/banner.gif",
 64 |             "https://example.com/banner.gif",
 65 |             "http://www.example.com/banner.gif",
 66 |         ],
 67 |         "doesn't block": [
 68 |             "http://badexample.com/banner.gif",
 69 |             "http://gooddomain.example/analyze?http://example.com/banner.gif",
 70 |         ]
 71 |     },
 72 | 
 73 |     "http://example.com^": {
 74 |         "blocks": [
 75 |             "http://example.com/",
 76 |             "http://example.com:8000/ ",
 77 |         ],
 78 |         "doesn't block": [
 79 |             "http://example.com.ar/",
 80 |         ]
 81 |     },
 82 | 
 83 |     "^example.com^": {
 84 |         "blocks": ["http://example.com:8000/foo.bar?a=12&b=%D1%82%D0%B5%D1%81%D1%82"],
 85 |         "doesn't block": []
 86 |     },
 87 | 
 88 |     "^%D1%82%D0%B5%D1%81%D1%82^": {
 89 |         "blocks": ["http://example.com:8000/foo.bar?a=12&b=%D1%82%D0%B5%D1%81%D1%82"],
 90 |         "doesn't block": []
 91 |     },
 92 | 
 93 |     "^foo.bar^": {
 94 |         "blocks": ["http://example.com:8000/foo.bar?a=12&b=%D1%82%D0%B5%D1%81%D1%82"],
 95 |         "doesn't block": []
 96 |     },
 97 | }
 98 | 
 99 | 
100 | RULE_EXCEPTION_TESTS = {
101 |     ("adv", "@@advice."): {
102 |         "blocks": ["http://example.com/advert.html"],
103 |         "doesn't block": ["http://example.com/advice.html"]
104 |     },
105 |     ("@@advice.", "adv"): {
106 |         "blocks": ["http://example.com/advert.html"],
107 |         "doesn't block": ["http://example.com/advice.html"]
108 |     },
109 |     ("@@|http://example.com", "@@advice.", "adv", "!foo"): {
110 |         "blocks": [
111 |             "http://examples.com/advert.html"
112 |         ],
113 |         "doesn't block": [
114 |             "http://example.com/advice.html",
115 |             "http://example.com/advert.html"
116 |             "http://examples.com/advice.html"
117 |             "http://examples.com/#!foo"
118 |         ]
119 |     },
120 | }
121 | 
122 | 
123 | RULES_WITH_OPTIONS_TESTS = {
124 |     # rule: url, params, matches?
125 |     "||example.com": [
126 |         ("http://example.com", {'third-party': True}, True),
127 |         ("http://example2.com", {'third-party': True}, False),
128 |         ("http://example.com", {'third-party': False}, True),
129 |     ],
130 |     "||example.com^$third-party": [
131 |         ("http://example.com", {'third-party': True}, True),
132 |         ("http://example2.com", {'third-party': True}, False),
133 |         ("http://example.com", {'third-party': False}, False),
134 |     ],
135 |     "||example.com^$third-party,~script": [
136 |         ("http://example.com", {'third-party': True, 'script': True}, False),
137 |         ("http://example.com", {'third-party': True, 'script': False}, True),
138 |         ("http://example2.com", {'third-party': True, 'script': False}, False),
139 |         ("http://example.com", {'third-party': False, 'script': False}, False),
140 |     ],
141 | 
142 |     "adv$domain=example.com|example.net": [
143 |         ("http://example.net/adv", {'domain': 'example.net'}, True),
144 |         ("http://somewebsite.com/adv", {'domain': 'example.com'}, True),
145 |         ("http://www.example.net/adv", {'domain': 'www.example.net'}, True),
146 |         ("http://my.subdomain.example.com/adv", {'domain': 'my.subdomain.example.com'}, True),
147 | 
148 |         ("http://example.com/adv", {'domain': 'badexample.com'}, False),
149 |         ("http://example.com/adv", {'domain': 'otherdomain.net'}, False),
150 |         ("http://example.net/ad", {'domain': 'example.net'}, False),
151 |     ],
152 | 
153 |     "adv$domain=example.com|~foo.example.com": [
154 |         ("http://example.net/adv", {'domain': 'example.com'}, True),
155 |         ("http://example.net/adv", {'domain': 'foo.example.com'}, False),
156 |         ("http://example.net/adv", {'domain': 'www.foo.example.com'}, False),
157 |     ],
158 | 
159 |     "adv$domain=~example.com|foo.example.com": [
160 |         ("http://example.net/adv", {'domain': 'example.com'}, False),
161 |         ("http://example.net/adv", {'domain': 'foo.example.com'}, True),
162 |         ("http://example.net/adv", {'domain': 'www.foo.example.com'}, True),
163 |     ],
164 | 
165 |     "adv$domain=~example.com": [
166 |         ("http://example.net/adv", {'domain': 'otherdomain.com'}, True),
167 |         ("http://somewebsite.com/adv", {'domain': 'example.com'}, False),
168 |     ],
169 | 
170 |     "adv$domain=~example.com|~example.net": [
171 |         ("http://example.net/adv", {'domain': 'example.net'}, False),
172 |         ("http://somewebsite.com/adv", {'domain': 'example.com'}, False),
173 |         ("http://www.example.net/adv", {'domain': 'www.example.net'}, False),
174 |         ("http://my.subdomain.example.com/adv", {'domain': 'my.subdomain.example.com'}, False),
175 | 
176 |         ("http://example.com/adv", {'domain': 'badexample.com'}, True),
177 |         ("http://example.com/adv", {'domain': 'otherdomain.net'}, True),
178 |         ("http://example.net/ad", {'domain': 'example.net'}, False),
179 |     ],
180 | 
181 |     "adv$domain=example.com|~example.net": [
182 |         # ~example.net should be ignored here
183 |         ("http://example.net/adv", {'domain': 'example.net'}, False),
184 |         ("http://somewebsite.com/adv", {'domain': 'example.com'}, True),
185 |         ("http://www.example.net/adv", {'domain': 'www.example.net'}, False),
186 |         ("http://my.subdomain.example.com/adv", {'domain': 'my.subdomain.example.com'}, True),
187 | 
188 |         ("http://example.com/adv", {'domain': 'badexample.com'}, False),
189 |         ("http://example.com/adv", {'domain': 'otherdomain.net'}, False),
190 |         ("http://example.net/ad", {'domain': 'example.net'}, False),
191 |     ],
192 | 
193 |     "adv$domain=example.com,~foo.example.com,script": [
194 |         ("http://example.net/adv", {'domain': 'example.com', 'script': True}, True),
195 |         ("http://example.net/adv", {'domain': 'foo.example.com', 'script': True}, False),
196 |         ("http://example.net/adv", {'domain': 'www.foo.example.com', 'script': True}, False),
197 | 
198 |         ("http://example.net/adv", {'domain': 'example.com', 'script': False}, False),
199 |         ("http://example.net/adv", {'domain': 'foo.example.com', 'script': False}, False),
200 |         ("http://example.net/adv", {'domain': 'www.foo.example.com', 'script': False}, False),
201 |     ],
202 | 
203 |     "$websocket,domain=extratorrent.cc|firstrowau.eu": [
204 |         ("http://example.com", {'domain': 'extratorrent.cc', 'websocket': True}, True),
205 |         ("http://example.com", {'domain': 'extratorrent.cc', 'websocket': False}, False),
206 |     ]
207 | }
208 | 
209 | MULTIRULES_WITH_OPTIONS_TESTS = {
210 |     # rules: url, params, should_block
211 |     ("adv", "@@advice.$~script"): [
212 |         ("http://example.com/advice.html", {'script': False}, False),
213 |         ("http://example.com/advice.html", {'script': True}, True),
214 |         ("http://example.com/advert.html", {'script': False}, True),
215 |         ("http://example.com/advert.html", {'script': True}, True),
216 |     ],
217 | }
218 | 
219 | @pytest.mark.parametrize('use_re2', USE_RE2)
220 | @pytest.mark.parametrize(('rule_text', 'results'), DOCUMENTED_TESTS.items())
221 | def test_documented_examples(rule_text, results, use_re2):
222 |     rule = AdblockRule(rule_text)
223 |     rules = AdblockRules([rule_text], use_re2=use_re2)
224 | 
225 |     for url in results["blocks"]:
226 |         assert rule.match_url(url)
227 |         assert rules.should_block(url)
228 | 
229 |     for url in results["doesn't block"]:
230 |         assert not rule.match_url(url)
231 |         assert not rules.should_block(url)
232 | 
233 | 
234 | @pytest.mark.parametrize('use_re2', USE_RE2)
235 | @pytest.mark.parametrize(('rules', 'results'), RULE_EXCEPTION_TESTS.items())
236 | def test_rule_exceptions(rules, results, use_re2):
237 |     rules = AdblockRules(rules, use_re2=use_re2)
238 | 
239 |     for url in results["blocks"]:
240 |         assert rules.should_block(url)
241 | 
242 |     for url in results["doesn't block"]:
243 |         assert not rules.should_block(url)
244 | 
245 | 
246 | @pytest.mark.parametrize('use_re2', USE_RE2)
247 | @pytest.mark.parametrize(('rule_text', 'results'), RULES_WITH_OPTIONS_TESTS.items())
248 | def test_rule_with_options(rule_text, results, use_re2):
249 |     rule = AdblockRule(rule_text)
250 |     rules = AdblockRules([rule_text], use_re2=use_re2)
251 | 
252 |     for url, params, match in results:
253 |         assert rule.match_url(url, params) == match
254 |         assert rules.should_block(url, params) == match
255 | 
256 | 
257 | @pytest.mark.parametrize('use_re2', USE_RE2)
258 | @pytest.mark.parametrize(('rules', 'results'), MULTIRULES_WITH_OPTIONS_TESTS.items())
259 | def test_rules_with_options(rules, results, use_re2):
260 |     rules = AdblockRules(rules, use_re2=use_re2)
261 |     for url, params, should_block in results:
262 |         assert rules.should_block(url, params) == should_block
263 | 
264 | 
265 | def test_regex_rules():
266 |     rules = AdblockRules(["/banner\d+/"])
267 |     assert rules.should_block("banner123")
268 |     assert not rules.should_block("banners")
269 | 
270 | 
271 | def test_rules_supported_options():
272 |     rules = AdblockRules(["adv", "@@advice.$~script"])
273 |     assert not rules.should_block("http://example.com/advice.html", {'script': False})
274 | 
275 |     # exception rule should be discarded if "script" option is not supported
276 |     rules2 = AdblockRules(["adv", "@@advice.$~script"], supported_options=[])
277 |     assert rules2.should_block("http://example.com/advice.html", {'script': False})
278 | 
279 | 
280 | def test_rules_instantiation():
281 |     rule = AdblockRule("adv")
282 |     rules = AdblockRules([rule])
283 |     assert rule.match_url("http://example.com/adv")
284 |     assert rules.should_block("http://example.com/adv")
285 | 
286 | 
287 | def test_empty_rules():
288 |     rules = AdblockRules(["adv", "", " \t", AdblockRule("adv2")])
289 |     assert len(rules.rules) == 2
290 | 
291 | 
292 | def test_empty_regexp_rules():
293 |     with pytest.raises(AdblockParsingError):
294 |         AdblockRules(['adv', '/', '//'])
295 | 


--------------------------------------------------------------------------------
/tests/test_rule_types.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | import pytest
 4 | from adblockparser import AdblockRule
 5 | 
 6 | COMMENT_RULES = [
 7 |     "[Adblock Plus 2.0]",
 8 |     "! Checksum: nVIXktYXKU6M+cu+Txkhuw",
 9 |     "!/cb.php?sub$script,third-party",
10 |     "!@@/cb.php?sub",
11 |     "!###ADSLOT_SKYSCRAPER",
12 |     "! *** easylist:easylist/easylist_whitelist_general_hide.txt ***",
13 | ]
14 | 
15 | HTML_RULES = [
16 |     "###ADSLOT_SKYSCRAPER",
17 |     "@@###ADSLOT_SKYSCRAPER",
18 |     "##.adsBox",
19 |     "eee.se#@##adspace_top",
20 |     "domain1.com,domain2.com#@##adwrapper",
21 |     "edgesuitedomain.net#@##ad-unit",
22 |     "mydomain.com#@#.ad-unit",
23 |     '##a[href^="http://affiliate.sometracker.com/"]',
24 | ]
25 | 
26 | 
27 | @pytest.mark.parametrize("text", COMMENT_RULES)
28 | def test_is_comment(text):
29 |     rule = AdblockRule(text)
30 |     assert rule.is_comment
31 |     assert not rule.is_html_rule
32 |     assert not rule.is_exception
33 |     assert rule.options == {}
34 |     assert not rule.regex
35 | 
36 | 
37 | @pytest.mark.parametrize("text", HTML_RULES)
38 | def test_is_html_rule(text):
39 |     rule = AdblockRule(text)
40 |     assert rule.is_html_rule
41 |     assert not rule.is_comment
42 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | ; this is a tox config for running tests
 2 | ; under different Python interpreters
 3 | 
 4 | [tox]
 5 | envlist = py27,py33,py34,py35,pypy,py27-re2
 6 | 
 7 | [testenv]
 8 | deps=
 9 |     pytest
10 |     pytest-cov
11 | 
12 | commands=
13 |     ; a workaround for tox bug: https://bitbucket.org/hpk42/tox/issue/176/
14 |     pip install -I {toxinidir}
15 |     py.test \
16 |         --cov=adblockparser --cov-report=html --cov-report=term \
17 |         --doctest-modules --doctest-glob *.rst \
18 |         {posargs:adblockparser tests README.rst}
19 | 
20 | [re2]
21 | deps = re2 >= 0.2.21
22 | 
23 | [testenv:py27-re2]
24 | basepython = python2.7
25 | deps=
26 |     {[testenv]deps}
27 |     {[re2]deps}
28 | 


--------------------------------------------------------------------------------