├── .coveragerc
├── .gitignore
├── .travis.yml
├── LICENSE.txt
├── README.rst
├── maybedont
    ├── __init__.py
    ├── predict.py
    ├── scrapy_middleware.py
    └── utils.py
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── test_middleware.py
    └── test_predict.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | .ipynb_checkpoints/
 3 | venv/
 4 | build/
 5 | dist/
 6 | *.egg-info/
 7 | .tox
 8 | .idea
 9 | htmlcov
10 | .coverage
11 | .cache
12 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python: 3.5
 3 | sudo: false
 4 | 
 5 | branches:
 6 |     only:
 7 |         - master
 8 |         - /^\d\.\d+$/
 9 | 
10 | install:
11 |     - pip install -U tox codecov
12 | 
13 | script: tox
14 | 
15 | after_success:
16 |     - codecov
17 | 
18 | cache:
19 |     directories:
20 |         - $HOME/.cache/pip
21 | 
22 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2018 Hyperion Gray
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so,
 8 | subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | MaybeDont
  2 | =========
  3 | 
  4 | .. image:: https://img.shields.io/pypi/v/MaybeDont.svg
  5 |    :target: https://pypi.python.org/pypi/MaybeDont
  6 |    :alt: PyPI Version
  7 | 
  8 | .. image:: https://img.shields.io/travis/TeamHG-Memex/MaybeDont/master.svg
  9 |    :target: http://travis-ci.org/TeamHG-Memex/MaybeDont
 10 |    :alt: Build Status
 11 | 
 12 | .. image:: https://codecov.io/github/TeamHG-Memex/MaybeDont/coverage.svg?branch=master
 13 |    :target: https://codecov.io/github/TeamHG-Memex/MaybeDont?branch=master
 14 |    :alt: Code Coverage
 15 | 
 16 | .. contents::
 17 | 
 18 | MaybeDont is a library that helps avoid downloading pages with duplicate
 19 | content during crawling. It learns which URL components are important and
 20 | which are not important during crawling, and tries to predict if the page
 21 | will be duplicate based on it's URL.
 22 | 
 23 | The idea is that if you have a crawler that just
 24 | follows all links, it might download a lot of duplicate pages: for example,
 25 | for a forum there might be pages like ``/view.php?topicId=10`` and
 26 | ``/view.php?topicId=10&start=0`` - the only difference is added ``start=0``,
 27 | and the content of this pages is likely duplicate. If we knew that adding
 28 | ``start=0`` does not change content, then we would avoid downloading the page
 29 | ``/view.php?topicId=10&start=0`` if we have already fetched
 30 | ``/view.php?topicId=10``, and thus save time and bandwidth.
 31 | 
 32 | 
 33 | Duplicate detector
 34 | ------------------
 35 | 
 36 | ``maybedont.DupePredictor`` collects statistics about page URLs and contents, and
 37 | is able to predict if the new URL will bring any new content.
 38 | 
 39 | First, initialize a ``DupePredictor``::
 40 | 
 41 |     from maybedont import DupePredictor
 42 |     dp = DupePredictor(
 43 |         texts_sample=[page_1, page_2, page_3],
 44 |         jaccard_threshold=0.9)  # default value
 45 | 
 46 | ``texts_sample`` is a list of page contents. It can be ommited, but it is
 47 | recommended to provide it: it is used to learn which parts of the page are
 48 | common for a lot of site's pages, and excludes this parts from duplicate
 49 | comparison. This helps with pages where the content is small relative to
 50 | the site chrome (footer, header, etc.): without removing chrome all such
 51 | pages would be considered duplicates, as only a tiny fraction of the content
 52 | changes.
 53 | 
 54 | Next, we can update ``DupePredictor`` model with downloaded pages::
 55 | 
 56 |     dp.update_model(url_4, text_4)
 57 |     dp.update_model(url_5, text_5)
 58 | 
 59 | After a while, ``DupePredictor`` will learn which arguments in URLs
 60 | are important, and which can be safely ignored.
 61 | ``DupePredictor.get_dupe_prob`` returns the probability of url being
 62 | a duplicate of some content that has already been seem::
 63 | 
 64 |     dp.get_dupe_prob(url_6)
 65 | 
 66 | Runtime overhead should be not too large: on a crawl with < 100k pages,
 67 | expected time to update the model is 1-5 ms, and below 1 ms
 68 | to get the probability. All visited urls and hashes of content are stored
 69 | in memory, along with some indexing structures.
 70 | 
 71 | 
 72 | Install
 73 | -------
 74 | 
 75 | ::
 76 | 
 77 |     pip install MaybeDont
 78 | 
 79 | 
 80 | Spider middleware
 81 | -----------------
 82 | 
 83 | If you have a `Scrapy <http://scrapy.org>`_ spider,
 84 | or are looking for an inspiration for a spider
 85 | middleware, check out ``maybedont.scrapy_middleware.AvoidDupContentMiddleware``.
 86 | First, it collects an queue of documents to know better which page elements
 87 | are common on the site, in order to exclude them from content comparison.
 88 | After that it builds it's ``DupePredictor``, updates it with crawled pages
 89 | (only textual pages are taken into account), and starts dropping requests
 90 | for duplicate content once it gets confident enough. Not all requests for
 91 | duplicates are dropped: with a small probability (currenty 5%) requests
 92 | are carried anyway. This makes duplicate detection more robust against
 93 | changes in site URL or content structure as the crawl progresses.
 94 | 
 95 | To enable the middleware, the following settings are required::
 96 | 
 97 |     AVOID_DUP_CONTENT_ENABLED = True
 98 |     DOWNLOADER_MIDDLEWARES['maybedont.scrapy_middleware.AvoidDupContentMiddleware'] = 200
 99 | 
100 | Middleware is only applied to requests with ``avoid_dup_content`` in
101 | ``request.meta``.
102 | 
103 | Optional settings:
104 | 
105 | - ``AVOID_DUP_CONTENT_THRESHOLD = 0.98`` - minimal probability when requests
106 |   are skipped.
107 | - ``AVOID_DUP_CONTENT_EXPLORATION = 0.05`` - probability of still making a
108 |   request that should be dropped
109 | - ``AVOID_DUP_CONTENT_INITIAL_QUEUE_LIMIT = 300`` - number of pages that
110 |   should be downloaded before DupePredictor is initialized
111 | 
112 | 
113 | How it works
114 | ------------
115 | 
116 | Duplicate detection is based on ``MinHashLSH`` from the
117 | `datasketch <https://github.com/ekzhu/datasketch>`_ library. Text
118 | 4-shingles of words are used for hashing,
119 | not spanning line breaks in the extracted text.
120 | 
121 | Several hypotheses about duplicates are tested:
122 | 
123 | 1. All URLs with a given URL path are the same (have the same content),
124 |    regardless of query parameters;
125 | 2. All URLs which only differ in a given URL query parameter are the same
126 |    (e.g. session tokens can be detected this way);
127 | 3. All URLs which have a given path and only differ in a given URL
128 |    query parameter are the same;
129 | 4. All URLs which have a given path and query string and only differ
130 |    in a single given query parameter are the same;
131 | 5. URLs are the same if they have same path and only differ
132 |    in that some of them have a given param=value query argument added;
133 | 6. URLs are the same if they have a given path and only differ
134 |    in a given param=value query argument;
135 | 
136 | Bernoulli distribution is fit for each hypothesis.
137 | 
138 | 
139 | License
140 | -------
141 | 
142 | License is MIT
143 | 
144 | ----
145 | 
146 | .. image:: https://hyperiongray.s3.amazonaws.com/define-hg.svg
147 | 	:target: https://www.hyperiongray.com/?pk_campaign=github&pk_kwd=MaybeDont
148 | 	:alt: define hyperiongray
149 | 


--------------------------------------------------------------------------------
/maybedont/__init__.py:
--------------------------------------------------------------------------------
1 | from maybedont.predict import DupePredictor
2 | 


--------------------------------------------------------------------------------
/maybedont/predict.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division
  2 | import logging, random, math
  3 | from collections import namedtuple, defaultdict
  4 | from six.moves.urllib.parse import urlsplit, parse_qs
  5 | 
  6 | from datasketch import MinHashLSH
  7 | 
  8 | from .utils import get_too_common_shingles, get_min_hash, canonicalize_url
  9 | 
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class DupePredictor(object):
 15 |     """ Learn to predict if the content is duplicate by the URL.
 16 |     """
 17 |     def __init__(self, texts_sample=None, jaccard_threshold=0.9, num_perm=128,
 18 |                  storage_config=None):
 19 |         """ Initialize DupePredictor.
 20 |         :param jaccard_threshold: a minimal jaccard similarity when pages
 21 |         are considered duplicates (intersection of content / union of content).
 22 |         :param texts_sample: a list of texts to calculate too_common_shingles
 23 |         - this allows a more precise duplicate detection, because now
 24 |         we know which parts are common to all pages, and which are unique
 25 |         for each page.
 26 |         :param storage_config: configuration for a redis backend to persist
 27 |         minhashes in. Using this backend makes DupePredictor instances
 28 |         persistent across restarts. The configuration format is:
 29 |         storage_config={'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}}.
 30 |         See https://ekzhu.github.io/datasketch/lsh.html#minhash-lsh-at-scale
 31 |         """
 32 |         self.jaccard_threshold = jaccard_threshold
 33 |         self.num_perm = num_perm
 34 |         self.storage_config = storage_config
 35 |         if storage_config:
 36 |             self.lsh = MinHashLSH(
 37 |                 threshold=self.jaccard_threshold, num_perm=self.num_perm,
 38 |                 storage_config=self.storage_config)
 39 |         else:
 40 |             self.lsh = MinHashLSH(
 41 |                 threshold=self.jaccard_threshold, num_perm=self.num_perm)
 42 |         self.too_common_shingles = set()
 43 |         if texts_sample:
 44 |             self.too_common_shingles = get_too_common_shingles(texts_sample)
 45 | 
 46 |         self.seen_urls = {}  # url: URLMeta
 47 |         self.urls_by_path = defaultdict(set)  # path: {url}
 48 |         self.urls_by_path_q = defaultdict(set)  # (path, q): {url}
 49 |         self.urls_by_path_qwp = defaultdict(set)  # (path, param, q): {url}
 50 |         self.params_by_path = defaultdict(set)  # path: {param}
 51 |         self.param_values = defaultdict(set)  # (path, param): {value}
 52 | 
 53 |         # Duplicate hypotheses:
 54 |         # (1) All items with same path are duplicates. Key is (path,)
 55 |         self.path_dupstats = defaultdict(DupStat)
 56 |         # (2) All items with same path that differ only in given param are
 57 |         # duplicates. Key is (param,)
 58 |         self.param_dupstats = defaultdict(DupStat)
 59 |         # (3) Same but conditioned by path, key is (path, param)
 60 |         self.path_param_dupstats = defaultdict(DupStat)
 61 |         # (4) Same but conditioned by path + the rest of the query
 62 |         # Key is (path, query, param)
 63 |         self.path_query_param_dupstats = defaultdict(DupStat)
 64 |         # (5) All items with same path with only added param=value are duplicates
 65 |         # Key is (param, value)
 66 |         self.param_value_dupstats = defaultdict(DupStat)
 67 |         # (6) Same but conditioned by path, key is (path, param, value)
 68 |         self.path_param_value_dupstats = defaultdict(DupStat)
 69 |         # TODO - more powerful hypotheses:
 70 |         # - param + value without path
 71 |         # - more than one get param
 72 | 
 73 |     def get_dupe_prob(self, url):
 74 |         """ A probability of given url being a duplicate of some content
 75 |         that has already been seem.
 76 |         """
 77 |         path, query = _parse_url(url)
 78 |         dupestats = []
 79 |         extend_ds = lambda x: dupestats.extend(filter(None, (
 80 |             ds_dict.get(key) for ds_dict, key in x)))
 81 |         if self.urls_by_path.get(path):
 82 |             extend_ds([(self.path_dupstats, path)])
 83 |         # If param is in the query
 84 |         for param, value in query.items():
 85 |             qwp_key = _q_key(_without_key(query, param))
 86 |             # Have we seen the query with param changed or removed?
 87 |             has_changed = self.urls_by_path_qwp.get((path, param, qwp_key))
 88 |             has_removed = self.urls_by_path_q.get((path, qwp_key))
 89 |             if has_changed or has_removed:
 90 |                 extend_ds(self._param_dupstats(path, param, qwp_key))
 91 |             if has_removed:
 92 |                 extend_ds(self._param_value_dupstats(path, param, value))
 93 |         # If param is not in the query, but we've crawled a page when it is
 94 |         q_key = _q_key(query)
 95 |         for param in (self.params_by_path.get(path, set()) - set(query)):
 96 |             if self.urls_by_path_qwp.get((path, param, q_key)):
 97 |                 extend_ds(self._param_dupstats(path, param, q_key))
 98 |                 # FIXME - this could be a long list of param values,
 99 |                 # it's better to somehow store only high-probability values?
100 |                 for value in self.param_values.get((path, param), set()):
101 |                     extend_ds(self._param_value_dupstats(path, param, value))
102 |         return max(ds.get_prob() for ds in dupestats) if dupestats else 0.
103 | 
104 |     def update_model(self, url, text):
105 |         """ Update prediction model with a page by given url and text content.
106 |         Return a list of item duplicates (for testing purposes).
107 |         """
108 |         min_hash = get_min_hash(text, self.too_common_shingles, self.num_perm)
109 |         item_url = canonicalize_url(url)
110 |         item_path, item_query = _parse_url(item_url)
111 |         all_duplicates = [
112 |             (url, self.seen_urls[url]) for url in self.lsh.query(min_hash)]
113 |         duplicates = [(url, m.query) for url, m in all_duplicates
114 |                       if m.path == item_path]
115 |         # Hypothesis (1) - just paths
116 |         n_path_nodup = self._nodup_filter(min_hash, (
117 |             self.urls_by_path.get(item_path, set())
118 |             .difference(url for url, _ in duplicates)))
119 |         self.path_dupstats[item_path].update(len(duplicates), n_path_nodup)
120 |         # Other hypotheses, if param is in the query
121 |         for param, value in item_query.items():
122 |             self._update_with_param(
123 |                 duplicates, min_hash, item_path, item_query, param, [value])
124 |         # Other hypotheses, if param is not in the query
125 |         for param in (
126 |                 self.params_by_path.get(item_path, set()) - set(item_query)):
127 |             self._update_with_param(
128 |                 duplicates, min_hash, item_path, item_query, param,
129 |                 self.param_values.get((item_path, param), set()))
130 |         # Update indexes
131 |         for param, value in item_query.items():
132 |             self.urls_by_path_q[item_path, _q_key(item_query)].add(item_url)
133 |             item_qwp_key = _q_key(_without_key(item_query, param))
134 |             self.urls_by_path_qwp[item_path, param, item_qwp_key].add(item_url)
135 |             self.params_by_path[item_path].add(param)
136 |             self.param_values[item_path, param].add(value)
137 |         if not item_query:
138 |             self.urls_by_path_q[item_path, ()].add(item_url)
139 |         self.urls_by_path[item_path].add(item_url)
140 |         if item_url in self.lsh:
141 |             self.lsh.remove(item_url)
142 |         self.lsh.insert(item_url, min_hash)
143 |         self.seen_urls[item_url] = URLMeta(item_path, item_query, min_hash)
144 |         if len(self.seen_urls) % 100 == 0:
145 |             self.log_dupstats()
146 |         return all_duplicates
147 | 
148 |     def _update_with_param(self, duplicates, min_hash, item_path, item_query,
149 |                            param, values):
150 |         # qwp = "query without param"
151 |         item_qwp = _without_key(item_query, param)
152 |         item_qwp_key = _q_key(item_qwp)
153 | 
154 |         q_dup = {url for url, q in duplicates
155 |                  if _without_key(q, param) == item_qwp}
156 |         n_q_nodup = self._nodup_filter(min_hash, (
157 |             self.urls_by_path_qwp.get((item_path, param, item_qwp_key), set())
158 |             .union(self.urls_by_path_q.get((item_path, item_qwp_key), set()))
159 |             .difference(q_dup)))
160 |         if q_dup or n_q_nodup:
161 |             for ds_dict, key in self._param_dupstats(
162 |                     item_path, param, item_qwp_key):
163 |                 ds_dict[key].update(len(q_dup), n_q_nodup)
164 |         if values:
165 |             if param in item_query:
166 |                 qv_dup = {url for url, q in duplicates if q == item_qwp}
167 |                 n_qv_nodup = self._nodup_filter(min_hash, (
168 |                     self.urls_by_path_q.get((item_path, item_qwp_key), set())
169 |                     .difference(qv_dup)))
170 |             # FIXME - this could be a long list of param values,
171 |             # it's better to somehow store only high-probability values?
172 |             for value in values:
173 |                 if param not in item_query:
174 |                     qv_dup = {url for url, q in duplicates
175 |                         if q.get(param) == value and
176 |                         _without_key(q, param) == item_qwp}
177 |                     qap_key = _q_key(_with_key_val(item_query, param, value))
178 |                     n_qv_nodup = self._nodup_filter(min_hash, (
179 |                         self.urls_by_path_q.get((item_path, qap_key), set())
180 |                         .difference(qv_dup)))
181 |                 if qv_dup or n_qv_nodup:
182 |                     for ds_dict, key in self._param_value_dupstats(
183 |                             item_path, param, value):
184 |                         ds_dict[key].update(len(qv_dup), n_qv_nodup)
185 | 
186 |     def _param_dupstats(self, path, param, qwp_key):
187 |         return [
188 |             (self.param_dupstats, param),
189 |             (self.path_param_dupstats, (path, param)),
190 |             (self.path_query_param_dupstats, (path, param, qwp_key)),
191 |             ]
192 | 
193 |     def _param_value_dupstats(self, path, param, value):
194 |         return [
195 |             (self.param_value_dupstats, (param, value)),
196 |             (self.path_param_value_dupstats, (path, param, value)),
197 |             ]
198 | 
199 |     def _nodup_filter(self, min_hash, all_urls, max_sample=200):
200 |         """ This filters results that are considered not duplicates.
201 |         But we really need to check that, because lsh.query does not always
202 |         return ALL duplicates, esp. when there are a lot of them, so
203 |         here we double-check and return only urls that are NOT duplicates.
204 |         Return estimated number of not duplicates.
205 |         """
206 |         if not all_urls:
207 |             return 0
208 |         urls = random.sample(all_urls, max_sample) \
209 |                if len(all_urls) > max_sample else all_urls
210 |         filtered = [
211 |             url for url in urls
212 |             if min_hash.jaccard(self.seen_urls[url].min_hash) <
213 |             self.jaccard_threshold]
214 |         return int(len(filtered) / len(urls) * len(all_urls))
215 | 
216 |     def log_dupstats(self, min_dup=100):
217 |         for ds, name in [
218 |                 (self.path_dupstats, 'Path dupstats'),
219 |                 (self.param_dupstats, 'Param dupstats'),
220 |                 (self.path_param_dupstats, 'Path-param dupstats'),
221 |                 (self.path_query_param_dupstats, 'Path-query-param dupstats'),
222 |                 (self.param_value_dupstats, 'Param-value dupstats'),
223 |                 (self.path_param_value_dupstats, 'Path-param-value dupstats'),
224 |                 ]:
225 |             _log_dupstats(ds, name, min_dup=min_dup)
226 | 
227 | 
228 | def _without_key(dict_, key):
229 |     return {k: v for k, v in dict_.items() if k != key}
230 | 
231 | 
232 | def _with_key_val(dict_, key, value):
233 |     dict_ = dict(dict_)
234 |     dict_[key] = value
235 |     return dict_
236 | 
237 | 
238 | def _parse_url(url):
239 |     p = urlsplit(url)
240 |     query = {k: v[0] for k, v in parse_qs(p.query).items() if len(v) == 1}
241 |     return ''.join([p.netloc, p.path]), query
242 | 
243 | 
244 | def _q_key(query):
245 |     return tuple(sorted(query.items()))
246 | 
247 | 
248 | def _log_dupstats(dupstats, name, min_dup):
249 |     dupstats_items = [
250 |         (url, dupstat) for url, dupstat in sorted(
251 |             dupstats.items(), key=lambda x: x[1].total, reverse=True)
252 |         if dupstat.dup > min_dup]
253 |     if dupstats_items:
254 |         logger.debug('%s:', name)
255 |         for url, dupstat in dupstats_items:
256 |             logger.debug('%s %s', url, dupstat)
257 | 
258 | 
259 | URLMeta = namedtuple('URLMeta', ['path', 'query', 'min_hash'])
260 | 
261 | 
262 | class DupStat(object):
263 |     def __init__(self):
264 |         self.dup = 0
265 |         self.nodup = 0
266 | 
267 |     @property
268 |     def total(self):
269 |         return self.dup + self.nodup
270 | 
271 |     def update(self, dup, nodup):
272 |         self.dup += dup
273 |         self.nodup += nodup
274 | 
275 |     def get_prob(self):
276 |         if self.total < 5:
277 |             return 0.
278 |         a, b = self.dup + 1, self.nodup + 1
279 |         n = a + b
280 |         p = a / n
281 |         q = b / n
282 |         # Lower edge of the 95% confidence interval, binomial distribution
283 |         return p - 1.96 * math.sqrt(p * q / n)
284 | 
285 |     def __repr__(self):
286 |         return '<DupStat: {:.0%} ({} of {})>'.format(
287 |             self.get_prob(), self.dup, self.total)
288 | 


--------------------------------------------------------------------------------
/maybedont/scrapy_middleware.py:
--------------------------------------------------------------------------------
 1 | import logging, random, time
 2 | 
 3 | import scrapy.http.response.text
 4 | from scrapy.exceptions import IgnoreRequest, NotConfigured
 5 | 
 6 | from maybedont import DupePredictor
 7 | 
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | class AvoidDupContentMiddleware(object):
13 |     """
14 |     Avoid requests for duplicate content. During crawling this middleware
15 |     learns what parameters are important (influence content), and what can
16 |     be safely ignored. Once it is confident it starts dropping most
17 |     requests that are unlikely to get new content. Some requests are still
18 |     downloaded to make crawling more robust against changes in site structure.
19 |     It is applied only to requests with "avoid_dup_content" in meta.
20 |     """
21 |     def __init__(self, initial_queue_limit, threshold, exploration):
22 |         self.dupe_predictor = None
23 |         # We initialize dupe detector only after gathering enough pages,
24 |         # it needs them for better duplicate detection, to know which content
25 |         # is common to a lot of pages, and which is unique.
26 |         self.initial_queue = []  # (url, text)
27 |         self.initial_queue_limit = initial_queue_limit
28 |         self.threshold = threshold
29 |         self.exploration = exploration
30 | 
31 |     @classmethod
32 |     def from_crawler(cls, crawler):
33 |         if not crawler.settings.getbool('AVOID_DUP_CONTENT_ENABLED'):
34 |             raise NotConfigured
35 |         s = crawler.settings
36 |         return cls(
37 |             initial_queue_limit=s.getint(
38 |                 'AVOID_DUP_CONTENT_INITIAL_QUEUE_LIMIT', 300),
39 |             threshold=s.getfloat('AVOID_DUP_CONTENT_THRESHOLD', 0.98),
40 |             exploration=s.getfloat('AVOID_DUP_CONTENT_EXPLORATION', 0.05))
41 | 
42 |     def process_request(self, request, spider):
43 |         if not self.dupe_predictor or self.skip(request):
44 |             return
45 |         url = request.url
46 |         t0 = time.time()
47 |         dupe_prob = self.dupe_predictor.get_dupe_prob(url)
48 |         t = time.time() - t0
49 |         if t > 0.01:
50 |             logger.debug('get_dupe_prob took %.4f s for %s', t, url)
51 |         if dupe_prob > self.threshold:
52 |             if random.random() < self.exploration:
53 |                 logger.debug('Exploring a likely duplicate %s with prob %.3f',
54 |                              url, dupe_prob)
55 |             else:
56 |                 logger.debug('Ignoring a likely duplicate %s with prob %.3f',
57 |                              url, dupe_prob)
58 |                 raise IgnoreRequest
59 | 
60 |     def process_response(self, request, response, spider):
61 |         if not isinstance(response, scrapy.http.response.text.TextResponse) or self.skip(request):
62 |             return response
63 |         url, text = response.url, extract_text(response)
64 |         t0 = time.time()
65 |         if self.dupe_predictor:
66 |             self.dupe_predictor.update_model(url, text)
67 |             t = time.time() - t0
68 |             if t > 0.01:
69 |                 logger.debug('Updated model in %.4f s for %s', t, url)
70 |         else:
71 |             self.initial_queue.append((url, text))
72 |             if len(self.initial_queue) >= self.initial_queue_limit:
73 |                 logger.debug(
74 |                     'Gathered enough intitial pages, building DupePredictor')
75 |                 self.dupe_predictor = DupePredictor(
76 |                     texts_sample=[text for _, text in self.initial_queue])
77 |                 # Update model with all the pages we have missed
78 |                 for url, text in self.initial_queue:
79 |                     self.dupe_predictor.update_model(url, text)
80 |                 self.initial_queue = None
81 |                 logger.debug('Built DupePredictor in %.4f s', time.time() - t0)
82 |         return response
83 | 
84 |     def skip(self, request):
85 |         return not request.meta.get('avoid_dup_content')
86 | 
87 | 
88 | def extract_text(response):
89 |     return '\n'.join(response.xpath('//body').xpath('string()').extract())
90 | 


--------------------------------------------------------------------------------
/maybedont/utils.py:
--------------------------------------------------------------------------------
 1 | from hashlib import sha1
 2 | from collections import defaultdict
 3 | 
 4 | from datasketch import MinHash
 5 | 
 6 | 
 7 | def shingle_hashes(text):
 8 |     n = 4
 9 |     for line in text.split('\n'):
10 |         words = line.strip().split()
11 |         if words:
12 |             for idx in range(min(len(words), n), len(words) + 1):
13 |                 yield sha1(' '.join(
14 |                     words[max(0, idx - n) : idx]).encode('utf-8'))
15 | 
16 | 
17 | def get_too_common_shingles(texts, threshold=0.05):
18 |     shingle_counts = defaultdict(int)
19 |     n_items = 0
20 |     for text in texts:
21 |         n_items += 1
22 |         hashes = set(shingle_h.digest() for shingle_h in shingle_hashes(text))
23 |         for h in hashes:
24 |             shingle_counts[h] += 1
25 |     if shingle_counts:
26 |         return set(h for h, count in shingle_counts.items()
27 |                    if count > max(1, threshold * n_items))
28 |     return set()
29 | 
30 | 
31 | def get_min_hash(text, too_common, num_perm=128):
32 |     min_hash = MinHash(num_perm=num_perm)
33 |     for shingle_h in shingle_hashes(text):
34 |         digest = shingle_h.digest()
35 |         if digest not in too_common:
36 |             min_hash.update(digest)
37 |     return min_hash
38 | 
39 | 
40 | try:
41 |     from scrapy.utils.url import canonicalize_url as _canonicalize_url
42 |     def canonicalize_url(url):
43 |         return _canonicalize_url(url, keep_fragments=True)
44 | except ImportError:
45 |     def canonicalize_url(url):
46 |         return url
47 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup
 3 | 
 4 | 
 5 | def read(fname):
 6 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 7 | 
 8 | 
 9 | setup(
10 |     name='MaybeDont',
11 |     version='0.1.0',
12 |     author="Konstantin Lopuhin",
13 |     author_email="kostia.lopuhin@gmail.com",
14 |     description='A component that tried to avoid downloading duplicate content',
15 |     license='MIT',
16 |     url='https://github.com/TeamHG-Memex/MaybeDont',
17 |     packages=['maybedont'],
18 |     long_description=read('README.rst'),
19 |     install_requires=[
20 |         'six',
21 |         'datasketch>=0.2.0',
22 |     ],
23 |     classifiers=[
24 |         'Development Status :: 3 - Alpha',
25 |         'Topic :: Internet :: WWW/HTTP :: Indexing/Search',
26 |         'License :: OSI Approved :: MIT License',
27 |         'Programming Language :: Python',
28 |         'Programming Language :: Python :: 2',
29 |         'Programming Language :: Python :: 2.7',
30 |         'Programming Language :: Python :: 3',
31 |         'Programming Language :: Python :: 3.3',
32 |         'Programming Language :: Python :: 3.4',
33 |         'Programming Language :: Python :: 3.5',
34 |     ],
35 | )
36 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TeamHG-Memex/MaybeDont/34721f67b69d426adda324a0ed905d3860828af9/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_middleware.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import scrapy
 3 | from scrapy import Request
 4 | from scrapy.http.response.html import HtmlResponse
 5 | from scrapy.utils.log import configure_logging
 6 | from scrapy.exceptions import IgnoreRequest
 7 | 
 8 | from maybedont.scrapy_middleware import AvoidDupContentMiddleware
 9 | 
10 | 
11 | configure_logging()
12 | 
13 | 
14 | class Spider(scrapy.Spider):
15 |     name = 'spider'
16 | 
17 |     def parse(self, response):
18 |         assert response
19 | 
20 | 
21 | def test_middleware():
22 |     Rq = lambda path: Request(
23 |         'http://example.com{}'.format(path),
24 |         meta={'avoid_dup_content': True})
25 |     Rs = lambda req, body: HtmlResponse(
26 |         req.url, body=body.encode(), request=req)
27 |     mw = AvoidDupContentMiddleware(
28 |         initial_queue_limit=1, threshold=0.5, exploration=0.00)
29 |     spider = Spider()
30 |     req = Rq('/')
31 |     mw.process_request(req, spider)
32 |     mw.process_response(req, Rs(req, ''), spider)
33 |     assert mw.dupe_predictor
34 |     n_dropped = 0
35 |     for i in range(10):
36 |         req = Rq('/viewtopic.php?topic_id={}'.format(i))
37 |         mw.process_request(req, spider)
38 |         mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
39 |         req = Rq('/viewtopic.php?topic_id={}&start=0'.format(i))
40 |         try:
41 |             mw.process_request(req, spider)
42 |         except IgnoreRequest:
43 |             n_dropped += 1
44 |         else:
45 |             mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
46 |         mw.dupe_predictor.log_dupstats(min_dup=0)
47 |     assert n_dropped == 5
48 |     # one request in different order
49 |     req = Rq('/viewtopic.php?topic_id=100&start=0')
50 |     mw.process_request(req, spider)
51 |     mw.process_response(req, Rs(req, ''), spider)
52 |     mw.process_request(Rq('/viewtopic.php?topic_id=200'), spider)
53 |     with pytest.raises(IgnoreRequest):
54 |         mw.process_request(Rq('/viewtopic.php?topic_id=100'), spider)
55 |     # test exploration
56 |     mw.exploration = 0.5
57 |     n_dropped = 0
58 |     n_requests = 0
59 |     for i in range(150, 170):
60 |         req = Rq('/viewtopic.php?topic_id={}'.format(i))
61 |         mw.process_request(req, spider)
62 |         mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
63 |         req = Rq('/viewtopic.php?topic_id={}&start=0'.format(i))
64 |         n_requests += 1
65 |         try:
66 |             mw.process_request(req, spider)
67 |         except IgnoreRequest:
68 |             n_dropped += 1
69 |         else:
70 |             mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
71 |     assert n_dropped > 0
72 |     assert n_dropped < n_requests
73 | 
74 | 
75 | def test_skip():
76 |     mw = AvoidDupContentMiddleware(
77 |         initial_queue_limit=300, threshold=0.98, exploration=0.05)
78 |     spider = Spider()
79 |     mw.process_request(Request('http://example.com'), spider)
80 |     assert len(mw.initial_queue) == 0
81 |     req = Request('http://example.com', meta={'avoid_dup_content': True})
82 |     mw.process_request(req, spider)
83 |     mw.process_response(
84 |         req, HtmlResponse(req.url, body=b'a', request=req), spider)
85 |     assert len(mw.initial_queue) == 1
86 | 


--------------------------------------------------------------------------------
/tests/test_predict.py:
--------------------------------------------------------------------------------
 1 | import logging, random
 2 | 
 3 | import pytest
 4 | 
 5 | from maybedont import DupePredictor
 6 | 
 7 | 
 8 | logging.basicConfig(level=logging.DEBUG)
 9 | 
10 | 
11 | def test_path():
12 |     dupe_predictor = DupePredictor()
13 |     def gen_urls():
14 |         return ['http://foo.com/d?p{0}={0}'.format(random.randint(1, 100)),
15 |                 'http://foo.com/nd?p{0}={0}'.format(random.randint(1, 100))]
16 |     for _ in range(100):
17 |         url1, url2 = gen_urls()
18 |         dupe_predictor.update_model(url1, 'd')
19 |         dupe_predictor.update_model(
20 |             url2, 'd{}'.format(random.randint(1, 100)))
21 |     dupe_predictor.log_dupstats(min_dup=1)
22 |     url1, url2 = gen_urls()
23 |     assert dupe_predictor.get_dupe_prob(url1) > 0.97
24 |     assert dupe_predictor.get_dupe_prob(url2) < 0.97
25 | 
26 | 
27 | @pytest.mark.parametrize('reverse_update', [True, False])
28 | @pytest.mark.parametrize('reverse_test', [True, False])
29 | @pytest.mark.parametrize('is_param', [True, False])
30 | def test_param(reverse_update, reverse_test, is_param):
31 |     dupe_predictor = DupePredictor()
32 |     def gen_urls(page):
33 |         tpls = ['{}/?page={}', '{}/?page={}&start=0'] if is_param else \
34 |                ['{}/{}',       '{}/{}?start=0']
35 |         return [tpl.format('http://foo.com', page) for tpl in tpls]
36 |     for i in range(100):
37 |         urls = gen_urls(i)
38 |         if reverse_update:
39 |             urls.reverse()
40 |         for url in urls:
41 |             dupe_predictor.update_model(url, 'a{}'.format(i))
42 |     dupe_predictor.log_dupstats(min_dup=1)
43 |     url1, url2 = gen_urls('b')
44 |     if reverse_test:
45 |         url1, url2 = url2, url1
46 |     dupe_predictor.update_model(url1, 'b')
47 |     assert dupe_predictor.get_dupe_prob(url2) > 0.97
48 |     for url in gen_urls('c'):
49 |         assert dupe_predictor.get_dupe_prob(url) < 0.1
50 | 
51 | 
52 | @pytest.mark.parametrize('reverse_update', [True, False])
53 | @pytest.mark.parametrize('reverse_test', [True, False])
54 | @pytest.mark.parametrize('is_param', [True, False])
55 | def test_param_value(reverse_update, reverse_test, is_param):
56 |     dupe_predictor = DupePredictor()
57 |     random.seed(1)
58 |     def gen_urls(page):
59 |         random_start = random.randint(1, 100)
60 |         if is_param:
61 |             tpls = ['{}/?page={}', '{}/?page={}&start=0',
62 |                     '{}/?page={}&start=%s' % random_start]
63 |         else:
64 |             tpls = ['{}/{}', '{}/{}?start=0', '{}/{}?start=%s' % random_start]
65 |         return [tpl.format('http://foo.com', page) for tpl in tpls]
66 |     for i in range(100):
67 |         urls = gen_urls(i)
68 |         with_contents = list(zip(urls, ['a{}'.format(i)] * 2 +
69 |                                        ['r{}'.format(random.randint(1, 100))]))
70 |         if reverse_update:
71 |             with_contents.reverse()
72 |         for url, content in with_contents:
73 |             dupe_predictor.update_model(url, content)
74 |     dupe_predictor.log_dupstats(min_dup=1)
75 |     url1, url2, url3 = gen_urls('b')
76 |     if reverse_test:
77 |         url1, url2 = url2, url1  # url3 stays the same
78 |     dupe_predictor.update_model(url1, 'b')
79 |     assert dupe_predictor.get_dupe_prob(url2) > 0.97
80 |     assert dupe_predictor.get_dupe_prob(url3) < 0.3
81 |     for url in gen_urls('c'):
82 |         assert dupe_predictor.get_dupe_prob(url) < 0.3
83 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | ; this is a tox config for running tests
 2 | ; under all supported Python interpreters
 3 | 
 4 | [tox]
 5 | envlist = py27,py34,py35
 6 | 
 7 | [testenv]
 8 | deps=
 9 |     pytest
10 |     pytest-cov
11 | 
12 | commands=
13 |     pip install -U pip wheel
14 |     pip install scrapy==1.1.0
15 |     py.test --cov=maybedont {posargs: tests}
16 | 


--------------------------------------------------------------------------------