`
59 |
60 | Returns JSON `{“status”: “pending”, “job_Id”: “XXYYZZ”, “info”: “X out of Y captures have been processed”}` the status of the job matching that specific job id
61 |
62 | ## Installing
63 |
64 | Using conda or another Python environment management system, select Python 3.10 to create a virtualenv and activate it:
65 | ```Shell
66 | python -m venv venv
67 | . venv/bin/activate
68 | ```
69 |
70 | Install and update using pip:
71 | ```Shell
72 | python setup.py install
73 | ```
74 | Copy the conf.yml.example file to the same directory, removing the .example extension
75 |
76 | ```
77 | cd wayback_discover_diff
78 | cp conf.yml.example conf.yml
79 | ```
80 | ## Run
81 | In order to run this server you should run :
82 | ```
83 | bash run_gunicorn.sh &
84 | bash run_celery.sh
85 | ```
86 |
87 | Open http://127.0.0.1:4000 in a browser.
88 |
89 | ## Tests
90 | In order to run the tests call the script:
91 | ```
92 | bash run_tests.sh
93 | ```
94 |
--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from wayback_discover_diff.util import (url_is_valid, year_simhash,
4 | timestamp_simhash)
5 |
6 |
7 | SAMPLE_REDIS_CONTENT = {
8 | 'com,example)/': {
9 | '20141021062411': 'o52rOf0Hi2o=',
10 | '20140202131837': 'og2jGKWHsy4=',
11 | '20140824062257': 'o52jPP0Hg2o=',
12 | '20160824062257': 'o52jPP0Hg2o='
13 | },
14 | 'com,other)/': {
15 | '2014': '-1'
16 | },
17 | 'org,nonexistingdomain)/': {
18 | '1999': '-1'
19 | },
20 | }
21 |
22 |
23 | class StubRedis(dict):
24 | """Mock Redis connection for unit tests.
25 | """
26 | def __init__(self, *args, **kwargs):
27 | self.update(SAMPLE_REDIS_CONTENT)
28 |
29 | def hset(self, key, hkey, hval):
30 | e = self.get(key)
31 | if e is None:
32 | self[key] = e = {}
33 | else:
34 | assert isinstance(e, dict)
35 | e[hkey] = hval
36 |
37 | def hget(self, key, hkey):
38 | e = self.get(key)
39 | if e is None: return None
40 | assert isinstance(e, dict)
41 | return e.get(hkey)
42 |
43 | def hkeys(self, key):
44 | e = self.get(key)
45 | if e is None: return {}
46 | assert isinstance(e, dict)
47 | return self.get(key).keys()
48 |
49 | def hmget(self, key, hkeys):
50 | e = self.get(key)
51 | if e is None: return None
52 | assert isinstance(e, dict)
53 | out = {}
54 | for hkey in hkeys:
55 | out[hkey] = e.get(hkey)
56 | return out
57 |
58 |
59 | @pytest.fixture
60 | def redis():
61 | return StubRedis()
62 |
63 |
64 | @pytest.mark.parametrize('url,result', [
65 | ('http://example.com/', True),
66 | ('other', False),
67 | ('torrent:something.gr/file', False),
68 | ('tel:00302310123456', False),
69 | ('loudfi1@libero.it', False),
70 | ('http://roblox', False)
71 | ])
72 | def test_url_is_valid(url, result):
73 | assert url_is_valid(url) == result
74 |
75 |
76 | @pytest.mark.parametrize('url,timestamp,simhash', [
77 | ('http://example.com', '20141021062411', 'o52rOf0Hi2o='),
78 | ('http://example.com', '2014102', None),
79 | ('http://other.com', '20141021062411', None),
80 | ])
81 | def test_timestamp_simhash(url, timestamp, simhash, redis):
82 | res = timestamp_simhash(redis, url, timestamp)
83 | if len(res.keys()) == 1:
84 | assert res == {'simhash': simhash}
85 | elif url == 'http://other.com':
86 | assert res == {'status': 'error', 'message': 'NO_CAPTURES'}
87 | else:
88 | assert res == {'status': 'error', 'message': 'CAPTURE_NOT_FOUND'}
89 |
90 |
91 | @pytest.mark.parametrize('url,year,count', [
92 | ('http://example.com', '2014', 3),
93 | ('http://example.com', '2016', 1),
94 | ('http://example.com', '2017', None),
95 | ('http://example.com', '', None),
96 | ('http://other.com', '2014', None)
97 | ])
98 | def test_year_simhash(url, year, count, redis):
99 | """check if year_simhash produced an error response.
100 | """
101 | res = year_simhash(redis, url, year)
102 | if isinstance(res,dict):
103 | if year == '2014':
104 | assert res == {'status': 'error', 'message': 'NO_CAPTURES'}
105 | else:
106 | assert res == {'status': 'error', 'message': 'NOT_CAPTURED'}
107 | if count:
108 | assert len(res[0]) == count
109 |
--------------------------------------------------------------------------------
/tests/test_web.py:
--------------------------------------------------------------------------------
1 | """Test web endpoints.
2 | """
3 | import json
4 | import pytest
5 | from werkzeug.test import Client
6 | from werkzeug.wrappers import Response
7 | from test_util import StubRedis
8 |
9 | from wayback_discover_diff.web import get_app
10 |
11 |
12 | @pytest.fixture
13 | def app():
14 | cfg = dict(redis_uri='redis://localhost/9',
15 | snapshots=dict(snapshots_per_page=100)
16 | )
17 | web_app = get_app(cfg)
18 | web_app.redis = StubRedis()
19 | return web_app
20 |
21 | # TODO we must mock Celery task
22 | # Initialize Celery and register Discover task.
23 | # celery = Celery(__name__, broker='redis://'+str(cfg['redis']['host'])+':'+str(cfg['redis']['port']))
24 | # celery.conf.update(
25 | # CELERY_BROKER_URL='redis://'+str(cfg['redis']['host'])+':'+str(cfg['redis']['port']),
26 | # CELERY_RESULT_BACKEND='redis://'+str(cfg['redis']['host'])+':'+str(cfg['redis']['port'])
27 | # )
28 | # celery.register_task(app)
29 |
30 |
31 | def test_simhash_parameters(app):
32 | client = Client(app, response_wrapper=Response)
33 | resp = client.get('/simhash?timestamp=20141115130953')
34 | assert resp.status_code == 200
35 | data = json.loads(resp.data.decode('utf-8'))
36 | assert data == dict(status='error', info='url param is required.')
37 |
38 | resp = client.get('/simhash?url=example.com')
39 | assert resp.status_code == 200
40 | data = json.loads(resp.data.decode('utf-8'))
41 | assert data == dict(status='error', info='year param is required.')
42 |
43 | resp = client.get('/simhash?url=invalid×tamp=20141115130953')
44 | assert resp.status_code == 200
45 | data = json.loads(resp.data.decode('utf-8'))
46 | assert data == dict(status='error', info='invalid url format.')
47 |
48 | # StubRedis already has simhash data for 20140202131837 and example.com
49 | resp = client.get('/simhash?url=example.com×tamp=20140202131837')
50 | data = json.loads(resp.data.decode('utf-8'))
51 | assert data.get('simhash') == 'og2jGKWHsy4='
52 |
53 |
54 | def test_no_entry(app):
55 | client = Client(app, response_wrapper=Response)
56 | resp = client.get('/simhash?timestamp=20180000000000&url=nonexistingdomain.org')
57 | assert resp.status_code == 200
58 | data = json.loads(resp.data.decode('utf-8'))
59 | assert data == {'message': 'CAPTURE_NOT_FOUND', 'status': 'error'}
60 |
61 | # TODO must mock this
62 | # def test_start_task():
63 | # url = 'iskme.org'
64 | # year = '2018'
65 | # job_id = celery.tasks['Discover'].apply(args=[url, year])
66 | # assert job_id is not None
67 |
68 |
69 | def test_simhash_task_parameters(app):
70 | client = Client(app, response_wrapper=Response)
71 | resp = client.get('/calculate-simhash?year=2018')
72 | assert resp.status_code == 200
73 | data = json.loads(resp.data.decode('utf-8'))
74 | assert data == dict(status='error', info='url param is required.')
75 |
76 | resp = client.get('/calculate-simhash?url=example.com&year=XY')
77 | assert resp.status_code == 200
78 | data = json.loads(resp.data.decode('utf-8'))
79 | assert data == dict(status='error', info='year param is required.')
80 |
81 | resp = client.get('/calculate-simhash?url=nonexistingdomain.org')
82 | assert resp.status_code == 200
83 | data = json.loads(resp.data.decode('utf-8'))
84 | assert data == dict(status='error', info='year param is required.')
85 |
86 | resp = client.get('/calculate-simhash?url=nonexistingdomain.org&year=-')
87 | assert resp.status_code == 200
88 | data = json.loads(resp.data.decode('utf-8'))
89 | assert data == dict(status='error', info='year param is required.')
90 |
91 | resp = client.get('/calculate-simhash?url=foo&year=2000')
92 | assert resp.status_code == 200
93 | data = json.loads(resp.data.decode('utf-8'))
94 | assert data == dict(status='error', info='invalid url format.')
95 |
96 |
97 | def test_task_no_snapshots(app):
98 | client = Client(app, response_wrapper=Response)
99 | resp = client.get('/simhash?url=nonexistingdomain.org&year=1999')
100 | data = json.loads(resp.data.decode('utf-8'))
101 | assert data == {'message': 'NO_CAPTURES', 'status': 'error'}
102 |
103 |
104 | # TODO must mock this
105 | # def test_success_calc_simhash():
106 | # url = 'iskme.org'
107 | # year = '2018'
108 | # job = celery.tasks['Discover'].apply(args=[url, year])
109 | # task_info = json.loads(job.info)
110 | # assert task_info.get('duration', -1) != -1
111 |
112 |
113 | def test_root(app):
114 | client = Client(app, response_wrapper=Response)
115 | resp = client.get('/')
116 | assert resp.data
117 |
118 |
119 | def test_job_params(app):
120 | client = Client(app, response_wrapper=Response)
121 | resp = client.get('/job')
122 | data = json.loads(resp.data.decode('utf-8'))
123 | assert data == dict(status='error', info='job_id param is required.')
124 |
--------------------------------------------------------------------------------
/wayback_discover_diff/util.py:
--------------------------------------------------------------------------------
1 | """SPN Utility methods.
2 | """
3 | import logging
4 | from collections import defaultdict
5 | from math import ceil
6 | import os
7 | import re
8 | import yaml
9 | from redis.exceptions import RedisError
10 | from surt import surt
11 | import tldextract
12 |
13 |
14 | def load_config():
15 | """Load conf file defined by ENV var WAYBACK_DISCOVER_DIFF_CONF.
16 | If not available load ./conf.yaml
17 | """
18 | config = {}
19 | try:
20 | cfg_file = os.environ.get('WAYBACK_DISCOVER_DIFF_CONF')
21 | if not cfg_file:
22 | cfg_file = os.getcwd() + '/conf.yml'
23 | logging.warning('using default configuration from %s', cfg_file)
24 | with open(cfg_file) as cfg:
25 | config = yaml.safe_load(cfg)
26 | logging.debug('config=%s', config)
27 | except OSError:
28 | logging.error('Error loading configuration', exc_info=1)
29 | return config
30 |
31 |
32 | def timestamp_simhash(redis, url, timestamp):
33 | """Get stored simhash data from Redis for URL and timestamp
34 | """
35 | try:
36 | if url and timestamp:
37 | results = redis.hget(surt(url), timestamp)
38 | if results:
39 | return {'simhash': results}
40 | results = redis.hget(surt(url), timestamp[:4])
41 | if results:
42 | return {'status': 'error', 'message': 'NO_CAPTURES'}
43 | except RedisError as exc:
44 | logging.error('error loading simhash data for url %s timestamp %s (%s)',
45 | url, timestamp, exc)
46 | return {'status': 'error', 'message': 'CAPTURE_NOT_FOUND'}
47 |
48 |
49 | def year_simhash(redis, url, year, page=None, snapshots_per_page=None):
50 | """Get stored simhash data for url, year and page (optional).
51 | """
52 | try:
53 | if url and year:
54 | # TODO replace hkeys with hscan
55 | results = redis.hkeys(surt(url))
56 | if results:
57 | timestamps_to_fetch = []
58 | for timestamp in results:
59 | if timestamp == str(year):
60 | return {'status': 'error', 'message': 'NO_CAPTURES'}
61 | if timestamp[:4] == str(year):
62 | timestamps_to_fetch.append(timestamp)
63 | if timestamps_to_fetch:
64 | return handle_results(redis, timestamps_to_fetch, url,
65 | snapshots_per_page, page)
66 | # TODO return empty result and NOT error.
67 | except RedisError as exc:
68 | logging.error('error loading simhash data for url %s year %s page %d (%s)',
69 | url, year, page, exc)
70 | return {'status': 'error', 'message': 'NOT_CAPTURED'}
71 |
72 |
73 | def handle_results(redis, timestamps_to_fetch, url, snapshots_per_page,
74 | page=None):
75 | """Utility method used by `year_simhash`
76 | """
77 | available_simhashes = []
78 | if page:
79 | number_of_pages = ceil(len(timestamps_to_fetch) / snapshots_per_page)
80 | page = min(page, number_of_pages)
81 | if number_of_pages > 0:
82 | timestamps_to_fetch = \
83 | timestamps_to_fetch[(page - 1) * snapshots_per_page:(page * snapshots_per_page)]
84 | else:
85 | number_of_pages = 1
86 | try:
87 | results = redis.hmget(surt(url), timestamps_to_fetch)
88 | # TODO this crashes because of simhash bytes
89 | for i, simhash in enumerate(results):
90 | available_simhashes.append([str(timestamps_to_fetch[i]), simhash])
91 | if page:
92 | available_simhashes.insert(0, ["pages", number_of_pages])
93 | return [available_simhashes, len(timestamps_to_fetch)]
94 | except RedisError as exc:
95 | logging.error('cannot handle results for url %s page %d (%s)',
96 | url, page, exc)
97 | return None
98 |
99 |
100 | EMAIL_RE = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)")
101 |
102 |
103 | def url_is_valid(url):
104 | """URL validation.
105 | """
106 | try:
107 | if not url:
108 | return False
109 | if EMAIL_RE.match(url):
110 | return False
111 | ext = tldextract.extract(url)
112 | return ext.domain != '' and ext.suffix != ''
113 | except (ValueError, AttributeError):
114 | return False
115 |
116 |
117 | def compress_captures(captures):
118 | """Input: [["20130603143716","NRyJrLc2FWA="],["20130402202841","FT6d7Jc3vWA="],...]
119 | Output:
120 | Captures: [[2013, [06, [03, ['143716', 0]]],
121 | [04, [02, ['202841', 1]]]
122 | ]]
123 | Hashes: ['NRyJrLc2FWA=', 'FT6d7Jc3vWA=']
124 | """
125 | hashdict = {}
126 | grouped = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
127 | for ts, simhash in captures:
128 | year, month, day, hms = ts[0:4], ts[4:6], ts[6:8], ts[8:]
129 | hashid = hashdict.get(simhash)
130 | if hashid is None:
131 | hashid = len(hashdict)
132 | hashdict[simhash] = hashid
133 | cap = [hms, hashid]
134 | grouped[int(year)][int(month)][int(day)].append(cap)
135 | new_captures = [
136 | [y] + [
137 | [m] + [
138 | [d] + dc for d, dc in mc.items()
139 | ] for m, mc in yc.items()
140 | ] for y, yc in grouped.items()
141 | ]
142 | hashes = [hash for hash, hashid in sorted(hashdict.items(), key=lambda x: x[1])]
143 | return (new_captures, hashes)
144 |
--------------------------------------------------------------------------------
/tests/test_discover.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import mock
3 | from test_util import StubRedis
4 | from wayback_discover_diff.discover import (extract_html_features,
5 | calculate_simhash, custom_hash_function, pack_simhash_to_bytes, Discover)
6 |
7 |
8 | def test_extract_html_features():
9 | # handle html with repeated elements and spaces
10 | html = """
11 | my title
12 |
13 | abc
14 | test
15 |
16 | 123
17 | abc
18 | space
19 |
20 | """
21 | features = {'123': 1, 'abc': 2, 'my': 1, 'test': 1, 'title': 1, 'space': 1}
22 | assert extract_html_features(html) == features
23 |
24 | # handle html with repeated elements, and punctuation
25 | html = """
26 | my title
27 |
28 | abc
29 | a.b.c.
30 | abc.
31 | test
32 | 123
33 | abc
34 |
35 | """
36 | features = {'123': 1, 'a': 1, 'abc': 3, 'b': 1, 'c': 1, 'my': 1, 'test': 1, 'title': 1}
37 | assert extract_html_features(html) == features
38 |
39 | # handle plain text
40 | html = "just a string"
41 | features = {'just': 1, 'a': 1, 'string': 1}
42 | assert extract_html_features(html) == features
43 |
44 | # skip HTML comments
45 | html = """
46 |
47 |
49 |
50 |
51 |
52 |
53 |
54 |
55 | Thank you for closing the message box.
56 | test
57 | """
58 | features = {'box': 1, 'closing': 1, 'for': 1, 'message': 1, 'test': 1,
59 | 'thank': 1, 'the': 1, 'you': 1}
60 | assert extract_html_features(html) == features
61 |
62 | # it doesn't crash with invalid or unicode chars
63 | html = """
64 | Invalid /\x94Invalid\x0b'
65 |
66 | 今日は
67 |
68 |
69 | """
70 | features = {'\x94invalid': 1, 'invalid': 1, '今日は': 1}
71 | assert extract_html_features(html) == features
72 |
73 |
74 | html = """
75 | weird is happening \c\x0b
76 | tagtag
77 | """
78 |
79 | features = {'c': 1, 'weird': 1, 'is': 1, 'happening': 1, 'tag': 2}
80 | assert extract_html_features(html) == features
81 |
82 |
83 | def test_calculate_simhash():
84 | features = {'two': 2, 'three': 3, 'one': 1}
85 | assert calculate_simhash(features, 128) == 66237222457941138286276456718971054176
86 |
87 |
88 | CFG = {
89 | 'simhash': {
90 | 'size': 256,
91 | 'expire_after': 86400
92 | },
93 | 'redis': {
94 | 'url': 'redis://localhost:6379/1',
95 | 'decode_responses': True,
96 | 'timeout': 10
97 | },
98 | 'threads': 5,
99 | 'snapshots': {
100 | 'number_per_year': -1,
101 | 'number_per_page': 600
102 | }
103 | }
104 |
105 | @mock.patch('wayback_discover_diff.discover.StrictRedis')
106 | def test_worker_download(Redis):
107 | Redis.return_value = StubRedis()
108 | task = Discover(CFG)
109 | # This capture performs redirects inside WBM. It has CDX status=200 but
110 | # its really a redirect (This is a common WBM issue). We test that
111 | # redirects work fine.
112 | task.url = 'https://iskme.org'
113 | assert task.download_capture('20190103133511')
114 |
115 |
116 | def test_regular_hash():
117 | features = {
118 | '2019': 1,
119 | 'advanced': 1,
120 | 'google': 1,
121 | 'google©': 1,
122 | 'history': 1,
123 | 'insearch': 1,
124 | 'more': 1,
125 | 'optionssign': 1,
126 | 'privacy': 1,
127 | 'programsbusiness': 1,
128 | 'searchimagesmapsplayyoutubenewsgmaildrivemorecalendartranslatemobilebooksshoppingbloggerfinancephotosvideosdocseven': 1,
129 | 'searchlanguage': 1,
130 | 'settingsweb': 1,
131 | 'solutionsabout': 1,
132 | 'terms': 1,
133 | 'toolsadvertising': 1,
134 | '»account': 1
135 | }
136 | h = calculate_simhash(features, 128)
137 | assert h.bit_length() == 128
138 | h_bytes = pack_simhash_to_bytes(h)
139 | assert len(h_bytes) == 16
140 |
141 |
142 | def test_shortened_hash():
143 | h_size = 128
144 | features = {
145 | 'about': 1,
146 | 'accountsearchmapsyoutubeplaynewsgmailcontactsdrivecalendartranslatephotosshoppingmorefinancedocsbooksbloggerhangoutskeepjamboardearthcollectionseven': 1,
147 | 'at': 1,
148 | 'data': 1,
149 | 'feedbackadvertisingbusiness': 1,
150 | 'from': 1,
151 | 'gmailimagessign': 1,
152 | 'google': 3,
153 | 'helpsend': 1,
154 | 'in': 2,
155 | 'inappropriate': 1,
156 | 'library': 1,
157 | 'local': 1,
158 | 'more': 1,
159 | 'new': 1,
160 | 'predictions': 1,
161 | 'privacytermssettingssearch': 1,
162 | 'remove': 1,
163 | 'report': 1,
164 | 'searchhistorysearch': 1,
165 | 'searchyour': 1,
166 | 'settingsadvanced': 1,
167 | 'skills': 1,
168 | 'store': 1,
169 | 'with': 1,
170 | 'your': 1,
171 | '×develop': 1
172 | }
173 | h = calculate_simhash(features, h_size)
174 | assert h.bit_length() != h_size
175 | h_bytes = pack_simhash_to_bytes(h, h_size)
176 | assert len(h_bytes) == h_size // 8
177 |
178 |
179 | def test_simhash_256():
180 | h_size = 256
181 | features = {
182 | '2019': 1,
183 | 'advanced': 1,
184 | 'at': 1,
185 | 'google': 1,
186 | 'googleadvertising': 1,
187 | 'google©': 1,
188 | 'history': 1,
189 | 'insearch': 1,
190 | 'library': 1,
191 | 'local': 1,
192 | 'more': 1,
193 | 'new': 1,
194 | 'optionssign': 1,
195 | 'privacy': 1,
196 | 'programsbusiness': 1,
197 | 'searchimagesmapsplayyoutubenewsgmaildrivemorecalendartranslatemobilebooksshoppingbloggerfinancephotosvideosdocseven': 1,
198 | 'searchlanguage': 1,
199 | 'settingsweb': 1,
200 | 'skills': 1,
201 | 'solutionsabout': 1,
202 | 'terms': 1,
203 | 'toolsdevelop': 1,
204 | 'with': 1,
205 | 'your': 1,
206 | '»account': 1,
207 | }
208 | h = calculate_simhash(features, h_size, custom_hash_function)
209 | assert h.bit_length() == h_size
210 | h_bytes = pack_simhash_to_bytes(h, h_size)
211 | assert len(h_bytes) == h_size // 8
212 |
--------------------------------------------------------------------------------
/wayback_discover_diff/web.py:
--------------------------------------------------------------------------------
1 | """Web endpoints
2 | """
3 | import logging
4 | from time import time
5 | import pkg_resources
6 | from celery import states
7 | from celery.result import AsyncResult
8 | from celery.exceptions import CeleryError
9 | from flask import Flask, request
10 | from redis.exceptions import RedisError
11 | from .stats import statsd_incr
12 | from .util import (year_simhash, timestamp_simhash, url_is_valid,
13 | compress_captures)
14 |
15 | APP = Flask(__name__, instance_relative_config=True)
16 | APP._logger = logging.getLogger('wayback_discover_diff.web')
17 |
18 | def get_app(config):
19 | """Utility method to set APP configuration. Its used by application.py.
20 | """
21 | APP.config.from_mapping(
22 | SECRET_KEY='wayback machine simhash service',
23 | )
24 | APP.config.update(CELERYD_HIJACK_ROOT_LOGGER=False)
25 | APP.config.update(config)
26 | return APP
27 |
28 |
29 | def get_active_task(url, year):
30 | """Check for current simhash processing tasks for targe url & year
31 | """
32 | try:
33 | pending = APP.celery.control.inspect().active()
34 | if pending:
35 | for task in list(pending.values())[0]:
36 | if task['args'] == "['{}', '{}']".format(url, year):
37 | return task
38 | return None
39 | except RedisError:
40 | # Redis connection timeout is quite common in production Celery.
41 | return None
42 |
43 |
44 | @APP.route('/')
45 | def root():
46 | """Return info on the current package version.
47 | """
48 | version = pkg_resources.require("wayback-discover-diff")[0].version
49 | return "wayback-discover-diff service version: %s" % version
50 |
51 |
52 | @APP.route('/simhash')
53 | def simhash():
54 | """Return simhash data for specific URL and year (optional),
55 | page is also optional.
56 | """
57 | try:
58 | statsd_incr('get-simhash-year-request')
59 | url = request.args.get('url')
60 | if not url:
61 | return {'status': 'error', 'info': 'url param is required.'}
62 | if not url_is_valid(url):
63 | return {'status': 'error', 'info': 'invalid url format.'}
64 | timestamp = request.args.get('timestamp')
65 | if not timestamp:
66 | year = request.args.get('year', type=int)
67 | if not year:
68 | return {'status': 'error', 'info': 'year param is required.'}
69 | page = request.args.get('page', type=int)
70 | snapshots_per_page = APP.config.get('snapshots', {}).get('number_per_page')
71 | results_tuple = year_simhash(APP.redis, url, year, page,
72 | snapshots_per_page)
73 | # check if year_simhash produced an error response and return it
74 | if isinstance(results_tuple, dict):
75 | return results_tuple
76 | task = get_active_task(url, year)
77 |
78 | output = dict(captures=results_tuple[0],
79 | total_captures=results_tuple[1],
80 | status='PENDING' if task else 'COMPLETE')
81 | if request.args.get('compress') in ['true', '1']:
82 | (captures, hashes) = compress_captures(output['captures'])
83 | output['captures'] = captures
84 | output['hashes'] = hashes
85 | return output
86 |
87 | results = timestamp_simhash(APP.redis, url, timestamp)
88 | # check if timestamp_simhash produced an error response and return it
89 | if isinstance(results, dict):
90 | return results
91 | task = get_active_task(url, timestamp[:4])
92 | if task:
93 | return {'status': 'PENDING', 'captures': results}
94 | return {'status': 'COMPLETE', 'captures': results}
95 | except (ValueError, CeleryError) as exc:
96 | APP._logger.error('Cannot get simhash of %s', url, exc_info=1)
97 | return {'status': 'error', 'info': 'Internal server error.'}
98 |
99 |
100 | @APP.route('/calculate-simhash')
101 | def request_url():
102 | """Start simhash calculation for URL & year.
103 | Validate parameters url & timestamp before starting Celery task.
104 | """
105 | try:
106 | statsd_incr('calculate-simhash-year-request')
107 | url = request.args.get('url')
108 | if not url:
109 | return {'status': 'error', 'info': 'url param is required.'}
110 | if not url_is_valid(url):
111 | return {'status': 'error', 'info': 'invalid url format.'}
112 | year = request.args.get('year', type=int)
113 | if not year:
114 | return {'status': 'error', 'info': 'year param is required.'}
115 | # see if there is an active job for this request
116 | task = get_active_task(url, year)
117 | if task:
118 | return {'status': 'PENDING', 'job_id': task['id']}
119 | res = APP.celery.tasks['Discover'].apply_async(
120 | args=[url, year, time()]
121 | )
122 | return {'status': 'started', 'job_id': res.id}
123 | except CeleryError as exc:
124 | APP._logger.warning('Cannot calculate simhash of %s, %s', url,
125 | year, exc_info=1)
126 | return {'status': 'error', 'info': 'Cannot start calculation.'}
127 | except ValueError as exc:
128 | APP._logger.warning('Cannot calculate simhash of %s, no year',
129 | url, exc_info=1)
130 | return {'status': 'error', 'info': 'year param must be numeric.'}
131 |
132 |
133 | @APP.route('/job')
134 | def job_status():
135 | """Return job status.
136 | """
137 | try:
138 | statsd_incr('status-request')
139 | job_id = request.args.get('job_id')
140 | if not job_id:
141 | return {'status': 'error', 'info': 'job_id param is required.'}
142 | task = AsyncResult(job_id, app=APP.celery)
143 | if task.state == states.PENDING:
144 | if task.info:
145 | info = task.info.get('info', 1)
146 | else:
147 | info = None
148 | # job did not finish yet
149 | return {'status': task.state, 'job_id': task.id, 'info': info}
150 |
151 | if task.info and task.info.get('status', 0) == 'error':
152 | # something went wrong in the background job
153 | return {'info': task.info.get('info', 1), 'job_id': task.id,
154 | 'status': task.info.get('status', 0)}
155 | if task.info:
156 | duration = task.info.get('duration', 1)
157 | else:
158 | duration = 1
159 | return {'status': task.state, 'job_id': task.id, 'duration': duration}
160 | except (CeleryError, AttributeError) as exc:
161 | APP._logger.error('Cannot get job status of %s', job_id, exc_info=1)
162 | return {'status': 'error', 'info': 'Cannot get status.'}
163 |
--------------------------------------------------------------------------------
/wayback_discover_diff/discover.py:
--------------------------------------------------------------------------------
1 | """Celery worker
2 | """
3 | from concurrent.futures import ThreadPoolExecutor
4 | import hashlib
5 | import logging
6 | import string
7 | from time import time
8 | from datetime import datetime
9 | import cProfile
10 | import base64
11 | from itertools import groupby
12 | from celery import Task
13 | import urllib3
14 | from urllib3.exceptions import HTTPError
15 | from redis import StrictRedis, BlockingConnectionPool
16 | from redis.exceptions import RedisError
17 | from simhash import Simhash
18 | from surt import surt
19 | from selectolax.parser import HTMLParser
20 | from werkzeug.urls import url_fix
21 |
22 | from .stats import statsd_incr, statsd_timing
23 |
24 | # https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
25 | urllib3.disable_warnings()
26 |
27 |
28 | TRANSLATOR = str.maketrans(string.punctuation, ' '*len(string.punctuation))
29 |
30 |
31 | def extract_html_features(html):
32 | """Process HTML document and get key features as text. Steps:
33 | kill all script and style elements
34 | get lowercase text
35 | remove all punctuation
36 | break into lines and remove leading and trailing space on each
37 | break multi-headlines into a line each
38 | drop blank lines
39 | return a dict with features and their weights
40 | """
41 | try:
42 | tree = HTMLParser(html)
43 | tree.strip_tags(['script', 'style'])
44 | text = tree.root.text(separator=' ')
45 | if not text:
46 | return {}
47 | except UnicodeDecodeError:
48 | return {}
49 | text = text.lower().translate(TRANSLATOR)
50 | lines = (line.strip() for line in text.splitlines())
51 | chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
52 | text = '\n'.join(chunk for chunk in chunks if chunk)
53 | return {k: sum(1 for _ in g) for k, g in groupby(sorted(text.split()))}
54 |
55 |
56 | def custom_hash_function(x):
57 | """Required by Simhash
58 | """
59 | return int.from_bytes(hashlib.blake2b(x).digest(), byteorder='big')
60 |
61 |
62 | def calculate_simhash(features_dict, simhash_size, hashfunc=None):
63 | """Calculate simhash for features in a dict. `features_dict` contains data
64 | like {'text': weight}
65 | """
66 | if hashfunc:
67 | return Simhash(features_dict, simhash_size, hashfunc=hashfunc).value
68 | return Simhash(features_dict, simhash_size).value
69 |
70 |
71 | def pack_simhash_to_bytes(simhash, simhash_size=None):
72 | # simhash_value = simhash.value
73 | if simhash_size is None:
74 | size_in_bytes = (simhash.bit_length() + 7) // 8
75 | else:
76 | size_in_bytes = simhash_size // 8
77 | return simhash.to_bytes(size_in_bytes, byteorder='little')
78 |
79 |
80 | class Discover(Task):
81 | """Custom Celery Task class.
82 | http://docs.celeryproject.org/en/latest/userguide/tasks.html#custom-task-classes
83 | """
84 | name = 'Discover'
85 | task_id = None
86 | # If a simhash calculation for a URL & year does more than
87 | # `max_download_errors`, stop it to avoid pointless requests. The captures
88 | # are not text/html or there is a problem with the WBM.
89 | max_download_errors = 10
90 | max_capture_download = 1000000
91 |
92 | def __init__(self, cfg):
93 | self.simhash_size = cfg['simhash']['size']
94 | self.simhash_expire = cfg['simhash']['expire_after']
95 | if self.simhash_size > 512:
96 | raise Exception('do not support simhash longer than 512')
97 |
98 | headers = {'User-Agent': 'wayback-discover-diff',
99 | 'Accept-Encoding': 'gzip,deflate',
100 | 'Connection': 'keep-alive'}
101 | cdx_auth_token = cfg.get('cdx_auth_token')
102 | if cdx_auth_token:
103 | headers['cookie'] = 'cdx_auth_token=%s' % cdx_auth_token
104 |
105 | self.http = urllib3.HTTPConnectionPool('web.archive.org', maxsize=50,
106 | retries=2, timeout=20,
107 | headers=headers)
108 | self.redis = StrictRedis(
109 | connection_pool=BlockingConnectionPool.from_url(
110 | **cfg['redis']
111 | )
112 | )
113 | self.tpool = ThreadPoolExecutor(max_workers=cfg['threads'])
114 | self.snapshots_number = cfg['snapshots']['number_per_year']
115 | self.download_errors = 0
116 | # Initialize logger
117 | self._log = logging.getLogger('wayback_discover_diff.worker')
118 |
119 | def download_capture(self, ts):
120 | """Download capture data from the WBM and update job status. Return
121 | data only when its text or html. On download error, increment download_errors
122 | which will stop the task after 10 errors. Fetch data up to a limit
123 | to avoid getting too much (which is unnecessary) and have a consistent
124 | operation time.
125 | """
126 | try:
127 | statsd_incr('download-capture')
128 | self._log.info('fetching capture %s %s', ts, self.url)
129 | res = self.http.request('GET', '/web/{}id_/{}'.format(ts, self.url),
130 | preload_content=False)
131 | data = res.read(self.max_capture_download)
132 | ctype = res.headers.get('content-type')
133 | res.release_conn()
134 | if ctype:
135 | ctype = ctype.lower()
136 | if "text" in ctype or "html" in ctype:
137 | return data
138 | except HTTPError:
139 | self.download_errors += 1
140 | statsd_incr('download-error')
141 | self._log.error('cannot fetch capture %s %s', ts, self.url, exc_info=1)
142 | return None
143 |
144 | def start_profiling(self, snapshot, index):
145 | """Used for performance testing only.
146 | """
147 | cProfile.runctx('self.get_calc(snapshot, index)',
148 | globals=globals(), locals=locals(),
149 | filename='profile.prof')
150 |
151 | def get_calc(self, capture):
152 | """if a capture with an equal digest has been already processed,
153 | return cached simhash and avoid redownloading and processing. Else,
154 | download capture, extract HTML features and calculate simhash.
155 | If there are already too many download failures, return None without
156 | any processing to avoid pointless requests.
157 | Return None if any problem occurs (e.g. HTTP error or cannot calculate)
158 | """
159 | (timestamp, digest) = capture.split(' ')
160 | simhash_enc = self.seen.get(digest)
161 | if simhash_enc:
162 | self._log.info("already seen %s", digest)
163 | return (timestamp, simhash_enc)
164 |
165 | if self.download_errors >= self.max_download_errors:
166 | statsd_incr('multiple-consecutive-errors')
167 | self._log.error('%d consecutive download errors fetching %s captures',
168 | self.download_errors, self.url)
169 | return None
170 |
171 | response_data = self.download_capture(timestamp)
172 | if response_data:
173 | data = extract_html_features(response_data)
174 | if data:
175 | statsd_incr('calculate-simhash')
176 | self._log.info("calculating simhash")
177 | simhash = calculate_simhash(data, self.simhash_size,
178 | hashfunc=custom_hash_function)
179 | # This encoding is necessary to store simhash data in Redis.
180 | simhash_enc = base64.b64encode(
181 | pack_simhash_to_bytes(simhash, self.simhash_size)
182 | )
183 | self.seen[digest] = simhash_enc
184 | return (timestamp, simhash_enc)
185 | return None
186 |
187 | def run(self, url, year, created):
188 | """Run Celery Task.
189 | """
190 | self.job_id = self.request.id
191 | self.url = url_fix(url)
192 | time_started = datetime.now()
193 | self._log.info('Start calculating simhashes.')
194 | self.download_errors = 0
195 |
196 | statsd_timing('task-wait', time() - created)
197 | if not self.url:
198 | self._log.error('did not give url parameter')
199 | return {'status': 'error', 'info': 'URL is required.'}
200 | if not year:
201 | self._log.error('did not give year parameter')
202 | return {'status': 'error', 'info': 'Year is required.'}
203 | # fetch captures
204 | self.update_state(state='PENDING',
205 | meta={'info': 'Fetching {} captures for year {}'.format(
206 | url, year)})
207 | resp = self.fetch_cdx(url, year)
208 | if resp.get('status') == 'error':
209 | return resp
210 | captures = resp.get('captures')
211 | total = len(captures)
212 | self.seen = dict()
213 | # calculate simhashes in parallel
214 | i = 0
215 | final_results = {}
216 | for res in self.tpool.map(self.get_calc, captures):
217 | if not res:
218 | continue
219 | (timestamp, simhash) = res
220 | if simhash:
221 | final_results[timestamp] = simhash
222 | if i % 10 == 0:
223 | self.update_state(
224 | state='PENDING',
225 | meta={'info': 'Processed %d out of %d captures.' % (i, total)}
226 | )
227 | i += 1
228 |
229 | self._log.info('%d final results for %s and year %s.',
230 | len(final_results), self.url, year)
231 | if final_results:
232 | try:
233 | urlkey = surt(self.url)
234 | self.redis.hmset(urlkey, final_results)
235 | self.redis.expire(urlkey, self.simhash_expire)
236 | except RedisError as exc:
237 | self._log.error('cannot write simhashes to Redis for URL %s',
238 | self.url, exc_info=1)
239 |
240 | duration = (datetime.now() - time_started).seconds
241 | statsd_timing('task-duration', duration)
242 | self._log.info('Simhash calculation finished in %.2fsec.', duration)
243 | return {'duration': str(duration)}
244 |
245 | def fetch_cdx(self, url, year):
246 | """Make a CDX query for timestamp and digest for a specific year.
247 | """
248 | try:
249 | self._log.info('fetching CDX of %s for year %s', url, year)
250 | # Collapse captures by timestamp to get 3 captures per day (max).
251 | # TODO increase that in the future when we can handle more captures.
252 | # Its necessary to reduce the huge number of captures some websites
253 | # (e.g. twitter.com has 167k captures for 2018. Get only 2xx captures.
254 | fields = {'url': url, 'from': year, 'to': year,
255 | 'statuscode': 200, 'fl': 'timestamp,digest',
256 | 'collapse': 'timestamp:9'}
257 | if self.snapshots_number != -1:
258 | fields['limit'] = self.snapshots_number
259 | response = self.http.request('GET', '/web/timemap', fields=fields)
260 | self._log.info('finished fetching timestamps of %s for year %s',
261 | self, year)
262 | if response.status == 200:
263 | if not response.data:
264 | self._log.info('no captures found for %s %s', self, year)
265 | urlkey = surt(url)
266 | self.redis.hset(urlkey, year, -1)
267 | self.redis.expire(urlkey, self.simhash_expire)
268 | return {'status': 'error',
269 | 'info': 'No captures of {} for year {}'.format(url, year)}
270 | captures_txt = response.data.decode('utf-8')
271 | captures = captures_txt.strip().split("\n")
272 | if captures:
273 | return {'status': 'success', 'captures': captures}
274 | return {'status': 'error',
275 | 'info': 'No captures of {} for year {}'.format(url, year)}
276 | except (ValueError, HTTPError) as exc:
277 | self._log.error('invalid CDX query response for %s %s', url, year,
278 | exc_info=1)
279 | return {'status': 'error', 'info': str(exc)}
280 | except RedisError as exc:
281 | self._log.error('error connecting with Redis for url %s year %s',
282 | url, year, exc_info=1)
283 | return {'status': 'error', 'info': str(exc)}
284 |
--------------------------------------------------------------------------------