├── wayback_discover_diff ├── __init__.py ├── wayback_discover_diff.iml ├── feature_extraction_cli.py ├── application.py ├── conf.yml.example ├── stats.py ├── util.py ├── web.py └── discover.py ├── tox.ini ├── run_celery.sh ├── .gitignore ├── run_gunicorn.sh ├── wayback-discover-diff.iml ├── mypy.ini ├── .circleci └── config.yml ├── setup.py ├── README.md └── tests ├── test_util.py ├── test_web.py └── test_discover.py /wayback_discover_diff/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist=py310 3 | [testenv] 4 | deps= 5 | pytest 6 | mock 7 | commands=py.test 8 | -------------------------------------------------------------------------------- /run_celery.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | WAYBACK_DISCOVER_DIFF_CONF=wayback_discover_diff/conf.yml celery -A wayback_discover_diff.application.CELERY worker --without-gossip --without-mingle 3 | # -l debug 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | .idea/ 3 | .cache/ 4 | *.pyc 5 | __pycache__/ 6 | 7 | instance/ 8 | 9 | .pytest_cache/ 10 | .coverage 11 | htmlcov/ 12 | 13 | dist/ 14 | build/ 15 | *.egg-info/ 16 | .eggs 17 | 18 | .tox 19 | -------------------------------------------------------------------------------- /run_gunicorn.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Initialize options for gunicorn 3 | OPTS=( 4 | --env FLASK_APP=wayback_discover_diff 5 | --env FLASK_DEBUG=1 6 | --env WAYBACK_DISCOVER_DIFF_CONF=wayback_discover_diff/conf.yml 7 | --workers 2 8 | -b 0.0.0.0:8096 9 | --reload 10 | ) 11 | 12 | #Run gunicorn 13 | gunicorn "${OPTS[@]}" wayback_discover_diff.application:APP 14 | -------------------------------------------------------------------------------- /wayback-discover-diff.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /wayback_discover_diff/wayback_discover_diff.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /wayback_discover_diff/feature_extraction_cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Extract and print features from target capture. 3 | Utility script useful to experiment and evaluate feature extraction. 4 | """ 5 | import pprint 6 | import sys 7 | import urllib3 8 | from wayback_discover_diff.discover import extract_html_features 9 | 10 | url = sys.argv[1] 11 | 12 | http = urllib3.PoolManager() 13 | res = http.request('GET', url) 14 | data = res.data.decode('utf-8') 15 | 16 | features = extract_html_features(data) 17 | pprint.pprint(features) 18 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.8 3 | # warn_return_any = True 4 | warn_unused_configs = True 5 | no_implicit_optional = True 6 | warn_redundant_casts = True 7 | warn_unused_ignores = True 8 | warn_unreachable = True 9 | 10 | # STRICT 11 | # disallow_untyped_defs = True 12 | 13 | # ref https://justincaustin.com/blog/mypy-tips-and-tricks 14 | strict_optional = True 15 | 16 | # Error output 17 | show_column_numbers = True 18 | show_error_context = True 19 | show_error_codes = True 20 | show_traceback = True 21 | pretty = True 22 | color_output = True 23 | error_summary = True 24 | 25 | [mypy-flask_cors.*] 26 | ignore_missing_imports = True 27 | 28 | [mypy-selectolax.*] 29 | ignore_missing_imports = True 30 | 31 | [mypy-simhash.*] 32 | ignore_missing_imports = True 33 | 34 | [mypy-surt.*] 35 | ignore_missing_imports = True 36 | 37 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | working_directory: ~/wayback-discover-diff 5 | docker: 6 | - image: cimg/python:3.10 7 | steps: 8 | - checkout 9 | 10 | - run: 11 | name: Install Dependencies 12 | command: | 13 | python -m venv venv 14 | . venv/bin/activate 15 | python setup.py develop 16 | # Even though pytest, mock & pylint are defined in setup.py, they 17 | # are not installed. This fixes that. 18 | pip install pytest mock pylint 19 | - run: 20 | name: Tests 21 | command: | 22 | . venv/bin/activate 23 | pytest tests 24 | - run: 25 | name: Code linting 26 | command: | 27 | . venv/bin/activate 28 | pylint wayback_discover_diff 29 | 30 | workflows: 31 | version: 2 32 | build: 33 | jobs: 34 | - build 35 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from setuptools import setup, find_packages 3 | 4 | 5 | if sys.version_info < (3, 10): 6 | raise RuntimeError("Python version is {}. Requires 3.10 or greater." 7 | "".format(sys.version_info)) 8 | 9 | 10 | setup( 11 | name='wayback-discover-diff', 12 | version='0.1.9.6', 13 | description='Calculate wayback machine captures simhash', 14 | packages=find_packages(), 15 | zip_safe=False, 16 | install_requires=[ 17 | 'Werkzeug<3', 18 | 'Flask>=2.1.3,<2.2.0', 19 | 'simhash>=2.1.2', 20 | 'urllib3==1.26.16', 21 | 'PyYAML>=6', 22 | # required for Celery 23 | 'celery==5.4.0', 24 | 'kombu>=5.3.4,<6.0', 25 | 'redis==4.6.0', 26 | 27 | 'hiredis', 28 | 'flask-cors', 29 | 'selectolax>=0.3.21', 30 | 'statsd', 31 | 'surt' 32 | ], 33 | tests_require=[ 34 | 'pytest', 35 | 'mock' 36 | ], 37 | ) 38 | -------------------------------------------------------------------------------- /wayback_discover_diff/application.py: -------------------------------------------------------------------------------- 1 | """application.py -- top-level web application for wayback-discover-diff. 2 | """ 3 | import logging.config 4 | import os 5 | from celery import Celery 6 | from flask_cors import CORS 7 | from redis import StrictRedis, BlockingConnectionPool 8 | from wayback_discover_diff import stats 9 | from wayback_discover_diff.util import load_config 10 | from wayback_discover_diff.discover import Discover 11 | 12 | # Init config 13 | CFG = load_config() 14 | 15 | # Init logging 16 | logconf = CFG.get('logging') 17 | if logconf: 18 | logging.config.dictConfig(logconf) 19 | 20 | # Init statsd client 21 | stats_conf = CFG.get('statsd') 22 | if isinstance(stats_conf, dict): 23 | stats.configure(**stats_conf) 24 | 25 | # Init Celery app 26 | CELERY = Celery(**CFG['celery']) 27 | CELERY.register_task(Discover(CFG)) 28 | 29 | # Init Flask app 30 | from . import web 31 | APP = web.get_app(CFG) 32 | 33 | # Initialize CORS support 34 | cors = CFG.get('cors') 35 | if cors: 36 | CORS(APP, origins=cors) 37 | 38 | # Initialize Celery and Redis 39 | APP.celery = CELERY 40 | APP.redis = StrictRedis( 41 | connection_pool=BlockingConnectionPool.from_url( 42 | **CFG.get('redis') 43 | ) 44 | ) 45 | 46 | # ensure the instance folder exists 47 | try: 48 | os.makedirs(APP.instance_path) 49 | except OSError: 50 | pass 51 | -------------------------------------------------------------------------------- /wayback_discover_diff/conf.yml.example: -------------------------------------------------------------------------------- 1 | simhash: 2 | size: 256 3 | expire_after: 86400 4 | 5 | redis: 6 | url: "redis://localhost:6379/1" 7 | decode_responses: True 8 | health_check_interval: 30 9 | max_connections: 100 10 | socket_keepalive: True 11 | socket_timeout: 10 12 | retry_on_timeout: True 13 | 14 | test_redis: 15 | port: 6379 16 | host: "localhost" 17 | db: 0 18 | 19 | cdx_auth_token: "xxxx-yyy-zzz-www-xxxxx" 20 | 21 | celery: 22 | result_backend: "redis://localhost:6379/2" 23 | broker_url: "redis://localhost:6379/3" 24 | task_default_queue: "wayback_discover_diff" 25 | task_soft_time_limit: 7200 26 | worker_max_tasks_per_child: 100 27 | 28 | statsd: 29 | host: "graphite.us.archive.org" 30 | port: 8125 31 | 32 | threads: 8 33 | 34 | snapshots: 35 | number_per_year: -1 36 | number_per_page: 600 37 | 38 | cors: 39 | ['http://localhost:3000', 40 | 'http://localhost:3001'] 41 | 42 | logging: 43 | version: 1 44 | disable_existing_loggers: false 45 | handlers: 46 | console: { class: logging.StreamHandler, formatter: default } 47 | formatters: 48 | default: 49 | format: "%(asctime)s %(levelname)s %(thread)d %(name)s %(message)s" 50 | datefmt: "%Y-%m-%d %H:%M:%S" 51 | root: 52 | level: DEBUG 53 | loggers: 54 | wayback_discover_diff.web: 55 | handlers: [console] 56 | level: DEBUG 57 | wayback_discover_diff.worker: 58 | handlers: [console] 59 | level: DEBUG 60 | -------------------------------------------------------------------------------- /wayback_discover_diff/stats.py: -------------------------------------------------------------------------------- 1 | """Statsd methods to record statistics. 2 | """ 3 | import logging 4 | import socket 5 | from contextlib import contextmanager 6 | from timeit import default_timer as time 7 | import statsd 8 | 9 | 10 | STATSD_CLIENT = statsd.StatsClient('localhost', 8125) 11 | 12 | 13 | def configure(host, port): 14 | """Confiugure StatsD client. 15 | """ 16 | global STATSD_CLIENT 17 | _logger = logging.getLogger(__name__) 18 | hostname = socket.getfqdn().split(".")[0] 19 | prefix = 'wb.changes.%s.' % hostname 20 | try: 21 | STATSD_CLIENT = statsd.StatsClient(host=host, port=port, prefix=prefix) 22 | _logger.info('configured statsd client %s %s %s', host, port, prefix) 23 | except OSError as exc: 24 | _logger.error('cannot connect to statsd server %s %s %s (%s)', host, 25 | port, prefix, str(exc)) 26 | 27 | 28 | def statsd_incr(metric, count=1): 29 | """Utility method to increment statsd metric. 30 | """ 31 | STATSD_CLIENT.incr(metric, count) 32 | 33 | 34 | def statsd_timing(metric, dt_sec): 35 | """Utility method to record statsd timing metric. Input is in sec (usually 36 | the difference between two times), we must convert to millisec. 37 | """ 38 | STATSD_CLIENT.timing(metric, int(dt_sec * 1000)) 39 | 40 | 41 | @contextmanager 42 | def timing(metric): 43 | t0 = time() 44 | try: 45 | yield self 46 | finally: 47 | STATSD_CLIENT.timing(metric, int((time() - t0) * 1000)) 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # wayback-discover-diff 2 | 3 | A Python 3.10+ application running a web service that accepts HTTP GET requests and returns JSON: 4 | 5 | - `/calculate-simhash?url={URL}&year={YEAR}` 6 | 7 | Checks if there is a task to calculate simhash for all captures of target URL in the specified year already running. 8 | If there isn't, it creates it. 9 | 10 | Return JSON `{“status”: “started”, “job_id”: “XXYYZZ (uuid)”}` 11 | 12 | **OR** 13 | 14 | If there is a task already running it returns its job_id. 15 | 16 | Return JSON `{“status”: “PENDING”, “job_id”: “XXYYZZ (uuid)”}` 17 | 18 | - `/simhash?url={URL}×tamp={timestamp}` 19 | 20 | Returns JSON `{“simhash”: “XXXX”}` if that capture's simhash value has already been calculated 21 | 22 | **OR** 23 | 24 | Returns JSON `{"message": "NO_CAPTURES", "status": "error"}` if the WBM has no captures for this year and URL combination. 25 | 26 | **OR** 27 | 28 | Returns JSON `{ "message": "CAPTURE_NOT_FOUND", "status": "error" }` if the timestamp does not exist. 29 | 30 | - `/simhash?url={URL}&year={YEAR}` 31 | 32 | Which returns all the timestamps for which a simhash value exists in the DB for that specific URL and year with the following format : ["TIMESTAMP_VALUE", "SIMHASH_VALUE"] 33 | 34 | Its also possible to view the same results in a more compact data format using 35 | 36 | - `/simhash?url={URL}&year={YEAR}&compress=1` 37 | 38 | Returns JSON { captures […], total number of captures: XXX, status "COMPLETE" } if there are simhash values in the DB and that job is completed. 39 | 40 | **OR** 41 | 42 | Returns JSON `{ captures […], total number of captures: XXX, status "PENDING" }` if there are simhash values in the DB but that job is still pending. 43 | 44 | **OR** 45 | 46 | Returns JSON `{'status': 'error', 'message': 'NOT_CAPTURED'}` if that URL and year combination hasn't been hashed yet. 47 | 48 | **OR** 49 | 50 | Returns JSON `{'status': 'error', 'message': 'NO_CAPTURES'}` if the WBM doesn't have snapshots for that year and URL. 51 | 52 | - `/simhash?url={URL}&year={YEAR}&page={PAGE_NUMBER}` 53 | 54 | Which is the same as the request above but, depending on the page size that is set in the conf.yml file, the results are paginated. The response has the following format : [["pages","NUMBER_OF_PAGES"],["TIMESTAMP_VALUE", "SIMHASH_VALUE"]] 55 | 56 | **The SIMHASH_VALUE is base64 encoded** 57 | 58 | - `/job?job_id=` 59 | 60 | Returns JSON `{“status”: “pending”, “job_Id”: “XXYYZZ”, “info”: “X out of Y captures have been processed”}` the status of the job matching that specific job id 61 | 62 | ## Installing 63 | 64 | Using conda or another Python environment management system, select Python 3.10 to create a virtualenv and activate it: 65 | ```Shell 66 | python -m venv venv 67 | . venv/bin/activate 68 | ``` 69 | 70 | Install and update using pip: 71 | ```Shell 72 | python setup.py install 73 | ``` 74 | Copy the conf.yml.example file to the same directory, removing the .example extension 75 | 76 | ``` 77 | cd wayback_discover_diff 78 | cp conf.yml.example conf.yml 79 | ``` 80 | ## Run 81 | In order to run this server you should run : 82 | ``` 83 | bash run_gunicorn.sh & 84 | bash run_celery.sh 85 | ``` 86 | 87 | Open http://127.0.0.1:4000 in a browser. 88 | 89 | ## Tests 90 | In order to run the tests call the script: 91 | ``` 92 | bash run_tests.sh 93 | ``` 94 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from wayback_discover_diff.util import (url_is_valid, year_simhash, 4 | timestamp_simhash) 5 | 6 | 7 | SAMPLE_REDIS_CONTENT = { 8 | 'com,example)/': { 9 | '20141021062411': 'o52rOf0Hi2o=', 10 | '20140202131837': 'og2jGKWHsy4=', 11 | '20140824062257': 'o52jPP0Hg2o=', 12 | '20160824062257': 'o52jPP0Hg2o=' 13 | }, 14 | 'com,other)/': { 15 | '2014': '-1' 16 | }, 17 | 'org,nonexistingdomain)/': { 18 | '1999': '-1' 19 | }, 20 | } 21 | 22 | 23 | class StubRedis(dict): 24 | """Mock Redis connection for unit tests. 25 | """ 26 | def __init__(self, *args, **kwargs): 27 | self.update(SAMPLE_REDIS_CONTENT) 28 | 29 | def hset(self, key, hkey, hval): 30 | e = self.get(key) 31 | if e is None: 32 | self[key] = e = {} 33 | else: 34 | assert isinstance(e, dict) 35 | e[hkey] = hval 36 | 37 | def hget(self, key, hkey): 38 | e = self.get(key) 39 | if e is None: return None 40 | assert isinstance(e, dict) 41 | return e.get(hkey) 42 | 43 | def hkeys(self, key): 44 | e = self.get(key) 45 | if e is None: return {} 46 | assert isinstance(e, dict) 47 | return self.get(key).keys() 48 | 49 | def hmget(self, key, hkeys): 50 | e = self.get(key) 51 | if e is None: return None 52 | assert isinstance(e, dict) 53 | out = {} 54 | for hkey in hkeys: 55 | out[hkey] = e.get(hkey) 56 | return out 57 | 58 | 59 | @pytest.fixture 60 | def redis(): 61 | return StubRedis() 62 | 63 | 64 | @pytest.mark.parametrize('url,result', [ 65 | ('http://example.com/', True), 66 | ('other', False), 67 | ('torrent:something.gr/file', False), 68 | ('tel:00302310123456', False), 69 | ('loudfi1@libero.it', False), 70 | ('http://roblox', False) 71 | ]) 72 | def test_url_is_valid(url, result): 73 | assert url_is_valid(url) == result 74 | 75 | 76 | @pytest.mark.parametrize('url,timestamp,simhash', [ 77 | ('http://example.com', '20141021062411', 'o52rOf0Hi2o='), 78 | ('http://example.com', '2014102', None), 79 | ('http://other.com', '20141021062411', None), 80 | ]) 81 | def test_timestamp_simhash(url, timestamp, simhash, redis): 82 | res = timestamp_simhash(redis, url, timestamp) 83 | if len(res.keys()) == 1: 84 | assert res == {'simhash': simhash} 85 | elif url == 'http://other.com': 86 | assert res == {'status': 'error', 'message': 'NO_CAPTURES'} 87 | else: 88 | assert res == {'status': 'error', 'message': 'CAPTURE_NOT_FOUND'} 89 | 90 | 91 | @pytest.mark.parametrize('url,year,count', [ 92 | ('http://example.com', '2014', 3), 93 | ('http://example.com', '2016', 1), 94 | ('http://example.com', '2017', None), 95 | ('http://example.com', '', None), 96 | ('http://other.com', '2014', None) 97 | ]) 98 | def test_year_simhash(url, year, count, redis): 99 | """check if year_simhash produced an error response. 100 | """ 101 | res = year_simhash(redis, url, year) 102 | if isinstance(res,dict): 103 | if year == '2014': 104 | assert res == {'status': 'error', 'message': 'NO_CAPTURES'} 105 | else: 106 | assert res == {'status': 'error', 'message': 'NOT_CAPTURED'} 107 | if count: 108 | assert len(res[0]) == count 109 | -------------------------------------------------------------------------------- /tests/test_web.py: -------------------------------------------------------------------------------- 1 | """Test web endpoints. 2 | """ 3 | import json 4 | import pytest 5 | from werkzeug.test import Client 6 | from werkzeug.wrappers import Response 7 | from test_util import StubRedis 8 | 9 | from wayback_discover_diff.web import get_app 10 | 11 | 12 | @pytest.fixture 13 | def app(): 14 | cfg = dict(redis_uri='redis://localhost/9', 15 | snapshots=dict(snapshots_per_page=100) 16 | ) 17 | web_app = get_app(cfg) 18 | web_app.redis = StubRedis() 19 | return web_app 20 | 21 | # TODO we must mock Celery task 22 | # Initialize Celery and register Discover task. 23 | # celery = Celery(__name__, broker='redis://'+str(cfg['redis']['host'])+':'+str(cfg['redis']['port'])) 24 | # celery.conf.update( 25 | # CELERY_BROKER_URL='redis://'+str(cfg['redis']['host'])+':'+str(cfg['redis']['port']), 26 | # CELERY_RESULT_BACKEND='redis://'+str(cfg['redis']['host'])+':'+str(cfg['redis']['port']) 27 | # ) 28 | # celery.register_task(app) 29 | 30 | 31 | def test_simhash_parameters(app): 32 | client = Client(app, response_wrapper=Response) 33 | resp = client.get('/simhash?timestamp=20141115130953') 34 | assert resp.status_code == 200 35 | data = json.loads(resp.data.decode('utf-8')) 36 | assert data == dict(status='error', info='url param is required.') 37 | 38 | resp = client.get('/simhash?url=example.com') 39 | assert resp.status_code == 200 40 | data = json.loads(resp.data.decode('utf-8')) 41 | assert data == dict(status='error', info='year param is required.') 42 | 43 | resp = client.get('/simhash?url=invalid×tamp=20141115130953') 44 | assert resp.status_code == 200 45 | data = json.loads(resp.data.decode('utf-8')) 46 | assert data == dict(status='error', info='invalid url format.') 47 | 48 | # StubRedis already has simhash data for 20140202131837 and example.com 49 | resp = client.get('/simhash?url=example.com×tamp=20140202131837') 50 | data = json.loads(resp.data.decode('utf-8')) 51 | assert data.get('simhash') == 'og2jGKWHsy4=' 52 | 53 | 54 | def test_no_entry(app): 55 | client = Client(app, response_wrapper=Response) 56 | resp = client.get('/simhash?timestamp=20180000000000&url=nonexistingdomain.org') 57 | assert resp.status_code == 200 58 | data = json.loads(resp.data.decode('utf-8')) 59 | assert data == {'message': 'CAPTURE_NOT_FOUND', 'status': 'error'} 60 | 61 | # TODO must mock this 62 | # def test_start_task(): 63 | # url = 'iskme.org' 64 | # year = '2018' 65 | # job_id = celery.tasks['Discover'].apply(args=[url, year]) 66 | # assert job_id is not None 67 | 68 | 69 | def test_simhash_task_parameters(app): 70 | client = Client(app, response_wrapper=Response) 71 | resp = client.get('/calculate-simhash?year=2018') 72 | assert resp.status_code == 200 73 | data = json.loads(resp.data.decode('utf-8')) 74 | assert data == dict(status='error', info='url param is required.') 75 | 76 | resp = client.get('/calculate-simhash?url=example.com&year=XY') 77 | assert resp.status_code == 200 78 | data = json.loads(resp.data.decode('utf-8')) 79 | assert data == dict(status='error', info='year param is required.') 80 | 81 | resp = client.get('/calculate-simhash?url=nonexistingdomain.org') 82 | assert resp.status_code == 200 83 | data = json.loads(resp.data.decode('utf-8')) 84 | assert data == dict(status='error', info='year param is required.') 85 | 86 | resp = client.get('/calculate-simhash?url=nonexistingdomain.org&year=-') 87 | assert resp.status_code == 200 88 | data = json.loads(resp.data.decode('utf-8')) 89 | assert data == dict(status='error', info='year param is required.') 90 | 91 | resp = client.get('/calculate-simhash?url=foo&year=2000') 92 | assert resp.status_code == 200 93 | data = json.loads(resp.data.decode('utf-8')) 94 | assert data == dict(status='error', info='invalid url format.') 95 | 96 | 97 | def test_task_no_snapshots(app): 98 | client = Client(app, response_wrapper=Response) 99 | resp = client.get('/simhash?url=nonexistingdomain.org&year=1999') 100 | data = json.loads(resp.data.decode('utf-8')) 101 | assert data == {'message': 'NO_CAPTURES', 'status': 'error'} 102 | 103 | 104 | # TODO must mock this 105 | # def test_success_calc_simhash(): 106 | # url = 'iskme.org' 107 | # year = '2018' 108 | # job = celery.tasks['Discover'].apply(args=[url, year]) 109 | # task_info = json.loads(job.info) 110 | # assert task_info.get('duration', -1) != -1 111 | 112 | 113 | def test_root(app): 114 | client = Client(app, response_wrapper=Response) 115 | resp = client.get('/') 116 | assert resp.data 117 | 118 | 119 | def test_job_params(app): 120 | client = Client(app, response_wrapper=Response) 121 | resp = client.get('/job') 122 | data = json.loads(resp.data.decode('utf-8')) 123 | assert data == dict(status='error', info='job_id param is required.') 124 | -------------------------------------------------------------------------------- /wayback_discover_diff/util.py: -------------------------------------------------------------------------------- 1 | """SPN Utility methods. 2 | """ 3 | import logging 4 | from collections import defaultdict 5 | from math import ceil 6 | import os 7 | import re 8 | import yaml 9 | from redis.exceptions import RedisError 10 | from surt import surt 11 | import tldextract 12 | 13 | 14 | def load_config(): 15 | """Load conf file defined by ENV var WAYBACK_DISCOVER_DIFF_CONF. 16 | If not available load ./conf.yaml 17 | """ 18 | config = {} 19 | try: 20 | cfg_file = os.environ.get('WAYBACK_DISCOVER_DIFF_CONF') 21 | if not cfg_file: 22 | cfg_file = os.getcwd() + '/conf.yml' 23 | logging.warning('using default configuration from %s', cfg_file) 24 | with open(cfg_file) as cfg: 25 | config = yaml.safe_load(cfg) 26 | logging.debug('config=%s', config) 27 | except OSError: 28 | logging.error('Error loading configuration', exc_info=1) 29 | return config 30 | 31 | 32 | def timestamp_simhash(redis, url, timestamp): 33 | """Get stored simhash data from Redis for URL and timestamp 34 | """ 35 | try: 36 | if url and timestamp: 37 | results = redis.hget(surt(url), timestamp) 38 | if results: 39 | return {'simhash': results} 40 | results = redis.hget(surt(url), timestamp[:4]) 41 | if results: 42 | return {'status': 'error', 'message': 'NO_CAPTURES'} 43 | except RedisError as exc: 44 | logging.error('error loading simhash data for url %s timestamp %s (%s)', 45 | url, timestamp, exc) 46 | return {'status': 'error', 'message': 'CAPTURE_NOT_FOUND'} 47 | 48 | 49 | def year_simhash(redis, url, year, page=None, snapshots_per_page=None): 50 | """Get stored simhash data for url, year and page (optional). 51 | """ 52 | try: 53 | if url and year: 54 | # TODO replace hkeys with hscan 55 | results = redis.hkeys(surt(url)) 56 | if results: 57 | timestamps_to_fetch = [] 58 | for timestamp in results: 59 | if timestamp == str(year): 60 | return {'status': 'error', 'message': 'NO_CAPTURES'} 61 | if timestamp[:4] == str(year): 62 | timestamps_to_fetch.append(timestamp) 63 | if timestamps_to_fetch: 64 | return handle_results(redis, timestamps_to_fetch, url, 65 | snapshots_per_page, page) 66 | # TODO return empty result and NOT error. 67 | except RedisError as exc: 68 | logging.error('error loading simhash data for url %s year %s page %d (%s)', 69 | url, year, page, exc) 70 | return {'status': 'error', 'message': 'NOT_CAPTURED'} 71 | 72 | 73 | def handle_results(redis, timestamps_to_fetch, url, snapshots_per_page, 74 | page=None): 75 | """Utility method used by `year_simhash` 76 | """ 77 | available_simhashes = [] 78 | if page: 79 | number_of_pages = ceil(len(timestamps_to_fetch) / snapshots_per_page) 80 | page = min(page, number_of_pages) 81 | if number_of_pages > 0: 82 | timestamps_to_fetch = \ 83 | timestamps_to_fetch[(page - 1) * snapshots_per_page:(page * snapshots_per_page)] 84 | else: 85 | number_of_pages = 1 86 | try: 87 | results = redis.hmget(surt(url), timestamps_to_fetch) 88 | # TODO this crashes because of simhash bytes 89 | for i, simhash in enumerate(results): 90 | available_simhashes.append([str(timestamps_to_fetch[i]), simhash]) 91 | if page: 92 | available_simhashes.insert(0, ["pages", number_of_pages]) 93 | return [available_simhashes, len(timestamps_to_fetch)] 94 | except RedisError as exc: 95 | logging.error('cannot handle results for url %s page %d (%s)', 96 | url, page, exc) 97 | return None 98 | 99 | 100 | EMAIL_RE = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)") 101 | 102 | 103 | def url_is_valid(url): 104 | """URL validation. 105 | """ 106 | try: 107 | if not url: 108 | return False 109 | if EMAIL_RE.match(url): 110 | return False 111 | ext = tldextract.extract(url) 112 | return ext.domain != '' and ext.suffix != '' 113 | except (ValueError, AttributeError): 114 | return False 115 | 116 | 117 | def compress_captures(captures): 118 | """Input: [["20130603143716","NRyJrLc2FWA="],["20130402202841","FT6d7Jc3vWA="],...] 119 | Output: 120 | Captures: [[2013, [06, [03, ['143716', 0]]], 121 | [04, [02, ['202841', 1]]] 122 | ]] 123 | Hashes: ['NRyJrLc2FWA=', 'FT6d7Jc3vWA='] 124 | """ 125 | hashdict = {} 126 | grouped = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) 127 | for ts, simhash in captures: 128 | year, month, day, hms = ts[0:4], ts[4:6], ts[6:8], ts[8:] 129 | hashid = hashdict.get(simhash) 130 | if hashid is None: 131 | hashid = len(hashdict) 132 | hashdict[simhash] = hashid 133 | cap = [hms, hashid] 134 | grouped[int(year)][int(month)][int(day)].append(cap) 135 | new_captures = [ 136 | [y] + [ 137 | [m] + [ 138 | [d] + dc for d, dc in mc.items() 139 | ] for m, mc in yc.items() 140 | ] for y, yc in grouped.items() 141 | ] 142 | hashes = [hash for hash, hashid in sorted(hashdict.items(), key=lambda x: x[1])] 143 | return (new_captures, hashes) 144 | -------------------------------------------------------------------------------- /tests/test_discover.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import mock 3 | from test_util import StubRedis 4 | from wayback_discover_diff.discover import (extract_html_features, 5 | calculate_simhash, custom_hash_function, pack_simhash_to_bytes, Discover) 6 | 7 | 8 | def test_extract_html_features(): 9 | # handle html with repeated elements and spaces 10 | html = """ 11 | my title 12 | 13 | abc 14 | test 15 | 16 | 123 17 | abc 18 | space 19 | 20 | """ 21 | features = {'123': 1, 'abc': 2, 'my': 1, 'test': 1, 'title': 1, 'space': 1} 22 | assert extract_html_features(html) == features 23 | 24 | # handle html with repeated elements, and punctuation 25 | html = """ 26 | my title 27 | 28 | abc 29 | a.b.c. 30 | abc. 31 | test 32 | 123 33 | abc 34 | 35 | """ 36 | features = {'123': 1, 'a': 1, 'abc': 3, 'b': 1, 'c': 1, 'my': 1, 'test': 1, 'title': 1} 37 | assert extract_html_features(html) == features 38 | 39 | # handle plain text 40 | html = "just a string" 41 | features = {'just': 1, 'a': 1, 'string': 1} 42 | assert extract_html_features(html) == features 43 | 44 | # skip HTML comments 45 | html = """ 46 | 47 | 49 | 50 | 51 | 52 | 53 | 54 | 55 |

Thank you for closing the message box.

56 | test 57 | """ 58 | features = {'box': 1, 'closing': 1, 'for': 1, 'message': 1, 'test': 1, 59 | 'thank': 1, 'the': 1, 'you': 1} 60 | assert extract_html_features(html) == features 61 | 62 | # it doesn't crash with invalid or unicode chars 63 | html = """ 64 | Invalid /\x94Invalid\x0b' 65 | 66 | 今日は 67 | 68 | 69 | """ 70 | features = {'\x94invalid': 1, 'invalid': 1, '今日は': 1} 71 | assert extract_html_features(html) == features 72 | 73 | 74 | html = """ 75 | weird is happening \c\x0b 76 | tagtag 77 | """ 78 | 79 | features = {'c': 1, 'weird': 1, 'is': 1, 'happening': 1, 'tag': 2} 80 | assert extract_html_features(html) == features 81 | 82 | 83 | def test_calculate_simhash(): 84 | features = {'two': 2, 'three': 3, 'one': 1} 85 | assert calculate_simhash(features, 128) == 66237222457941138286276456718971054176 86 | 87 | 88 | CFG = { 89 | 'simhash': { 90 | 'size': 256, 91 | 'expire_after': 86400 92 | }, 93 | 'redis': { 94 | 'url': 'redis://localhost:6379/1', 95 | 'decode_responses': True, 96 | 'timeout': 10 97 | }, 98 | 'threads': 5, 99 | 'snapshots': { 100 | 'number_per_year': -1, 101 | 'number_per_page': 600 102 | } 103 | } 104 | 105 | @mock.patch('wayback_discover_diff.discover.StrictRedis') 106 | def test_worker_download(Redis): 107 | Redis.return_value = StubRedis() 108 | task = Discover(CFG) 109 | # This capture performs redirects inside WBM. It has CDX status=200 but 110 | # its really a redirect (This is a common WBM issue). We test that 111 | # redirects work fine. 112 | task.url = 'https://iskme.org' 113 | assert task.download_capture('20190103133511') 114 | 115 | 116 | def test_regular_hash(): 117 | features = { 118 | '2019': 1, 119 | 'advanced': 1, 120 | 'google': 1, 121 | 'google©': 1, 122 | 'history': 1, 123 | 'insearch': 1, 124 | 'more': 1, 125 | 'optionssign': 1, 126 | 'privacy': 1, 127 | 'programsbusiness': 1, 128 | 'searchimagesmapsplayyoutubenewsgmaildrivemorecalendartranslatemobilebooksshoppingbloggerfinancephotosvideosdocseven': 1, 129 | 'searchlanguage': 1, 130 | 'settingsweb': 1, 131 | 'solutionsabout': 1, 132 | 'terms': 1, 133 | 'toolsadvertising': 1, 134 | '»account': 1 135 | } 136 | h = calculate_simhash(features, 128) 137 | assert h.bit_length() == 128 138 | h_bytes = pack_simhash_to_bytes(h) 139 | assert len(h_bytes) == 16 140 | 141 | 142 | def test_shortened_hash(): 143 | h_size = 128 144 | features = { 145 | 'about': 1, 146 | 'accountsearchmapsyoutubeplaynewsgmailcontactsdrivecalendartranslatephotosshoppingmorefinancedocsbooksbloggerhangoutskeepjamboardearthcollectionseven': 1, 147 | 'at': 1, 148 | 'data': 1, 149 | 'feedbackadvertisingbusiness': 1, 150 | 'from': 1, 151 | 'gmailimagessign': 1, 152 | 'google': 3, 153 | 'helpsend': 1, 154 | 'in': 2, 155 | 'inappropriate': 1, 156 | 'library': 1, 157 | 'local': 1, 158 | 'more': 1, 159 | 'new': 1, 160 | 'predictions': 1, 161 | 'privacytermssettingssearch': 1, 162 | 'remove': 1, 163 | 'report': 1, 164 | 'searchhistorysearch': 1, 165 | 'searchyour': 1, 166 | 'settingsadvanced': 1, 167 | 'skills': 1, 168 | 'store': 1, 169 | 'with': 1, 170 | 'your': 1, 171 | '×develop': 1 172 | } 173 | h = calculate_simhash(features, h_size) 174 | assert h.bit_length() != h_size 175 | h_bytes = pack_simhash_to_bytes(h, h_size) 176 | assert len(h_bytes) == h_size // 8 177 | 178 | 179 | def test_simhash_256(): 180 | h_size = 256 181 | features = { 182 | '2019': 1, 183 | 'advanced': 1, 184 | 'at': 1, 185 | 'google': 1, 186 | 'googleadvertising': 1, 187 | 'google©': 1, 188 | 'history': 1, 189 | 'insearch': 1, 190 | 'library': 1, 191 | 'local': 1, 192 | 'more': 1, 193 | 'new': 1, 194 | 'optionssign': 1, 195 | 'privacy': 1, 196 | 'programsbusiness': 1, 197 | 'searchimagesmapsplayyoutubenewsgmaildrivemorecalendartranslatemobilebooksshoppingbloggerfinancephotosvideosdocseven': 1, 198 | 'searchlanguage': 1, 199 | 'settingsweb': 1, 200 | 'skills': 1, 201 | 'solutionsabout': 1, 202 | 'terms': 1, 203 | 'toolsdevelop': 1, 204 | 'with': 1, 205 | 'your': 1, 206 | '»account': 1, 207 | } 208 | h = calculate_simhash(features, h_size, custom_hash_function) 209 | assert h.bit_length() == h_size 210 | h_bytes = pack_simhash_to_bytes(h, h_size) 211 | assert len(h_bytes) == h_size // 8 212 | -------------------------------------------------------------------------------- /wayback_discover_diff/web.py: -------------------------------------------------------------------------------- 1 | """Web endpoints 2 | """ 3 | import logging 4 | from time import time 5 | import pkg_resources 6 | from celery import states 7 | from celery.result import AsyncResult 8 | from celery.exceptions import CeleryError 9 | from flask import Flask, request 10 | from redis.exceptions import RedisError 11 | from .stats import statsd_incr 12 | from .util import (year_simhash, timestamp_simhash, url_is_valid, 13 | compress_captures) 14 | 15 | APP = Flask(__name__, instance_relative_config=True) 16 | APP._logger = logging.getLogger('wayback_discover_diff.web') 17 | 18 | def get_app(config): 19 | """Utility method to set APP configuration. Its used by application.py. 20 | """ 21 | APP.config.from_mapping( 22 | SECRET_KEY='wayback machine simhash service', 23 | ) 24 | APP.config.update(CELERYD_HIJACK_ROOT_LOGGER=False) 25 | APP.config.update(config) 26 | return APP 27 | 28 | 29 | def get_active_task(url, year): 30 | """Check for current simhash processing tasks for targe url & year 31 | """ 32 | try: 33 | pending = APP.celery.control.inspect().active() 34 | if pending: 35 | for task in list(pending.values())[0]: 36 | if task['args'] == "['{}', '{}']".format(url, year): 37 | return task 38 | return None 39 | except RedisError: 40 | # Redis connection timeout is quite common in production Celery. 41 | return None 42 | 43 | 44 | @APP.route('/') 45 | def root(): 46 | """Return info on the current package version. 47 | """ 48 | version = pkg_resources.require("wayback-discover-diff")[0].version 49 | return "wayback-discover-diff service version: %s" % version 50 | 51 | 52 | @APP.route('/simhash') 53 | def simhash(): 54 | """Return simhash data for specific URL and year (optional), 55 | page is also optional. 56 | """ 57 | try: 58 | statsd_incr('get-simhash-year-request') 59 | url = request.args.get('url') 60 | if not url: 61 | return {'status': 'error', 'info': 'url param is required.'} 62 | if not url_is_valid(url): 63 | return {'status': 'error', 'info': 'invalid url format.'} 64 | timestamp = request.args.get('timestamp') 65 | if not timestamp: 66 | year = request.args.get('year', type=int) 67 | if not year: 68 | return {'status': 'error', 'info': 'year param is required.'} 69 | page = request.args.get('page', type=int) 70 | snapshots_per_page = APP.config.get('snapshots', {}).get('number_per_page') 71 | results_tuple = year_simhash(APP.redis, url, year, page, 72 | snapshots_per_page) 73 | # check if year_simhash produced an error response and return it 74 | if isinstance(results_tuple, dict): 75 | return results_tuple 76 | task = get_active_task(url, year) 77 | 78 | output = dict(captures=results_tuple[0], 79 | total_captures=results_tuple[1], 80 | status='PENDING' if task else 'COMPLETE') 81 | if request.args.get('compress') in ['true', '1']: 82 | (captures, hashes) = compress_captures(output['captures']) 83 | output['captures'] = captures 84 | output['hashes'] = hashes 85 | return output 86 | 87 | results = timestamp_simhash(APP.redis, url, timestamp) 88 | # check if timestamp_simhash produced an error response and return it 89 | if isinstance(results, dict): 90 | return results 91 | task = get_active_task(url, timestamp[:4]) 92 | if task: 93 | return {'status': 'PENDING', 'captures': results} 94 | return {'status': 'COMPLETE', 'captures': results} 95 | except (ValueError, CeleryError) as exc: 96 | APP._logger.error('Cannot get simhash of %s', url, exc_info=1) 97 | return {'status': 'error', 'info': 'Internal server error.'} 98 | 99 | 100 | @APP.route('/calculate-simhash') 101 | def request_url(): 102 | """Start simhash calculation for URL & year. 103 | Validate parameters url & timestamp before starting Celery task. 104 | """ 105 | try: 106 | statsd_incr('calculate-simhash-year-request') 107 | url = request.args.get('url') 108 | if not url: 109 | return {'status': 'error', 'info': 'url param is required.'} 110 | if not url_is_valid(url): 111 | return {'status': 'error', 'info': 'invalid url format.'} 112 | year = request.args.get('year', type=int) 113 | if not year: 114 | return {'status': 'error', 'info': 'year param is required.'} 115 | # see if there is an active job for this request 116 | task = get_active_task(url, year) 117 | if task: 118 | return {'status': 'PENDING', 'job_id': task['id']} 119 | res = APP.celery.tasks['Discover'].apply_async( 120 | args=[url, year, time()] 121 | ) 122 | return {'status': 'started', 'job_id': res.id} 123 | except CeleryError as exc: 124 | APP._logger.warning('Cannot calculate simhash of %s, %s', url, 125 | year, exc_info=1) 126 | return {'status': 'error', 'info': 'Cannot start calculation.'} 127 | except ValueError as exc: 128 | APP._logger.warning('Cannot calculate simhash of %s, no year', 129 | url, exc_info=1) 130 | return {'status': 'error', 'info': 'year param must be numeric.'} 131 | 132 | 133 | @APP.route('/job') 134 | def job_status(): 135 | """Return job status. 136 | """ 137 | try: 138 | statsd_incr('status-request') 139 | job_id = request.args.get('job_id') 140 | if not job_id: 141 | return {'status': 'error', 'info': 'job_id param is required.'} 142 | task = AsyncResult(job_id, app=APP.celery) 143 | if task.state == states.PENDING: 144 | if task.info: 145 | info = task.info.get('info', 1) 146 | else: 147 | info = None 148 | # job did not finish yet 149 | return {'status': task.state, 'job_id': task.id, 'info': info} 150 | 151 | if task.info and task.info.get('status', 0) == 'error': 152 | # something went wrong in the background job 153 | return {'info': task.info.get('info', 1), 'job_id': task.id, 154 | 'status': task.info.get('status', 0)} 155 | if task.info: 156 | duration = task.info.get('duration', 1) 157 | else: 158 | duration = 1 159 | return {'status': task.state, 'job_id': task.id, 'duration': duration} 160 | except (CeleryError, AttributeError) as exc: 161 | APP._logger.error('Cannot get job status of %s', job_id, exc_info=1) 162 | return {'status': 'error', 'info': 'Cannot get status.'} 163 | -------------------------------------------------------------------------------- /wayback_discover_diff/discover.py: -------------------------------------------------------------------------------- 1 | """Celery worker 2 | """ 3 | from concurrent.futures import ThreadPoolExecutor 4 | import hashlib 5 | import logging 6 | import string 7 | from time import time 8 | from datetime import datetime 9 | import cProfile 10 | import base64 11 | from itertools import groupby 12 | from celery import Task 13 | import urllib3 14 | from urllib3.exceptions import HTTPError 15 | from redis import StrictRedis, BlockingConnectionPool 16 | from redis.exceptions import RedisError 17 | from simhash import Simhash 18 | from surt import surt 19 | from selectolax.parser import HTMLParser 20 | from werkzeug.urls import url_fix 21 | 22 | from .stats import statsd_incr, statsd_timing 23 | 24 | # https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings 25 | urllib3.disable_warnings() 26 | 27 | 28 | TRANSLATOR = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 29 | 30 | 31 | def extract_html_features(html): 32 | """Process HTML document and get key features as text. Steps: 33 | kill all script and style elements 34 | get lowercase text 35 | remove all punctuation 36 | break into lines and remove leading and trailing space on each 37 | break multi-headlines into a line each 38 | drop blank lines 39 | return a dict with features and their weights 40 | """ 41 | try: 42 | tree = HTMLParser(html) 43 | tree.strip_tags(['script', 'style']) 44 | text = tree.root.text(separator=' ') 45 | if not text: 46 | return {} 47 | except UnicodeDecodeError: 48 | return {} 49 | text = text.lower().translate(TRANSLATOR) 50 | lines = (line.strip() for line in text.splitlines()) 51 | chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) 52 | text = '\n'.join(chunk for chunk in chunks if chunk) 53 | return {k: sum(1 for _ in g) for k, g in groupby(sorted(text.split()))} 54 | 55 | 56 | def custom_hash_function(x): 57 | """Required by Simhash 58 | """ 59 | return int.from_bytes(hashlib.blake2b(x).digest(), byteorder='big') 60 | 61 | 62 | def calculate_simhash(features_dict, simhash_size, hashfunc=None): 63 | """Calculate simhash for features in a dict. `features_dict` contains data 64 | like {'text': weight} 65 | """ 66 | if hashfunc: 67 | return Simhash(features_dict, simhash_size, hashfunc=hashfunc).value 68 | return Simhash(features_dict, simhash_size).value 69 | 70 | 71 | def pack_simhash_to_bytes(simhash, simhash_size=None): 72 | # simhash_value = simhash.value 73 | if simhash_size is None: 74 | size_in_bytes = (simhash.bit_length() + 7) // 8 75 | else: 76 | size_in_bytes = simhash_size // 8 77 | return simhash.to_bytes(size_in_bytes, byteorder='little') 78 | 79 | 80 | class Discover(Task): 81 | """Custom Celery Task class. 82 | http://docs.celeryproject.org/en/latest/userguide/tasks.html#custom-task-classes 83 | """ 84 | name = 'Discover' 85 | task_id = None 86 | # If a simhash calculation for a URL & year does more than 87 | # `max_download_errors`, stop it to avoid pointless requests. The captures 88 | # are not text/html or there is a problem with the WBM. 89 | max_download_errors = 10 90 | max_capture_download = 1000000 91 | 92 | def __init__(self, cfg): 93 | self.simhash_size = cfg['simhash']['size'] 94 | self.simhash_expire = cfg['simhash']['expire_after'] 95 | if self.simhash_size > 512: 96 | raise Exception('do not support simhash longer than 512') 97 | 98 | headers = {'User-Agent': 'wayback-discover-diff', 99 | 'Accept-Encoding': 'gzip,deflate', 100 | 'Connection': 'keep-alive'} 101 | cdx_auth_token = cfg.get('cdx_auth_token') 102 | if cdx_auth_token: 103 | headers['cookie'] = 'cdx_auth_token=%s' % cdx_auth_token 104 | 105 | self.http = urllib3.HTTPConnectionPool('web.archive.org', maxsize=50, 106 | retries=2, timeout=20, 107 | headers=headers) 108 | self.redis = StrictRedis( 109 | connection_pool=BlockingConnectionPool.from_url( 110 | **cfg['redis'] 111 | ) 112 | ) 113 | self.tpool = ThreadPoolExecutor(max_workers=cfg['threads']) 114 | self.snapshots_number = cfg['snapshots']['number_per_year'] 115 | self.download_errors = 0 116 | # Initialize logger 117 | self._log = logging.getLogger('wayback_discover_diff.worker') 118 | 119 | def download_capture(self, ts): 120 | """Download capture data from the WBM and update job status. Return 121 | data only when its text or html. On download error, increment download_errors 122 | which will stop the task after 10 errors. Fetch data up to a limit 123 | to avoid getting too much (which is unnecessary) and have a consistent 124 | operation time. 125 | """ 126 | try: 127 | statsd_incr('download-capture') 128 | self._log.info('fetching capture %s %s', ts, self.url) 129 | res = self.http.request('GET', '/web/{}id_/{}'.format(ts, self.url), 130 | preload_content=False) 131 | data = res.read(self.max_capture_download) 132 | ctype = res.headers.get('content-type') 133 | res.release_conn() 134 | if ctype: 135 | ctype = ctype.lower() 136 | if "text" in ctype or "html" in ctype: 137 | return data 138 | except HTTPError: 139 | self.download_errors += 1 140 | statsd_incr('download-error') 141 | self._log.error('cannot fetch capture %s %s', ts, self.url, exc_info=1) 142 | return None 143 | 144 | def start_profiling(self, snapshot, index): 145 | """Used for performance testing only. 146 | """ 147 | cProfile.runctx('self.get_calc(snapshot, index)', 148 | globals=globals(), locals=locals(), 149 | filename='profile.prof') 150 | 151 | def get_calc(self, capture): 152 | """if a capture with an equal digest has been already processed, 153 | return cached simhash and avoid redownloading and processing. Else, 154 | download capture, extract HTML features and calculate simhash. 155 | If there are already too many download failures, return None without 156 | any processing to avoid pointless requests. 157 | Return None if any problem occurs (e.g. HTTP error or cannot calculate) 158 | """ 159 | (timestamp, digest) = capture.split(' ') 160 | simhash_enc = self.seen.get(digest) 161 | if simhash_enc: 162 | self._log.info("already seen %s", digest) 163 | return (timestamp, simhash_enc) 164 | 165 | if self.download_errors >= self.max_download_errors: 166 | statsd_incr('multiple-consecutive-errors') 167 | self._log.error('%d consecutive download errors fetching %s captures', 168 | self.download_errors, self.url) 169 | return None 170 | 171 | response_data = self.download_capture(timestamp) 172 | if response_data: 173 | data = extract_html_features(response_data) 174 | if data: 175 | statsd_incr('calculate-simhash') 176 | self._log.info("calculating simhash") 177 | simhash = calculate_simhash(data, self.simhash_size, 178 | hashfunc=custom_hash_function) 179 | # This encoding is necessary to store simhash data in Redis. 180 | simhash_enc = base64.b64encode( 181 | pack_simhash_to_bytes(simhash, self.simhash_size) 182 | ) 183 | self.seen[digest] = simhash_enc 184 | return (timestamp, simhash_enc) 185 | return None 186 | 187 | def run(self, url, year, created): 188 | """Run Celery Task. 189 | """ 190 | self.job_id = self.request.id 191 | self.url = url_fix(url) 192 | time_started = datetime.now() 193 | self._log.info('Start calculating simhashes.') 194 | self.download_errors = 0 195 | 196 | statsd_timing('task-wait', time() - created) 197 | if not self.url: 198 | self._log.error('did not give url parameter') 199 | return {'status': 'error', 'info': 'URL is required.'} 200 | if not year: 201 | self._log.error('did not give year parameter') 202 | return {'status': 'error', 'info': 'Year is required.'} 203 | # fetch captures 204 | self.update_state(state='PENDING', 205 | meta={'info': 'Fetching {} captures for year {}'.format( 206 | url, year)}) 207 | resp = self.fetch_cdx(url, year) 208 | if resp.get('status') == 'error': 209 | return resp 210 | captures = resp.get('captures') 211 | total = len(captures) 212 | self.seen = dict() 213 | # calculate simhashes in parallel 214 | i = 0 215 | final_results = {} 216 | for res in self.tpool.map(self.get_calc, captures): 217 | if not res: 218 | continue 219 | (timestamp, simhash) = res 220 | if simhash: 221 | final_results[timestamp] = simhash 222 | if i % 10 == 0: 223 | self.update_state( 224 | state='PENDING', 225 | meta={'info': 'Processed %d out of %d captures.' % (i, total)} 226 | ) 227 | i += 1 228 | 229 | self._log.info('%d final results for %s and year %s.', 230 | len(final_results), self.url, year) 231 | if final_results: 232 | try: 233 | urlkey = surt(self.url) 234 | self.redis.hmset(urlkey, final_results) 235 | self.redis.expire(urlkey, self.simhash_expire) 236 | except RedisError as exc: 237 | self._log.error('cannot write simhashes to Redis for URL %s', 238 | self.url, exc_info=1) 239 | 240 | duration = (datetime.now() - time_started).seconds 241 | statsd_timing('task-duration', duration) 242 | self._log.info('Simhash calculation finished in %.2fsec.', duration) 243 | return {'duration': str(duration)} 244 | 245 | def fetch_cdx(self, url, year): 246 | """Make a CDX query for timestamp and digest for a specific year. 247 | """ 248 | try: 249 | self._log.info('fetching CDX of %s for year %s', url, year) 250 | # Collapse captures by timestamp to get 3 captures per day (max). 251 | # TODO increase that in the future when we can handle more captures. 252 | # Its necessary to reduce the huge number of captures some websites 253 | # (e.g. twitter.com has 167k captures for 2018. Get only 2xx captures. 254 | fields = {'url': url, 'from': year, 'to': year, 255 | 'statuscode': 200, 'fl': 'timestamp,digest', 256 | 'collapse': 'timestamp:9'} 257 | if self.snapshots_number != -1: 258 | fields['limit'] = self.snapshots_number 259 | response = self.http.request('GET', '/web/timemap', fields=fields) 260 | self._log.info('finished fetching timestamps of %s for year %s', 261 | self, year) 262 | if response.status == 200: 263 | if not response.data: 264 | self._log.info('no captures found for %s %s', self, year) 265 | urlkey = surt(url) 266 | self.redis.hset(urlkey, year, -1) 267 | self.redis.expire(urlkey, self.simhash_expire) 268 | return {'status': 'error', 269 | 'info': 'No captures of {} for year {}'.format(url, year)} 270 | captures_txt = response.data.decode('utf-8') 271 | captures = captures_txt.strip().split("\n") 272 | if captures: 273 | return {'status': 'success', 'captures': captures} 274 | return {'status': 'error', 275 | 'info': 'No captures of {} for year {}'.format(url, year)} 276 | except (ValueError, HTTPError) as exc: 277 | self._log.error('invalid CDX query response for %s %s', url, year, 278 | exc_info=1) 279 | return {'status': 'error', 'info': str(exc)} 280 | except RedisError as exc: 281 | self._log.error('error connecting with Redis for url %s year %s', 282 | url, year, exc_info=1) 283 | return {'status': 'error', 'info': str(exc)} 284 | --------------------------------------------------------------------------------