├── .gitignore ├── README.md ├── setup.py ├── test.py └── waybackprov.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .pytest_cache 3 | *.log 4 | dist 5 | waybackprov.egg-info 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Give *waybackprov* a URL and it will summarize which Internet Archive 2 | collections have archived the URL. This kind of information can sometimes 3 | provide insight about why a particular web resource or set of web resources were 4 | archived from the web. 5 | 6 | ## Install 7 | 8 | pip install waybackprov 9 | 10 | ## Basic Usage 11 | 12 | To check a particular URL here's how it works: 13 | 14 | % waybackprov https://twitter.com/EPAScottPruitt 15 | 364 https://archive.org/details/focused_crawls 16 | 306 https://archive.org/details/edgi_monitor 17 | 151 https://archive.org/details/www3.epa.gov 18 | 60 https://archive.org/details/epa.gov4 19 | 47 https://archive.org/details/epa.gov5 20 | ... 21 | 22 | The first column contains the number of crawls for a particular URL, and the 23 | second column contains the URL for the Internet Archive collection that added 24 | it. 25 | 26 | ## Time 27 | 28 | By default waybackprov will only look at the current year. If you would like it 29 | to examine a range of years use the `--start` and `--end` options: 30 | 31 | % waybackprov --start 2016 --end 2018 https://twitter.com/EPAScottPruitt 32 | 33 | ## Multiple Pages 34 | 35 | If you would like to look at all URLs at a particular URL prefix you can use the 36 | `--prefix` option: 37 | 38 | % waybackprov --prefix https://twitter.com/EPAScottPruitt 39 | 40 | This will use the Internet Archive's [CDX API](https://github.com/webrecorder/pywb/wiki/CDX-Server-API) to also include URLs that are extensions of the URL you supply, so it would include for example: 41 | 42 | https://twitter.com/EPAScottPruitt/status/1309839080398339 43 | 44 | But it can also include things you may not want, such as: 45 | 46 | https://twitter.com/EPAScottPruitt/status/1309839080398339/media/1 47 | 48 | To further limit the URLs use the `--match` parameter to specify a regular 49 | expression only check particular URLs. Further specifying the URLs you are 50 | interested in is highly recommended since it prevents lots of lookups for CSS, 51 | JavaScript and image files that are components of the resource that was 52 | initially crawled. 53 | 54 | % waybackprov --prefix --match 'status/\d+$' https://twitter.com/EPAScottPruitt 55 | 56 | ## Collections 57 | 58 | One thing to remember when interpreting this data is that collections can 59 | contain other collections. For example the *edgi_monitor* collection is a 60 | sub-collection of *focused_crawls*. 61 | 62 | If you use the `--collapse` option only the most specific collection will be 63 | reported for a given crawl. So if *coll1* is part of *coll2* which is part of 64 | *coll3*, only *coll1* will be reported instead of *coll1*, *coll2* and *coll3*. 65 | This does involve collection metadata lookups at the Internet Archive API, so it 66 | does slow performance significantly. 67 | 68 | ## JSON and CSV 69 | 70 | If you would rather see the raw data as JSON or CSV use the `--format` option. 71 | When you use either of these formats you will see the metadata for each crawl, 72 | rather than a summary. 73 | 74 | ## Log 75 | 76 | If you would like to see detailed information about what *waybackprov* is doing 77 | use the `--log` option to supply the a file path to log to: 78 | 79 | % waybackprov --log waybackprov.log https://example.com/ 80 | 81 | ## Test 82 | 83 | If you would like to test it first install [pytest] and then: 84 | 85 | pytest test.py 86 | 87 | [pytest]: https://docs.pytest.org/en/latest/ 88 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open("README.md") as f: 4 | long_description = f.read() 5 | 6 | if __name__ == "__main__": 7 | setup( 8 | name='waybackprov', 9 | version='0.0.9', 10 | url='https://github.com/edsu/waybackprov', 11 | author='Ed Summers', 12 | author_email='ehs@pobox.com', 13 | py_modules=['waybackprov', ], 14 | description='Checks the provenance of a URL in the Wayback machine', 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | python_requires='>=3.0', 18 | entry_points={'console_scripts': ['waybackprov = waybackprov:main']} 19 | ) 20 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import logging 4 | 5 | from waybackprov import * 6 | 7 | logging.basicConfig(filename='test.log', filemode='w', level=logging.INFO) 8 | 9 | def test_coll(): 10 | coll = get_collection('ArchiveIt-Collection-2410') 11 | assert coll['title'] == 'University of Maryland' 12 | 13 | def test_get_crawls(): 14 | crawls = list(get_crawls('https://mith.umd.edu')) 15 | assert len(crawls) > 0 16 | assert crawls[0]['timestamp'] 17 | assert crawls[0]['url'] 18 | assert crawls[0]['status'] 19 | assert crawls[0]['collections'] 20 | assert len(crawls[0]['collections']) > 0 21 | 22 | def test_depth(): 23 | assert get_depth('ArchiveIt-Collection-2410') == 4 24 | assert get_depth('wikipediaoutlinks00003') == 3 25 | 26 | def test_deepest_collection(): 27 | colls = [ 28 | 'ArchiveIt-Partner-408', 29 | 'archiveitdigitalcollection', 30 | 'web', 31 | 'archiveitpartners', 32 | 'ArchiveIt-Collection-2410' 33 | ] 34 | assert deepest_collection(colls) == 'ArchiveIt-Collection-2410' 35 | 36 | def test_loop(): 37 | # weirdly, some collections can contain themselves when there is a loop 38 | # e.g. coll1 ∃ coll2 and coll2 ∃ coll1 39 | assert get_depth('ArchiveIt-Partner-1140') == 4 40 | 41 | def test_prefix(): 42 | crawls = get_crawls('https://twitter.com/Guccifer_2', prefix=True, match='/status/\d+$') 43 | crawl = next(crawls) 44 | assert crawl['url'] 45 | 46 | def test_cdx(): 47 | urls = cdx('https://twitter.com/Guccifer_2', match='/status/\d+$', start_year=2016, end_year=2018) 48 | assert len(list(urls)) == 132 49 | 50 | def test_missing(): 51 | crawls = list(get_crawls('https://twitter.com/slavresistance/status/1016697918970105857/')) 52 | assert len(crawls) == 0 53 | -------------------------------------------------------------------------------- /waybackprov.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import re 4 | import csv 5 | import sys 6 | import json 7 | import time 8 | import codecs 9 | import logging 10 | import operator 11 | import datetime 12 | import optparse 13 | import collections 14 | 15 | from functools import reduce 16 | from urllib.parse import quote 17 | from urllib.request import urlopen 18 | 19 | colls = {} 20 | 21 | def main(): 22 | now = datetime.datetime.now() 23 | 24 | parser = optparse.OptionParser('waybackprov.py [options] ') 25 | parser.add_option('--start', default=now.year -1, help='start year') 26 | parser.add_option('--end', default=now.year, help='end year') 27 | parser.add_option('--format', choices=['text', 'csv', 'json'], 28 | default='text', help='output data') 29 | parser.add_option('--collapse', action='store_true', 30 | help='only display most specific collection') 31 | parser.add_option('--prefix', action='store_true', 32 | help='use url as a prefix') 33 | parser.add_option('--match', help='limit to urls that match pattern') 34 | parser.add_option('--log', help='where to log activity to') 35 | opts, args = parser.parse_args() 36 | 37 | if opts.log: 38 | logging.basicConfig( 39 | filename=opts.log, 40 | format='%(asctime)s - %(levelname)s - %(message)s', 41 | level=logging.INFO 42 | ) 43 | else: 44 | logging.basicConfig( 45 | format='%(asctime)s - %(levelname)s - %(message)s', 46 | level=logging.WARNING 47 | ) 48 | if len(args) != 1: 49 | parser.error('You must supply a URL to lookup') 50 | 51 | url = args[0] 52 | 53 | crawl_data = get_crawls(url, 54 | start_year=opts.start, 55 | end_year=opts.end, 56 | collapse=opts.collapse, 57 | prefix=opts.prefix, 58 | match=opts.match 59 | ) 60 | 61 | if opts.format == 'text': 62 | crawls = 0 63 | coll_urls = {} 64 | coll_counter = collections.Counter() 65 | for crawl in crawl_data: 66 | crawls += 1 67 | coll_counter.update(crawl['collections']) 68 | for coll in crawl['collections']: 69 | # keep track of urls in each collection 70 | if coll not in coll_urls: 71 | coll_urls[coll] = set() 72 | coll_urls[coll].add(crawl['url']) 73 | 74 | if len(coll_counter) == 0: 75 | print('No results for %s-%s, consider using --start and --end to broaden.' % (opts.start, opts.end)) 76 | return 77 | 78 | max_pos = str(len(str(coll_counter.most_common(1)[0][1]))) 79 | if opts.prefix: 80 | str_format = '%' + max_pos + 'i %' + max_pos + 'i https://archive.org/details/%s' 81 | else: 82 | str_format = '%' + max_pos + 'i https://archive.org/details/%s' 83 | 84 | for coll_id, count in coll_counter.most_common(): 85 | if opts.prefix: 86 | print(str_format % (count, len(coll_urls[coll_id]), coll_id)) 87 | else: 88 | print(str_format % (count, coll_id)) 89 | 90 | print('') 91 | print('total crawls %s-%s: %s' % (opts.start, opts.end, crawls)) 92 | if (opts.prefix): 93 | total_urls = len(reduce(operator.or_, coll_urls.values())) 94 | print('total urls: %s' % total_urls) 95 | 96 | elif opts.format == 'json': 97 | data = list(crawl_data) 98 | print(json.dumps(data, indent=2)) 99 | 100 | elif opts.format == 'csv': 101 | w = csv.DictWriter(sys.stdout, 102 | fieldnames=['timestamp', 'status', 'collections', 'url', 'wayback_url']) 103 | for crawl in crawl_data: 104 | crawl['collections'] = ','.join(crawl['collections']) 105 | w.writerow(crawl) 106 | 107 | def get_crawls(url, start_year=None, end_year=None, collapse=False, 108 | prefix=False, match=None): 109 | 110 | if prefix == True: 111 | for year, sub_url in cdx(url, match=match, start_year=start_year, 112 | end_year=end_year): 113 | yield from get_crawls(sub_url, start_year=year, end_year=year) 114 | 115 | if start_year is None: 116 | start_year = datetime.datetime.now().year - 1 117 | else: 118 | start_year = int(start_year) 119 | if end_year is None: 120 | end_year = datetime.datetime.now().year 121 | else: 122 | end_year = int(end_year) 123 | 124 | api = 'https://web.archive.org/__wb/calendarcaptures?url=%s&selected_year=%s' 125 | for year in range(start_year, end_year + 1): 126 | # This calendar data structure reflects the layout of a calendar 127 | # month. So some spots in the first and last row are null. Not 128 | # every day has any data if the URL wasn't crawled then. 129 | logging.info("getting calendar year %s for %s", year, url) 130 | cal = get_json(api % (url, year)) 131 | found = False 132 | for month in cal: 133 | for week in month: 134 | for day in week: 135 | if day is None or day == {}: 136 | continue 137 | # note: we can't seem to rely on 'cnt' as a count 138 | for i in range(0, len(day['st'])): 139 | c = { 140 | 'status': day['st'][i], 141 | 'timestamp': day['ts'][i], 142 | 'collections': day['why'][i], 143 | 'url': url 144 | } 145 | c['wayback_url'] = 'https://web.archive.org/web/%s/%s' % (c['timestamp'], url) 146 | if c['collections'] is None: 147 | continue 148 | if collapse and len(c['collections']) > 0: 149 | c['collections'] = [deepest_collection(c['collections'])] 150 | logging.info('found crawl %s', c) 151 | found = True 152 | yield c 153 | 154 | def deepest_collection(coll_ids): 155 | return max(coll_ids, key=get_depth) 156 | 157 | def get_collection(coll_id): 158 | # no need to fetch twice 159 | if coll_id in colls: 160 | return colls[coll_id] 161 | 162 | logging.info('fetching collection %s', coll_id) 163 | 164 | # get the collection metadata 165 | url = 'https://archive.org/metadata/%s' % coll_id 166 | data = get_json(url)['metadata'] 167 | 168 | # make collection into reliable array 169 | if 'collection' in data: 170 | if type(data['collection']) == str: 171 | data['collection'] = [data['collection']] 172 | else: 173 | data['collection'] = [] 174 | 175 | # so we don't have to look it up again 176 | colls[coll_id] = data 177 | 178 | return data 179 | 180 | def get_depth(coll_id, seen_colls=None): 181 | coll = get_collection(coll_id) 182 | if 'depth' in coll: 183 | return coll['depth'] 184 | 185 | logging.info('calculating depth of %s', coll_id) 186 | 187 | if len(coll['collection']) == 0: 188 | return 0 189 | 190 | # prevent recursive loops 191 | if seen_colls == None: 192 | seen_colls = set() 193 | if coll_id in seen_colls: 194 | return 0 195 | seen_colls.add(coll_id) 196 | 197 | depth = max(map(lambda id: get_depth(id, seen_colls) + 1, coll['collection'])) 198 | 199 | coll['depth'] = depth 200 | logging.info('depth %s = %s', coll_id, depth) 201 | return depth 202 | 203 | def get_json(url): 204 | count = 0 205 | while True: 206 | count += 1 207 | if count >= 10: 208 | logging.error("giving up on fetching JSON from %s", url) 209 | try: 210 | resp = urlopen(url) 211 | reader = codecs.getreader('utf-8') 212 | return json.load(reader(resp)) 213 | except Exception as e: 214 | logging.error('caught exception: %s', e) 215 | logging.info('sleeping for %s seconds', count * 10) 216 | time.sleep(count * 10) 217 | raise(Exception("unable to get JSON for %s", url)) 218 | 219 | def cdx(url, match=None, start_year=None, end_year=None): 220 | logging.info('searching cdx for %s with regex %s', url, match) 221 | 222 | if match: 223 | try: 224 | pattern = re.compile(match) 225 | except Exception as e: 226 | sys.exit('invalid regular expression: {}'.format(e)) 227 | else: 228 | pattern = None 229 | 230 | cdx_url = 'http://web.archive.org/cdx/search/cdx?url={}&matchType=prefix&from={}&to={}'.format(quote(url), start_year, end_year) 231 | seen = set() 232 | results = codecs.decode(urlopen(cdx_url).read(), encoding='utf8') 233 | 234 | for line in results.split('\n'): 235 | parts = line.split(' ') 236 | if len(parts) == 7: 237 | year = int(parts[1][0:4]) 238 | url = parts[2] 239 | seen_key = '{}:{}'.format(year, url) 240 | if seen_key in seen: 241 | continue 242 | if pattern and not pattern.search(url): 243 | continue 244 | seen.add(seen_key) 245 | logging.info('cdx found %s', url) 246 | yield(year, url) 247 | 248 | if __name__ == "__main__": 249 | main() 250 | --------------------------------------------------------------------------------