├── wayback_machine_scraper ├── __init__.py ├── mirror_spider.py └── __main__.py ├── img └── logo.png ├── .gitignore ├── LICENSE.txt ├── setup.py └── README.md /wayback_machine_scraper/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangaline/wayback-machine-scraper/HEAD/img/logo.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | .env 3 | __pycache__ 4 | *.pyc 5 | website 6 | dist 7 | build 8 | upload.sh 9 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | ISC License 2 | 3 | Copyright (c) 2017, Evan Sangaline 4 | 5 | Permission to use, copy, modify, and/or distribute this software for any 6 | purpose with or without fee is hereby granted, provided that the above 7 | copyright notice and this permission notice appear in all copies. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | description = ('A command-line utility for scraping ' 4 | 'Wayback Machine snapshots from archive.org.') 5 | long_description = description + \ 6 | (' For further details, ' 7 | 'please see the code repository on github: ' 8 | 'https://github.com/sangaline/wayback-machine-scraper') 9 | 10 | 11 | setup( 12 | name='wayback-machine-scraper', 13 | version='1.0.7', 14 | author='Evan Sangaline', 15 | author_email='evan@intoli.com', 16 | description=description, 17 | license='ISC', 18 | keywords='archive.org scrapy scraper waybackmachine', 19 | url="https://github.com/sangaline/wayback-machine-scraper", 20 | packages=find_packages(), 21 | entry_points={ 22 | 'console_scripts': [ 23 | 'wayback-machine-scraper = wayback_machine_scraper.__main__:main', 24 | ], 25 | }, 26 | long_description=long_description, 27 | classifiers=[ 28 | 'Development Status :: 5 - Production/Stable', 29 | 'Framework :: Scrapy', 30 | 'Topic :: Utilities', 31 | 'License :: OSI Approved :: ISC License (ISCL)', 32 | ], 33 | install_requires=[ 34 | 'cryptography', 35 | 'scrapy', 36 | 'scrapy-wayback-machine', 37 | 'twisted', 38 | ] 39 | ) 40 | -------------------------------------------------------------------------------- /wayback_machine_scraper/mirror_spider.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | try: 5 | from urllib.parse import quote_plus 6 | except ImportError: 7 | from urllib import quote_plus 8 | 9 | from scrapy.spiders import CrawlSpider, Rule 10 | from scrapy.linkextractors import LinkExtractor 11 | 12 | from scrapy_wayback_machine import WaybackMachineMiddleware 13 | 14 | 15 | class MirrorSpider(CrawlSpider): 16 | name = 'mirror_spider' 17 | handle_httpstatus_list = [404] 18 | 19 | def __init__(self, domains, directory, allow=(), deny=(), unix=False): 20 | self.directory = directory 21 | self.unix = unix 22 | self.rules = ( 23 | Rule(LinkExtractor(allow=allow, deny=deny), callback='save_page'), 24 | ) 25 | 26 | # parse the allowed domains and start urls 27 | self.allowed_domains = [] 28 | self.start_urls = [] 29 | for domain in domains: 30 | url_parts = domain.split('://') 31 | unqualified_url = url_parts[-1] 32 | url_scheme = url_parts[0] if len(url_parts) > 1 else 'http' 33 | full_url = '{0}://{1}'.format(url_scheme, unqualified_url) 34 | bare_domain = unqualified_url.split('/')[0] 35 | self.allowed_domains.append(bare_domain) 36 | self.start_urls.append(full_url) 37 | 38 | super().__init__() 39 | 40 | def parse_start_url(self, response): 41 | # scrapy doesn't call the callbacks for the start urls by default, 42 | # this overrides that behavior so that any matching callbacks are called 43 | for rule in self._rules: 44 | if rule.link_extractor._link_allowed(response): 45 | if rule.callback: 46 | rule.callback(response) 47 | 48 | def save_page(self, response): 49 | # ignore 404s 50 | if response.status == 404: 51 | return 52 | 53 | # make the parent directory 54 | url_parts = response.url.split('://')[1].split('/') 55 | if os.name == 'nt': 56 | url_parts = [quote_plus(url_part) for url_part in url_parts] 57 | parent_directory = os.path.join(self.directory, *url_parts) 58 | os.makedirs(parent_directory, exist_ok=True) 59 | 60 | # construct the output filename 61 | time = response.meta['wayback_machine_time'] 62 | if self.unix: 63 | filename = '{0}.snapshot'.format(time.timestamp()) 64 | else: 65 | filename = '{0}.snapshot'.format(time.strftime(WaybackMachineMiddleware.timestamp_format)) 66 | full_path = os.path.join(parent_directory, filename) 67 | 68 | # write out the file 69 | with open(full_path, 'wb') as f: 70 | f.write(response.body) 71 | -------------------------------------------------------------------------------- /wayback_machine_scraper/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pkg_resources import get_distribution 3 | 4 | from scrapy.crawler import CrawlerProcess 5 | from scrapy.settings import Settings 6 | 7 | from .mirror_spider import MirrorSpider 8 | 9 | 10 | def main(): 11 | # configure the settings for the crawler and spider 12 | args = parse_args() 13 | config = { 14 | 'domains': args.domains, 15 | 'directory': args.output, 16 | 'allow': args.allow, 17 | 'deny': args.deny, 18 | 'unix': args.unix, 19 | } 20 | settings = Settings({ 21 | 'USER_AGENT': ( 22 | 'Wayback Machine Scraper/{0} ' 23 | '(+https://github.com/sangaline/scrapy-wayback-machine)' 24 | ).format(get_distribution('wayback-machine-scraper').version), 25 | 'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO', 26 | 'DOWNLOADER_MIDDLEWARES': { 27 | 'scrapy_wayback_machine.WaybackMachineMiddleware': 5, 28 | }, 29 | 'AUTOTHROTTLE_ENABLED': True, 30 | 'AUTOTHROTTLE_DEBUG': args.verbose, 31 | 'AUTOTHROTTLE_START_DELAY': 1, 32 | 'AUTOTHROTTLE_TARGET_CONCURRENCY': args.concurrency, 33 | 'WAYBACK_MACHINE_TIME_RANGE': (getattr(args, 'from'), args.to), 34 | }) 35 | 36 | # start the crawler 37 | process = CrawlerProcess(settings) 38 | process.crawl(MirrorSpider, **config) 39 | process.start() 40 | 41 | 42 | def parse_args(): 43 | formatter = argparse.ArgumentDefaultsHelpFormatter 44 | parser = argparse.ArgumentParser(formatter_class=formatter, description=( 45 | 'Mirror all Wayback Machine snapshots of one or more domains ' 46 | 'within a specified time range.' 47 | )) 48 | parser.add_argument('domains', metavar='DOMAIN', nargs='+', help=( 49 | 'Specify the domain(s) to scrape. ' 50 | 'Can also be a full URL to specify starting points for the crawler.' 51 | )) 52 | parser.add_argument('-o', '--output', metavar='DIRECTORY', default='website', help=( 53 | 'Specify the domain(s) to scrape. ' 54 | 'Can also be a full URL to specify starting points for the crawler.' 55 | )) 56 | parser.add_argument('-f', '--from', metavar='TIMESTAMP', default='10000101', help=( 57 | 'The timestamp for the beginning of the range to scrape. ' 58 | 'Can either be YYYYmmdd, YYYYmmddHHMMSS, or a Unix timestamp.' 59 | )) 60 | parser.add_argument('-t', '--to', metavar='TIMESTAMP', default='30000101', help=( 61 | 'The timestamp for the end of the range to scrape. ' 62 | 'Use the same timestamp as `--from` to specify a single point in time.' 63 | )) 64 | parser.add_argument('-a', '--allow', metavar='REGEX', default=(), help=( 65 | 'A regular expression that all scraped URLs must match.' 66 | )) 67 | parser.add_argument('-d', '--deny', metavar='REGEX', default=(), help=( 68 | 'A regular expression to exclude matched URLs.' 69 | )) 70 | parser.add_argument('-c', '--concurrency', default=10.0, help=( 71 | 'Target concurrency for crawl requests.' 72 | 'The crawl rate will be automatically adjusted to match this target.' 73 | 'Use values less than 1 to be polite and higher values to scrape more quickly.' 74 | )) 75 | parser.add_argument('-u', '--unix', action='store_true', help=( 76 | 'Save snapshots as `UNIX_TIMESTAMP.snapshot` instead of ' 77 | 'the default `YYYYmmddHHMMSS.snapshot`.' 78 | )) 79 | parser.add_argument('-v', '--verbose', action='store_true', help=( 80 | 'Turn on debug logging.' 81 | )) 82 | 83 | return parser.parse_args() 84 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![The Wayback Machine Scraper Logo](img/logo.png) 2 | 3 | # The Wayback Machine Scraper 4 | 5 | The repository consists of a command-line utility `wayback-machine-scraper` that can be used to scrape or download website data as it appears in [archive.org](http://archive.org)'s [Wayback Machine](https://archive.org/web/). 6 | It crawls through historical snapshots of a website and saves the snapshots to disk. 7 | This can be useful when you're trying to scrape a site that has scraping measures that make direct scraping impossible or prohibitively slow. 8 | It's also useful if you want to scrape a website as it appeared at some point in the past or to scrape information that changes over time. 9 | 10 | The command-line utility is highly configurable in terms of what it scrapes but it only saves the unparsed content of the pages on the site. 11 | If you're interested in parsing data from the pages that are crawled then you might want to check out [scrapy-wayback-machine](https://github.com/sangaline/scrapy-wayback-machine) instead. 12 | It's a downloader middleware that handles all of the tricky parts and passes normal `response` objects to your [Scrapy](https://scrapy.org) spiders with archive timestamp information attached. 13 | The middleware is very unobtrusive and should work seamlessly with existing [Scrapy](https://scrapy.org) middlewares, extensions, and spiders. 14 | It's what `wayback-machine-scraper` uses behind the scenes and it offers more flexibility for advanced use cases. 15 | 16 | ## Installation 17 | 18 | The package can be installed using `pip`. 19 | 20 | ```bash 21 | pip install wayback-machine-scraper 22 | ``` 23 | 24 | ## Command-Line Interface 25 | 26 | Writing a custom [Scrapy](https://scrapy.org) spider and using the `WaybackMachine` middleware is the preferred way to use this project, but a command line interface for basic mirroring is also included. 27 | The usage information can be printed by running `wayback-machine-scraper -h`. 28 | 29 | ``` 30 | usage: wayback-machine-scraper [-h] [-o DIRECTORY] [-f TIMESTAMP] 31 | [-t TIMESTAMP] [-a REGEX] [-d REGEX] 32 | [-c CONCURRENCY] [-u] [-v] 33 | DOMAIN [DOMAIN ...] 34 | 35 | Mirror all Wayback Machine snapshots of one or more domains within a specified 36 | time range. 37 | 38 | positional arguments: 39 | DOMAIN Specify the domain(s) to scrape. Can also be a full 40 | URL to specify starting points for the crawler. 41 | 42 | optional arguments: 43 | -h, --help show this help message and exit 44 | -o DIRECTORY, --output DIRECTORY 45 | Specify the domain(s) to scrape. Can also be a full 46 | URL to specify starting points for the crawler. 47 | (default: website) 48 | -f TIMESTAMP, --from TIMESTAMP 49 | The timestamp for the beginning of the range to 50 | scrape. Can either be YYYYmmdd, YYYYmmddHHMMSS, or a 51 | Unix timestamp. (default: 10000101) 52 | -t TIMESTAMP, --to TIMESTAMP 53 | The timestamp for the end of the range to scrape. Use 54 | the same timestamp as `--from` to specify a single 55 | point in time. (default: 30000101) 56 | -a REGEX, --allow REGEX 57 | A regular expression that all scraped URLs must match. 58 | (default: ()) 59 | -d REGEX, --deny REGEX 60 | A regular expression to exclude matched URLs. 61 | (default: ()) 62 | -c CONCURRENCY, --concurrency CONCURRENCY 63 | Target concurrency for crawl requests.The crawl rate 64 | will be automatically adjusted to match this 65 | target.Use values less than 1 to be polite and higher 66 | values to scrape more quickly. (default: 10.0) 67 | -u, --unix Save snapshots as `UNIX_TIMESTAMP.snapshot` instead of 68 | the default `YYYYmmddHHMMSS.snapshot`. (default: 69 | False) 70 | -v, --verbose Turn on debug logging. (default: False) 71 | ``` 72 | 73 | ## Examples 74 | 75 | The usage can be perhaps be made more clear with a couple of concrete examples. 76 | 77 | ### A Single Page Over Time 78 | 79 | One of the key advantages of `wayback-machine-scraper` over other projects, such as [wayback-machine-downloader](https://github.com/hartator/wayback-machine-downloader), is that it offers the capability to download all available [archive.org](https://archive.org) snapshots. 80 | This can be extremely useful if you're interested in analyzing how pages change over time. 81 | 82 | For example, say that you would like to analyze many snapshots of the [Hacker News](news.ycombinator.com) front page as I did writing [Reverse Engineering the Hacker News Algorithm](http://sangaline.com/post/reverse-engineering-the-hacker-news-ranking-algorithm/). 83 | This can be done by running 84 | 85 | ```bash 86 | wayback-machine-scraper -a 'news.ycombinator.com$' news.ycombinator.com 87 | ``` 88 | 89 | where the `--allow` regular expression `news.ycombinator.com$` limits the crawl to the front page. 90 | This produces a file structure of 91 | 92 | ``` 93 | website/ 94 | └── news.ycombinator.com 95 | ├── 20070221033032.snapshot 96 | ├── 20070226001637.snapshot 97 | ├── 20070405032412.snapshot 98 | ├── 20070405175109.snapshot 99 | ├── 20070406195336.snapshot 100 | ├── 20070601184317.snapshot 101 | ├── 20070629033202.snapshot 102 | ├── 20070630222527.snapshot 103 | ├── 20070630222818.snapshot 104 | └── etc. 105 | ``` 106 | 107 | with each snapshot file containing the full HTML body of the front page. 108 | 109 | A series of snapshots for any page can be obtained in this way as long as suitable regular expressions and start URLs are constructed. 110 | If we are interested in a page other than the homepage then we should use it as the start URL instead. 111 | To get all of the snapshots for a specific story we could run 112 | 113 | ```bash 114 | wayback-machine-scraper -a 'id=13857086$' 'news.ycombinator.com/item?id=13857086' 115 | ``` 116 | 117 | which produces 118 | 119 | ``` 120 | website/ 121 | └── news.ycombinator.com 122 | └── item?id=13857086 123 | ├── 20170313225853.snapshot 124 | ├── 20170313231755.snapshot 125 | ├── 20170314043150.snapshot 126 | ├── 20170314165633.snapshot 127 | └── 20170320205604.snapshot 128 | ``` 129 | 130 | ### A Full Site Crawl at One Point In Time 131 | 132 | If the goal is to take a snapshot of an entire site at once then this can also be easily achieved. 133 | Specifying both the `--from` and `--to` options as the same point in time will assure that only one snapshot is saved for each URL. 134 | Running 135 | 136 | ``` 137 | wayback-machine-scraper -f 20080623 -t 20080623 news.ycombinator.com 138 | ``` 139 | 140 | produces a file structure of 141 | 142 | ``` 143 | website 144 | └── news.ycombinator.com 145 | ├── 20080621143814.snapshot 146 | ├── item?id=221868 147 | │   └── 20080622151531.snapshot 148 | ├── item?id=222157 149 | │   └── 20080622151822.snapshot 150 | ├── item?id=222341 151 | │   └── 20080620221102.snapshot 152 | └── etc. 153 | ``` 154 | 155 | with a single snapshot for each page in the crawl as it appeared on June 23, 2008. 156 | --------------------------------------------------------------------------------