├── wayback_machine_scraper
    ├── __init__.py
    ├── mirror_spider.py
    └── __main__.py
├── img
    └── logo.png
├── .gitignore
├── LICENSE.txt
├── setup.py
└── README.md


/wayback_machine_scraper/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangaline/wayback-machine-scraper/HEAD/img/logo.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | .env
3 | __pycache__
4 | *.pyc
5 | website
6 | dist
7 | build
8 | upload.sh
9 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | ISC License
 2 | 
 3 | Copyright (c) 2017, Evan Sangaline
 4 | 
 5 | Permission to use, copy, modify, and/or distribute this software for any
 6 | purpose with or without fee is hereby granted, provided that the above
 7 | copyright notice and this permission notice appear in all copies.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | description = ('A command-line utility for scraping '
 4 |                'Wayback Machine snapshots from archive.org.')
 5 | long_description = description + \
 6 |         (' For further details, '
 7 |          'please see the code repository on github: '
 8 |          'https://github.com/sangaline/wayback-machine-scraper')
 9 | 
10 | 
11 | setup(
12 |     name='wayback-machine-scraper',
13 |     version='1.0.7',
14 |     author='Evan Sangaline',
15 |     author_email='evan@intoli.com',
16 |     description=description,
17 |     license='ISC',
18 |     keywords='archive.org scrapy scraper waybackmachine',
19 |     url="https://github.com/sangaline/wayback-machine-scraper",
20 |     packages=find_packages(),
21 |     entry_points={
22 |         'console_scripts': [
23 |             'wayback-machine-scraper = wayback_machine_scraper.__main__:main',
24 |         ],
25 |     },
26 |     long_description=long_description,
27 |     classifiers=[
28 |         'Development Status :: 5 - Production/Stable',
29 |         'Framework :: Scrapy',
30 |         'Topic :: Utilities',
31 |         'License :: OSI Approved :: ISC License (ISCL)',
32 |     ],
33 |     install_requires=[
34 |         'cryptography',
35 |         'scrapy',
36 |         'scrapy-wayback-machine',
37 |         'twisted',
38 |     ]
39 | )
40 | 


--------------------------------------------------------------------------------
/wayback_machine_scraper/mirror_spider.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | 
 4 | try:
 5 |     from urllib.parse import quote_plus
 6 | except ImportError:
 7 |     from urllib import quote_plus
 8 | 
 9 | from scrapy.spiders import CrawlSpider, Rule
10 | from scrapy.linkextractors import LinkExtractor
11 | 
12 | from scrapy_wayback_machine import WaybackMachineMiddleware
13 | 
14 | 
15 | class MirrorSpider(CrawlSpider):
16 |     name = 'mirror_spider'
17 |     handle_httpstatus_list = [404]
18 | 
19 |     def __init__(self, domains, directory, allow=(), deny=(), unix=False):
20 |         self.directory = directory
21 |         self.unix = unix
22 |         self.rules = (
23 |             Rule(LinkExtractor(allow=allow, deny=deny), callback='save_page'),
24 |         )
25 | 
26 |         # parse the allowed domains and start urls
27 |         self.allowed_domains = []
28 |         self.start_urls = []
29 |         for domain in domains:
30 |             url_parts = domain.split('://')
31 |             unqualified_url = url_parts[-1]
32 |             url_scheme = url_parts[0] if len(url_parts) > 1 else 'http'
33 |             full_url = '{0}://{1}'.format(url_scheme, unqualified_url)
34 |             bare_domain = unqualified_url.split('/')[0]
35 |             self.allowed_domains.append(bare_domain)
36 |             self.start_urls.append(full_url)
37 | 
38 |         super().__init__()
39 | 
40 |     def parse_start_url(self, response):
41 |         # scrapy doesn't call the callbacks for the start urls by default,
42 |         # this overrides that behavior so that any matching callbacks are called
43 |         for rule in self._rules:
44 |             if rule.link_extractor._link_allowed(response):
45 |                 if rule.callback:
46 |                     rule.callback(response)
47 | 
48 |     def save_page(self, response):
49 |         # ignore 404s
50 |         if response.status == 404:
51 |             return
52 | 
53 |         # make the parent directory
54 |         url_parts = response.url.split('://')[1].split('/')
55 |         if os.name == 'nt':
56 |             url_parts = [quote_plus(url_part) for url_part in url_parts]
57 |         parent_directory = os.path.join(self.directory, *url_parts)
58 |         os.makedirs(parent_directory, exist_ok=True)
59 | 
60 |         # construct the output filename
61 |         time = response.meta['wayback_machine_time']
62 |         if self.unix:
63 |             filename = '{0}.snapshot'.format(time.timestamp())
64 |         else:
65 |             filename = '{0}.snapshot'.format(time.strftime(WaybackMachineMiddleware.timestamp_format))
66 |         full_path = os.path.join(parent_directory, filename)
67 | 
68 |         # write out the file
69 |         with open(full_path, 'wb') as f:
70 |             f.write(response.body)
71 | 


--------------------------------------------------------------------------------
/wayback_machine_scraper/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pkg_resources import get_distribution
 3 | 
 4 | from scrapy.crawler import CrawlerProcess
 5 | from scrapy.settings import Settings
 6 | 
 7 | from .mirror_spider import MirrorSpider
 8 | 
 9 | 
10 | def main():
11 |     # configure the settings for the crawler and spider
12 |     args = parse_args()
13 |     config = {
14 |         'domains': args.domains,
15 |         'directory': args.output,
16 |         'allow': args.allow,
17 |         'deny': args.deny,
18 |         'unix': args.unix,
19 |     }
20 |     settings = Settings({
21 |         'USER_AGENT': (
22 |             'Wayback Machine Scraper/{0} '
23 |             '(+https://github.com/sangaline/scrapy-wayback-machine)'
24 |         ).format(get_distribution('wayback-machine-scraper').version),
25 |         'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO',
26 |         'DOWNLOADER_MIDDLEWARES': {
27 |             'scrapy_wayback_machine.WaybackMachineMiddleware': 5,
28 |         },
29 |         'AUTOTHROTTLE_ENABLED': True,
30 |         'AUTOTHROTTLE_DEBUG': args.verbose,
31 |         'AUTOTHROTTLE_START_DELAY': 1,
32 |         'AUTOTHROTTLE_TARGET_CONCURRENCY': args.concurrency,
33 |         'WAYBACK_MACHINE_TIME_RANGE': (getattr(args, 'from'), args.to),
34 |     })
35 | 
36 |     # start the crawler
37 |     process = CrawlerProcess(settings)
38 |     process.crawl(MirrorSpider, **config)
39 |     process.start()
40 | 
41 | 
42 | def parse_args():
43 |     formatter = argparse.ArgumentDefaultsHelpFormatter
44 |     parser = argparse.ArgumentParser(formatter_class=formatter, description=(
45 |         'Mirror all Wayback Machine snapshots of one or more domains '
46 |         'within a specified time range.'
47 |     ))
48 |     parser.add_argument('domains', metavar='DOMAIN', nargs='+', help=(
49 |         'Specify the domain(s) to scrape. '
50 |         'Can also be a full URL to specify starting points for the crawler.'
51 |     ))
52 |     parser.add_argument('-o', '--output', metavar='DIRECTORY', default='website', help=(
53 |         'Specify the domain(s) to scrape. '
54 |         'Can also be a full URL to specify starting points for the crawler.'
55 |     ))
56 |     parser.add_argument('-f', '--from', metavar='TIMESTAMP', default='10000101', help=(
57 |         'The timestamp for the beginning of the range to scrape. '
58 |         'Can either be YYYYmmdd, YYYYmmddHHMMSS, or a Unix timestamp.'
59 |     ))
60 |     parser.add_argument('-t', '--to', metavar='TIMESTAMP', default='30000101', help=(
61 |         'The timestamp for the end of the range to scrape. '
62 |         'Use the same timestamp as `--from` to specify a single point in time.'
63 |     ))
64 |     parser.add_argument('-a', '--allow', metavar='REGEX', default=(), help=(
65 |         'A regular expression that all scraped URLs must match.'
66 |     ))
67 |     parser.add_argument('-d', '--deny', metavar='REGEX', default=(), help=(
68 |         'A regular expression to exclude matched URLs.'
69 |     ))
70 |     parser.add_argument('-c', '--concurrency', default=10.0, help=(
71 |         'Target concurrency for crawl requests.'
72 |         'The crawl rate will be automatically adjusted to match this target.'
73 |         'Use values less than 1 to be polite and higher values to scrape more quickly.'
74 |     ))
75 |     parser.add_argument('-u', '--unix', action='store_true', help=(
76 |         'Save snapshots as `UNIX_TIMESTAMP.snapshot` instead of '
77 |         'the default `YYYYmmddHHMMSS.snapshot`.'
78 |     ))
79 |     parser.add_argument('-v', '--verbose', action='store_true', help=(
80 |         'Turn on debug logging.'
81 |     ))
82 | 
83 |     return parser.parse_args()
84 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![The Wayback Machine Scraper Logo](img/logo.png)
  2 | 
  3 | # The Wayback Machine Scraper
  4 | 
  5 | The repository consists of a command-line utility `wayback-machine-scraper` that can be used to scrape or download website data as it appears in [archive.org](http://archive.org)'s [Wayback Machine](https://archive.org/web/).
  6 | It crawls through historical snapshots of a website and saves the snapshots to disk.
  7 | This can be useful when you're trying to scrape a site that has scraping measures that make direct scraping impossible or prohibitively slow.
  8 | It's also useful if you want to scrape a website as it appeared at some point in the past or to scrape information that changes over time.
  9 | 
 10 | The command-line utility is highly configurable in terms of what it scrapes but it only saves the unparsed content of the pages on the site.
 11 | If you're interested in parsing data from the pages that are crawled then you might want to check out [scrapy-wayback-machine](https://github.com/sangaline/scrapy-wayback-machine) instead.
 12 | It's a downloader middleware that handles all of the tricky parts and passes normal `response` objects to your [Scrapy](https://scrapy.org) spiders with archive timestamp information attached.
 13 | The middleware is very unobtrusive and should work seamlessly with existing [Scrapy](https://scrapy.org) middlewares, extensions, and spiders.
 14 | It's what `wayback-machine-scraper` uses behind the scenes and it offers more flexibility for advanced use cases.
 15 | 
 16 | ## Installation
 17 | 
 18 | The package can be installed using `pip`.
 19 | 
 20 | ```bash
 21 | pip install wayback-machine-scraper
 22 | ```
 23 | 
 24 | ## Command-Line Interface
 25 | 
 26 | Writing a custom [Scrapy](https://scrapy.org) spider and using the `WaybackMachine` middleware is the preferred way to use this project, but a command line interface for basic mirroring is also included.
 27 | The usage information can be printed by running `wayback-machine-scraper -h`.
 28 | 
 29 | ```
 30 | usage: wayback-machine-scraper [-h] [-o DIRECTORY] [-f TIMESTAMP]
 31 |                                [-t TIMESTAMP] [-a REGEX] [-d REGEX]
 32 |                                [-c CONCURRENCY] [-u] [-v]
 33 |                                DOMAIN [DOMAIN ...]
 34 | 
 35 | Mirror all Wayback Machine snapshots of one or more domains within a specified
 36 | time range.
 37 | 
 38 | positional arguments:
 39 |   DOMAIN                Specify the domain(s) to scrape. Can also be a full
 40 |                         URL to specify starting points for the crawler.
 41 | 
 42 | optional arguments:
 43 |   -h, --help            show this help message and exit
 44 |   -o DIRECTORY, --output DIRECTORY
 45 |                         Specify the domain(s) to scrape. Can also be a full
 46 |                         URL to specify starting points for the crawler.
 47 |                         (default: website)
 48 |   -f TIMESTAMP, --from TIMESTAMP
 49 |                         The timestamp for the beginning of the range to
 50 |                         scrape. Can either be YYYYmmdd, YYYYmmddHHMMSS, or a
 51 |                         Unix timestamp. (default: 10000101)
 52 |   -t TIMESTAMP, --to TIMESTAMP
 53 |                         The timestamp for the end of the range to scrape. Use
 54 |                         the same timestamp as `--from` to specify a single
 55 |                         point in time. (default: 30000101)
 56 |   -a REGEX, --allow REGEX
 57 |                         A regular expression that all scraped URLs must match.
 58 |                         (default: ())
 59 |   -d REGEX, --deny REGEX
 60 |                         A regular expression to exclude matched URLs.
 61 |                         (default: ())
 62 |   -c CONCURRENCY, --concurrency CONCURRENCY
 63 |                         Target concurrency for crawl requests.The crawl rate
 64 |                         will be automatically adjusted to match this
 65 |                         target.Use values less than 1 to be polite and higher
 66 |                         values to scrape more quickly. (default: 10.0)
 67 |   -u, --unix            Save snapshots as `UNIX_TIMESTAMP.snapshot` instead of
 68 |                         the default `YYYYmmddHHMMSS.snapshot`. (default:
 69 |                         False)
 70 |   -v, --verbose         Turn on debug logging. (default: False)
 71 | ```
 72 | 
 73 | ## Examples
 74 | 
 75 | The usage can be perhaps be made more clear with a couple of concrete examples.
 76 | 
 77 | ### A Single Page Over Time
 78 | 
 79 | One of the key advantages of `wayback-machine-scraper` over other projects, such as [wayback-machine-downloader](https://github.com/hartator/wayback-machine-downloader), is that it offers the capability to download all available [archive.org](https://archive.org) snapshots.
 80 | This can be extremely useful if you're interested in analyzing how pages change over time.
 81 | 
 82 | For example, say that you would like to analyze many snapshots of the [Hacker News](news.ycombinator.com) front page as I did writing [Reverse Engineering the Hacker News Algorithm](http://sangaline.com/post/reverse-engineering-the-hacker-news-ranking-algorithm/).
 83 | This can be done by running
 84 | 
 85 | ```bash
 86 | wayback-machine-scraper -a 'news.ycombinator.com$' news.ycombinator.com
 87 | ```
 88 | 
 89 | where the `--allow` regular expression `news.ycombinator.com$` limits the crawl to the front page.
 90 | This produces a file structure of
 91 | 
 92 | ```
 93 | website/
 94 | └── news.ycombinator.com
 95 |     ├── 20070221033032.snapshot
 96 |     ├── 20070226001637.snapshot
 97 |     ├── 20070405032412.snapshot
 98 |     ├── 20070405175109.snapshot
 99 |     ├── 20070406195336.snapshot
100 |     ├── 20070601184317.snapshot
101 |     ├── 20070629033202.snapshot
102 |     ├── 20070630222527.snapshot
103 |     ├── 20070630222818.snapshot
104 |     └── etc.
105 | ```
106 | 
107 | with each snapshot file containing the full HTML body of the front page.
108 | 
109 | A series of snapshots for any page can be obtained in this way as long as suitable regular expressions and start URLs are constructed.
110 | If we are interested in a page other than the homepage then we should use it as the start URL instead.
111 | To get all of the snapshots for a specific story we could run
112 | 
113 | ```bash
114 | wayback-machine-scraper -a 'id=13857086$' 'news.ycombinator.com/item?id=13857086'
115 | ```
116 | 
117 | which produces
118 | 
119 | ```
120 | website/
121 | └── news.ycombinator.com
122 |     └── item?id=13857086
123 |         ├── 20170313225853.snapshot
124 |         ├── 20170313231755.snapshot
125 |         ├── 20170314043150.snapshot
126 |         ├── 20170314165633.snapshot
127 |         └── 20170320205604.snapshot
128 | ```
129 | 
130 | ### A Full Site Crawl at One Point In Time
131 | 
132 | If the goal is to take a snapshot of an entire site at once then this can also be easily achieved.
133 | Specifying both the `--from` and `--to` options as the same point in time will assure that only one snapshot is saved for each URL.
134 | Running
135 | 
136 | ```
137 | wayback-machine-scraper -f 20080623 -t 20080623 news.ycombinator.com
138 | ```
139 | 
140 | produces a file structure of
141 | 
142 | ```
143 | website
144 | └── news.ycombinator.com
145 |     ├── 20080621143814.snapshot
146 |     ├── item?id=221868
147 |     │   └── 20080622151531.snapshot
148 |     ├── item?id=222157
149 |     │   └── 20080622151822.snapshot
150 |     ├── item?id=222341
151 |     │   └── 20080620221102.snapshot
152 |     └── etc.
153 | ```
154 | 
155 | with a single snapshot for each page in the crawl as it appeared on June 23, 2008.
156 | 


--------------------------------------------------------------------------------