├── .dockerignore
├── .env.example
├── .github
    └── workflows
    │   ├── docker.yml
    │   └── ruby.yml
├── .gitignore
├── .travis.yml
├── Dockerfile
├── Gemfile
├── LICENSE
├── README.md
├── Rakefile
├── bin
    └── wayback_machine_downloader
├── docker-compose.yml
├── entrypoint.sh
├── lib
    ├── wayback_machine_downloader.rb
    └── wayback_machine_downloader
    │   ├── archive_api.rb
    │   ├── tidy_bytes.rb
    │   └── to_regex.rb
├── test
    └── test_wayback_machine_downloader.rb
└── wayback_machine_downloader.gemspec


/.dockerignore:
--------------------------------------------------------------------------------
1 | *.md
2 | *.yml
3 | 
4 | .github
5 | websites


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | DB_HOST="db"
2 | DB_USER="root"
3 | DB_PASSWORD="example1234"
4 | DB_NAME="wayback"


--------------------------------------------------------------------------------
/.github/workflows/docker.yml:
--------------------------------------------------------------------------------
 1 | name: Create and publish a Docker image
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |     tags:
 8 |       - '*'
 9 | 
10 | env:
11 |   REGISTRY: ghcr.io
12 |   IMAGE_NAME: ${{ github.repository }}
13 | 
14 | jobs:
15 |   build-and-push-image:
16 |     runs-on: ubuntu-latest
17 |     permissions:
18 |       contents: read
19 |       packages: write
20 |     steps:
21 |       - name: Checkout repository
22 |         uses: actions/checkout@main
23 |       - name: Set up QEMU
24 |         uses: docker/setup-qemu-action@master
25 |       - name: Set up Docker Buildx
26 |         uses: docker/setup-buildx-action@master
27 |       - name: Log in to the Container registry
28 |         uses: docker/login-action@master
29 |         with:
30 |           registry: ${{ env.REGISTRY }}
31 |           username: ${{ github.actor }}
32 |           password: ${{ secrets.GITHUB_TOKEN }}
33 |       - name: Extract metadata (tags, labels) for Docker
34 |         id: meta
35 |         uses: docker/metadata-action@master
36 |         with:
37 |           images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
38 |       - name: Build and push Docker image
39 |         uses: docker/build-push-action@master
40 |         with:
41 |           context: .
42 |           file: ./Dockerfile
43 |           platforms: linux/amd64,linux/arm64/v8
44 |           push: true
45 |           tags: ${{ steps.meta.outputs.tags }}
46 |           labels: ${{ steps.meta.outputs.labels }}


--------------------------------------------------------------------------------
/.github/workflows/ruby.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake
 6 | # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby
 7 | 
 8 | name: Ruby
 9 | 
10 | on:
11 |   push:
12 |     branches: [ "master" ]
13 |   pull_request:
14 |     branches: [ "master" ]
15 | 
16 | permissions:
17 |   contents: read
18 | 
19 | jobs:
20 |   test:
21 |     runs-on: ubuntu-24.04
22 |     strategy:
23 |       matrix:
24 |         ruby-version: ['2.7', '3.0', '3.3']
25 | 
26 |     steps:
27 |     - uses: actions/checkout@v4.2.2
28 |     - name: Setup Ruby, JRuby and TruffleRuby
29 |       uses: ruby/setup-ruby@v1.202.0
30 |       with:
31 |         ruby-version: ${{ matrix.ruby-version }}
32 |         bundler-cache: true # runs 'bundle install' and caches installed gems automatically
33 |     - name: Run tests
34 |       run: bundle exec rake


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ## PROJECT::GENERAL
 2 | .yardoc
 3 | coverage
 4 | doc
 5 | rdoc
 6 | log
 7 | websites
 8 | .DS_Store
 9 | .rake_tasks~
10 | 
11 | ## BUNDLER
12 | *.gem
13 | .bundle
14 | pkg
15 | Gemfile.lock
16 | 
17 | ## RBENV
18 | .ruby-version
19 | .rbenv*
20 | 
21 | ## ENV
22 | *.env*
23 | !.env*.example
24 | 
25 | 
26 | ## RCOV
27 | coverage.data
28 | 
29 | tmp
30 | 
31 | ## RUBINIUS
32 | *.rbc
33 | 
34 | test.rb
35 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: ruby
 3 | rvm:
 4 |   - 1.9.2
 5 |   - 1.9.3
 6 |   - 2.0.0
 7 |   - 2.1
 8 |   - 2.2
 9 |   - 2.3.1
10 |   - jruby
11 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ruby:3.4.4-alpine
 2 | USER root
 3 | WORKDIR /build
 4 | 
 5 | COPY Gemfile /build/
 6 | COPY *.gemspec /build/
 7 | 
 8 | RUN bundle config set jobs "$(nproc)" \
 9 |     && bundle config set without 'development test' \
10 |     && bundle install
11 | 
12 | COPY . /build
13 | 
14 | WORKDIR /
15 | ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ]
16 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 | 
3 | gem 'concurrent-ruby', '~> 1.3', '>= 1.3.4'
4 | gemspec


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015-2016 Julian Khaleghy and contributors
 4 | See the full list at https://github.com/hartator/wayback-machine-downloader/graphs/contributors
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Wayback Machine Downloader
  2 | [![version](https://badge.fury.io/rb/wayback_machine_downloader_straw.svg)](https://rubygems.org/gems/wayback_machine_downloader_straw)
  3 | 
  4 | This is a fork of the [Wayback Machine Downloader](https://github.com/hartator/wayback-machine-downloader). With this, you can download a website from the Internet Archive Wayback Machine.
  5 | 
  6 | Included here is partial content from other forks, namely those @ [ShiftaDeband](https://github.com/ShiftaDeband/wayback-machine-downloader) and [matthid](https://github.com/matthid/wayback-machine-downloader) — attributions are in the code and go to the original authors; as well as a few additional (future) features.
  7 | 
  8 | ## ▶️ Quick start
  9 | 
 10 | Download a website's latest snapshot:
 11 | ```bash
 12 | ruby wayback_machine_downloader https://example.com
 13 | ```
 14 | Your files will save to `./websites/example.com/` with their original structure preserved.
 15 | 
 16 | ## 📥 Installation
 17 | ### Requirements
 18 | - Ruby 2.3+ ([download Ruby here](https://www.ruby-lang.org/en/downloads/))
 19 | - Bundler gem (`gem install bundler`)
 20 | 
 21 | ### Quick install
 22 | It took a while, but we have a gem for this! Install it with:
 23 | ```bash
 24 | gem install wayback_machine_downloader_straw
 25 | ```
 26 | To run most commands, just like in the original WMD, you can use:
 27 | ```bash
 28 | wayback_machine_downloader https://example.com
 29 | ```
 30 | 
 31 | ### Step-by-step setup
 32 | 1. **Install Ruby**:
 33 |    ```bash
 34 |    ruby -v
 35 |    ```
 36 |    This will verify your installation. If not installed, [download Ruby](https://www.ruby-lang.org/en/downloads/) for your OS.
 37 | 
 38 | 2. **Install dependencies**:
 39 |    ```bash
 40 |    bundle install
 41 |    ```
 42 | 
 43 |    If you encounter an error like cannot load such file -- concurrent-ruby, manually install the missing gem:
 44 |    ```bash
 45 |    gem install concurrent-ruby
 46 |    ```
 47 |    
 48 | 3. **Run it**:
 49 |    ```bash
 50 |    cd path/to/wayback-machine-downloader/bin
 51 |    ruby wayback_machine_downloader https://example.com
 52 |    ```
 53 |    For example, if you extracted the contents to a folder named "wayback-machine-downloader" in your Downloads directory, you'd need to type `cd Downloads\wayback-machine-downloader\bin`.
 54 | 
 55 | *Windows tip*: In File Explorer, Shift + Right Click your `bin` folder → "Open Terminal here".
 56 | 
 57 | ## 🐳 Docker users
 58 | We have a Docker image! See [#Packages](https://github.com/StrawberryMaster/wayback-machine-downloader/pkgs/container/wayback-machine-downloader) for the latest version. You can also build it yourself. Here's how:
 59 | 
 60 | ```bash
 61 | docker build -t wayback_machine_downloader .
 62 | docker run -it --rm wayback_machine_downloader [options] URL
 63 | ```
 64 | 
 65 | or the example without cloning the repo - fetching smallrockets.com until the year 2013:
 66 | 
 67 | ```bash
 68 | docker run -v .:/websites ghcr.io/strawberrymaster/wayback-machine-downloader:master wayback_machine_downloader --to 20130101 smallrockets.com
 69 | ```
 70 | 
 71 | ### 🐳 Using Docker Compose
 72 | 
 73 | We can also use it with Docker Compose, which provides a lot of benefits for extending more functionalities (such as implementing storing previous downloads in a database):
 74 | ```yaml
 75 | # docker-compose.yml
 76 | services:
 77 |   wayback_machine_downloader:
 78 |     build:
 79 |         context: .
 80 |     tty: true
 81 |     image: wayback_machine_downloader:latest
 82 |     container_name: wayback_machine_downloader
 83 |     environment:
 84 |       - ENVIRONMENT=${ENVIRONMENT:-development}
 85 |       - OPTIONS=${OPTIONS:-""}
 86 |       - TARGET_URL=${TARGET_URL}
 87 |     volumes:
 88 |       - .:/build:rw
 89 |       - ./websites:/build/websites:rw
 90 |     command: --directory /build/websites ${OPTIONS} ${TARGET_URL}
 91 | ```
 92 | #### Usage:
 93 | Now You can create a Docker image as named "wayback_machine_downloader" with the following command:
 94 | ```bash
 95 | docker compose up -d --build
 96 | ```
 97 | 
 98 | After that you must set TARGET_URL environment variable:
 99 | ```bash
100 | export TARGET_URL="https://example.com/"
101 | ```
102 | 
103 | The **OPTIONS** env. variable is optional this may include additional settings which are found in the "**Advanced usage**" section below.
104 | 
105 | Example:
106 | ```bash
107 | export OPTIONS="--list -f 20060121"
108 | ```
109 | 
110 | After that you can run the exists container with the following command:
111 | ```bash
112 | docker compose run --rm wayback_machine_downloader https://example.com
113 | ```
114 | 
115 | ## ⚙️ Configuration
116 | There are a few constants that can be edited in the `wayback_machine_downloader.rb` file for your convenience. The default values may be conservative, so you can adjust them to your needs. They are:
117 | ```ruby
118 | DEFAULT_TIMEOUT = 30        # HTTP timeout (in seconds)
119 | MAX_RETRIES = 3             # Number of times to retry failed requests
120 | RETRY_DELAY = 2             # Wait time between retries (seconds)
121 | RATE_LIMIT = 0.25           # Throttle between requests (seconds)
122 | CONNECTION_POOL_SIZE = 10   # Maximum simultaneous connections
123 | MEMORY_BUFFER_SIZE = 16384  # Download buffer size (bytes)
124 | STATE_CDX_FILENAME = '.cdx.json'       # Stores snapshot listing
125 | STATE_DB_FILENAME = '.downloaded.txt'  # Tracks completed downloads
126 | ```
127 | 
128 | ## 🛠️ Advanced usage
129 | 
130 | ### Basic options
131 | | Option | Description |
132 | |--------|-------------|
133 | | `-d DIR`, `--directory DIR` | Custom output directory |
134 | | `-s`, `--all-timestamps`     | Download all historical versions |
135 | | `-f TS`, `--from TS`  | Start from timestamp (e.g., 20060121) |
136 | | `-t TS`, `--to TS`  | Stop at timestamp |
137 | | `-e`, `--exact-url`     | Download exact URL only |
138 | | `-r`, `--rewritten`     | Download rewritten Wayback Archive files only |
139 | 
140 | **Example** - Download files to `downloaded-backup` folder
141 | ```bash
142 | ruby wayback_machine_downloader https://example.com --directory downloaded-backup/
143 | ```
144 | By default, Wayback Machine Downloader will download files to ./websites/ followed by the domain name of the website. You may want to save files in a specific directory using this option.
145 | 
146 | **Example 2** - Download historical timestamps:
147 | ```bash
148 | ruby wayback_machine_downloader https://example.com --all-timestamps 
149 | ```
150 | This option will download all timestamps/snapshots for a given website. It will uses the timestamp of each snapshot as directory. In this case, it will download, for example:
151 | ```bash
152 | websites/example.com/20060715085250/index.html
153 | websites/example.com/20051120005053/index.html
154 | websites/example.com/20060111095815/img/logo.png
155 | ...
156 | ```
157 | 
158 | **Example 3** - Download content on or after July 16, 2006:
159 | ```bash
160 | ruby wayback_machine_downloader https://example.com --from 20060716231334 
161 | ```
162 | You may want to supply a from timestamp to lock your backup to a specific version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., https://web.archive.org/web/20060716231334/http://example.com). You can also use years (2006), years + month (200607), etc. It can be used in combination of To Timestamp.
163 | Wayback Machine Downloader will then fetch only file versions on or after the timestamp specified.
164 | 
165 | **Example 4** - Download content on or before September 16, 2010:
166 | ```bash
167 | ruby wayback_machine_downloader https://example.com --to 20100916231334
168 | ```
169 | You may want to supply a to timestamp to lock your backup to a specific version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., https://web.archive.org/web/20100916231334/http://example.com). You can also use years (2010), years + month (201009), etc. It can be used in combination of From Timestamp.
170 | Wayback Machine Downloader will then fetch only file versions on or before the timestamp specified.
171 | 
172 | **Example 5** - Download the homepage of http://example.com
173 | ```bash
174 | ruby wayback_machine_downloader https://example.com --exact-url
175 | ```
176 | If you want to retrieve only the file matching exactly the url provided, you can use this flag. It will avoid downloading anything else.
177 | 
178 | **Example 6** - Download a rewritten file
179 | ```bash
180 | ruby wayback_machine_downloader https://example.com --rewritten
181 | ```
182 | Useful if you want to download the rewritten files from the Wayback Machine instead of the original ones.
183 | 
184 | ### Filtering Content
185 | | Option | Description |
186 | |--------|-------------|
187 | | `-o FILTER`, `--only FILTER` | Only download matching URLs (supports regex) |
188 | | `-x FILTER`, `--exclude FILTER` | Exclude matching URLs |
189 | 
190 | **Example** - Include only images:
191 | ```bash
192 | ruby wayback_machine_downloader https://example.com -o "/\.(jpg|png)/i"
193 | ```
194 | You may want to retrieve files which are of a certain type (e.g., .pdf, .jpg, .wrd...) or are in a specific directory. To do so, you can supply the --only flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download.
195 | For example, if you only want to download files inside a specific my_directory:
196 | ```bash
197 | ruby wayback_machine_downloader https://example.com --only my_directory
198 | ```
199 | Or if you want to download every images without anything else:
200 | ```bash
201 | ruby wayback_machine_downloader https://example.com --only "/\.(gif|jpg|jpeg)$/i"
202 | ```
203 | 
204 | **Example 2** - Exclude images:
205 | ```bash
206 | ruby wayback_machine_downloader https://example.com -x "/\.(jpg|png)/i"
207 | ```
208 | You may want to retrieve files which aren't of a certain type (e.g., .pdf, .jpg, .wrd...) or aren't in a specific directory. To do so, you can supply the --exclude flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download.
209 | For example, if you want to avoid downloading files inside my_directory:
210 | ```bash
211 | ruby wayback_machine_downloader https://example.com --exclude my_directory
212 | ```
213 | Or if you want to download everything except images:
214 | ```bash
215 | ruby wayback_machine_downloader https://example.com --exclude "/\.(gif|jpg|jpeg)$/i"
216 | ```
217 | 
218 | ### Performance
219 | | Option | Description |
220 | |--------|-------------|
221 | | `-c NUM`, `--concurrency NUM` | Concurrent downloads (default: 1) |
222 | | `-p NUM`, `--maximum-snapshot NUM` | Max snapshot pages (150k snapshots/page) |
223 | 
224 | **Example** - 20 parallel downloads:
225 | ```bash
226 | ruby wayback_machine_downloader https://example.com --concurrency 20
227 | ```
228 | Will specify the number of multiple files you want to download at the same time. Allows one to speed up the download of a website significantly. Default is to download one file at a time.
229 | 
230 | **Example 2** - 300 snapshot pages:
231 | ```bash
232 | ruby wayback_machine_downloader https://example.com --snapshot-pages 300    
233 | ```
234 | Will specify the maximum number of snapshot pages to consider. Count an average of 150,000 snapshots per page. 100 is the default maximum number of snapshot pages and should be sufficient for most websites. Use a bigger number if you want to download a very large website.
235 | 
236 | ### Diagnostics
237 | | Option | Description |
238 | |--------|-------------|
239 | | `-a`, `--all` | Include error pages (40x/50x) |
240 | | `-l`, `--list` | List files without downloading |
241 | 
242 | **Example** - Download all files
243 | ```bash
244 | ruby wayback_machine_downloader https://example.com --all
245 | ```
246 | By default, Wayback Machine Downloader limits itself to files that responded with 200 OK code. If you also need errors files (40x and 50x codes) or redirections files (30x codes), you can use the --all or -a flag and Wayback Machine Downloader will download them in addition of the 200 OK files. It will also keep empty files that are removed by default.
247 | 
248 | **Example 2** - Generate URL list:
249 | ```bash
250 | ruby wayback_machine_downloader https://example.com --list
251 | ```
252 | It will just display the files to be downloaded with their snapshot timestamps and urls. The output format is JSON. It won't download anything. It's useful for debugging or to connect to another application.
253 | 
254 | ### Job management
255 | The downloader automatically saves its progress (`.cdx.json` for snapshot list, `.downloaded.txt` for completed files) in the output directory. If you run the same command again pointing to the same output directory, it will resume where it left off, skipping already downloaded files.
256 | 
257 | > [!NOTE]
258 | > Automatic resumption can be affected by changing the URL, mode selection (like `--all-timestamps`), filtering selections, or other options. If you want to ensure a clean start, use the `--reset` option.
259 | 
260 | | Option | Description |
261 | |--------|-------------|
262 | | `--reset` | Delete state files (`.cdx.json`, `.downloaded.txt`) and restart the download from scratch. Does not delete already downloaded website files. |
263 | | `--keep` | Keep state files (`.cdx.json`, `.downloaded.txt`) even after a successful download. By default, these are deleted upon successful completion. |
264 | 
265 | **Example** - Restart a download job from the beginning:
266 | ```bash
267 | ruby wayback_machine_downloader https://example.com --reset
268 | ```
269 | This is useful if you suspect the state files are corrupted or want to ensure a completely fresh download process without deleting the files you already have.
270 | 
271 | **Example 2** - Keep state files after download:
272 | ```bash
273 | ruby wayback_machine_downloader https://example.com --keep
274 | ```
275 | This can be useful for debugging or if you plan to extend the download later with different parameters (e.g., adding `--to` timestamp) while leveraging the existing snapshot list.
276 | 
277 | ## 🤝 Contributing
278 | 1. Fork the repository
279 | 2. Create a feature branch
280 | 3. Submit a pull request
281 | 
282 | **Run tests** (note, these are still broken!):
283 | ```bash
284 | bundle exec rake test
285 | ```
286 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require 'rake/testtask'
2 | 
3 | Rake::TestTask.new do |t|
4 |   t.libs << 'test'
5 | end
6 | 
7 | desc "Run tests"
8 | task :default => :test
9 | 


--------------------------------------------------------------------------------
/bin/wayback_machine_downloader:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require_relative '../lib/wayback_machine_downloader'
 4 | require 'optparse'
 5 | require 'pp'
 6 | 
 7 | options = {}
 8 | option_parser = OptionParser.new do |opts|
 9 |   opts.banner = "Usage: wayback_machine_downloader http://example.com"
10 | 
11 |   opts.separator ""
12 |   opts.separator "Download an entire website from the Wayback Machine."
13 | 
14 |   opts.separator ""
15 |   opts.separator "Optional options:"
16 | 
17 |   opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
18 |     options[:directory] = t
19 |   end
20 | 
21 |   opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
22 |     options[:all_timestamps] = true
23 |   end
24 |   
25 |   opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
26 |     options[:from_timestamp] = t
27 |   end
28 | 
29 |   opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
30 |     options[:to_timestamp] = t
31 |   end
32 | 
33 |   opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
34 |     options[:exact_url] = t
35 |   end
36 | 
37 |   opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
38 |     options[:only_filter] = t
39 |   end
40 | 
41 |   opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
42 |     options[:exclude_filter] = t
43 |   end
44 | 
45 |   opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
46 |     options[:all] = true
47 |   end
48 | 
49 |   opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
50 |     options[:threads_count] = t
51 |   end
52 | 
53 |   opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
54 |     options[:maximum_pages] = t
55 |   end
56 | 
57 |   opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
58 |     options[:list] = true
59 |   end
60 | 
61 |   opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t|
62 |     options[:rewritten] = true
63 |   end
64 | 
65 |   opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t|
66 |     options[:rewrite] = true
67 |   end
68 | 
69 |   opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t|
70 |     options[:reset] = true
71 |   end
72 | 
73 |   opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t|
74 |     options[:keep] = true
75 |   end
76 | 
77 |   opts.on("-v", "--version", "Display version") do |t|
78 |     options[:version] = t
79 |   end
80 | end.parse!
81 | 
82 | if (base_url = ARGV[-1])
83 |   options[:base_url] = base_url
84 |   wayback_machine_downloader = WaybackMachineDownloader.new options
85 |   if options[:list]
86 |     wayback_machine_downloader.list_files
87 |   else
88 |     wayback_machine_downloader.download_files
89 |   end
90 | elsif options[:version]
91 |   puts WaybackMachineDownloader::VERSION
92 | else
93 |   puts "You need to specify a website to backup. (e.g., http://example.com)"
94 |   puts "Run `wayback_machine_downloader --help` for more help."
95 | end
96 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   wayback_machine_downloader:
 3 |     build:
 4 |         context: .
 5 |     tty: true
 6 |     image: wayback_machine_downloader:latest
 7 |     container_name: wayback_machine_downloader
 8 |     environment:
 9 |       - ENVIRONMENT=${DEVELOPMENT:-production}
10 |       - OPTIONS=${OPTIONS:-""}
11 |       - TARGET_URL=${TARGET_URL}
12 |     volumes:
13 |       - .:/build:rw
14 |       - ./websites:/websites:rw
15 |     command: /build/bin/wayback_machine_downloader ${TARGET_URL} ${OPTIONS}


--------------------------------------------------------------------------------
/entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | if [ "$ENVIRONMENT" == "development" ]; then
4 |   echo "Running in development mode. Starting rerun..."
5 |   exec rerun --dir /build --ignore "websites/*" -- /build/bin/wayback_machine_downloader "$@"
6 | else
7 |   echo "Not in development mode. Skipping rerun."
8 |   exec /build/bin/wayback_machine_downloader "$@"
9 | fi


--------------------------------------------------------------------------------
/lib/wayback_machine_downloader.rb:
--------------------------------------------------------------------------------
  1 | # encoding: UTF-8
  2 | 
  3 | require 'thread'
  4 | require 'net/http'
  5 | require 'open-uri'
  6 | require 'fileutils'
  7 | require 'cgi'
  8 | require 'json'
  9 | require 'time'
 10 | require 'concurrent-ruby'
 11 | require 'logger'
 12 | require 'zlib'
 13 | require 'stringio'
 14 | require_relative 'wayback_machine_downloader/tidy_bytes'
 15 | require_relative 'wayback_machine_downloader/to_regex'
 16 | require_relative 'wayback_machine_downloader/archive_api'
 17 | 
 18 | class ConnectionPool
 19 |   MAX_AGE = 300
 20 |   CLEANUP_INTERVAL = 60
 21 |   DEFAULT_TIMEOUT = 30
 22 |   MAX_RETRIES = 3
 23 | 
 24 |   def initialize(size)
 25 |     @size = size
 26 |     @pool = Concurrent::Map.new
 27 |     @creation_times = Concurrent::Map.new
 28 |     @cleanup_thread = schedule_cleanup
 29 |   end
 30 | 
 31 |   def with_connection(&block)
 32 |     conn = acquire_connection
 33 |     begin
 34 |       yield conn
 35 |     ensure
 36 |       release_connection(conn)
 37 |     end
 38 |   end
 39 | 
 40 |   def shutdown
 41 |     @cleanup_thread&.exit
 42 |     @pool.each_value { |conn| conn.finish if conn&.started? }
 43 |     @pool.clear
 44 |     @creation_times.clear
 45 |   end
 46 | 
 47 |   private
 48 | 
 49 |   def acquire_connection
 50 |     thread_id = Thread.current.object_id
 51 |     conn = @pool[thread_id]
 52 | 
 53 |     if should_create_new?(conn)
 54 |       conn&.finish if conn&.started?
 55 |       conn = create_connection
 56 |       @pool[thread_id] = conn
 57 |       @creation_times[thread_id] = Time.now
 58 |     end
 59 | 
 60 |     conn
 61 |   end
 62 | 
 63 |   def release_connection(conn)
 64 |     return unless conn
 65 |     if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
 66 |       conn.finish
 67 |       @pool.delete(Thread.current.object_id)
 68 |       @creation_times.delete(Thread.current.object_id)
 69 |     end
 70 |   end
 71 | 
 72 |   def should_create_new?(conn)
 73 |     return true if conn.nil?
 74 |     return true unless conn.started?
 75 |     return true if Time.now - @creation_times[Thread.current.object_id] > MAX_AGE
 76 |     false
 77 |   end
 78 | 
 79 |   def create_connection
 80 |     http = Net::HTTP.new("web.archive.org", 443)
 81 |     http.use_ssl = true
 82 |     http.read_timeout = DEFAULT_TIMEOUT
 83 |     http.open_timeout = DEFAULT_TIMEOUT
 84 |     http.keep_alive_timeout = 30
 85 |     http.max_retries = MAX_RETRIES
 86 |     http.start
 87 |     http
 88 |   end
 89 | 
 90 |   def schedule_cleanup
 91 |     Thread.new do
 92 |       loop do
 93 |         cleanup_old_connections
 94 |         sleep CLEANUP_INTERVAL
 95 |       end
 96 |     end
 97 |   end
 98 | 
 99 |   def cleanup_old_connections
100 |     current_time = Time.now
101 |     @creation_times.each do |thread_id, creation_time|
102 |       if current_time - creation_time > MAX_AGE
103 |         conn = @pool[thread_id]
104 |         conn&.finish if conn&.started?
105 |         @pool.delete(thread_id)
106 |         @creation_times.delete(thread_id)
107 |       end
108 |     end
109 |   end
110 | end
111 | 
112 | class WaybackMachineDownloader
113 | 
114 |   include ArchiveAPI
115 | 
116 |   VERSION = "2.3.8"
117 |   DEFAULT_TIMEOUT = 30
118 |   MAX_RETRIES = 3
119 |   RETRY_DELAY = 2
120 |   RATE_LIMIT = 0.25  # Delay between requests in seconds
121 |   CONNECTION_POOL_SIZE = 10
122 |   MEMORY_BUFFER_SIZE = 16384  # 16KB chunks
123 |   STATE_CDX_FILENAME = ".cdx.json"
124 |   STATE_DB_FILENAME = ".downloaded.txt"
125 | 
126 |   attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
127 |     :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
128 |     :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite
129 | 
130 |   def initialize params
131 |     validate_params(params)
132 |     @base_url = params[:base_url]
133 |     @exact_url = params[:exact_url]
134 |     @directory = params[:directory]
135 |     @all_timestamps = params[:all_timestamps]
136 |     @from_timestamp = params[:from_timestamp].to_i
137 |     @to_timestamp = params[:to_timestamp].to_i
138 |     @only_filter = params[:only_filter]
139 |     @exclude_filter = params[:exclude_filter]
140 |     @all = params[:all]
141 |     @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
142 |     @threads_count = [params[:threads_count].to_i, 1].max
143 |     @rewritten = params[:rewritten]
144 |     @reset = params[:reset]
145 |     @keep = params[:keep]
146 |     @timeout = params[:timeout] || DEFAULT_TIMEOUT
147 |     @logger = setup_logger
148 |     @failed_downloads = Concurrent::Array.new
149 |     @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE)
150 |     @db_mutex = Mutex.new
151 |     @rewrite = params[:rewrite] || false
152 | 
153 |     handle_reset
154 |   end
155 | 
156 |   def backup_name
157 |     url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url
158 |     
159 |     if url_to_process.include? '//'
160 |       url_to_process.split('/')[2]
161 |     else
162 |       url_to_process
163 |     end
164 |   end
165 | 
166 |   def backup_path
167 |     if @directory
168 |       if @directory[-1] == '/'
169 |         @directory
170 |       else
171 |         @directory + '/'
172 |       end
173 |     else
174 |       'websites/' + backup_name + '/'
175 |     end
176 |   end
177 | 
178 |   def cdx_path
179 |     File.join(backup_path, STATE_CDX_FILENAME)
180 |   end
181 | 
182 |   def db_path
183 |     File.join(backup_path, STATE_DB_FILENAME)
184 |   end
185 | 
186 |   def handle_reset
187 |     if @reset
188 |       puts "Resetting download state..."
189 |       FileUtils.rm_f(cdx_path)
190 |       FileUtils.rm_f(db_path)
191 |       puts "Removed state files: #{cdx_path}, #{db_path}"
192 |     end
193 |   end
194 | 
195 |   def match_only_filter file_url
196 |     if @only_filter
197 |       only_filter_regex = @only_filter.to_regex
198 |       if only_filter_regex
199 |         only_filter_regex =~ file_url
200 |       else
201 |         file_url.downcase.include? @only_filter.downcase
202 |       end
203 |     else
204 |       true
205 |     end
206 |   end
207 | 
208 |   def match_exclude_filter file_url
209 |     if @exclude_filter
210 |       exclude_filter_regex = @exclude_filter.to_regex
211 |       if exclude_filter_regex
212 |         exclude_filter_regex =~ file_url
213 |       else
214 |         file_url.downcase.include? @exclude_filter.downcase
215 |       end
216 |     else
217 |       false
218 |     end
219 |   end
220 | 
221 |   def get_all_snapshots_to_consider
222 |     if File.exist?(cdx_path) && !@reset
223 |       puts "Loading snapshot list from #{cdx_path}"
224 |       begin
225 |         snapshot_list_to_consider = JSON.parse(File.read(cdx_path))
226 |         puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache."
227 |         puts
228 |         return Concurrent::Array.new(snapshot_list_to_consider)
229 |       rescue JSON::ParserError => e
230 |         puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..."
231 |         FileUtils.rm_f(cdx_path)
232 |       rescue => e
233 |         puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..."
234 |         FileUtils.rm_f(cdx_path)
235 |       end
236 |     end
237 | 
238 |     snapshot_list_to_consider = Concurrent::Array.new
239 |     mutex = Mutex.new
240 | 
241 |     puts "Getting snapshot pages from Wayback Machine API..."
242 | 
243 |     # Fetch the initial set of snapshots, sequentially
244 |     @connection_pool.with_connection do |connection|
245 |       initial_list = get_raw_list_from_api(@base_url, nil, connection)
246 |       initial_list ||= []
247 |       mutex.synchronize do
248 |         snapshot_list_to_consider.concat(initial_list)
249 |         print "."
250 |       end
251 |     end
252 | 
253 |     # Fetch additional pages if the exact URL flag is not set
254 |     unless @exact_url
255 |       page_index = 0
256 |       batch_size = [@threads_count, 5].min
257 |       continue_fetching = true
258 | 
259 |       while continue_fetching && page_index < @maximum_pages
260 |         # Determine the range of pages to fetch in this batch
261 |         end_index = [page_index + batch_size, @maximum_pages].min
262 |         current_batch = (page_index...end_index).to_a
263 | 
264 |         # Create futures for concurrent API calls
265 |         futures = current_batch.map do |page|
266 |           Concurrent::Future.execute do
267 |             result = nil
268 |             @connection_pool.with_connection do |connection|
269 |               result = get_raw_list_from_api("#{@base_url}/*", page, connection)
270 |             end
271 |             result ||= []
272 |             [page, result]
273 |           end
274 |         end
275 | 
276 |         results = []
277 | 
278 |         futures.each do |future|
279 |           begin
280 |             results << future.value
281 |           rescue => e
282 |             puts "\nError fetching page #{future}: #{e.message}"
283 |           end
284 |         end
285 | 
286 |         # Sort results by page number to maintain order
287 |         results.sort_by! { |page, _| page }
288 | 
289 |         # Process results and check for empty pages
290 |         results.each do |page, result|
291 |           if result.nil? || result.empty?
292 |             continue_fetching = false
293 |             break
294 |           else
295 |             mutex.synchronize do
296 |               snapshot_list_to_consider.concat(result)
297 |               print "."
298 |             end
299 |           end
300 |         end
301 | 
302 |         page_index = end_index
303 | 
304 |         sleep(RATE_LIMIT) if continue_fetching
305 |       end
306 |     end
307 | 
308 |     puts " found #{snapshot_list_to_consider.length} snapshots."
309 | 
310 |     # Save the fetched list to the cache file
311 |     begin
312 |       FileUtils.mkdir_p(File.dirname(cdx_path))
313 |       File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON
314 |       puts "Saved snapshot list to #{cdx_path}"
315 |     rescue => e
316 |       puts "Error saving snapshot cache to #{cdx_path}: #{e.message}"
317 |     end
318 |     puts
319 | 
320 |     snapshot_list_to_consider
321 |   end
322 | 
323 |   def get_file_list_curated
324 |     file_list_curated = Hash.new
325 |     get_all_snapshots_to_consider.each do |file_timestamp, file_url|
326 |       next unless file_url.include?('/')
327 |       file_id = file_url.split('/')[3..-1].join('/')
328 |       file_id = CGI::unescape file_id
329 |       file_id = file_id.tidy_bytes unless file_id == ""
330 |       if file_id.nil?
331 |         puts "Malformed file url, ignoring: #{file_url}"
332 |       else
333 |         if match_exclude_filter(file_url)
334 |           puts "File url matches exclude filter, ignoring: #{file_url}"
335 |         elsif not match_only_filter(file_url)
336 |           puts "File url doesn't match only filter, ignoring: #{file_url}"
337 |         elsif file_list_curated[file_id]
338 |           unless file_list_curated[file_id][:timestamp] > file_timestamp
339 |             file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
340 |           end
341 |         else
342 |           file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
343 |         end
344 |       end
345 |     end
346 |     file_list_curated
347 |   end
348 | 
349 |   def get_file_list_all_timestamps
350 |     file_list_curated = Hash.new
351 |     get_all_snapshots_to_consider.each do |file_timestamp, file_url|
352 |       next unless file_url.include?('/')
353 |       file_id = file_url.split('/')[3..-1].join('/')
354 |       file_id_and_timestamp = [file_timestamp, file_id].join('/')
355 |       file_id_and_timestamp = CGI::unescape file_id_and_timestamp
356 |       file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
357 |       if file_id.nil?
358 |         puts "Malformed file url, ignoring: #{file_url}"
359 |       else
360 |         if match_exclude_filter(file_url)
361 |           puts "File url matches exclude filter, ignoring: #{file_url}"
362 |         elsif not match_only_filter(file_url)
363 |           puts "File url doesn't match only filter, ignoring: #{file_url}"
364 |         elsif file_list_curated[file_id_and_timestamp]
365 |           puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
366 |         else
367 |           file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
368 |         end
369 |       end
370 |     end
371 |     puts "file_list_curated: " + file_list_curated.count.to_s
372 |     file_list_curated
373 |   end
374 | 
375 | 
376 |   def get_file_list_by_timestamp
377 |     if @all_timestamps
378 |       file_list_curated = get_file_list_all_timestamps
379 |       file_list_curated.map do |file_remote_info|
380 |         file_remote_info[1][:file_id] = file_remote_info[0]
381 |         file_remote_info[1]
382 |       end
383 |     else
384 |       file_list_curated = get_file_list_curated
385 |       file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
386 |       file_list_curated.map do |file_remote_info|
387 |         file_remote_info[1][:file_id] = file_remote_info[0]
388 |         file_remote_info[1]
389 |       end
390 |     end
391 |   end
392 | 
393 |   def list_files
394 |     # retrieval produces its own output
395 |     @orig_stdout = $stdout
396 |     $stdout = $stderr
397 |     files = get_file_list_by_timestamp
398 |     $stdout = @orig_stdout
399 |     puts "["
400 |     files[0...-1].each do |file|
401 |       puts file.to_json + ","
402 |     end
403 |     puts files[-1].to_json
404 |     puts "]"
405 |   end
406 | 
407 |   def load_downloaded_ids
408 |     downloaded_ids = Set.new
409 |     if File.exist?(db_path) && !@reset
410 |       puts "Loading list of already downloaded files from #{db_path}"
411 |       begin
412 |         File.foreach(db_path) { |line| downloaded_ids.add(line.strip) }
413 |       rescue => e
414 |         puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded."
415 |         downloaded_ids.clear
416 |       end
417 |     end
418 |     downloaded_ids
419 |   end
420 | 
421 |   def append_to_db(file_id)
422 |     @db_mutex.synchronize do
423 |       begin
424 |         FileUtils.mkdir_p(File.dirname(db_path))
425 |         File.open(db_path, 'a') { |f| f.puts(file_id) }
426 |       rescue => e
427 |         @logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}")
428 |       end
429 |     end
430 |   end
431 | 
432 |   def download_files
433 |     start_time = Time.now
434 |     puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
435 | 
436 |     FileUtils.mkdir_p(backup_path)
437 | 
438 |     # Load the list of files to potentially download
439 |     files_to_download = file_list_by_timestamp
440 | 
441 |     if files_to_download.empty?
442 |       puts "No files found matching criteria."
443 |       cleanup
444 |       return
445 |     end
446 | 
447 |     total_files = files_to_download.count
448 |     puts "#{total_files} files found matching criteria."
449 | 
450 |     # Load IDs of already downloaded files
451 |     downloaded_ids = load_downloaded_ids
452 |     files_to_process = files_to_download.reject do |file_info|
453 |       downloaded_ids.include?(file_info[:file_id])
454 |     end
455 | 
456 |     remaining_count = files_to_process.count
457 |     skipped_count = total_files - remaining_count
458 | 
459 |     if skipped_count > 0
460 |       puts "Found #{skipped_count} previously downloaded files, skipping them."
461 |     end
462 | 
463 |     if remaining_count == 0
464 |       puts "All matching files have already been downloaded."
465 |       cleanup
466 |       return
467 |     end
468 | 
469 |     puts "#{remaining_count} files to download:"
470 | 
471 |     @processed_file_count = 0
472 |     @total_to_download = remaining_count
473 |     @download_mutex = Mutex.new
474 | 
475 |     thread_count = [@threads_count, CONNECTION_POOL_SIZE].min
476 |     pool = Concurrent::FixedThreadPool.new(thread_count)
477 | 
478 |     files_to_process.each do |file_remote_info|
479 |       pool.post do
480 |         download_success = false
481 |         begin
482 |           @connection_pool.with_connection do |connection|
483 |             result_message = download_file(file_remote_info, connection)
484 |             # assume download success if the result message contains ' -> '
485 |             if result_message && result_message.include?(' -> ')
486 |                download_success = true
487 |             end
488 |             @download_mutex.synchronize do
489 |               @processed_file_count += 1
490 |               # adjust progress message to reflect remaining files
491 |               progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message
492 |               puts progress_message if progress_message
493 |             end
494 |           end
495 |           # sppend to DB only after successful download outside the connection block
496 |           if download_success
497 |             append_to_db(file_remote_info[:file_id])
498 |           end
499 |         rescue => e
500 |           @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}")
501 |            @download_mutex.synchronize do
502 |               @processed_file_count += 1
503 |            end
504 |         end
505 |         sleep(RATE_LIMIT)
506 |       end
507 |     end
508 | 
509 |     pool.shutdown
510 |     pool.wait_for_termination
511 | 
512 |     end_time = Time.now
513 |     puts "\nDownload finished in #{(end_time - start_time).round(2)}s."
514 |     puts "Results saved in #{backup_path}"
515 |     cleanup
516 |   end
517 | 
518 |   def structure_dir_path dir_path
519 |     begin
520 |       FileUtils::mkdir_p dir_path unless File.exist? dir_path
521 |     rescue Errno::EEXIST => e
522 |       error_to_string = e.to_s
523 |       puts "# #{error_to_string}"
524 |       if error_to_string.include? "File exists @ dir_s_mkdir - "
525 |         file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
526 |       elsif error_to_string.include? "File exists - "
527 |         file_already_existing = error_to_string.split("File exists - ")[-1]
528 |       else
529 |         raise "Unhandled directory restructure error # #{error_to_string}"
530 |       end
531 |       file_already_existing_temporary = file_already_existing + '.temp'
532 |       file_already_existing_permanent = file_already_existing + '/index.html'
533 |       FileUtils::mv file_already_existing, file_already_existing_temporary
534 |       FileUtils::mkdir_p file_already_existing
535 |       FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
536 |       puts "#{file_already_existing} -> #{file_already_existing_permanent}"
537 |       structure_dir_path dir_path
538 |     end
539 |   end
540 | 
541 |   def rewrite_urls_to_relative(file_path)
542 |     return unless File.exist?(file_path)
543 |     
544 |     file_ext = File.extname(file_path).downcase
545 |     
546 |     begin
547 |       content = File.binread(file_path)
548 | 
549 |       if file_ext == '.html' || file_ext == '.htm'
550 |         encoding = content.match(/<meta\s+charset=["']?([^"'>]+)/i)&.captures&.first || 'UTF-8'
551 |         content.force_encoding(encoding) rescue content.force_encoding('UTF-8')
552 |       else
553 |         content.force_encoding('UTF-8')
554 |       end
555 | 
556 |       # URLs in HTML attributes
557 |       content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
558 |         prefix, url, suffix = $1, $2, $3
559 |         
560 |         if url.start_with?('http')
561 |           begin
562 |             uri = URI.parse(url)
563 |             path = uri.path
564 |             path = path[1..-1] if path.start_with?('/')
565 |             "#{prefix}#{path}#{suffix}"
566 |           rescue
567 |             "#{prefix}#{url}#{suffix}"
568 |           end
569 |         elsif url.start_with?('/')
570 |           "#{prefix}./#{url[1..-1]}#{suffix}"
571 |         else
572 |           "#{prefix}#{url}#{suffix}"
573 |         end
574 |       end
575 |       
576 |       # URLs in CSS
577 |       content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do
578 |         url = $1
579 |         
580 |         if url.start_with?('http')
581 |           begin
582 |             uri = URI.parse(url)
583 |             path = uri.path
584 |             path = path[1..-1] if path.start_with?('/')
585 |             "url(\"#{path}\")"
586 |           rescue
587 |             "url(\"#{url}\")"
588 |           end
589 |         elsif url.start_with?('/')
590 |           "url(\"./#{url[1..-1]}\")"
591 |         else
592 |           "url(\"#{url}\")"
593 |         end
594 |       end
595 |       
596 |       # URLs in JavaScript
597 |       content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do
598 |         quote_start, url, quote_end = $1, $2, $3
599 |         
600 |         if url.start_with?('http')
601 |           begin
602 |             uri = URI.parse(url)
603 |             path = uri.path
604 |             path = path[1..-1] if path.start_with?('/')
605 |             "#{quote_start}#{path}#{quote_end}"
606 |           rescue
607 |             "#{quote_start}#{url}#{quote_end}"
608 |           end
609 |         elsif url.start_with?('/')
610 |           "#{quote_start}./#{url[1..-1]}#{quote_end}"
611 |         else
612 |           "#{quote_start}#{url}#{quote_end}"
613 |         end
614 |       end
615 |       
616 |       # for URLs in HTML attributes that start with a single slash
617 |       content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do
618 |         prefix, path, suffix = $1, $2, $3
619 |         "#{prefix}./#{path}#{suffix}"
620 |       end
621 |       
622 |       # for URLs in CSS that start with a single slash
623 |       content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do
624 |         path = $1
625 |         "url(\"./#{path}\")"
626 |       end
627 | 
628 |       # save the modified content back to the file
629 |       File.binwrite(file_path, content)
630 |       puts "Rewrote URLs in #{file_path} to be relative."
631 |     rescue Errno::ENOENT => e
632 |       @logger.warn("Error reading file #{file_path}: #{e.message}")
633 |     end
634 |   end
635 | 
636 |   def download_file (file_remote_info, http)
637 |     current_encoding = "".encoding
638 |     file_url = file_remote_info[:file_url].encode(current_encoding)
639 |     file_id = file_remote_info[:file_id]
640 |     file_timestamp = file_remote_info[:timestamp]
641 |     file_path_elements = file_id.split('/')
642 | 
643 |     if file_id == ""
644 |       dir_path = backup_path
645 |       file_path = backup_path + 'index.html'
646 |     elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
647 |       dir_path = backup_path + file_path_elements[0..-1].join('/')
648 |       file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
649 |     else
650 |       dir_path = backup_path + file_path_elements[0..-2].join('/')
651 |       file_path = backup_path + file_path_elements[0..-1].join('/')
652 |     end
653 |     if Gem.win_platform?
654 |       dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
655 |       file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
656 |     end
657 | 
658 |     # check existence *before* download attempt
659 |     # this handles cases where a file was created manually or by a previous partial run without a .db entry
660 |     if File.exist? file_path
661 |        return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})"
662 |     end
663 | 
664 |     begin
665 |       structure_dir_path dir_path
666 |       status = download_with_retry(file_path, file_url, file_timestamp, http)
667 | 
668 |       case status
669 |       when :saved
670 |         if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i
671 |           rewrite_urls_to_relative(file_path)
672 |         end
673 |         "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})"
674 |       when :skipped_not_found
675 |         "Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})"
676 |       else
677 |         # ideally, this case should not be reached if download_with_retry behaves as expected.
678 |         @logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}")
679 |         "Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})"
680 |       end
681 |     rescue StandardError => e
682 |       msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})"
683 |       if File.exist?(file_path) and File.size(file_path) == 0
684 |         File.delete(file_path)
685 |         msg += "\n#{file_path} was empty and was removed."
686 |       end
687 |       msg
688 |     end
689 |   end
690 | 
691 |   def file_queue
692 |     @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
693 |   end
694 | 
695 |   def file_list_by_timestamp
696 |     @file_list_by_timestamp ||= get_file_list_by_timestamp
697 |   end
698 | 
699 |   private
700 | 
701 |   def validate_params(params)
702 |     raise ArgumentError, "Base URL is required" unless params[:base_url]
703 |     raise ArgumentError, "Maximum pages must be positive" if params[:maximum_pages] && params[:maximum_pages].to_i <= 0
704 |   end
705 | 
706 |   def setup_logger
707 |     logger = Logger.new(STDOUT)
708 |     logger.level = ENV['DEBUG'] ? Logger::DEBUG : Logger::INFO
709 |     logger.formatter = proc do |severity, datetime, progname, msg|
710 |       "#{datetime.strftime('%Y-%m-%d %H:%M:%S')} [#{severity}] #{msg}\n"
711 |     end
712 |     logger
713 |   end
714 | 
715 |   def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0)
716 |     retries = 0
717 |     begin
718 |       wayback_url = if @rewritten
719 |         "https://web.archive.org/web/#{file_timestamp}/#{file_url}"
720 |       else
721 |         "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}"
722 |       end
723 | 
724 |       # Escape square brackets because they are not valid in URI()
725 |       wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D')
726 | 
727 |       request = Net::HTTP::Get.new(URI(wayback_url))
728 |       request["Connection"] = "keep-alive"
729 |       request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}"
730 |       request["Accept-Encoding"] = "gzip, deflate"
731 | 
732 |       response = connection.request(request)
733 | 
734 |       save_response_body = lambda do
735 |         File.open(file_path, "wb") do |file|
736 |           body = response.body
737 |           if response['content-encoding'] == 'gzip' && body && !body.empty?
738 |             begin
739 |               gz = Zlib::GzipReader.new(StringIO.new(body))
740 |               decompressed_body = gz.read
741 |               gz.close
742 |               file.write(decompressed_body)
743 |             rescue Zlib::GzipFile::Error => e
744 |               @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.")
745 |               file.write(body)
746 |             end
747 |           else
748 |             file.write(body) if body
749 |           end
750 |         end
751 |       end
752 | 
753 |       if @all
754 |         case response
755 |         when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError
756 |           save_response_body.call
757 |           if response.is_a?(Net::HTTPRedirection)
758 |             @logger.info("Saved redirect page for #{file_url} (status #{response.code}).")
759 |           elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError)
760 |             @logger.info("Saved error page for #{file_url} (status #{response.code}).")
761 |           end
762 |           return :saved
763 |         else
764 |           # for any other response type when --all is true, treat as an error to be retried or failed
765 |           raise "Unhandled HTTP response: #{response.code} #{response.message}"
766 |         end
767 |       else # not @all (our default behavior)
768 |         case response
769 |         when Net::HTTPSuccess
770 |           save_response_body.call
771 |           return :saved
772 |         when Net::HTTPRedirection
773 |           raise "Too many redirects for #{file_url}" if redirect_count >= 2
774 |           location = response['location']
775 |           @logger.warn("Redirect found for #{file_url} -> #{location}")
776 |           return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1)
777 |         when Net::HTTPTooManyRequests
778 |           sleep(RATE_LIMIT * 2)
779 |           raise "Rate limited, retrying..."
780 |         when Net::HTTPNotFound
781 |           @logger.warn("File not found, skipping: #{file_url}")
782 |           return :skipped_not_found
783 |         else
784 |           raise "HTTP Error: #{response.code} #{response.message}"
785 |         end
786 |       end
787 | 
788 |     rescue StandardError => e
789 |       if retries < MAX_RETRIES
790 |         retries += 1
791 |         @logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}")
792 |         sleep(RETRY_DELAY * retries)
793 |         retry
794 |       else
795 |         @failed_downloads << {url: file_url, error: e.message}
796 |         raise e
797 |       end
798 |     end
799 |   end
800 | 
801 |   def cleanup
802 |     @connection_pool.shutdown
803 | 
804 |     if @failed_downloads.any?
805 |       @logger.error("Download completed with errors.")
806 |       @logger.error("Failed downloads summary:")
807 |       @failed_downloads.each do |failure|
808 |         @logger.error("  #{failure[:url]} - #{failure[:error]}")
809 |       end
810 |       unless @reset
811 |          puts "State files kept due to download errors: #{cdx_path}, #{db_path}"
812 |          return
813 |       end
814 |     end
815 | 
816 |     if !@keep || @reset
817 |         puts "Cleaning up state files..." unless @keep && !@reset
818 |         FileUtils.rm_f(cdx_path)
819 |         FileUtils.rm_f(db_path)
820 |     elsif @keep
821 |         puts "Keeping state files as requested: #{cdx_path}, #{db_path}"
822 |     end
823 |   end
824 | end
825 | 


--------------------------------------------------------------------------------
/lib/wayback_machine_downloader/archive_api.rb:
--------------------------------------------------------------------------------
 1 | require 'json'
 2 | require 'uri'
 3 | 
 4 | module ArchiveAPI
 5 | 
 6 |   def get_raw_list_from_api(url, page_index, http)
 7 |     # Automatically append /* if the URL doesn't contain a path after the domain
 8 |     # This is a workaround for an issue with the API and *some* domains.
 9 |     # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6
10 |     if url && !url.match(/^https?:\/\/.*\//i)
11 |       url = "#{url}/*"
12 |     end
13 | 
14 |     request_url = URI("https://web.archive.org/cdx/search/cdx")
15 |     params = [["output", "json"], ["url", url]] + parameters_for_api(page_index)
16 |     request_url.query = URI.encode_www_form(params)
17 | 
18 |     begin
19 |       response = http.get(request_url)
20 |       body = response.body.to_s.strip
21 |       return [] if body.empty?
22 |       json = JSON.parse(body)
23 | 
24 |       # Check if the response contains the header ["timestamp", "original"]
25 |       json.shift if json.first == ["timestamp", "original"]
26 |       json
27 |     rescue JSON::ParserError, StandardError => e
28 |       warn "Failed to fetch data from API: #{e.message}"
29 |       []
30 |     end
31 |   end
32 | 
33 |   def parameters_for_api(page_index)
34 |     parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
35 |     parameters.push(["filter", "statuscode:200"]) unless @all
36 |     parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0
37 |     parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0
38 |     parameters.push(["page", page_index]) if page_index
39 |     parameters
40 |   end
41 | 
42 | end
43 | 


--------------------------------------------------------------------------------
/lib/wayback_machine_downloader/tidy_bytes.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module TidyBytes
 4 |   # precomputing CP1252 to UTF-8 mappings for bytes 128-159
 5 |   CP1252_MAP = (128..159).map do |byte|
 6 |     case byte
 7 |     when 128 then [226, 130, 172]  # EURO SIGN
 8 |     when 130 then [226, 128, 154]  # SINGLE LOW-9 QUOTATION MARK
 9 |     when 131 then [198, 146]       # LATIN SMALL LETTER F WITH HOOK
10 |     when 132 then [226, 128, 158]  # DOUBLE LOW-9 QUOTATION MARK
11 |     when 133 then [226, 128, 166]  # HORIZONTAL ELLIPSIS
12 |     when 134 then [226, 128, 160]  # DAGGER
13 |     when 135 then [226, 128, 161]  # DOUBLE DAGGER
14 |     when 136 then [203, 134]       # MODIFIER LETTER CIRCUMFLEX ACCENT
15 |     when 137 then [226, 128, 176]  # PER MILLE SIGN
16 |     when 138 then [197, 160]       # LATIN CAPITAL LETTER S WITH CARON
17 |     when 139 then [226, 128, 185]  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
18 |     when 140 then [197, 146]       # LATIN CAPITAL LIGATURE OE
19 |     when 142 then [197, 189]       # LATIN CAPITAL LETTER Z WITH CARON
20 |     when 145 then [226, 128, 152]  # LEFT SINGLE QUOTATION MARK
21 |     when 146 then [226, 128, 153]  # RIGHT SINGLE QUOTATION MARK
22 |     when 147 then [226, 128, 156]  # LEFT DOUBLE QUOTATION MARK
23 |     when 148 then [226, 128, 157]  # RIGHT DOUBLE QUOTATION MARK
24 |     when 149 then [226, 128, 162]  # BULLET
25 |     when 150 then [226, 128, 147]  # EN DASH
26 |     when 151 then [226, 128, 148]  # EM DASH
27 |     when 152 then [203, 156]       # SMALL TILDE
28 |     when 153 then [226, 132, 162]  # TRADE MARK SIGN
29 |     when 154 then [197, 161]       # LATIN SMALL LETTER S WITH CARON
30 |     when 155 then [226, 128, 186]  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
31 |     when 156 then [197, 147]       # LATIN SMALL LIGATURE OE
32 |     when 158 then [197, 190]       # LATIN SMALL LETTER Z WITH CARON
33 |     when 159 then [197, 184]       # LATIN SMALL LETTER Y WITH DIAERESIS
34 |     end
35 |   end.freeze
36 | 
37 |   # precomputing all possible byte conversions 
38 |   CP1252_TO_UTF8 = Array.new(256) do |b|
39 |     if (128..159).cover?(b)
40 |       CP1252_MAP[b - 128]&.pack('C*')
41 |     elsif b < 128
42 |       b.chr
43 |     else
44 |       b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*')
45 |     end
46 |   end.freeze
47 | 
48 |   def self.included(base)
49 |     base.class_eval do
50 |       def tidy_bytes(force = false)
51 |         return nil if empty?
52 |         
53 |         if force
54 |           buffer = String.new(capacity: bytesize)
55 |           each_byte { |b| buffer << CP1252_TO_UTF8[b] }
56 |           return buffer.force_encoding(Encoding::UTF_8)
57 |         end
58 | 
59 |         begin
60 |           encode('UTF-8')
61 |         rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
62 |           buffer = String.new(capacity: bytesize)
63 |           scrub { |b| CP1252_TO_UTF8[b.ord] }
64 |         end
65 |       end
66 | 
67 |       def tidy_bytes!(force = false)
68 |         result = tidy_bytes(force)
69 |         result ? replace(result) : self
70 |       end
71 |     end
72 |   end
73 | end
74 | 
75 | class String
76 |   include TidyBytes
77 | end


--------------------------------------------------------------------------------
/lib/wayback_machine_downloader/to_regex.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | module ToRegex
  4 |   module StringMixin
  5 |     INLINE_OPTIONS = /[imxnesu]*/i.freeze
  6 |     REGEXP_DELIMITERS = {
  7 |       '%r{' => '}'.freeze,
  8 |       '/' => '/'.freeze
  9 |     }.freeze
 10 | 
 11 |     REGEX_FLAGS = {
 12 |       ignore_case: Regexp::IGNORECASE,
 13 |       multiline: Regexp::MULTILINE,
 14 |       extended: Regexp::EXTENDED
 15 |     }.freeze
 16 | 
 17 |     class << self
 18 |       def literal?(str)
 19 |         REGEXP_DELIMITERS.none? { |start, ending| str.start_with?(start) && str.match?(/#{ending}#{INLINE_OPTIONS}\z/) }
 20 |       end
 21 |     end
 22 | 
 23 |     # Get a regex back
 24 |     #
 25 |     # Without :literal or :detect, `"foo".to_regex` will return nil.
 26 |     #
 27 |     # @param [optional, Hash] options
 28 |     # @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp
 29 |     # @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally
 30 |     # @option options [true,false] :ignore_case /foo/i
 31 |     # @option options [true,false] :multiline /foo/m
 32 |     # @option options [true,false] :extended /foo/x
 33 |     # @option options [true,false] :lang /foo/[nesu]
 34 |     def to_regex(options = {})
 35 |       args = as_regexp(options)
 36 |       args ? Regexp.new(*args) : nil
 37 |     end
 38 |     # Return arguments that can be passed to `Regexp.new`
 39 |     # @see to_regexp
 40 |     def as_regexp(options = {})
 41 |       raise ArgumentError, '[to_regexp] Options must be a Hash' unless options.is_a?(Hash)
 42 |       
 43 |       str = self
 44 |       return if options[:detect] && str.empty?
 45 | 
 46 |       if should_treat_as_literal?(str, options)
 47 |         content = Regexp.escape(str)
 48 |       elsif (delim_set = extract_delimiters(str))
 49 |         content, options = parse_regexp_string(str, delim_set, options)
 50 |         return unless content
 51 |       else
 52 |         return
 53 |       end
 54 | 
 55 |       build_regexp_args(content, options)
 56 |     end
 57 | 
 58 |     private
 59 | 
 60 |     def should_treat_as_literal?(str, options)
 61 |       options[:literal] || (options[:detect] && ToRegex::StringMixin.literal?(str))
 62 |     end
 63 | 
 64 |     def extract_delimiters(str)
 65 |       REGEXP_DELIMITERS.find { |start, _| str.start_with?(start) }
 66 |     end
 67 | 
 68 |     def parse_regexp_string(str, delim_set, options)
 69 |       start_delim, end_delim = delim_set
 70 |       match = /\A#{start_delim}(.*)#{end_delim}(#{INLINE_OPTIONS})\z/u.match(str)
 71 |       return unless match
 72 | 
 73 |       content = match[1].gsub('\\/', '/')
 74 |       parse_inline_options(match[2], options)
 75 |       [content, options]
 76 |     end
 77 | 
 78 |     def parse_inline_options(inline_options, options)
 79 |       return unless inline_options
 80 |       options[:ignore_case] = true if inline_options.include?('i')
 81 |       options[:multiline] = true if inline_options.include?('m')
 82 |       options[:extended] = true if inline_options.include?('x')
 83 |       # 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8
 84 |       options[:lang] = inline_options.scan(/[nesu]/i).join.downcase
 85 |     end
 86 | 
 87 |     def build_regexp_args(content, options)
 88 |       flags = calculate_flags(options)
 89 |       lang = normalize_lang_option(options[:lang])
 90 |       
 91 |       lang.empty? ? [content, flags] : [content, flags, lang]
 92 |     end
 93 | 
 94 |     def calculate_flags(options)
 95 |       REGEX_FLAGS.sum { |key, value| options[key] ? value : 0 }
 96 |     end
 97 | 
 98 |     def normalize_lang_option(lang)
 99 |       return '' unless lang
100 |       RUBY_VERSION >= '1.9' ? lang.delete('u') : lang
101 |     end
102 |   end
103 | end
104 | 
105 | class String
106 |   include ToRegex::StringMixin
107 | end


--------------------------------------------------------------------------------
/test/test_wayback_machine_downloader.rb:
--------------------------------------------------------------------------------
  1 | require 'minitest/autorun'
  2 | require 'wayback_machine_downloader'
  3 | 
  4 | class WaybackMachineDownloaderTest < Minitest::Test
  5 | 
  6 |   def setup
  7 |     @wayback_machine_downloader = WaybackMachineDownloader.new(
  8 |       base_url: 'https://www.example.com'
  9 |     )
 10 |     $stdout = StringIO.new
 11 |   end
 12 | 
 13 |   def teardown
 14 |     FileUtils.rm_rf(@wayback_machine_downloader.backup_path)
 15 |   end
 16 | 
 17 |   def test_base_url_being_set
 18 |     assert_equal 'https://www.example.com', @wayback_machine_downloader.base_url
 19 |   end
 20 | 
 21 |   def test_backup_name_being_set
 22 |     assert_equal 'www.example.com', @wayback_machine_downloader.backup_name
 23 |   end
 24 | 
 25 |   def test_backup_name_being_set_when_base_url_is_domain
 26 |     @wayback_machine_downloader.base_url = 'www.example.com'
 27 |     assert_equal 'www.example.com', @wayback_machine_downloader.backup_name
 28 |   end
 29 | 
 30 |   def test_file_list_curated
 31 |     assert_equal 20060711191226, @wayback_machine_downloader.get_file_list_curated["linux.htm"][:timestamp]
 32 |   end
 33 | 
 34 |   def test_file_list_by_timestamp
 35 |     file_expected = {
 36 |       file_url: "http://www.onlyfreegames.net:80/strat.html",
 37 |       timestamp: 20060111084756,
 38 |       file_id: "strat.html"
 39 |     }
 40 |     assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-2]
 41 |   end
 42 | 
 43 |   def test_without_exact_url
 44 |     @wayback_machine_downloader.exact_url = false
 45 |     assert @wayback_machine_downloader.get_file_list_curated.size > 1
 46 |   end
 47 | 
 48 |   def test_exact_url
 49 |     @wayback_machine_downloader.exact_url = true
 50 |     assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size
 51 |   end
 52 | 
 53 |   def test_file_list_only_filter_without_matches
 54 |     @wayback_machine_downloader.only_filter = 'abc123'
 55 |     assert_equal 0, @wayback_machine_downloader.get_file_list_curated.size
 56 |   end
 57 | 
 58 |   def test_file_list_only_filter_with_1_match
 59 |     @wayback_machine_downloader.only_filter = 'menu.html'
 60 |     assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size
 61 |   end
 62 | 
 63 |   def test_file_list_only_filter_with_a_regex
 64 |     @wayback_machine_downloader.only_filter = '/\.(gif|je?pg|bmp)$/i'
 65 |     assert_equal 37, @wayback_machine_downloader.get_file_list_curated.size
 66 |   end
 67 | 
 68 |   def test_file_list_exclude_filter_without_matches
 69 |     @wayback_machine_downloader.exclude_filter = 'abc123'
 70 |     assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size
 71 |   end
 72 | 
 73 |   def test_file_list_exclude_filter_with_1_match
 74 |     @wayback_machine_downloader.exclude_filter = 'menu.html'
 75 |     assert_equal 67, @wayback_machine_downloader.get_file_list_curated.size
 76 |   end
 77 | 
 78 |   def test_file_list_exclude_filter_with_a_regex
 79 |     @wayback_machine_downloader.exclude_filter = '/\.(gif|je?pg|bmp)$/i'
 80 |     assert_equal 31, @wayback_machine_downloader.get_file_list_curated.size
 81 |   end
 82 | 
 83 |   def test_file_download
 84 |     @wayback_machine_downloader.download_files
 85 |     linux_page = open 'websites/www.onlyfreegames.net/linux.htm'
 86 |     assert_includes linux_page.read, "Linux Games"
 87 |   end
 88 | 
 89 |   def test_all_timestamps_being_respected
 90 |     @wayback_machine_downloader.all_timestamps = true
 91 |     assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size
 92 |   end
 93 | 
 94 |   def test_from_timestamp_being_respected
 95 |     @wayback_machine_downloader.from_timestamp = 20050716231334
 96 |     file_url = @wayback_machine_downloader.get_file_list_curated["linux.htm"][:file_url]
 97 |     assert_equal "http://www.onlyfreegames.net:80/linux.htm", file_url
 98 |   end
 99 | 
100 |   def test_to_timestamp_being_respected
101 |     @wayback_machine_downloader.to_timestamp = 20050716231334
102 |     assert_nil @wayback_machine_downloader.get_file_list_curated["linux.htm"]
103 |   end
104 | 
105 |   def test_all_get_file_list_curated_size
106 |     @wayback_machine_downloader.all = true
107 |     assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size
108 |   end
109 | 
110 |   # Testing encoding conflicts needs a different base_url
111 |   def test_nonascii_suburls_download
112 |     @wayback_machine_downloader = WaybackMachineDownloader.new(
113 |       base_url: 'https://en.wikipedia.org/wiki/%C3%84')
114 |     # Once just for the downloading...
115 |     @wayback_machine_downloader.download_files
116 |   end
117 | 
118 |   def test_nonascii_suburls_already_present
119 |     @wayback_machine_downloader = WaybackMachineDownloader.new(
120 |       base_url: 'https://en.wikipedia.org/wiki/%C3%84')
121 |     # ... twice to test the "is already present" case
122 |     @wayback_machine_downloader.download_files
123 |     @wayback_machine_downloader.download_files
124 |   end
125 | 
126 | end
127 | 


--------------------------------------------------------------------------------
/wayback_machine_downloader.gemspec:
--------------------------------------------------------------------------------
 1 | Gem::Specification.new do |s|
 2 |   s.name        = "wayback_machine_downloader_straw"
 3 |   s.version     = "2.3.8"
 4 |   s.executables << "wayback_machine_downloader"
 5 |   s.summary     = "Download an entire website from the Wayback Machine."
 6 |   s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)"
 7 |   s.authors     = ["strawberrymaster"]
 8 |   s.email       = "strawberrymaster@vivaldi.net"
 9 |   s.files       = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"]
10 |   s.homepage    = "https://github.com/StrawberryMaster/wayback-machine-downloader"
11 |   s.license     = "MIT"
12 |   s.required_ruby_version = ">= 3.4.3"
13 |   s.add_runtime_dependency "concurrent-ruby", "~> 1.3", ">= 1.3.4"
14 |   s.add_development_dependency "rake", "~> 12.2"
15 |   s.add_development_dependency "minitest", "~> 5.2"
16 | end
17 | 


--------------------------------------------------------------------------------