├── .dockerignore ├── .env.example ├── .github └── workflows │ ├── docker.yml │ └── ruby.yml ├── .gitignore ├── .travis.yml ├── Dockerfile ├── Gemfile ├── LICENSE ├── README.md ├── Rakefile ├── bin └── wayback_machine_downloader ├── docker-compose.yml ├── entrypoint.sh ├── lib ├── wayback_machine_downloader.rb └── wayback_machine_downloader │ ├── archive_api.rb │ ├── tidy_bytes.rb │ └── to_regex.rb ├── test └── test_wayback_machine_downloader.rb └── wayback_machine_downloader.gemspec /.dockerignore: -------------------------------------------------------------------------------- 1 | *.md 2 | *.yml 3 | 4 | .github 5 | websites -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | DB_HOST="db" 2 | DB_USER="root" 3 | DB_PASSWORD="example1234" 4 | DB_NAME="wayback" -------------------------------------------------------------------------------- /.github/workflows/docker.yml: -------------------------------------------------------------------------------- 1 | name: Create and publish a Docker image 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | tags: 8 | - '*' 9 | 10 | env: 11 | REGISTRY: ghcr.io 12 | IMAGE_NAME: ${{ github.repository }} 13 | 14 | jobs: 15 | build-and-push-image: 16 | runs-on: ubuntu-latest 17 | permissions: 18 | contents: read 19 | packages: write 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@main 23 | - name: Set up QEMU 24 | uses: docker/setup-qemu-action@master 25 | - name: Set up Docker Buildx 26 | uses: docker/setup-buildx-action@master 27 | - name: Log in to the Container registry 28 | uses: docker/login-action@master 29 | with: 30 | registry: ${{ env.REGISTRY }} 31 | username: ${{ github.actor }} 32 | password: ${{ secrets.GITHUB_TOKEN }} 33 | - name: Extract metadata (tags, labels) for Docker 34 | id: meta 35 | uses: docker/metadata-action@master 36 | with: 37 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 38 | - name: Build and push Docker image 39 | uses: docker/build-push-action@master 40 | with: 41 | context: . 42 | file: ./Dockerfile 43 | platforms: linux/amd64,linux/arm64/v8 44 | push: true 45 | tags: ${{ steps.meta.outputs.tags }} 46 | labels: ${{ steps.meta.outputs.labels }} -------------------------------------------------------------------------------- /.github/workflows/ruby.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake 6 | # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby 7 | 8 | name: Ruby 9 | 10 | on: 11 | push: 12 | branches: [ "master" ] 13 | pull_request: 14 | branches: [ "master" ] 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | test: 21 | runs-on: ubuntu-24.04 22 | strategy: 23 | matrix: 24 | ruby-version: ['2.7', '3.0', '3.3'] 25 | 26 | steps: 27 | - uses: actions/checkout@v4.2.2 28 | - name: Setup Ruby, JRuby and TruffleRuby 29 | uses: ruby/setup-ruby@v1.202.0 30 | with: 31 | ruby-version: ${{ matrix.ruby-version }} 32 | bundler-cache: true # runs 'bundle install' and caches installed gems automatically 33 | - name: Run tests 34 | run: bundle exec rake -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## PROJECT::GENERAL 2 | .yardoc 3 | coverage 4 | doc 5 | rdoc 6 | log 7 | websites 8 | .DS_Store 9 | .rake_tasks~ 10 | 11 | ## BUNDLER 12 | *.gem 13 | .bundle 14 | pkg 15 | Gemfile.lock 16 | 17 | ## RBENV 18 | .ruby-version 19 | .rbenv* 20 | 21 | ## ENV 22 | *.env* 23 | !.env*.example 24 | 25 | 26 | ## RCOV 27 | coverage.data 28 | 29 | tmp 30 | 31 | ## RUBINIUS 32 | *.rbc 33 | 34 | test.rb 35 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: ruby 3 | rvm: 4 | - 1.9.2 5 | - 1.9.3 6 | - 2.0.0 7 | - 2.1 8 | - 2.2 9 | - 2.3.1 10 | - jruby 11 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ruby:3.4.4-alpine 2 | USER root 3 | WORKDIR /build 4 | 5 | COPY Gemfile /build/ 6 | COPY *.gemspec /build/ 7 | 8 | RUN bundle config set jobs "$(nproc)" \ 9 | && bundle config set without 'development test' \ 10 | && bundle install 11 | 12 | COPY . /build 13 | 14 | WORKDIR / 15 | ENTRYPOINT [ "/build/bin/wayback_machine_downloader" ] 16 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gem 'concurrent-ruby', '~> 1.3', '>= 1.3.4' 4 | gemspec -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2016 Julian Khaleghy and contributors 4 | See the full list at https://github.com/hartator/wayback-machine-downloader/graphs/contributors 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Wayback Machine Downloader 2 | [![version](https://badge.fury.io/rb/wayback_machine_downloader_straw.svg)](https://rubygems.org/gems/wayback_machine_downloader_straw) 3 | 4 | This is a fork of the [Wayback Machine Downloader](https://github.com/hartator/wayback-machine-downloader). With this, you can download a website from the Internet Archive Wayback Machine. 5 | 6 | Included here is partial content from other forks, namely those @ [ShiftaDeband](https://github.com/ShiftaDeband/wayback-machine-downloader) and [matthid](https://github.com/matthid/wayback-machine-downloader) — attributions are in the code and go to the original authors; as well as a few additional (future) features. 7 | 8 | ## ▶️ Quick start 9 | 10 | Download a website's latest snapshot: 11 | ```bash 12 | ruby wayback_machine_downloader https://example.com 13 | ``` 14 | Your files will save to `./websites/example.com/` with their original structure preserved. 15 | 16 | ## 📥 Installation 17 | ### Requirements 18 | - Ruby 2.3+ ([download Ruby here](https://www.ruby-lang.org/en/downloads/)) 19 | - Bundler gem (`gem install bundler`) 20 | 21 | ### Quick install 22 | It took a while, but we have a gem for this! Install it with: 23 | ```bash 24 | gem install wayback_machine_downloader_straw 25 | ``` 26 | To run most commands, just like in the original WMD, you can use: 27 | ```bash 28 | wayback_machine_downloader https://example.com 29 | ``` 30 | 31 | ### Step-by-step setup 32 | 1. **Install Ruby**: 33 | ```bash 34 | ruby -v 35 | ``` 36 | This will verify your installation. If not installed, [download Ruby](https://www.ruby-lang.org/en/downloads/) for your OS. 37 | 38 | 2. **Install dependencies**: 39 | ```bash 40 | bundle install 41 | ``` 42 | 43 | If you encounter an error like cannot load such file -- concurrent-ruby, manually install the missing gem: 44 | ```bash 45 | gem install concurrent-ruby 46 | ``` 47 | 48 | 3. **Run it**: 49 | ```bash 50 | cd path/to/wayback-machine-downloader/bin 51 | ruby wayback_machine_downloader https://example.com 52 | ``` 53 | For example, if you extracted the contents to a folder named "wayback-machine-downloader" in your Downloads directory, you'd need to type `cd Downloads\wayback-machine-downloader\bin`. 54 | 55 | *Windows tip*: In File Explorer, Shift + Right Click your `bin` folder → "Open Terminal here". 56 | 57 | ## 🐳 Docker users 58 | We have a Docker image! See [#Packages](https://github.com/StrawberryMaster/wayback-machine-downloader/pkgs/container/wayback-machine-downloader) for the latest version. You can also build it yourself. Here's how: 59 | 60 | ```bash 61 | docker build -t wayback_machine_downloader . 62 | docker run -it --rm wayback_machine_downloader [options] URL 63 | ``` 64 | 65 | or the example without cloning the repo - fetching smallrockets.com until the year 2013: 66 | 67 | ```bash 68 | docker run -v .:/websites ghcr.io/strawberrymaster/wayback-machine-downloader:master wayback_machine_downloader --to 20130101 smallrockets.com 69 | ``` 70 | 71 | ### 🐳 Using Docker Compose 72 | 73 | We can also use it with Docker Compose, which provides a lot of benefits for extending more functionalities (such as implementing storing previous downloads in a database): 74 | ```yaml 75 | # docker-compose.yml 76 | services: 77 | wayback_machine_downloader: 78 | build: 79 | context: . 80 | tty: true 81 | image: wayback_machine_downloader:latest 82 | container_name: wayback_machine_downloader 83 | environment: 84 | - ENVIRONMENT=${ENVIRONMENT:-development} 85 | - OPTIONS=${OPTIONS:-""} 86 | - TARGET_URL=${TARGET_URL} 87 | volumes: 88 | - .:/build:rw 89 | - ./websites:/build/websites:rw 90 | command: --directory /build/websites ${OPTIONS} ${TARGET_URL} 91 | ``` 92 | #### Usage: 93 | Now You can create a Docker image as named "wayback_machine_downloader" with the following command: 94 | ```bash 95 | docker compose up -d --build 96 | ``` 97 | 98 | After that you must set TARGET_URL environment variable: 99 | ```bash 100 | export TARGET_URL="https://example.com/" 101 | ``` 102 | 103 | The **OPTIONS** env. variable is optional this may include additional settings which are found in the "**Advanced usage**" section below. 104 | 105 | Example: 106 | ```bash 107 | export OPTIONS="--list -f 20060121" 108 | ``` 109 | 110 | After that you can run the exists container with the following command: 111 | ```bash 112 | docker compose run --rm wayback_machine_downloader https://example.com 113 | ``` 114 | 115 | ## ⚙️ Configuration 116 | There are a few constants that can be edited in the `wayback_machine_downloader.rb` file for your convenience. The default values may be conservative, so you can adjust them to your needs. They are: 117 | ```ruby 118 | DEFAULT_TIMEOUT = 30 # HTTP timeout (in seconds) 119 | MAX_RETRIES = 3 # Number of times to retry failed requests 120 | RETRY_DELAY = 2 # Wait time between retries (seconds) 121 | RATE_LIMIT = 0.25 # Throttle between requests (seconds) 122 | CONNECTION_POOL_SIZE = 10 # Maximum simultaneous connections 123 | MEMORY_BUFFER_SIZE = 16384 # Download buffer size (bytes) 124 | STATE_CDX_FILENAME = '.cdx.json' # Stores snapshot listing 125 | STATE_DB_FILENAME = '.downloaded.txt' # Tracks completed downloads 126 | ``` 127 | 128 | ## 🛠️ Advanced usage 129 | 130 | ### Basic options 131 | | Option | Description | 132 | |--------|-------------| 133 | | `-d DIR`, `--directory DIR` | Custom output directory | 134 | | `-s`, `--all-timestamps` | Download all historical versions | 135 | | `-f TS`, `--from TS` | Start from timestamp (e.g., 20060121) | 136 | | `-t TS`, `--to TS` | Stop at timestamp | 137 | | `-e`, `--exact-url` | Download exact URL only | 138 | | `-r`, `--rewritten` | Download rewritten Wayback Archive files only | 139 | 140 | **Example** - Download files to `downloaded-backup` folder 141 | ```bash 142 | ruby wayback_machine_downloader https://example.com --directory downloaded-backup/ 143 | ``` 144 | By default, Wayback Machine Downloader will download files to ./websites/ followed by the domain name of the website. You may want to save files in a specific directory using this option. 145 | 146 | **Example 2** - Download historical timestamps: 147 | ```bash 148 | ruby wayback_machine_downloader https://example.com --all-timestamps 149 | ``` 150 | This option will download all timestamps/snapshots for a given website. It will uses the timestamp of each snapshot as directory. In this case, it will download, for example: 151 | ```bash 152 | websites/example.com/20060715085250/index.html 153 | websites/example.com/20051120005053/index.html 154 | websites/example.com/20060111095815/img/logo.png 155 | ... 156 | ``` 157 | 158 | **Example 3** - Download content on or after July 16, 2006: 159 | ```bash 160 | ruby wayback_machine_downloader https://example.com --from 20060716231334 161 | ``` 162 | You may want to supply a from timestamp to lock your backup to a specific version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., https://web.archive.org/web/20060716231334/http://example.com). You can also use years (2006), years + month (200607), etc. It can be used in combination of To Timestamp. 163 | Wayback Machine Downloader will then fetch only file versions on or after the timestamp specified. 164 | 165 | **Example 4** - Download content on or before September 16, 2010: 166 | ```bash 167 | ruby wayback_machine_downloader https://example.com --to 20100916231334 168 | ``` 169 | You may want to supply a to timestamp to lock your backup to a specific version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., https://web.archive.org/web/20100916231334/http://example.com). You can also use years (2010), years + month (201009), etc. It can be used in combination of From Timestamp. 170 | Wayback Machine Downloader will then fetch only file versions on or before the timestamp specified. 171 | 172 | **Example 5** - Download the homepage of http://example.com 173 | ```bash 174 | ruby wayback_machine_downloader https://example.com --exact-url 175 | ``` 176 | If you want to retrieve only the file matching exactly the url provided, you can use this flag. It will avoid downloading anything else. 177 | 178 | **Example 6** - Download a rewritten file 179 | ```bash 180 | ruby wayback_machine_downloader https://example.com --rewritten 181 | ``` 182 | Useful if you want to download the rewritten files from the Wayback Machine instead of the original ones. 183 | 184 | ### Filtering Content 185 | | Option | Description | 186 | |--------|-------------| 187 | | `-o FILTER`, `--only FILTER` | Only download matching URLs (supports regex) | 188 | | `-x FILTER`, `--exclude FILTER` | Exclude matching URLs | 189 | 190 | **Example** - Include only images: 191 | ```bash 192 | ruby wayback_machine_downloader https://example.com -o "/\.(jpg|png)/i" 193 | ``` 194 | You may want to retrieve files which are of a certain type (e.g., .pdf, .jpg, .wrd...) or are in a specific directory. To do so, you can supply the --only flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download. 195 | For example, if you only want to download files inside a specific my_directory: 196 | ```bash 197 | ruby wayback_machine_downloader https://example.com --only my_directory 198 | ``` 199 | Or if you want to download every images without anything else: 200 | ```bash 201 | ruby wayback_machine_downloader https://example.com --only "/\.(gif|jpg|jpeg)$/i" 202 | ``` 203 | 204 | **Example 2** - Exclude images: 205 | ```bash 206 | ruby wayback_machine_downloader https://example.com -x "/\.(jpg|png)/i" 207 | ``` 208 | You may want to retrieve files which aren't of a certain type (e.g., .pdf, .jpg, .wrd...) or aren't in a specific directory. To do so, you can supply the --exclude flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download. 209 | For example, if you want to avoid downloading files inside my_directory: 210 | ```bash 211 | ruby wayback_machine_downloader https://example.com --exclude my_directory 212 | ``` 213 | Or if you want to download everything except images: 214 | ```bash 215 | ruby wayback_machine_downloader https://example.com --exclude "/\.(gif|jpg|jpeg)$/i" 216 | ``` 217 | 218 | ### Performance 219 | | Option | Description | 220 | |--------|-------------| 221 | | `-c NUM`, `--concurrency NUM` | Concurrent downloads (default: 1) | 222 | | `-p NUM`, `--maximum-snapshot NUM` | Max snapshot pages (150k snapshots/page) | 223 | 224 | **Example** - 20 parallel downloads: 225 | ```bash 226 | ruby wayback_machine_downloader https://example.com --concurrency 20 227 | ``` 228 | Will specify the number of multiple files you want to download at the same time. Allows one to speed up the download of a website significantly. Default is to download one file at a time. 229 | 230 | **Example 2** - 300 snapshot pages: 231 | ```bash 232 | ruby wayback_machine_downloader https://example.com --snapshot-pages 300 233 | ``` 234 | Will specify the maximum number of snapshot pages to consider. Count an average of 150,000 snapshots per page. 100 is the default maximum number of snapshot pages and should be sufficient for most websites. Use a bigger number if you want to download a very large website. 235 | 236 | ### Diagnostics 237 | | Option | Description | 238 | |--------|-------------| 239 | | `-a`, `--all` | Include error pages (40x/50x) | 240 | | `-l`, `--list` | List files without downloading | 241 | 242 | **Example** - Download all files 243 | ```bash 244 | ruby wayback_machine_downloader https://example.com --all 245 | ``` 246 | By default, Wayback Machine Downloader limits itself to files that responded with 200 OK code. If you also need errors files (40x and 50x codes) or redirections files (30x codes), you can use the --all or -a flag and Wayback Machine Downloader will download them in addition of the 200 OK files. It will also keep empty files that are removed by default. 247 | 248 | **Example 2** - Generate URL list: 249 | ```bash 250 | ruby wayback_machine_downloader https://example.com --list 251 | ``` 252 | It will just display the files to be downloaded with their snapshot timestamps and urls. The output format is JSON. It won't download anything. It's useful for debugging or to connect to another application. 253 | 254 | ### Job management 255 | The downloader automatically saves its progress (`.cdx.json` for snapshot list, `.downloaded.txt` for completed files) in the output directory. If you run the same command again pointing to the same output directory, it will resume where it left off, skipping already downloaded files. 256 | 257 | > [!NOTE] 258 | > Automatic resumption can be affected by changing the URL, mode selection (like `--all-timestamps`), filtering selections, or other options. If you want to ensure a clean start, use the `--reset` option. 259 | 260 | | Option | Description | 261 | |--------|-------------| 262 | | `--reset` | Delete state files (`.cdx.json`, `.downloaded.txt`) and restart the download from scratch. Does not delete already downloaded website files. | 263 | | `--keep` | Keep state files (`.cdx.json`, `.downloaded.txt`) even after a successful download. By default, these are deleted upon successful completion. | 264 | 265 | **Example** - Restart a download job from the beginning: 266 | ```bash 267 | ruby wayback_machine_downloader https://example.com --reset 268 | ``` 269 | This is useful if you suspect the state files are corrupted or want to ensure a completely fresh download process without deleting the files you already have. 270 | 271 | **Example 2** - Keep state files after download: 272 | ```bash 273 | ruby wayback_machine_downloader https://example.com --keep 274 | ``` 275 | This can be useful for debugging or if you plan to extend the download later with different parameters (e.g., adding `--to` timestamp) while leveraging the existing snapshot list. 276 | 277 | ## 🤝 Contributing 278 | 1. Fork the repository 279 | 2. Create a feature branch 280 | 3. Submit a pull request 281 | 282 | **Run tests** (note, these are still broken!): 283 | ```bash 284 | bundle exec rake test 285 | ``` 286 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rake/testtask' 2 | 3 | Rake::TestTask.new do |t| 4 | t.libs << 'test' 5 | end 6 | 7 | desc "Run tests" 8 | task :default => :test 9 | -------------------------------------------------------------------------------- /bin/wayback_machine_downloader: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require_relative '../lib/wayback_machine_downloader' 4 | require 'optparse' 5 | require 'pp' 6 | 7 | options = {} 8 | option_parser = OptionParser.new do |opts| 9 | opts.banner = "Usage: wayback_machine_downloader http://example.com" 10 | 11 | opts.separator "" 12 | opts.separator "Download an entire website from the Wayback Machine." 13 | 14 | opts.separator "" 15 | opts.separator "Optional options:" 16 | 17 | opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t| 18 | options[:directory] = t 19 | end 20 | 21 | opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t| 22 | options[:all_timestamps] = true 23 | end 24 | 25 | opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t| 26 | options[:from_timestamp] = t 27 | end 28 | 29 | opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t| 30 | options[:to_timestamp] = t 31 | end 32 | 33 | opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t| 34 | options[:exact_url] = t 35 | end 36 | 37 | opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t| 38 | options[:only_filter] = t 39 | end 40 | 41 | opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t| 42 | options[:exclude_filter] = t 43 | end 44 | 45 | opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t| 46 | options[:all] = true 47 | end 48 | 49 | opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t| 50 | options[:threads_count] = t 51 | end 52 | 53 | opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t| 54 | options[:maximum_pages] = t 55 | end 56 | 57 | opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t| 58 | options[:list] = true 59 | end 60 | 61 | opts.on("-r", "--rewritten", "Downloads the rewritten Wayback Machine files instead of the original files") do |t| 62 | options[:rewritten] = true 63 | end 64 | 65 | opts.on("--local", "Rewrite URLs to make them relative for local browsing") do |t| 66 | options[:rewrite] = true 67 | end 68 | 69 | opts.on("--reset", "Delete state files (.cdx.json, .downloaded.txt) and restart the download from scratch") do |t| 70 | options[:reset] = true 71 | end 72 | 73 | opts.on("--keep", "Keep state files (.cdx.json, .downloaded.txt) after a successful download") do |t| 74 | options[:keep] = true 75 | end 76 | 77 | opts.on("-v", "--version", "Display version") do |t| 78 | options[:version] = t 79 | end 80 | end.parse! 81 | 82 | if (base_url = ARGV[-1]) 83 | options[:base_url] = base_url 84 | wayback_machine_downloader = WaybackMachineDownloader.new options 85 | if options[:list] 86 | wayback_machine_downloader.list_files 87 | else 88 | wayback_machine_downloader.download_files 89 | end 90 | elsif options[:version] 91 | puts WaybackMachineDownloader::VERSION 92 | else 93 | puts "You need to specify a website to backup. (e.g., http://example.com)" 94 | puts "Run `wayback_machine_downloader --help` for more help." 95 | end 96 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | wayback_machine_downloader: 3 | build: 4 | context: . 5 | tty: true 6 | image: wayback_machine_downloader:latest 7 | container_name: wayback_machine_downloader 8 | environment: 9 | - ENVIRONMENT=${DEVELOPMENT:-production} 10 | - OPTIONS=${OPTIONS:-""} 11 | - TARGET_URL=${TARGET_URL} 12 | volumes: 13 | - .:/build:rw 14 | - ./websites:/websites:rw 15 | command: /build/bin/wayback_machine_downloader ${TARGET_URL} ${OPTIONS} -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$ENVIRONMENT" == "development" ]; then 4 | echo "Running in development mode. Starting rerun..." 5 | exec rerun --dir /build --ignore "websites/*" -- /build/bin/wayback_machine_downloader "$@" 6 | else 7 | echo "Not in development mode. Skipping rerun." 8 | exec /build/bin/wayback_machine_downloader "$@" 9 | fi -------------------------------------------------------------------------------- /lib/wayback_machine_downloader.rb: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | require 'thread' 4 | require 'net/http' 5 | require 'open-uri' 6 | require 'fileutils' 7 | require 'cgi' 8 | require 'json' 9 | require 'time' 10 | require 'concurrent-ruby' 11 | require 'logger' 12 | require 'zlib' 13 | require 'stringio' 14 | require_relative 'wayback_machine_downloader/tidy_bytes' 15 | require_relative 'wayback_machine_downloader/to_regex' 16 | require_relative 'wayback_machine_downloader/archive_api' 17 | 18 | class ConnectionPool 19 | MAX_AGE = 300 20 | CLEANUP_INTERVAL = 60 21 | DEFAULT_TIMEOUT = 30 22 | MAX_RETRIES = 3 23 | 24 | def initialize(size) 25 | @size = size 26 | @pool = Concurrent::Map.new 27 | @creation_times = Concurrent::Map.new 28 | @cleanup_thread = schedule_cleanup 29 | end 30 | 31 | def with_connection(&block) 32 | conn = acquire_connection 33 | begin 34 | yield conn 35 | ensure 36 | release_connection(conn) 37 | end 38 | end 39 | 40 | def shutdown 41 | @cleanup_thread&.exit 42 | @pool.each_value { |conn| conn.finish if conn&.started? } 43 | @pool.clear 44 | @creation_times.clear 45 | end 46 | 47 | private 48 | 49 | def acquire_connection 50 | thread_id = Thread.current.object_id 51 | conn = @pool[thread_id] 52 | 53 | if should_create_new?(conn) 54 | conn&.finish if conn&.started? 55 | conn = create_connection 56 | @pool[thread_id] = conn 57 | @creation_times[thread_id] = Time.now 58 | end 59 | 60 | conn 61 | end 62 | 63 | def release_connection(conn) 64 | return unless conn 65 | if conn.started? && Time.now - @creation_times[Thread.current.object_id] > MAX_AGE 66 | conn.finish 67 | @pool.delete(Thread.current.object_id) 68 | @creation_times.delete(Thread.current.object_id) 69 | end 70 | end 71 | 72 | def should_create_new?(conn) 73 | return true if conn.nil? 74 | return true unless conn.started? 75 | return true if Time.now - @creation_times[Thread.current.object_id] > MAX_AGE 76 | false 77 | end 78 | 79 | def create_connection 80 | http = Net::HTTP.new("web.archive.org", 443) 81 | http.use_ssl = true 82 | http.read_timeout = DEFAULT_TIMEOUT 83 | http.open_timeout = DEFAULT_TIMEOUT 84 | http.keep_alive_timeout = 30 85 | http.max_retries = MAX_RETRIES 86 | http.start 87 | http 88 | end 89 | 90 | def schedule_cleanup 91 | Thread.new do 92 | loop do 93 | cleanup_old_connections 94 | sleep CLEANUP_INTERVAL 95 | end 96 | end 97 | end 98 | 99 | def cleanup_old_connections 100 | current_time = Time.now 101 | @creation_times.each do |thread_id, creation_time| 102 | if current_time - creation_time > MAX_AGE 103 | conn = @pool[thread_id] 104 | conn&.finish if conn&.started? 105 | @pool.delete(thread_id) 106 | @creation_times.delete(thread_id) 107 | end 108 | end 109 | end 110 | end 111 | 112 | class WaybackMachineDownloader 113 | 114 | include ArchiveAPI 115 | 116 | VERSION = "2.3.8" 117 | DEFAULT_TIMEOUT = 30 118 | MAX_RETRIES = 3 119 | RETRY_DELAY = 2 120 | RATE_LIMIT = 0.25 # Delay between requests in seconds 121 | CONNECTION_POOL_SIZE = 10 122 | MEMORY_BUFFER_SIZE = 16384 # 16KB chunks 123 | STATE_CDX_FILENAME = ".cdx.json" 124 | STATE_DB_FILENAME = ".downloaded.txt" 125 | 126 | attr_accessor :base_url, :exact_url, :directory, :all_timestamps, 127 | :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, 128 | :all, :maximum_pages, :threads_count, :logger, :reset, :keep, :rewrite 129 | 130 | def initialize params 131 | validate_params(params) 132 | @base_url = params[:base_url] 133 | @exact_url = params[:exact_url] 134 | @directory = params[:directory] 135 | @all_timestamps = params[:all_timestamps] 136 | @from_timestamp = params[:from_timestamp].to_i 137 | @to_timestamp = params[:to_timestamp].to_i 138 | @only_filter = params[:only_filter] 139 | @exclude_filter = params[:exclude_filter] 140 | @all = params[:all] 141 | @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 142 | @threads_count = [params[:threads_count].to_i, 1].max 143 | @rewritten = params[:rewritten] 144 | @reset = params[:reset] 145 | @keep = params[:keep] 146 | @timeout = params[:timeout] || DEFAULT_TIMEOUT 147 | @logger = setup_logger 148 | @failed_downloads = Concurrent::Array.new 149 | @connection_pool = ConnectionPool.new(CONNECTION_POOL_SIZE) 150 | @db_mutex = Mutex.new 151 | @rewrite = params[:rewrite] || false 152 | 153 | handle_reset 154 | end 155 | 156 | def backup_name 157 | url_to_process = @base_url.end_with?('/*') ? @base_url.chomp('/*') : @base_url 158 | 159 | if url_to_process.include? '//' 160 | url_to_process.split('/')[2] 161 | else 162 | url_to_process 163 | end 164 | end 165 | 166 | def backup_path 167 | if @directory 168 | if @directory[-1] == '/' 169 | @directory 170 | else 171 | @directory + '/' 172 | end 173 | else 174 | 'websites/' + backup_name + '/' 175 | end 176 | end 177 | 178 | def cdx_path 179 | File.join(backup_path, STATE_CDX_FILENAME) 180 | end 181 | 182 | def db_path 183 | File.join(backup_path, STATE_DB_FILENAME) 184 | end 185 | 186 | def handle_reset 187 | if @reset 188 | puts "Resetting download state..." 189 | FileUtils.rm_f(cdx_path) 190 | FileUtils.rm_f(db_path) 191 | puts "Removed state files: #{cdx_path}, #{db_path}" 192 | end 193 | end 194 | 195 | def match_only_filter file_url 196 | if @only_filter 197 | only_filter_regex = @only_filter.to_regex 198 | if only_filter_regex 199 | only_filter_regex =~ file_url 200 | else 201 | file_url.downcase.include? @only_filter.downcase 202 | end 203 | else 204 | true 205 | end 206 | end 207 | 208 | def match_exclude_filter file_url 209 | if @exclude_filter 210 | exclude_filter_regex = @exclude_filter.to_regex 211 | if exclude_filter_regex 212 | exclude_filter_regex =~ file_url 213 | else 214 | file_url.downcase.include? @exclude_filter.downcase 215 | end 216 | else 217 | false 218 | end 219 | end 220 | 221 | def get_all_snapshots_to_consider 222 | if File.exist?(cdx_path) && !@reset 223 | puts "Loading snapshot list from #{cdx_path}" 224 | begin 225 | snapshot_list_to_consider = JSON.parse(File.read(cdx_path)) 226 | puts "Loaded #{snapshot_list_to_consider.length} snapshots from cache." 227 | puts 228 | return Concurrent::Array.new(snapshot_list_to_consider) 229 | rescue JSON::ParserError => e 230 | puts "Error reading snapshot cache file #{cdx_path}: #{e.message}. Refetching..." 231 | FileUtils.rm_f(cdx_path) 232 | rescue => e 233 | puts "Error loading snapshot cache #{cdx_path}: #{e.message}. Refetching..." 234 | FileUtils.rm_f(cdx_path) 235 | end 236 | end 237 | 238 | snapshot_list_to_consider = Concurrent::Array.new 239 | mutex = Mutex.new 240 | 241 | puts "Getting snapshot pages from Wayback Machine API..." 242 | 243 | # Fetch the initial set of snapshots, sequentially 244 | @connection_pool.with_connection do |connection| 245 | initial_list = get_raw_list_from_api(@base_url, nil, connection) 246 | initial_list ||= [] 247 | mutex.synchronize do 248 | snapshot_list_to_consider.concat(initial_list) 249 | print "." 250 | end 251 | end 252 | 253 | # Fetch additional pages if the exact URL flag is not set 254 | unless @exact_url 255 | page_index = 0 256 | batch_size = [@threads_count, 5].min 257 | continue_fetching = true 258 | 259 | while continue_fetching && page_index < @maximum_pages 260 | # Determine the range of pages to fetch in this batch 261 | end_index = [page_index + batch_size, @maximum_pages].min 262 | current_batch = (page_index...end_index).to_a 263 | 264 | # Create futures for concurrent API calls 265 | futures = current_batch.map do |page| 266 | Concurrent::Future.execute do 267 | result = nil 268 | @connection_pool.with_connection do |connection| 269 | result = get_raw_list_from_api("#{@base_url}/*", page, connection) 270 | end 271 | result ||= [] 272 | [page, result] 273 | end 274 | end 275 | 276 | results = [] 277 | 278 | futures.each do |future| 279 | begin 280 | results << future.value 281 | rescue => e 282 | puts "\nError fetching page #{future}: #{e.message}" 283 | end 284 | end 285 | 286 | # Sort results by page number to maintain order 287 | results.sort_by! { |page, _| page } 288 | 289 | # Process results and check for empty pages 290 | results.each do |page, result| 291 | if result.nil? || result.empty? 292 | continue_fetching = false 293 | break 294 | else 295 | mutex.synchronize do 296 | snapshot_list_to_consider.concat(result) 297 | print "." 298 | end 299 | end 300 | end 301 | 302 | page_index = end_index 303 | 304 | sleep(RATE_LIMIT) if continue_fetching 305 | end 306 | end 307 | 308 | puts " found #{snapshot_list_to_consider.length} snapshots." 309 | 310 | # Save the fetched list to the cache file 311 | begin 312 | FileUtils.mkdir_p(File.dirname(cdx_path)) 313 | File.write(cdx_path, JSON.pretty_generate(snapshot_list_to_consider.to_a)) # Convert Concurrent::Array back to Array for JSON 314 | puts "Saved snapshot list to #{cdx_path}" 315 | rescue => e 316 | puts "Error saving snapshot cache to #{cdx_path}: #{e.message}" 317 | end 318 | puts 319 | 320 | snapshot_list_to_consider 321 | end 322 | 323 | def get_file_list_curated 324 | file_list_curated = Hash.new 325 | get_all_snapshots_to_consider.each do |file_timestamp, file_url| 326 | next unless file_url.include?('/') 327 | file_id = file_url.split('/')[3..-1].join('/') 328 | file_id = CGI::unescape file_id 329 | file_id = file_id.tidy_bytes unless file_id == "" 330 | if file_id.nil? 331 | puts "Malformed file url, ignoring: #{file_url}" 332 | else 333 | if match_exclude_filter(file_url) 334 | puts "File url matches exclude filter, ignoring: #{file_url}" 335 | elsif not match_only_filter(file_url) 336 | puts "File url doesn't match only filter, ignoring: #{file_url}" 337 | elsif file_list_curated[file_id] 338 | unless file_list_curated[file_id][:timestamp] > file_timestamp 339 | file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp} 340 | end 341 | else 342 | file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp} 343 | end 344 | end 345 | end 346 | file_list_curated 347 | end 348 | 349 | def get_file_list_all_timestamps 350 | file_list_curated = Hash.new 351 | get_all_snapshots_to_consider.each do |file_timestamp, file_url| 352 | next unless file_url.include?('/') 353 | file_id = file_url.split('/')[3..-1].join('/') 354 | file_id_and_timestamp = [file_timestamp, file_id].join('/') 355 | file_id_and_timestamp = CGI::unescape file_id_and_timestamp 356 | file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == "" 357 | if file_id.nil? 358 | puts "Malformed file url, ignoring: #{file_url}" 359 | else 360 | if match_exclude_filter(file_url) 361 | puts "File url matches exclude filter, ignoring: #{file_url}" 362 | elsif not match_only_filter(file_url) 363 | puts "File url doesn't match only filter, ignoring: #{file_url}" 364 | elsif file_list_curated[file_id_and_timestamp] 365 | puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose 366 | else 367 | file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp} 368 | end 369 | end 370 | end 371 | puts "file_list_curated: " + file_list_curated.count.to_s 372 | file_list_curated 373 | end 374 | 375 | 376 | def get_file_list_by_timestamp 377 | if @all_timestamps 378 | file_list_curated = get_file_list_all_timestamps 379 | file_list_curated.map do |file_remote_info| 380 | file_remote_info[1][:file_id] = file_remote_info[0] 381 | file_remote_info[1] 382 | end 383 | else 384 | file_list_curated = get_file_list_curated 385 | file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse 386 | file_list_curated.map do |file_remote_info| 387 | file_remote_info[1][:file_id] = file_remote_info[0] 388 | file_remote_info[1] 389 | end 390 | end 391 | end 392 | 393 | def list_files 394 | # retrieval produces its own output 395 | @orig_stdout = $stdout 396 | $stdout = $stderr 397 | files = get_file_list_by_timestamp 398 | $stdout = @orig_stdout 399 | puts "[" 400 | files[0...-1].each do |file| 401 | puts file.to_json + "," 402 | end 403 | puts files[-1].to_json 404 | puts "]" 405 | end 406 | 407 | def load_downloaded_ids 408 | downloaded_ids = Set.new 409 | if File.exist?(db_path) && !@reset 410 | puts "Loading list of already downloaded files from #{db_path}" 411 | begin 412 | File.foreach(db_path) { |line| downloaded_ids.add(line.strip) } 413 | rescue => e 414 | puts "Error reading downloaded files list #{db_path}: #{e.message}. Assuming no files downloaded." 415 | downloaded_ids.clear 416 | end 417 | end 418 | downloaded_ids 419 | end 420 | 421 | def append_to_db(file_id) 422 | @db_mutex.synchronize do 423 | begin 424 | FileUtils.mkdir_p(File.dirname(db_path)) 425 | File.open(db_path, 'a') { |f| f.puts(file_id) } 426 | rescue => e 427 | @logger.error("Failed to append downloaded file ID #{file_id} to #{db_path}: #{e.message}") 428 | end 429 | end 430 | end 431 | 432 | def download_files 433 | start_time = Time.now 434 | puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives." 435 | 436 | FileUtils.mkdir_p(backup_path) 437 | 438 | # Load the list of files to potentially download 439 | files_to_download = file_list_by_timestamp 440 | 441 | if files_to_download.empty? 442 | puts "No files found matching criteria." 443 | cleanup 444 | return 445 | end 446 | 447 | total_files = files_to_download.count 448 | puts "#{total_files} files found matching criteria." 449 | 450 | # Load IDs of already downloaded files 451 | downloaded_ids = load_downloaded_ids 452 | files_to_process = files_to_download.reject do |file_info| 453 | downloaded_ids.include?(file_info[:file_id]) 454 | end 455 | 456 | remaining_count = files_to_process.count 457 | skipped_count = total_files - remaining_count 458 | 459 | if skipped_count > 0 460 | puts "Found #{skipped_count} previously downloaded files, skipping them." 461 | end 462 | 463 | if remaining_count == 0 464 | puts "All matching files have already been downloaded." 465 | cleanup 466 | return 467 | end 468 | 469 | puts "#{remaining_count} files to download:" 470 | 471 | @processed_file_count = 0 472 | @total_to_download = remaining_count 473 | @download_mutex = Mutex.new 474 | 475 | thread_count = [@threads_count, CONNECTION_POOL_SIZE].min 476 | pool = Concurrent::FixedThreadPool.new(thread_count) 477 | 478 | files_to_process.each do |file_remote_info| 479 | pool.post do 480 | download_success = false 481 | begin 482 | @connection_pool.with_connection do |connection| 483 | result_message = download_file(file_remote_info, connection) 484 | # assume download success if the result message contains ' -> ' 485 | if result_message && result_message.include?(' -> ') 486 | download_success = true 487 | end 488 | @download_mutex.synchronize do 489 | @processed_file_count += 1 490 | # adjust progress message to reflect remaining files 491 | progress_message = result_message.sub(/\(#{@processed_file_count}\/\d+\)/, "(#{@processed_file_count}/#{@total_to_download})") if result_message 492 | puts progress_message if progress_message 493 | end 494 | end 495 | # sppend to DB only after successful download outside the connection block 496 | if download_success 497 | append_to_db(file_remote_info[:file_id]) 498 | end 499 | rescue => e 500 | @logger.error("Error processing file #{file_remote_info[:file_url]}: #{e.message}") 501 | @download_mutex.synchronize do 502 | @processed_file_count += 1 503 | end 504 | end 505 | sleep(RATE_LIMIT) 506 | end 507 | end 508 | 509 | pool.shutdown 510 | pool.wait_for_termination 511 | 512 | end_time = Time.now 513 | puts "\nDownload finished in #{(end_time - start_time).round(2)}s." 514 | puts "Results saved in #{backup_path}" 515 | cleanup 516 | end 517 | 518 | def structure_dir_path dir_path 519 | begin 520 | FileUtils::mkdir_p dir_path unless File.exist? dir_path 521 | rescue Errno::EEXIST => e 522 | error_to_string = e.to_s 523 | puts "# #{error_to_string}" 524 | if error_to_string.include? "File exists @ dir_s_mkdir - " 525 | file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1] 526 | elsif error_to_string.include? "File exists - " 527 | file_already_existing = error_to_string.split("File exists - ")[-1] 528 | else 529 | raise "Unhandled directory restructure error # #{error_to_string}" 530 | end 531 | file_already_existing_temporary = file_already_existing + '.temp' 532 | file_already_existing_permanent = file_already_existing + '/index.html' 533 | FileUtils::mv file_already_existing, file_already_existing_temporary 534 | FileUtils::mkdir_p file_already_existing 535 | FileUtils::mv file_already_existing_temporary, file_already_existing_permanent 536 | puts "#{file_already_existing} -> #{file_already_existing_permanent}" 537 | structure_dir_path dir_path 538 | end 539 | end 540 | 541 | def rewrite_urls_to_relative(file_path) 542 | return unless File.exist?(file_path) 543 | 544 | file_ext = File.extname(file_path).downcase 545 | 546 | begin 547 | content = File.binread(file_path) 548 | 549 | if file_ext == '.html' || file_ext == '.htm' 550 | encoding = content.match(/]+)/i)&.captures&.first || 'UTF-8' 551 | content.force_encoding(encoding) rescue content.force_encoding('UTF-8') 552 | else 553 | content.force_encoding('UTF-8') 554 | end 555 | 556 | # URLs in HTML attributes 557 | content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do 558 | prefix, url, suffix = $1, $2, $3 559 | 560 | if url.start_with?('http') 561 | begin 562 | uri = URI.parse(url) 563 | path = uri.path 564 | path = path[1..-1] if path.start_with?('/') 565 | "#{prefix}#{path}#{suffix}" 566 | rescue 567 | "#{prefix}#{url}#{suffix}" 568 | end 569 | elsif url.start_with?('/') 570 | "#{prefix}./#{url[1..-1]}#{suffix}" 571 | else 572 | "#{prefix}#{url}#{suffix}" 573 | end 574 | end 575 | 576 | # URLs in CSS 577 | content.gsub!(/url\(\s*["']?https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"'\)]+)["']?\s*\)/i) do 578 | url = $1 579 | 580 | if url.start_with?('http') 581 | begin 582 | uri = URI.parse(url) 583 | path = uri.path 584 | path = path[1..-1] if path.start_with?('/') 585 | "url(\"#{path}\")" 586 | rescue 587 | "url(\"#{url}\")" 588 | end 589 | elsif url.start_with?('/') 590 | "url(\"./#{url[1..-1]}\")" 591 | else 592 | "url(\"#{url}\")" 593 | end 594 | end 595 | 596 | # URLs in JavaScript 597 | content.gsub!(/(["'])https?:\/\/web\.archive\.org\/web\/[0-9]+(?:id_)?\/([^"']+)(["'])/i) do 598 | quote_start, url, quote_end = $1, $2, $3 599 | 600 | if url.start_with?('http') 601 | begin 602 | uri = URI.parse(url) 603 | path = uri.path 604 | path = path[1..-1] if path.start_with?('/') 605 | "#{quote_start}#{path}#{quote_end}" 606 | rescue 607 | "#{quote_start}#{url}#{quote_end}" 608 | end 609 | elsif url.start_with?('/') 610 | "#{quote_start}./#{url[1..-1]}#{quote_end}" 611 | else 612 | "#{quote_start}#{url}#{quote_end}" 613 | end 614 | end 615 | 616 | # for URLs in HTML attributes that start with a single slash 617 | content.gsub!(/(\s(?:href|src|action|data-src|data-url)=["'])\/([^"'\/][^"']*)(["'])/i) do 618 | prefix, path, suffix = $1, $2, $3 619 | "#{prefix}./#{path}#{suffix}" 620 | end 621 | 622 | # for URLs in CSS that start with a single slash 623 | content.gsub!(/url\(\s*["']?\/([^"'\)\/][^"'\)]*?)["']?\s*\)/i) do 624 | path = $1 625 | "url(\"./#{path}\")" 626 | end 627 | 628 | # save the modified content back to the file 629 | File.binwrite(file_path, content) 630 | puts "Rewrote URLs in #{file_path} to be relative." 631 | rescue Errno::ENOENT => e 632 | @logger.warn("Error reading file #{file_path}: #{e.message}") 633 | end 634 | end 635 | 636 | def download_file (file_remote_info, http) 637 | current_encoding = "".encoding 638 | file_url = file_remote_info[:file_url].encode(current_encoding) 639 | file_id = file_remote_info[:file_id] 640 | file_timestamp = file_remote_info[:timestamp] 641 | file_path_elements = file_id.split('/') 642 | 643 | if file_id == "" 644 | dir_path = backup_path 645 | file_path = backup_path + 'index.html' 646 | elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.' 647 | dir_path = backup_path + file_path_elements[0..-1].join('/') 648 | file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html' 649 | else 650 | dir_path = backup_path + file_path_elements[0..-2].join('/') 651 | file_path = backup_path + file_path_elements[0..-1].join('/') 652 | end 653 | if Gem.win_platform? 654 | dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) } 655 | file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) } 656 | end 657 | 658 | # check existence *before* download attempt 659 | # this handles cases where a file was created manually or by a previous partial run without a .db entry 660 | if File.exist? file_path 661 | return "#{file_url} # #{file_path} already exists. (#{@processed_file_count + 1}/#{@total_to_download})" 662 | end 663 | 664 | begin 665 | structure_dir_path dir_path 666 | status = download_with_retry(file_path, file_url, file_timestamp, http) 667 | 668 | case status 669 | when :saved 670 | if @rewrite && File.extname(file_path) =~ /\.(html?|css|js)$/i 671 | rewrite_urls_to_relative(file_path) 672 | end 673 | "#{file_url} -> #{file_path} (#{@processed_file_count + 1}/#{@total_to_download})" 674 | when :skipped_not_found 675 | "Skipped (not found): #{file_url} (#{@processed_file_count + 1}/#{@total_to_download})" 676 | else 677 | # ideally, this case should not be reached if download_with_retry behaves as expected. 678 | @logger.warn("Unknown status from download_with_retry for #{file_url}: #{status}") 679 | "Unknown status for #{file_url}: #{status} (#{@processed_file_count + 1}/#{@total_to_download})" 680 | end 681 | rescue StandardError => e 682 | msg = "Failed: #{file_url} # #{e} (#{@processed_file_count + 1}/#{@total_to_download})" 683 | if File.exist?(file_path) and File.size(file_path) == 0 684 | File.delete(file_path) 685 | msg += "\n#{file_path} was empty and was removed." 686 | end 687 | msg 688 | end 689 | end 690 | 691 | def file_queue 692 | @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info } 693 | end 694 | 695 | def file_list_by_timestamp 696 | @file_list_by_timestamp ||= get_file_list_by_timestamp 697 | end 698 | 699 | private 700 | 701 | def validate_params(params) 702 | raise ArgumentError, "Base URL is required" unless params[:base_url] 703 | raise ArgumentError, "Maximum pages must be positive" if params[:maximum_pages] && params[:maximum_pages].to_i <= 0 704 | end 705 | 706 | def setup_logger 707 | logger = Logger.new(STDOUT) 708 | logger.level = ENV['DEBUG'] ? Logger::DEBUG : Logger::INFO 709 | logger.formatter = proc do |severity, datetime, progname, msg| 710 | "#{datetime.strftime('%Y-%m-%d %H:%M:%S')} [#{severity}] #{msg}\n" 711 | end 712 | logger 713 | end 714 | 715 | def download_with_retry(file_path, file_url, file_timestamp, connection, redirect_count = 0) 716 | retries = 0 717 | begin 718 | wayback_url = if @rewritten 719 | "https://web.archive.org/web/#{file_timestamp}/#{file_url}" 720 | else 721 | "https://web.archive.org/web/#{file_timestamp}id_/#{file_url}" 722 | end 723 | 724 | # Escape square brackets because they are not valid in URI() 725 | wayback_url = wayback_url.gsub('[', '%5B').gsub(']', '%5D') 726 | 727 | request = Net::HTTP::Get.new(URI(wayback_url)) 728 | request["Connection"] = "keep-alive" 729 | request["User-Agent"] = "WaybackMachineDownloader/#{VERSION}" 730 | request["Accept-Encoding"] = "gzip, deflate" 731 | 732 | response = connection.request(request) 733 | 734 | save_response_body = lambda do 735 | File.open(file_path, "wb") do |file| 736 | body = response.body 737 | if response['content-encoding'] == 'gzip' && body && !body.empty? 738 | begin 739 | gz = Zlib::GzipReader.new(StringIO.new(body)) 740 | decompressed_body = gz.read 741 | gz.close 742 | file.write(decompressed_body) 743 | rescue Zlib::GzipFile::Error => e 744 | @logger.warn("Failure decompressing gzip file #{file_url}: #{e.message}. Writing raw body.") 745 | file.write(body) 746 | end 747 | else 748 | file.write(body) if body 749 | end 750 | end 751 | end 752 | 753 | if @all 754 | case response 755 | when Net::HTTPSuccess, Net::HTTPRedirection, Net::HTTPClientError, Net::HTTPServerError 756 | save_response_body.call 757 | if response.is_a?(Net::HTTPRedirection) 758 | @logger.info("Saved redirect page for #{file_url} (status #{response.code}).") 759 | elsif response.is_a?(Net::HTTPClientError) || response.is_a?(Net::HTTPServerError) 760 | @logger.info("Saved error page for #{file_url} (status #{response.code}).") 761 | end 762 | return :saved 763 | else 764 | # for any other response type when --all is true, treat as an error to be retried or failed 765 | raise "Unhandled HTTP response: #{response.code} #{response.message}" 766 | end 767 | else # not @all (our default behavior) 768 | case response 769 | when Net::HTTPSuccess 770 | save_response_body.call 771 | return :saved 772 | when Net::HTTPRedirection 773 | raise "Too many redirects for #{file_url}" if redirect_count >= 2 774 | location = response['location'] 775 | @logger.warn("Redirect found for #{file_url} -> #{location}") 776 | return download_with_retry(file_path, location, file_timestamp, connection, redirect_count + 1) 777 | when Net::HTTPTooManyRequests 778 | sleep(RATE_LIMIT * 2) 779 | raise "Rate limited, retrying..." 780 | when Net::HTTPNotFound 781 | @logger.warn("File not found, skipping: #{file_url}") 782 | return :skipped_not_found 783 | else 784 | raise "HTTP Error: #{response.code} #{response.message}" 785 | end 786 | end 787 | 788 | rescue StandardError => e 789 | if retries < MAX_RETRIES 790 | retries += 1 791 | @logger.warn("Retry #{retries}/#{MAX_RETRIES} for #{file_url}: #{e.message}") 792 | sleep(RETRY_DELAY * retries) 793 | retry 794 | else 795 | @failed_downloads << {url: file_url, error: e.message} 796 | raise e 797 | end 798 | end 799 | end 800 | 801 | def cleanup 802 | @connection_pool.shutdown 803 | 804 | if @failed_downloads.any? 805 | @logger.error("Download completed with errors.") 806 | @logger.error("Failed downloads summary:") 807 | @failed_downloads.each do |failure| 808 | @logger.error(" #{failure[:url]} - #{failure[:error]}") 809 | end 810 | unless @reset 811 | puts "State files kept due to download errors: #{cdx_path}, #{db_path}" 812 | return 813 | end 814 | end 815 | 816 | if !@keep || @reset 817 | puts "Cleaning up state files..." unless @keep && !@reset 818 | FileUtils.rm_f(cdx_path) 819 | FileUtils.rm_f(db_path) 820 | elsif @keep 821 | puts "Keeping state files as requested: #{cdx_path}, #{db_path}" 822 | end 823 | end 824 | end 825 | -------------------------------------------------------------------------------- /lib/wayback_machine_downloader/archive_api.rb: -------------------------------------------------------------------------------- 1 | require 'json' 2 | require 'uri' 3 | 4 | module ArchiveAPI 5 | 6 | def get_raw_list_from_api(url, page_index, http) 7 | # Automatically append /* if the URL doesn't contain a path after the domain 8 | # This is a workaround for an issue with the API and *some* domains. 9 | # See https://github.com/StrawberryMaster/wayback-machine-downloader/issues/6 10 | if url && !url.match(/^https?:\/\/.*\//i) 11 | url = "#{url}/*" 12 | end 13 | 14 | request_url = URI("https://web.archive.org/cdx/search/cdx") 15 | params = [["output", "json"], ["url", url]] + parameters_for_api(page_index) 16 | request_url.query = URI.encode_www_form(params) 17 | 18 | begin 19 | response = http.get(request_url) 20 | body = response.body.to_s.strip 21 | return [] if body.empty? 22 | json = JSON.parse(body) 23 | 24 | # Check if the response contains the header ["timestamp", "original"] 25 | json.shift if json.first == ["timestamp", "original"] 26 | json 27 | rescue JSON::ParserError, StandardError => e 28 | warn "Failed to fetch data from API: #{e.message}" 29 | [] 30 | end 31 | end 32 | 33 | def parameters_for_api(page_index) 34 | parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]] 35 | parameters.push(["filter", "statuscode:200"]) unless @all 36 | parameters.push(["from", @from_timestamp.to_s]) if @from_timestamp && @from_timestamp != 0 37 | parameters.push(["to", @to_timestamp.to_s]) if @to_timestamp && @to_timestamp != 0 38 | parameters.push(["page", page_index]) if page_index 39 | parameters 40 | end 41 | 42 | end 43 | -------------------------------------------------------------------------------- /lib/wayback_machine_downloader/tidy_bytes.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module TidyBytes 4 | # precomputing CP1252 to UTF-8 mappings for bytes 128-159 5 | CP1252_MAP = (128..159).map do |byte| 6 | case byte 7 | when 128 then [226, 130, 172] # EURO SIGN 8 | when 130 then [226, 128, 154] # SINGLE LOW-9 QUOTATION MARK 9 | when 131 then [198, 146] # LATIN SMALL LETTER F WITH HOOK 10 | when 132 then [226, 128, 158] # DOUBLE LOW-9 QUOTATION MARK 11 | when 133 then [226, 128, 166] # HORIZONTAL ELLIPSIS 12 | when 134 then [226, 128, 160] # DAGGER 13 | when 135 then [226, 128, 161] # DOUBLE DAGGER 14 | when 136 then [203, 134] # MODIFIER LETTER CIRCUMFLEX ACCENT 15 | when 137 then [226, 128, 176] # PER MILLE SIGN 16 | when 138 then [197, 160] # LATIN CAPITAL LETTER S WITH CARON 17 | when 139 then [226, 128, 185] # SINGLE LEFT-POINTING ANGLE QUOTATION MARK 18 | when 140 then [197, 146] # LATIN CAPITAL LIGATURE OE 19 | when 142 then [197, 189] # LATIN CAPITAL LETTER Z WITH CARON 20 | when 145 then [226, 128, 152] # LEFT SINGLE QUOTATION MARK 21 | when 146 then [226, 128, 153] # RIGHT SINGLE QUOTATION MARK 22 | when 147 then [226, 128, 156] # LEFT DOUBLE QUOTATION MARK 23 | when 148 then [226, 128, 157] # RIGHT DOUBLE QUOTATION MARK 24 | when 149 then [226, 128, 162] # BULLET 25 | when 150 then [226, 128, 147] # EN DASH 26 | when 151 then [226, 128, 148] # EM DASH 27 | when 152 then [203, 156] # SMALL TILDE 28 | when 153 then [226, 132, 162] # TRADE MARK SIGN 29 | when 154 then [197, 161] # LATIN SMALL LETTER S WITH CARON 30 | when 155 then [226, 128, 186] # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 31 | when 156 then [197, 147] # LATIN SMALL LIGATURE OE 32 | when 158 then [197, 190] # LATIN SMALL LETTER Z WITH CARON 33 | when 159 then [197, 184] # LATIN SMALL LETTER Y WITH DIAERESIS 34 | end 35 | end.freeze 36 | 37 | # precomputing all possible byte conversions 38 | CP1252_TO_UTF8 = Array.new(256) do |b| 39 | if (128..159).cover?(b) 40 | CP1252_MAP[b - 128]&.pack('C*') 41 | elsif b < 128 42 | b.chr 43 | else 44 | b < 192 ? [194, b].pack('C*') : [195, b - 64].pack('C*') 45 | end 46 | end.freeze 47 | 48 | def self.included(base) 49 | base.class_eval do 50 | def tidy_bytes(force = false) 51 | return nil if empty? 52 | 53 | if force 54 | buffer = String.new(capacity: bytesize) 55 | each_byte { |b| buffer << CP1252_TO_UTF8[b] } 56 | return buffer.force_encoding(Encoding::UTF_8) 57 | end 58 | 59 | begin 60 | encode('UTF-8') 61 | rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError 62 | buffer = String.new(capacity: bytesize) 63 | scrub { |b| CP1252_TO_UTF8[b.ord] } 64 | end 65 | end 66 | 67 | def tidy_bytes!(force = false) 68 | result = tidy_bytes(force) 69 | result ? replace(result) : self 70 | end 71 | end 72 | end 73 | end 74 | 75 | class String 76 | include TidyBytes 77 | end -------------------------------------------------------------------------------- /lib/wayback_machine_downloader/to_regex.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ToRegex 4 | module StringMixin 5 | INLINE_OPTIONS = /[imxnesu]*/i.freeze 6 | REGEXP_DELIMITERS = { 7 | '%r{' => '}'.freeze, 8 | '/' => '/'.freeze 9 | }.freeze 10 | 11 | REGEX_FLAGS = { 12 | ignore_case: Regexp::IGNORECASE, 13 | multiline: Regexp::MULTILINE, 14 | extended: Regexp::EXTENDED 15 | }.freeze 16 | 17 | class << self 18 | def literal?(str) 19 | REGEXP_DELIMITERS.none? { |start, ending| str.start_with?(start) && str.match?(/#{ending}#{INLINE_OPTIONS}\z/) } 20 | end 21 | end 22 | 23 | # Get a regex back 24 | # 25 | # Without :literal or :detect, `"foo".to_regex` will return nil. 26 | # 27 | # @param [optional, Hash] options 28 | # @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp 29 | # @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally 30 | # @option options [true,false] :ignore_case /foo/i 31 | # @option options [true,false] :multiline /foo/m 32 | # @option options [true,false] :extended /foo/x 33 | # @option options [true,false] :lang /foo/[nesu] 34 | def to_regex(options = {}) 35 | args = as_regexp(options) 36 | args ? Regexp.new(*args) : nil 37 | end 38 | # Return arguments that can be passed to `Regexp.new` 39 | # @see to_regexp 40 | def as_regexp(options = {}) 41 | raise ArgumentError, '[to_regexp] Options must be a Hash' unless options.is_a?(Hash) 42 | 43 | str = self 44 | return if options[:detect] && str.empty? 45 | 46 | if should_treat_as_literal?(str, options) 47 | content = Regexp.escape(str) 48 | elsif (delim_set = extract_delimiters(str)) 49 | content, options = parse_regexp_string(str, delim_set, options) 50 | return unless content 51 | else 52 | return 53 | end 54 | 55 | build_regexp_args(content, options) 56 | end 57 | 58 | private 59 | 60 | def should_treat_as_literal?(str, options) 61 | options[:literal] || (options[:detect] && ToRegex::StringMixin.literal?(str)) 62 | end 63 | 64 | def extract_delimiters(str) 65 | REGEXP_DELIMITERS.find { |start, _| str.start_with?(start) } 66 | end 67 | 68 | def parse_regexp_string(str, delim_set, options) 69 | start_delim, end_delim = delim_set 70 | match = /\A#{start_delim}(.*)#{end_delim}(#{INLINE_OPTIONS})\z/u.match(str) 71 | return unless match 72 | 73 | content = match[1].gsub('\\/', '/') 74 | parse_inline_options(match[2], options) 75 | [content, options] 76 | end 77 | 78 | def parse_inline_options(inline_options, options) 79 | return unless inline_options 80 | options[:ignore_case] = true if inline_options.include?('i') 81 | options[:multiline] = true if inline_options.include?('m') 82 | options[:extended] = true if inline_options.include?('x') 83 | # 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8 84 | options[:lang] = inline_options.scan(/[nesu]/i).join.downcase 85 | end 86 | 87 | def build_regexp_args(content, options) 88 | flags = calculate_flags(options) 89 | lang = normalize_lang_option(options[:lang]) 90 | 91 | lang.empty? ? [content, flags] : [content, flags, lang] 92 | end 93 | 94 | def calculate_flags(options) 95 | REGEX_FLAGS.sum { |key, value| options[key] ? value : 0 } 96 | end 97 | 98 | def normalize_lang_option(lang) 99 | return '' unless lang 100 | RUBY_VERSION >= '1.9' ? lang.delete('u') : lang 101 | end 102 | end 103 | end 104 | 105 | class String 106 | include ToRegex::StringMixin 107 | end -------------------------------------------------------------------------------- /test/test_wayback_machine_downloader.rb: -------------------------------------------------------------------------------- 1 | require 'minitest/autorun' 2 | require 'wayback_machine_downloader' 3 | 4 | class WaybackMachineDownloaderTest < Minitest::Test 5 | 6 | def setup 7 | @wayback_machine_downloader = WaybackMachineDownloader.new( 8 | base_url: 'https://www.example.com' 9 | ) 10 | $stdout = StringIO.new 11 | end 12 | 13 | def teardown 14 | FileUtils.rm_rf(@wayback_machine_downloader.backup_path) 15 | end 16 | 17 | def test_base_url_being_set 18 | assert_equal 'https://www.example.com', @wayback_machine_downloader.base_url 19 | end 20 | 21 | def test_backup_name_being_set 22 | assert_equal 'www.example.com', @wayback_machine_downloader.backup_name 23 | end 24 | 25 | def test_backup_name_being_set_when_base_url_is_domain 26 | @wayback_machine_downloader.base_url = 'www.example.com' 27 | assert_equal 'www.example.com', @wayback_machine_downloader.backup_name 28 | end 29 | 30 | def test_file_list_curated 31 | assert_equal 20060711191226, @wayback_machine_downloader.get_file_list_curated["linux.htm"][:timestamp] 32 | end 33 | 34 | def test_file_list_by_timestamp 35 | file_expected = { 36 | file_url: "http://www.onlyfreegames.net:80/strat.html", 37 | timestamp: 20060111084756, 38 | file_id: "strat.html" 39 | } 40 | assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-2] 41 | end 42 | 43 | def test_without_exact_url 44 | @wayback_machine_downloader.exact_url = false 45 | assert @wayback_machine_downloader.get_file_list_curated.size > 1 46 | end 47 | 48 | def test_exact_url 49 | @wayback_machine_downloader.exact_url = true 50 | assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size 51 | end 52 | 53 | def test_file_list_only_filter_without_matches 54 | @wayback_machine_downloader.only_filter = 'abc123' 55 | assert_equal 0, @wayback_machine_downloader.get_file_list_curated.size 56 | end 57 | 58 | def test_file_list_only_filter_with_1_match 59 | @wayback_machine_downloader.only_filter = 'menu.html' 60 | assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size 61 | end 62 | 63 | def test_file_list_only_filter_with_a_regex 64 | @wayback_machine_downloader.only_filter = '/\.(gif|je?pg|bmp)$/i' 65 | assert_equal 37, @wayback_machine_downloader.get_file_list_curated.size 66 | end 67 | 68 | def test_file_list_exclude_filter_without_matches 69 | @wayback_machine_downloader.exclude_filter = 'abc123' 70 | assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size 71 | end 72 | 73 | def test_file_list_exclude_filter_with_1_match 74 | @wayback_machine_downloader.exclude_filter = 'menu.html' 75 | assert_equal 67, @wayback_machine_downloader.get_file_list_curated.size 76 | end 77 | 78 | def test_file_list_exclude_filter_with_a_regex 79 | @wayback_machine_downloader.exclude_filter = '/\.(gif|je?pg|bmp)$/i' 80 | assert_equal 31, @wayback_machine_downloader.get_file_list_curated.size 81 | end 82 | 83 | def test_file_download 84 | @wayback_machine_downloader.download_files 85 | linux_page = open 'websites/www.onlyfreegames.net/linux.htm' 86 | assert_includes linux_page.read, "Linux Games" 87 | end 88 | 89 | def test_all_timestamps_being_respected 90 | @wayback_machine_downloader.all_timestamps = true 91 | assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size 92 | end 93 | 94 | def test_from_timestamp_being_respected 95 | @wayback_machine_downloader.from_timestamp = 20050716231334 96 | file_url = @wayback_machine_downloader.get_file_list_curated["linux.htm"][:file_url] 97 | assert_equal "http://www.onlyfreegames.net:80/linux.htm", file_url 98 | end 99 | 100 | def test_to_timestamp_being_respected 101 | @wayback_machine_downloader.to_timestamp = 20050716231334 102 | assert_nil @wayback_machine_downloader.get_file_list_curated["linux.htm"] 103 | end 104 | 105 | def test_all_get_file_list_curated_size 106 | @wayback_machine_downloader.all = true 107 | assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size 108 | end 109 | 110 | # Testing encoding conflicts needs a different base_url 111 | def test_nonascii_suburls_download 112 | @wayback_machine_downloader = WaybackMachineDownloader.new( 113 | base_url: 'https://en.wikipedia.org/wiki/%C3%84') 114 | # Once just for the downloading... 115 | @wayback_machine_downloader.download_files 116 | end 117 | 118 | def test_nonascii_suburls_already_present 119 | @wayback_machine_downloader = WaybackMachineDownloader.new( 120 | base_url: 'https://en.wikipedia.org/wiki/%C3%84') 121 | # ... twice to test the "is already present" case 122 | @wayback_machine_downloader.download_files 123 | @wayback_machine_downloader.download_files 124 | end 125 | 126 | end 127 | -------------------------------------------------------------------------------- /wayback_machine_downloader.gemspec: -------------------------------------------------------------------------------- 1 | Gem::Specification.new do |s| 2 | s.name = "wayback_machine_downloader_straw" 3 | s.version = "2.3.8" 4 | s.executables << "wayback_machine_downloader" 5 | s.summary = "Download an entire website from the Wayback Machine." 6 | s.description = "Download complete websites from the Internet Archive's Wayback Machine. While the Wayback Machine (archive.org) excellently preserves web history, it lacks a built-in export functionality; this gem does just that, allowing you to download entire archived websites. (This is a significant rewrite of the original wayback_machine_downloader gem by hartator, with enhanced features and performance improvements.)" 7 | s.authors = ["strawberrymaster"] 8 | s.email = "strawberrymaster@vivaldi.net" 9 | s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"] 10 | s.homepage = "https://github.com/StrawberryMaster/wayback-machine-downloader" 11 | s.license = "MIT" 12 | s.required_ruby_version = ">= 3.4.3" 13 | s.add_runtime_dependency "concurrent-ruby", "~> 1.3", ">= 1.3.4" 14 | s.add_development_dependency "rake", "~> 12.2" 15 | s.add_development_dependency "minitest", "~> 5.2" 16 | end 17 | --------------------------------------------------------------------------------