├── .gitignore ├── .travis.yml ├── Dockerfile ├── Gemfile ├── MIT-LICENSE.txt ├── README.md ├── Rakefile ├── bin └── wayback_machine_downloader ├── lib ├── wayback_machine_downloader.rb └── wayback_machine_downloader │ ├── archive_api.rb │ ├── tidy_bytes.rb │ └── to_regex.rb ├── test └── test_wayback_machine_downloader.rb └── wayback_machine_downloader.gemspec /.gitignore: -------------------------------------------------------------------------------- 1 | ## PROJECT::GENERAL 2 | .yardoc 3 | coverage 4 | doc 5 | rdoc 6 | log 7 | websites 8 | .DS_Store 9 | 10 | ## BUNDLER 11 | *.gem 12 | .bundle 13 | pkg 14 | Gemfile.lock 15 | 16 | ## RBENV 17 | .ruby-version 18 | .rbenv* 19 | 20 | ## RCOV 21 | coverage.data 22 | 23 | tmp 24 | 25 | ## RUBINIUS 26 | *.rbc 27 | 28 | test.rb 29 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: ruby 3 | rvm: 4 | - 1.9.2 5 | - 1.9.3 6 | - 2.0.0 7 | - 2.1 8 | - 2.2 9 | - 2.3.1 10 | - jruby 11 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ruby:2.3 2 | COPY . /build 3 | RUN cd build && \ 4 | bundle install 5 | ENTRYPOINT [ "/usr/local/bundle/bin/wayback_machine_downloader" ] 6 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gemspec 4 | 5 | gem "retryable", "~> 3.0" 6 | -------------------------------------------------------------------------------- /MIT-LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2016 Julian Khaleghy and contributors 4 | See the full list at https://github.com/hartator/wayback-machine-downloader/graphs/contributors 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Wayback Machine Downloader 2 | 3 | [![Gem Version](https://badge.fury.io/rb/wayback_machine_downloader.svg)](https://rubygems.org/gems/wayback_machine_downloader/) 4 | [![Build Status](https://travis-ci.org/hartator/wayback-machine-downloader.svg?branch=master)](https://travis-ci.org/hartator/wayback-machine-downloader) 5 | 6 | Download an entire website from the Internet Archive Wayback Machine. 7 | 8 | ## Installation 9 | 10 | You need to install Ruby on your system (>= 1.9.2) - if you don't already have it. 11 | Then run: 12 | 13 | gem install wayback_machine_downloader 14 | 15 | **Tip:** If you run into permission errors, you might have to add `sudo` in front of this command. 16 | 17 | ## Basic Usage 18 | 19 | Run wayback_machine_downloader with the base url of the website you want to retrieve as a parameter (e.g., http://example.com): 20 | 21 | wayback_machine_downloader http://example.com 22 | 23 | ## How it works 24 | 25 | It will download the last version of every file present on Wayback Machine to `./websites/example.com/`. It will also re-create a directory structure and auto-create `index.html` pages to work seamlessly with Apache and Nginx. All files downloaded are the original ones and not Wayback Machine rewritten versions. This way, URLs and links structure are the same as before. 26 | 27 | ## Advanced Usage 28 | 29 | Usage: wayback_machine_downloader http://example.com 30 | 31 | Download an entire website from the Wayback Machine. 32 | 33 | Optional options: 34 | -d, --directory PATH Directory to save the downloaded files into 35 | Default is ./websites/ plus the domain name 36 | -s, --all-timestamps Download all snapshots/timestamps for a given website 37 | -f, --from TIMESTAMP Only files on or after timestamp supplied (ie. 20060716231334) 38 | -t, --to TIMESTAMP Only files on or before timestamp supplied (ie. 20100916231334) 39 | -e, --exact-url Download only the url provied and not the full site 40 | -o, --only ONLY_FILTER Restrict downloading to urls that match this filter 41 | (use // notation for the filter to be treated as a regex) 42 | -x, --exclude EXCLUDE_FILTER Skip downloading of urls that match this filter 43 | (use // notation for the filter to be treated as a regex) 44 | -a, --all Expand downloading to error files (40x and 50x) and redirections (30x) 45 | -c, --concurrency NUMBER Number of multiple files to dowload at a time 46 | Default is one file at a time (ie. 20) 47 | -p, --maximum-snapshot NUMBER Maximum snapshot pages to consider (Default is 100) 48 | Count an average of 150,000 snapshots per page 49 | -l, --list Only list file urls in a JSON format with the archived timestamps, won't download anything 50 | 51 | ## Specify directory to save files to 52 | 53 | -d, --directory PATH 54 | 55 | Optional. By default, Wayback Machine Downloader will download files to `./websites/` followed by the domain name of the website. You may want to save files in a specific directory using this option. 56 | 57 | Example: 58 | 59 | wayback_machine_downloader http://example.com --directory downloaded-backup/ 60 | 61 | ## All Timestamps 62 | 63 | -s, --all-timestamps 64 | 65 | Optional. This option will download all timestamps/snapshots for a given website. It will uses the timepstamp of each snapshot as directory. 66 | 67 | Example: 68 | 69 | wayback_machine_downloader http://example.com --all-timestamps 70 | 71 | Will download: 72 | websites/example.com/20060715085250/index.html 73 | websites/example.com/20051120005053/index.html 74 | websites/example.com/20060111095815/img/logo.png 75 | ... 76 | 77 | ## From Timestamp 78 | 79 | -f, --from TIMESTAMP 80 | 81 | Optional. You may want to supply a from timestamp to lock your backup to a specific version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., http://web.archive.org/web/20060716231334/http://example.com). You can also use years (2006), years + month (200607), etc. It can be used in combination of To Timestamp. 82 | Wayback Machine Downloader will then fetch only file versions on or after the timestamp specified. 83 | 84 | Example: 85 | 86 | wayback_machine_downloader http://example.com --from 20060716231334 87 | 88 | ## To Timestamp 89 | 90 | -t, --to TIMESTAMP 91 | 92 | Optional. You may want to supply a to timestamp to lock your backup to a specifc version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., http://web.archive.org/web/20100916231334/http://example.com). You can also use years (2010), years + month (201009), etc. It can be used in combination of From Timestamp. 93 | Wayback Machine Downloader will then fetch only file versions on or before the timestamp specified. 94 | 95 | Example: 96 | 97 | wayback_machine_downloader http://example.com --to 20100916231334 98 | 99 | ## Exact Url 100 | 101 | -e, --exact-url 102 | 103 | Optional. If you want to retrieve only the file matching exactly the url provided, you can use this flag. It will avoid downloading anything else. 104 | 105 | For example, if you only want to download only the html homepage file of example.com: 106 | 107 | wayback_machine_downloader http://example.com --exact-url 108 | 109 | 110 | ## Only URL Filter 111 | 112 | -o, --only ONLY_FILTER 113 | 114 | Optional. You may want to retrieve files which are of a certain type (e.g., .pdf, .jpg, .wrd...) or are in a specific directory. To do so, you can supply the `--only` flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download. 115 | 116 | For example, if you only want to download files inside a specific `my_directory`: 117 | 118 | wayback_machine_downloader http://example.com --only my_directory 119 | 120 | Or if you want to download every images without anything else: 121 | 122 | wayback_machine_downloader http://example.com --only "/\.(gif|jpg|jpeg)$/i" 123 | 124 | ## Exclude URL Filter 125 | 126 | -x, --exclude EXCLUDE_FILTER 127 | 128 | Optional. You may want to retrieve files which aren't of a certain type (e.g., .pdf, .jpg, .wrd...) or aren't in a specific directory. To do so, you can supply the `--exclude` flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download. 129 | 130 | For example, if you want to avoid downloading files inside `my_directory`: 131 | 132 | wayback_machine_downloader http://example.com --exclude my_directory 133 | 134 | Or if you want to download everything except images: 135 | 136 | wayback_machine_downloader http://example.com --exclude "/\.(gif|jpg|jpeg)$/i" 137 | 138 | ## Expand downloading to all file types 139 | 140 | -a, --all 141 | 142 | Optional. By default, Wayback Machine Downloader limits itself to files that responded with 200 OK code. If you also need errors files (40x and 50x codes) or redirections files (30x codes), you can use the `--all` or `-a` flag and Wayback Machine Downloader will download them in addition of the 200 OK files. It will also keep empty files that are removed by default. 143 | 144 | Example: 145 | 146 | wayback_machine_downloader http://example.com --all 147 | 148 | ## Only list files without downloading 149 | 150 | -l, --list 151 | 152 | It will just display the files to be downloaded with their snapshot timestamps and urls. The output format is JSON. It won't download anything. It's useful for debugging or to connect to another application. 153 | 154 | Example: 155 | 156 | wayback_machine_downloader http://example.com --list 157 | 158 | ## Maximum number of snapshot pages to consider 159 | 160 | -p, --snapshot-pages NUMBER 161 | 162 | Optional. Specify the maximum number of snapshot pages to consider. Count an average of 150,000 snapshots per page. 100 is the default maximum number of snapshot pages and should be sufficient for most websites. Use a bigger number if you want to download a very large website. 163 | 164 | Example: 165 | 166 | wayback_machine_downloader http://example.com --snapshot-pages 300 167 | 168 | ## Download multiple files at a time 169 | 170 | -c, --concurrency NUMBER 171 | 172 | Optional. Specify the number of multiple files you want to download at the same time. Allows to speed up the download of a website significantly. Default is to download one file at a time. 173 | 174 | Example: 175 | 176 | wayback_machine_downloader http://example.com --concurrency 20 177 | 178 | ## Using the Docker image 179 | 180 | As an alternative installation way, we have a Docker image! Retrieve the wayback-machine-downloader Docker image this way: 181 | 182 | docker pull hartator/wayback-machine-downloader 183 | 184 | Then, you should be able to use the Docker image to download websites. For example: 185 | 186 | docker run --rm -it -v $PWD/websites:/websites hartator/wayback-machine-downloader http://example.com 187 | 188 | ## Contributing 189 | 190 | Contributions are welcome! Just submit a pull request via GitHub. 191 | 192 | To run the tests: 193 | 194 | bundle install 195 | bundle exec rake test 196 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rake/testtask' 2 | 3 | Rake::TestTask.new do |t| 4 | t.libs << 'test' 5 | end 6 | 7 | desc "Run tests" 8 | task :default => :test 9 | -------------------------------------------------------------------------------- /bin/wayback_machine_downloader: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require_relative '../lib/wayback_machine_downloader' 4 | require 'optparse' 5 | require 'pp' 6 | 7 | options = {} 8 | option_parser = OptionParser.new do |opts| 9 | opts.banner = "Usage: wayback_machine_downloader http://example.com" 10 | 11 | opts.separator "" 12 | opts.separator "Download an entire website from the Wayback Machine." 13 | 14 | opts.separator "" 15 | opts.separator "Optional options:" 16 | 17 | opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t| 18 | options[:directory] = t 19 | end 20 | 21 | opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t| 22 | options[:all_timestamps] = true 23 | end 24 | 25 | opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t| 26 | options[:from_timestamp] = t 27 | end 28 | 29 | opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t| 30 | options[:to_timestamp] = t 31 | end 32 | 33 | opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t| 34 | options[:exact_url] = t 35 | end 36 | 37 | opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t| 38 | options[:only_filter] = t 39 | end 40 | 41 | opts.on("-w", "--wait SECONDS", Integer, "Wait the specified number of seconds between requests") do |t| 42 | options[:wait_seconds] = t 43 | end 44 | 45 | opts.on("--random-wait", "When used with --wait, randomize number of seconds waited between requests by a factor of 0.5 to 2") do |t| 46 | options[:wait_randomize] = true 47 | end 48 | 49 | opts.on("--tries NUMBER", Integer, "Number of times to retry for non-fatal connection errors (Default is 20)") do |t| 50 | options[:tries] = t 51 | end 52 | 53 | opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t| 54 | options[:exclude_filter] = t 55 | end 56 | 57 | opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t| 58 | options[:all] = true 59 | end 60 | 61 | opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time", "Default is one file at a time (ie. 20)") do |t| 62 | options[:threads_count] = t 63 | end 64 | 65 | opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t| 66 | options[:maximum_pages] = t 67 | end 68 | 69 | opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t| 70 | options[:list] = true 71 | end 72 | 73 | opts.on("-v", "--version", "Display version") do |t| 74 | options[:version] = t 75 | end 76 | end.parse! 77 | 78 | if (base_url = ARGV[-1]) 79 | options[:base_url] = base_url 80 | wayback_machine_downloader = WaybackMachineDownloader.new options 81 | if options[:list] 82 | wayback_machine_downloader.list_files 83 | else 84 | wayback_machine_downloader.download_files 85 | end 86 | elsif options[:version] 87 | puts WaybackMachineDownloader::VERSION 88 | else 89 | puts "You need to specify a website to backup. (e.g., http://example.com)" 90 | puts "Run `wayback_machine_downloader --help` for more help." 91 | end 92 | -------------------------------------------------------------------------------- /lib/wayback_machine_downloader.rb: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | require 'thread' 4 | require 'net/http' 5 | require 'open-uri' 6 | require 'fileutils' 7 | require 'cgi' 8 | require 'json' 9 | require 'retryable' 10 | require_relative 'wayback_machine_downloader/tidy_bytes' 11 | require_relative 'wayback_machine_downloader/to_regex' 12 | require_relative 'wayback_machine_downloader/archive_api' 13 | 14 | class WaybackMachineDownloader 15 | 16 | include ArchiveAPI 17 | 18 | VERSION = "2.2.1" 19 | 20 | attr_accessor :base_url, :exact_url, :directory, :all_timestamps, 21 | :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, 22 | :all, :maximum_pages, :threads_count, :wait_seconds, :wait_randomized 23 | 24 | def initialize params 25 | @base_url = params[:base_url] 26 | @exact_url = params[:exact_url] 27 | @directory = params[:directory] 28 | @all_timestamps = params[:all_timestamps] 29 | @from_timestamp = params[:from_timestamp].to_i 30 | @to_timestamp = params[:to_timestamp].to_i 31 | @only_filter = params[:only_filter] 32 | @exclude_filter = params[:exclude_filter] 33 | @all = params[:all] 34 | @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 35 | @threads_count = params[:threads_count].to_i 36 | @wait_seconds = params[:wait_seconds].to_i 37 | @wait_randomized = params[:wait_randomized] 38 | @tries = params[:tries] ? params[:tries].to_i : 20 39 | end 40 | 41 | def backup_name 42 | if @base_url.include? '//' 43 | @base_url.split('/')[2] 44 | else 45 | @base_url 46 | end 47 | end 48 | 49 | def backup_path 50 | if @directory 51 | if @directory[-1] == '/' 52 | @directory 53 | else 54 | @directory + '/' 55 | end 56 | else 57 | 'websites/' + backup_name + '/' 58 | end 59 | end 60 | 61 | def match_only_filter file_url 62 | if @only_filter 63 | only_filter_regex = @only_filter.to_regex 64 | if only_filter_regex 65 | only_filter_regex =~ file_url 66 | else 67 | file_url.downcase.include? @only_filter.downcase 68 | end 69 | else 70 | true 71 | end 72 | end 73 | 74 | def match_exclude_filter file_url 75 | if @exclude_filter 76 | exclude_filter_regex = @exclude_filter.to_regex 77 | if exclude_filter_regex 78 | exclude_filter_regex =~ file_url 79 | else 80 | file_url.downcase.include? @exclude_filter.downcase 81 | end 82 | else 83 | false 84 | end 85 | end 86 | 87 | def get_all_snapshots_to_consider 88 | # Note: Passing a page index parameter allow us to get more snapshots, 89 | # but from a less fresh index 90 | print "Getting snapshot pages" 91 | snapshot_list_to_consider = "" 92 | snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil) 93 | print "." 94 | unless @exact_url 95 | @maximum_pages.times do |page_index| 96 | wait 97 | snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) 98 | break if snapshot_list.empty? 99 | snapshot_list_to_consider += snapshot_list 100 | print "." 101 | end 102 | end 103 | puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider." 104 | puts 105 | snapshot_list_to_consider 106 | end 107 | 108 | def get_file_list_curated 109 | file_list_curated = Hash.new 110 | get_all_snapshots_to_consider.each_line do |line| 111 | next unless line.include?('/') 112 | file_timestamp = line[0..13].to_i 113 | file_url = line[15..-2] 114 | file_id = file_url.split('/')[3..-1].join('/') 115 | file_id = CGI::unescape file_id 116 | file_id = file_id.tidy_bytes unless file_id == "" 117 | if file_id.nil? 118 | puts "Malformed file url, ignoring: #{file_url}" 119 | else 120 | if match_exclude_filter(file_url) 121 | puts "File url matches exclude filter, ignoring: #{file_url}" 122 | elsif not match_only_filter(file_url) 123 | puts "File url doesn't match only filter, ignoring: #{file_url}" 124 | elsif file_list_curated[file_id] 125 | unless file_list_curated[file_id][:timestamp] > file_timestamp 126 | file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp} 127 | end 128 | else 129 | file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp} 130 | end 131 | end 132 | end 133 | file_list_curated 134 | end 135 | 136 | def get_file_list_all_timestamps 137 | file_list_curated = Hash.new 138 | get_all_snapshots_to_consider.each_line do |line| 139 | next unless line.include?('/') 140 | file_timestamp = line[0..13].to_i 141 | file_url = line[15..-2] 142 | file_id = file_url.split('/')[3..-1].join('/') 143 | file_id_and_timestamp = [file_timestamp, file_id].join('/') 144 | file_id_and_timestamp = CGI::unescape file_id_and_timestamp 145 | file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == "" 146 | if file_id.nil? 147 | puts "Malformed file url, ignoring: #{file_url}" 148 | else 149 | if match_exclude_filter(file_url) 150 | puts "File url matches exclude filter, ignoring: #{file_url}" 151 | elsif not match_only_filter(file_url) 152 | puts "File url doesn't match only filter, ignoring: #{file_url}" 153 | elsif file_list_curated[file_id_and_timestamp] 154 | puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose 155 | else 156 | file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp} 157 | end 158 | end 159 | end 160 | puts "file_list_curated: " + file_list_curated.count.to_s 161 | file_list_curated 162 | end 163 | 164 | 165 | def get_file_list_by_timestamp 166 | if @all_timestamps 167 | file_list_curated = get_file_list_all_timestamps 168 | file_list_curated.map do |file_remote_info| 169 | file_remote_info[1][:file_id] = file_remote_info[0] 170 | file_remote_info[1] 171 | end 172 | else 173 | file_list_curated = get_file_list_curated 174 | file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse 175 | file_list_curated.map do |file_remote_info| 176 | file_remote_info[1][:file_id] = file_remote_info[0] 177 | file_remote_info[1] 178 | end 179 | end 180 | end 181 | 182 | def list_files 183 | # retrieval produces its own output 184 | files = get_file_list_by_timestamp 185 | puts "[" 186 | files.each do |file| 187 | puts file.to_json + "," 188 | end 189 | puts "]" 190 | end 191 | 192 | def download_files 193 | start_time = Time.now 194 | puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives." 195 | puts 196 | 197 | if file_list_by_timestamp.count == 0 198 | puts "No files to download." 199 | puts "Possible reasons:" 200 | puts "\t* Site is not in Wayback Machine Archive." 201 | puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0 202 | puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0 203 | puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter 204 | puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter 205 | return 206 | end 207 | 208 | puts "#{file_list_by_timestamp.count} files to download:" 209 | 210 | threads = [] 211 | @processed_file_count = 0 212 | @threads_count = 1 unless @threads_count != 0 213 | @threads_count.times do 214 | threads << Thread.new do 215 | until file_queue.empty? 216 | wait 217 | file_remote_info = file_queue.pop(true) rescue nil 218 | download_file(file_remote_info) if file_remote_info 219 | end 220 | end 221 | end 222 | 223 | threads.each(&:join) 224 | end_time = Time.now 225 | puts 226 | puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)" 227 | end 228 | 229 | def structure_dir_path dir_path 230 | begin 231 | FileUtils::mkdir_p dir_path unless File.exist? dir_path 232 | rescue Errno::EEXIST => e 233 | error_to_string = e.to_s 234 | puts "# #{error_to_string}" 235 | if error_to_string.include? "File exists @ dir_s_mkdir - " 236 | file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1] 237 | elsif error_to_string.include? "File exists - " 238 | file_already_existing = error_to_string.split("File exists - ")[-1] 239 | else 240 | raise "Unhandled directory restructure error # #{error_to_string}" 241 | end 242 | file_already_existing_temporary = file_already_existing + '.temp' 243 | file_already_existing_permanent = file_already_existing + '/index.html' 244 | FileUtils::mv file_already_existing, file_already_existing_temporary 245 | FileUtils::mkdir_p file_already_existing 246 | FileUtils::mv file_already_existing_temporary, file_already_existing_permanent 247 | puts "#{file_already_existing} -> #{file_already_existing_permanent}" 248 | structure_dir_path dir_path 249 | end 250 | end 251 | 252 | def download_file file_remote_info 253 | current_encoding = "".encoding 254 | file_url = file_remote_info[:file_url].encode(current_encoding) 255 | file_id = file_remote_info[:file_id] 256 | file_timestamp = file_remote_info[:timestamp] 257 | file_path_elements = file_id.split('/') 258 | if file_id == "" 259 | dir_path = backup_path 260 | file_path = backup_path + 'index.html' 261 | elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.' 262 | dir_path = backup_path + file_path_elements[0..-1].join('/') 263 | file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html' 264 | else 265 | dir_path = backup_path + file_path_elements[0..-2].join('/') 266 | file_path = backup_path + file_path_elements[0..-1].join('/') 267 | end 268 | if Gem.win_platform? 269 | dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) } 270 | file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) } 271 | end 272 | unless File.exist? file_path 273 | begin 274 | structure_dir_path dir_path 275 | open(file_path, "wb") do |file| 276 | begin 277 | Retryable.retryable(tries: @tries, on: Net::ReadTimeout, sleep_method: self.method(:wait)) do 278 | URI.open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri| 279 | file.write(uri.read) 280 | end 281 | end 282 | rescue OpenURI::HTTPError => e 283 | puts "#{file_url} # #{e}" 284 | if @all 285 | file.write(e.io.read) 286 | puts "#{file_path} saved anyway." 287 | end 288 | rescue StandardError => e 289 | puts "#{file_url} # #{e}" 290 | end 291 | end 292 | rescue StandardError => e 293 | puts "#{file_url} # #{e}" 294 | ensure 295 | if not @all and File.exist?(file_path) and File.size(file_path) == 0 296 | File.delete(file_path) 297 | puts "#{file_path} was empty and was removed." 298 | end 299 | end 300 | semaphore.synchronize do 301 | @processed_file_count += 1 302 | puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})" 303 | end 304 | else 305 | semaphore.synchronize do 306 | @processed_file_count += 1 307 | puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})" 308 | end 309 | end 310 | end 311 | 312 | def file_queue 313 | @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info } 314 | end 315 | 316 | def file_list_by_timestamp 317 | @file_list_by_timestamp ||= get_file_list_by_timestamp 318 | end 319 | 320 | def semaphore 321 | @semaphore ||= Mutex.new 322 | end 323 | 324 | def wait 325 | @wait_seconds.positive? && @wait_randomized ? sleep(@wait_seconds.to_f * (rand(1.5) + 0.5)) : sleep(@wait_seconds) 326 | end 327 | end 328 | -------------------------------------------------------------------------------- /lib/wayback_machine_downloader/archive_api.rb: -------------------------------------------------------------------------------- 1 | module ArchiveAPI 2 | 3 | def get_raw_list_from_api url, page_index 4 | request_url = "http://web.archive.org/cdx/search/xd?url=" 5 | request_url += CGI.escape url 6 | request_url += parameters_for_api page_index 7 | 8 | Retryable.retryable(tries: @tries, on: Net::ReadTimeout, sleep_method: self.method(:wait)) do 9 | URI.open(request_url).read 10 | end 11 | end 12 | 13 | def parameters_for_api page_index 14 | parameters = "&fl=timestamp,original&collapse=digest&gzip=false" 15 | if @all 16 | parameters += "" 17 | else 18 | parameters += "&filter=statuscode:200" 19 | end 20 | if @from_timestamp and @from_timestamp != 0 21 | parameters += "&from=" + @from_timestamp.to_s 22 | end 23 | if @to_timestamp and @to_timestamp != 0 24 | parameters += "&to=" + @to_timestamp.to_s 25 | end 26 | if page_index 27 | parameters += "&page=#{page_index}" 28 | end 29 | parameters 30 | end 31 | 32 | end 33 | -------------------------------------------------------------------------------- /lib/wayback_machine_downloader/tidy_bytes.rb: -------------------------------------------------------------------------------- 1 | module TibyBytes 2 | 3 | # CP-1252 decimal byte => UTF-8 approximation as an array of bytes 4 | CP1252 = { 5 | 128 => [226, 130, 172], 6 | 129 => nil, 7 | 130 => [226, 128, 154], 8 | 131 => [198, 146], 9 | 132 => [226, 128, 158], 10 | 133 => [226, 128, 166], 11 | 134 => [226, 128, 160], 12 | 135 => [226, 128, 161], 13 | 136 => [203, 134], 14 | 137 => [226, 128, 176], 15 | 138 => [197, 160], 16 | 139 => [226, 128, 185], 17 | 140 => [197, 146], 18 | 141 => nil, 19 | 142 => [197, 189], 20 | 143 => nil, 21 | 144 => nil, 22 | 145 => [226, 128, 152], 23 | 146 => [226, 128, 153], 24 | 147 => [226, 128, 156], 25 | 148 => [226, 128, 157], 26 | 149 => [226, 128, 162], 27 | 150 => [226, 128, 147], 28 | 151 => [226, 128, 148], 29 | 152 => [203, 156], 30 | 153 => [226, 132, 162], 31 | 154 => [197, 161], 32 | 155 => [226, 128, 186], 33 | 156 => [197, 147], 34 | 157 => nil, 35 | 158 => [197, 190], 36 | 159 => [197, 184] 37 | } 38 | 39 | module StringMixin 40 | 41 | # Attempt to replace invalid UTF-8 bytes with valid ones. This method 42 | # naively assumes if you have invalid UTF8 bytes, they are either Windows 43 | # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not 44 | # always work. 45 | # 46 | # Passing +true+ will forcibly tidy all bytes, assuming that the string's 47 | # encoding is CP-1252 or ISO-8859-1. 48 | def tidy_bytes(force = false) 49 | 50 | if force 51 | return unpack("C*").map do |b| 52 | tidy_byte(b) 53 | end.flatten.compact.pack("C*").unpack("U*").pack("U*") 54 | end 55 | 56 | bytes = unpack("C*") 57 | conts_expected = 0 58 | last_lead = 0 59 | 60 | bytes.each_index do |i| 61 | 62 | byte = bytes[i] 63 | _is_ascii = byte < 128 64 | is_cont = byte > 127 && byte < 192 65 | is_lead = byte > 191 && byte < 245 66 | is_unused = byte > 240 67 | is_restricted = byte > 244 68 | 69 | # Impossible or highly unlikely byte? Clean it. 70 | if is_unused || is_restricted 71 | bytes[i] = tidy_byte(byte) 72 | elsif is_cont 73 | # Not expecting contination byte? Clean up. Otherwise, now expect one less. 74 | conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1 75 | else 76 | if conts_expected > 0 77 | # Expected continuation, but got ASCII or leading? Clean backwards up to 78 | # the leading byte. 79 | begin 80 | (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} 81 | rescue NoMethodError 82 | next 83 | end 84 | conts_expected = 0 85 | end 86 | if is_lead 87 | # Final byte is leading? Clean it. 88 | if i == bytes.length - 1 89 | bytes[i] = tidy_byte(bytes.last) 90 | else 91 | # Valid leading byte? Expect continuations determined by position of 92 | # first zero bit, with max of 3. 93 | conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3 94 | last_lead = i 95 | end 96 | end 97 | end 98 | end 99 | begin 100 | bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") 101 | rescue ArgumentError 102 | nil 103 | end 104 | end 105 | 106 | # Tidy bytes in-place. 107 | def tidy_bytes!(force = false) 108 | replace tidy_bytes(force) 109 | end 110 | 111 | private 112 | 113 | def tidy_byte(byte) 114 | byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64] 115 | end 116 | 117 | end 118 | end 119 | 120 | class String 121 | include TibyBytes::StringMixin 122 | end 123 | -------------------------------------------------------------------------------- /lib/wayback_machine_downloader/to_regex.rb: -------------------------------------------------------------------------------- 1 | module ToRegex 2 | module StringMixin 3 | class << self 4 | def literal?(str) 5 | REGEXP_DELIMITERS.none? { |s, e| str.start_with?(s) and str =~ /#{e}#{INLINE_OPTIONS}\z/ } 6 | end 7 | end 8 | 9 | INLINE_OPTIONS = /[imxnesu]*/ 10 | REGEXP_DELIMITERS = { 11 | '%r{' => '}', 12 | '/' => '/', 13 | } 14 | 15 | # Get a regex back 16 | # 17 | # Without :literal or :detect, `"foo".to_regex` will return nil. 18 | # 19 | # @param [optional, Hash] options 20 | # @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp 21 | # @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally 22 | # @option options [true,false] :ignore_case /foo/i 23 | # @option options [true,false] :multiline /foo/m 24 | # @option options [true,false] :extended /foo/x 25 | # @option options [true,false] :lang /foo/[nesu] 26 | def to_regex(options = {}) 27 | if args = as_regexp(options) 28 | ::Regexp.new(*args) 29 | end 30 | end 31 | 32 | # Return arguments that can be passed to `Regexp.new` 33 | # @see to_regexp 34 | def as_regexp(options = {}) 35 | unless options.is_a?(::Hash) 36 | raise ::ArgumentError, "[to_regexp] Options must be a Hash" 37 | end 38 | str = self 39 | 40 | return if options[:detect] and str == '' 41 | 42 | if options[:literal] or (options[:detect] and ToRegexp::String.literal?(str)) 43 | content = ::Regexp.escape str 44 | elsif delim_set = REGEXP_DELIMITERS.detect { |k, _| str.start_with?(k) } 45 | delim_start, delim_end = delim_set 46 | /\A#{delim_start}(.*)#{delim_end}(#{INLINE_OPTIONS})\z/u =~ str 47 | content = $1 48 | inline_options = $2 49 | return unless content.is_a?(::String) 50 | content.gsub! '\\/', '/' 51 | if inline_options 52 | options[:ignore_case] = true if inline_options.include?('i') 53 | options[:multiline] = true if inline_options.include?('m') 54 | options[:extended] = true if inline_options.include?('x') 55 | # 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8 56 | options[:lang] = inline_options.scan(/[nesu]/i).join.downcase 57 | end 58 | else 59 | return 60 | end 61 | 62 | ignore_case = options[:ignore_case] ? ::Regexp::IGNORECASE : 0 63 | multiline = options[:multiline] ? ::Regexp::MULTILINE : 0 64 | extended = options[:extended] ? ::Regexp::EXTENDED : 0 65 | lang = options[:lang] || '' 66 | if ::RUBY_VERSION > '1.9' and lang.include?('u') 67 | lang = lang.delete 'u' 68 | end 69 | 70 | if lang.empty? 71 | [ content, (ignore_case|multiline|extended) ] 72 | else 73 | [ content, (ignore_case|multiline|extended), lang ] 74 | end 75 | end 76 | end 77 | end 78 | 79 | class String 80 | include ToRegex::StringMixin 81 | end 82 | -------------------------------------------------------------------------------- /test/test_wayback_machine_downloader.rb: -------------------------------------------------------------------------------- 1 | require 'minitest/autorun' 2 | require 'wayback_machine_downloader' 3 | 4 | class WaybackMachineDownloaderTest < Minitest::Test 5 | 6 | def setup 7 | @wayback_machine_downloader = WaybackMachineDownloader.new( 8 | base_url: 'http://www.onlyfreegames.net') 9 | $stdout = StringIO.new 10 | end 11 | 12 | def teardown 13 | FileUtils.rm_rf(@wayback_machine_downloader.backup_path) 14 | end 15 | 16 | def test_base_url_being_set 17 | assert_equal 'http://www.onlyfreegames.net', @wayback_machine_downloader.base_url 18 | end 19 | 20 | def test_backup_name_being_set 21 | assert_equal 'www.onlyfreegames.net', @wayback_machine_downloader.backup_name 22 | end 23 | 24 | def test_backup_name_being_set_when_base_url_is_domain 25 | @wayback_machine_downloader.base_url = 'www.onlyfreegames.net' 26 | assert_equal 'www.onlyfreegames.net', @wayback_machine_downloader.backup_name 27 | end 28 | 29 | def test_file_list_curated 30 | assert_equal 20060711191226, @wayback_machine_downloader.get_file_list_curated["linux.htm"][:timestamp] 31 | end 32 | 33 | def test_file_list_by_timestamp 34 | file_expected = { 35 | file_url: "http://www.onlyfreegames.net:80/strat.html", 36 | timestamp: 20060111084756, 37 | file_id: "strat.html" 38 | } 39 | assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-2] 40 | end 41 | 42 | def test_without_exact_url 43 | @wayback_machine_downloader.exact_url = false 44 | assert @wayback_machine_downloader.get_file_list_curated.size > 1 45 | end 46 | 47 | def test_exact_url 48 | @wayback_machine_downloader.exact_url = true 49 | assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size 50 | end 51 | 52 | def test_file_list_only_filter_without_matches 53 | @wayback_machine_downloader.only_filter = 'abc123' 54 | assert_equal 0, @wayback_machine_downloader.get_file_list_curated.size 55 | end 56 | 57 | def test_file_list_only_filter_with_1_match 58 | @wayback_machine_downloader.only_filter = 'menu.html' 59 | assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size 60 | end 61 | 62 | def test_file_list_only_filter_with_a_regex 63 | @wayback_machine_downloader.only_filter = '/\.(gif|je?pg|bmp)$/i' 64 | assert_equal 37, @wayback_machine_downloader.get_file_list_curated.size 65 | end 66 | 67 | def test_file_list_exclude_filter_without_matches 68 | @wayback_machine_downloader.exclude_filter = 'abc123' 69 | assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size 70 | end 71 | 72 | def test_file_list_exclude_filter_with_1_match 73 | @wayback_machine_downloader.exclude_filter = 'menu.html' 74 | assert_equal 67, @wayback_machine_downloader.get_file_list_curated.size 75 | end 76 | 77 | def test_file_list_exclude_filter_with_a_regex 78 | @wayback_machine_downloader.exclude_filter = '/\.(gif|je?pg|bmp)$/i' 79 | assert_equal 31, @wayback_machine_downloader.get_file_list_curated.size 80 | end 81 | 82 | def test_file_download 83 | @wayback_machine_downloader.download_files 84 | linux_page = open 'websites/www.onlyfreegames.net/linux.htm' 85 | assert_includes linux_page.read, "Linux Games" 86 | end 87 | 88 | def test_all_timestamps_being_respected 89 | @wayback_machine_downloader.all_timestamps = true 90 | assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size 91 | end 92 | 93 | def test_from_timestamp_being_respected 94 | @wayback_machine_downloader.from_timestamp = 20050716231334 95 | file_url = @wayback_machine_downloader.get_file_list_curated["linux.htm"][:file_url] 96 | assert_equal "http://www.onlyfreegames.net:80/linux.htm", file_url 97 | end 98 | 99 | def test_to_timestamp_being_respected 100 | @wayback_machine_downloader.to_timestamp = 20050716231334 101 | assert_nil @wayback_machine_downloader.get_file_list_curated["linux.htm"] 102 | end 103 | 104 | def test_all_get_file_list_curated_size 105 | @wayback_machine_downloader.all = true 106 | assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size 107 | end 108 | 109 | # Testing encoding conflicts needs a different base_url 110 | def test_nonascii_suburls_download 111 | @wayback_machine_downloader = WaybackMachineDownloader.new( 112 | base_url: 'https://en.wikipedia.org/wiki/%C3%84') 113 | # Once just for the downloading... 114 | @wayback_machine_downloader.download_files 115 | end 116 | 117 | def test_nonascii_suburls_already_present 118 | @wayback_machine_downloader = WaybackMachineDownloader.new( 119 | base_url: 'https://en.wikipedia.org/wiki/%C3%84') 120 | # ... twice to test the "is already present" case 121 | @wayback_machine_downloader.download_files 122 | @wayback_machine_downloader.download_files 123 | end 124 | 125 | end 126 | -------------------------------------------------------------------------------- /wayback_machine_downloader.gemspec: -------------------------------------------------------------------------------- 1 | require './lib/wayback_machine_downloader' 2 | 3 | Gem::Specification.new do |s| 4 | s.name = "wayback_machine_downloader" 5 | s.version = WaybackMachineDownloader::VERSION 6 | s.executables << "wayback_machine_downloader" 7 | s.summary = "Download an entire website from the Wayback Machine." 8 | s.description = "Download an entire website from the Wayback Machine. Wayback Machine by Internet Archive (archive.org) is an awesome tool to view any website at any point of time but lacks an export feature. Wayback Machine Downloader brings exactly this." 9 | s.authors = ["hartator"] 10 | s.email = "hartator@gmail.com" 11 | s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"] 12 | s.homepage = "https://github.com/hartator/wayback-machine-downloader" 13 | s.license = "MIT" 14 | s.required_ruby_version = '>= 1.9.2' 15 | s.add_development_dependency 'rake', '~> 10.2' 16 | s.add_development_dependency 'minitest', '~> 5.2' 17 | end 18 | --------------------------------------------------------------------------------