├── .gitignore ├── .travis.yml ├── Dockerfile ├── Gemfile ├── MIT-LICENSE.txt ├── README.md ├── Rakefile ├── bin └── wayback_machine_downloader ├── lib ├── wayback_machine_downloader.rb └── wayback_machine_downloader │ ├── archive_api.rb │ ├── tidy_bytes.rb │ └── to_regex.rb ├── test └── test_wayback_machine_downloader.rb └── wayback_machine_downloader.gemspec /.gitignore: -------------------------------------------------------------------------------- 1 | ## PROJECT::GENERAL 2 | .yardoc 3 | coverage 4 | doc 5 | rdoc 6 | log 7 | websites 8 | .DS_Store 9 | .rake_tasks~ 10 | 11 | ## BUNDLER 12 | *.gem 13 | .bundle 14 | pkg 15 | Gemfile.lock 16 | 17 | ## RBENV 18 | .ruby-version 19 | .rbenv* 20 | 21 | ## RCOV 22 | coverage.data 23 | 24 | tmp 25 | 26 | ## RUBINIUS 27 | *.rbc 28 | 29 | test.rb 30 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: ruby 3 | rvm: 4 | - 1.9.2 5 | - 1.9.3 6 | - 2.0.0 7 | - 2.1 8 | - 2.2 9 | - 2.3.1 10 | - jruby 11 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ruby:2.3 2 | COPY . /build 3 | RUN cd build && \ 4 | bundle install 5 | ENTRYPOINT [ "/usr/local/bundle/bin/wayback_machine_downloader" ] 6 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gemspec 4 | -------------------------------------------------------------------------------- /MIT-LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2016 Julian Khaleghy and contributors 4 | See the full list at https://github.com/hartator/wayback-machine-downloader/graphs/contributors 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Wayback Machine Downloader 2 | 3 | [![Gem Version](https://badge.fury.io/rb/wayback_machine_downloader.svg)](https://rubygems.org/gems/wayback_machine_downloader/) 4 | [![Build Status](https://travis-ci.org/hartator/wayback-machine-downloader.svg?branch=master)](https://travis-ci.org/hartator/wayback-machine-downloader) 5 | 6 | Download an entire website from the Internet Archive Wayback Machine. 7 | 8 | ## Installation 9 | 10 | You need to install Ruby on your system (>= 1.9.2) - if you don't already have it. 11 | Then run: 12 | 13 | gem install wayback_machine_downloader 14 | 15 | **Tip:** If you run into permission errors, you might have to add `sudo` in front of this command. 16 | 17 | ## Basic Usage 18 | 19 | Run wayback_machine_downloader with the base url of the website you want to retrieve as a parameter (e.g., http://example.com): 20 | 21 | wayback_machine_downloader http://example.com 22 | 23 | ## How it works 24 | 25 | It will download the last version of every file present on Wayback Machine to `./websites/example.com/`. It will also re-create a directory structure and auto-create `index.html` pages to work seamlessly with Apache and Nginx. All files downloaded are the original ones and not Wayback Machine rewritten versions. This way, URLs and links structure are the same as before. 26 | 27 | ## Advanced Usage 28 | 29 | Usage: wayback_machine_downloader http://example.com 30 | 31 | Download an entire website from the Wayback Machine. 32 | 33 | Optional options: 34 | -d, --directory PATH Directory to save the downloaded files into 35 | Default is ./websites/ plus the domain name 36 | -s, --all-timestamps Download all snapshots/timestamps for a given website 37 | -f, --from TIMESTAMP Only files on or after timestamp supplied (ie. 20060716231334) 38 | -t, --to TIMESTAMP Only files on or before timestamp supplied (ie. 20100916231334) 39 | -e, --exact-url Download only the url provided and not the full site 40 | -o, --only ONLY_FILTER Restrict downloading to urls that match this filter 41 | (use // notation for the filter to be treated as a regex) 42 | -x, --exclude EXCLUDE_FILTER Skip downloading of urls that match this filter 43 | (use // notation for the filter to be treated as a regex) 44 | -a, --all Expand downloading to error files (40x and 50x) and redirections (30x) 45 | -c, --concurrency NUMBER Number of multiple files to download at a time 46 | Default is one file at a time (ie. 20) 47 | -p, --maximum-snapshot NUMBER Maximum snapshot pages to consider (Default is 100) 48 | Count an average of 150,000 snapshots per page 49 | -l, --list Only list file urls in a JSON format with the archived timestamps, won't download anything 50 | 51 | ## Specify directory to save files to 52 | 53 | -d, --directory PATH 54 | 55 | Optional. By default, Wayback Machine Downloader will download files to `./websites/` followed by the domain name of the website. You may want to save files in a specific directory using this option. 56 | 57 | Example: 58 | 59 | wayback_machine_downloader http://example.com --directory downloaded-backup/ 60 | 61 | ## All Timestamps 62 | 63 | -s, --all-timestamps 64 | 65 | Optional. This option will download all timestamps/snapshots for a given website. It will uses the timestamp of each snapshot as directory. 66 | 67 | Example: 68 | 69 | wayback_machine_downloader http://example.com --all-timestamps 70 | 71 | Will download: 72 | websites/example.com/20060715085250/index.html 73 | websites/example.com/20051120005053/index.html 74 | websites/example.com/20060111095815/img/logo.png 75 | ... 76 | 77 | ## From Timestamp 78 | 79 | -f, --from TIMESTAMP 80 | 81 | Optional. You may want to supply a from timestamp to lock your backup to a specific version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., https://web.archive.org/web/20060716231334/http://example.com). You can also use years (2006), years + month (200607), etc. It can be used in combination of To Timestamp. 82 | Wayback Machine Downloader will then fetch only file versions on or after the timestamp specified. 83 | 84 | Example: 85 | 86 | wayback_machine_downloader http://example.com --from 20060716231334 87 | 88 | ## To Timestamp 89 | 90 | -t, --to TIMESTAMP 91 | 92 | Optional. You may want to supply a to timestamp to lock your backup to a specific version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., https://web.archive.org/web/20100916231334/http://example.com). You can also use years (2010), years + month (201009), etc. It can be used in combination of From Timestamp. 93 | Wayback Machine Downloader will then fetch only file versions on or before the timestamp specified. 94 | 95 | Example: 96 | 97 | wayback_machine_downloader http://example.com --to 20100916231334 98 | 99 | ## Exact Url 100 | 101 | -e, --exact-url 102 | 103 | Optional. If you want to retrieve only the file matching exactly the url provided, you can use this flag. It will avoid downloading anything else. 104 | 105 | For example, if you only want to download only the html homepage file of example.com: 106 | 107 | wayback_machine_downloader http://example.com --exact-url 108 | 109 | 110 | ## Only URL Filter 111 | 112 | -o, --only ONLY_FILTER 113 | 114 | Optional. You may want to retrieve files which are of a certain type (e.g., .pdf, .jpg, .wrd...) or are in a specific directory. To do so, you can supply the `--only` flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download. 115 | 116 | For example, if you only want to download files inside a specific `my_directory`: 117 | 118 | wayback_machine_downloader http://example.com --only my_directory 119 | 120 | Or if you want to download every images without anything else: 121 | 122 | wayback_machine_downloader http://example.com --only "/\.(gif|jpg|jpeg)$/i" 123 | 124 | ## Exclude URL Filter 125 | 126 | -x, --exclude EXCLUDE_FILTER 127 | 128 | Optional. You may want to retrieve files which aren't of a certain type (e.g., .pdf, .jpg, .wrd...) or aren't in a specific directory. To do so, you can supply the `--exclude` flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download. 129 | 130 | For example, if you want to avoid downloading files inside `my_directory`: 131 | 132 | wayback_machine_downloader http://example.com --exclude my_directory 133 | 134 | Or if you want to download everything except images: 135 | 136 | wayback_machine_downloader http://example.com --exclude "/\.(gif|jpg|jpeg)$/i" 137 | 138 | ## Expand downloading to all file types 139 | 140 | -a, --all 141 | 142 | Optional. By default, Wayback Machine Downloader limits itself to files that responded with 200 OK code. If you also need errors files (40x and 50x codes) or redirections files (30x codes), you can use the `--all` or `-a` flag and Wayback Machine Downloader will download them in addition of the 200 OK files. It will also keep empty files that are removed by default. 143 | 144 | Example: 145 | 146 | wayback_machine_downloader http://example.com --all 147 | 148 | ## Only list files without downloading 149 | 150 | -l, --list 151 | 152 | It will just display the files to be downloaded with their snapshot timestamps and urls. The output format is JSON. It won't download anything. It's useful for debugging or to connect to another application. 153 | 154 | Example: 155 | 156 | wayback_machine_downloader http://example.com --list 157 | 158 | ## Maximum number of snapshot pages to consider 159 | 160 | -p, --snapshot-pages NUMBER 161 | 162 | Optional. Specify the maximum number of snapshot pages to consider. Count an average of 150,000 snapshots per page. 100 is the default maximum number of snapshot pages and should be sufficient for most websites. Use a bigger number if you want to download a very large website. 163 | 164 | Example: 165 | 166 | wayback_machine_downloader http://example.com --snapshot-pages 300 167 | 168 | ## Download multiple files at a time 169 | 170 | -c, --concurrency NUMBER 171 | 172 | Optional. Specify the number of multiple files you want to download at the same time. Allows one to speed up the download of a website significantly. Default is to download one file at a time. 173 | 174 | Example: 175 | 176 | wayback_machine_downloader http://example.com --concurrency 20 177 | 178 | ## Using the Docker image 179 | 180 | As an alternative installation way, we have a Docker image! Retrieve the wayback-machine-downloader Docker image this way: 181 | 182 | docker pull hartator/wayback-machine-downloader 183 | 184 | Then, you should be able to use the Docker image to download websites. For example: 185 | 186 | docker run --rm -it -v $PWD/websites:/websites hartator/wayback-machine-downloader http://example.com 187 | 188 | ## Contributing 189 | 190 | Contributions are welcome! Just submit a pull request via GitHub. 191 | 192 | To run the tests: 193 | 194 | bundle install 195 | bundle exec rake test 196 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'rake/testtask' 2 | 3 | Rake::TestTask.new do |t| 4 | t.libs << 'test' 5 | end 6 | 7 | desc "Run tests" 8 | task :default => :test 9 | -------------------------------------------------------------------------------- /bin/wayback_machine_downloader: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require_relative '../lib/wayback_machine_downloader' 4 | require 'optparse' 5 | require 'pp' 6 | 7 | options = {} 8 | option_parser = OptionParser.new do |opts| 9 | opts.banner = "Usage: wayback_machine_downloader http://example.com" 10 | 11 | opts.separator "" 12 | opts.separator "Download an entire website from the Wayback Machine." 13 | 14 | opts.separator "" 15 | opts.separator "Optional options:" 16 | 17 | opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t| 18 | options[:directory] = t 19 | end 20 | 21 | opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t| 22 | options[:all_timestamps] = true 23 | end 24 | 25 | opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t| 26 | options[:from_timestamp] = t 27 | end 28 | 29 | opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t| 30 | options[:to_timestamp] = t 31 | end 32 | 33 | opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t| 34 | options[:exact_url] = t 35 | end 36 | 37 | opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t| 38 | options[:only_filter] = t 39 | end 40 | 41 | opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t| 42 | options[:exclude_filter] = t 43 | end 44 | 45 | opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t| 46 | options[:all] = true 47 | end 48 | 49 | opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t| 50 | options[:threads_count] = t 51 | end 52 | 53 | opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t| 54 | options[:maximum_pages] = t 55 | end 56 | 57 | opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t| 58 | options[:list] = true 59 | end 60 | 61 | opts.on("-v", "--version", "Display version") do |t| 62 | options[:version] = t 63 | end 64 | end.parse! 65 | 66 | if (base_url = ARGV[-1]) 67 | options[:base_url] = base_url 68 | wayback_machine_downloader = WaybackMachineDownloader.new options 69 | if options[:list] 70 | wayback_machine_downloader.list_files 71 | else 72 | wayback_machine_downloader.download_files 73 | end 74 | elsif options[:version] 75 | puts WaybackMachineDownloader::VERSION 76 | else 77 | puts "You need to specify a website to backup. (e.g., http://example.com)" 78 | puts "Run `wayback_machine_downloader --help` for more help." 79 | end 80 | -------------------------------------------------------------------------------- /lib/wayback_machine_downloader.rb: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | require 'thread' 4 | require 'net/http' 5 | require 'open-uri' 6 | require 'fileutils' 7 | require 'cgi' 8 | require 'json' 9 | require_relative 'wayback_machine_downloader/tidy_bytes' 10 | require_relative 'wayback_machine_downloader/to_regex' 11 | require_relative 'wayback_machine_downloader/archive_api' 12 | 13 | class WaybackMachineDownloader 14 | 15 | include ArchiveAPI 16 | 17 | VERSION = "2.3.1" 18 | 19 | attr_accessor :base_url, :exact_url, :directory, :all_timestamps, 20 | :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, 21 | :all, :maximum_pages, :threads_count 22 | 23 | def initialize params 24 | @base_url = params[:base_url] 25 | @exact_url = params[:exact_url] 26 | @directory = params[:directory] 27 | @all_timestamps = params[:all_timestamps] 28 | @from_timestamp = params[:from_timestamp].to_i 29 | @to_timestamp = params[:to_timestamp].to_i 30 | @only_filter = params[:only_filter] 31 | @exclude_filter = params[:exclude_filter] 32 | @all = params[:all] 33 | @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100 34 | @threads_count = params[:threads_count].to_i 35 | end 36 | 37 | def backup_name 38 | if @base_url.include? '//' 39 | @base_url.split('/')[2] 40 | else 41 | @base_url 42 | end 43 | end 44 | 45 | def backup_path 46 | if @directory 47 | if @directory[-1] == '/' 48 | @directory 49 | else 50 | @directory + '/' 51 | end 52 | else 53 | 'websites/' + backup_name + '/' 54 | end 55 | end 56 | 57 | def match_only_filter file_url 58 | if @only_filter 59 | only_filter_regex = @only_filter.to_regex 60 | if only_filter_regex 61 | only_filter_regex =~ file_url 62 | else 63 | file_url.downcase.include? @only_filter.downcase 64 | end 65 | else 66 | true 67 | end 68 | end 69 | 70 | def match_exclude_filter file_url 71 | if @exclude_filter 72 | exclude_filter_regex = @exclude_filter.to_regex 73 | if exclude_filter_regex 74 | exclude_filter_regex =~ file_url 75 | else 76 | file_url.downcase.include? @exclude_filter.downcase 77 | end 78 | else 79 | false 80 | end 81 | end 82 | 83 | def get_all_snapshots_to_consider 84 | # Note: Passing a page index parameter allow us to get more snapshots, 85 | # but from a less fresh index 86 | print "Getting snapshot pages" 87 | snapshot_list_to_consider = [] 88 | snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil) 89 | print "." 90 | unless @exact_url 91 | @maximum_pages.times do |page_index| 92 | snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index) 93 | break if snapshot_list.empty? 94 | snapshot_list_to_consider += snapshot_list 95 | print "." 96 | end 97 | end 98 | puts " found #{snapshot_list_to_consider.length} snaphots to consider." 99 | puts 100 | snapshot_list_to_consider 101 | end 102 | 103 | def get_file_list_curated 104 | file_list_curated = Hash.new 105 | get_all_snapshots_to_consider.each do |file_timestamp, file_url| 106 | next unless file_url.include?('/') 107 | file_id = file_url.split('/')[3..-1].join('/') 108 | file_id = CGI::unescape file_id 109 | file_id = file_id.tidy_bytes unless file_id == "" 110 | if file_id.nil? 111 | puts "Malformed file url, ignoring: #{file_url}" 112 | else 113 | if match_exclude_filter(file_url) 114 | puts "File url matches exclude filter, ignoring: #{file_url}" 115 | elsif not match_only_filter(file_url) 116 | puts "File url doesn't match only filter, ignoring: #{file_url}" 117 | elsif file_list_curated[file_id] 118 | unless file_list_curated[file_id][:timestamp] > file_timestamp 119 | file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp} 120 | end 121 | else 122 | file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp} 123 | end 124 | end 125 | end 126 | file_list_curated 127 | end 128 | 129 | def get_file_list_all_timestamps 130 | file_list_curated = Hash.new 131 | get_all_snapshots_to_consider.each do |file_timestamp, file_url| 132 | next unless file_url.include?('/') 133 | file_id = file_url.split('/')[3..-1].join('/') 134 | file_id_and_timestamp = [file_timestamp, file_id].join('/') 135 | file_id_and_timestamp = CGI::unescape file_id_and_timestamp 136 | file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == "" 137 | if file_id.nil? 138 | puts "Malformed file url, ignoring: #{file_url}" 139 | else 140 | if match_exclude_filter(file_url) 141 | puts "File url matches exclude filter, ignoring: #{file_url}" 142 | elsif not match_only_filter(file_url) 143 | puts "File url doesn't match only filter, ignoring: #{file_url}" 144 | elsif file_list_curated[file_id_and_timestamp] 145 | puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose 146 | else 147 | file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp} 148 | end 149 | end 150 | end 151 | puts "file_list_curated: " + file_list_curated.count.to_s 152 | file_list_curated 153 | end 154 | 155 | 156 | def get_file_list_by_timestamp 157 | if @all_timestamps 158 | file_list_curated = get_file_list_all_timestamps 159 | file_list_curated.map do |file_remote_info| 160 | file_remote_info[1][:file_id] = file_remote_info[0] 161 | file_remote_info[1] 162 | end 163 | else 164 | file_list_curated = get_file_list_curated 165 | file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse 166 | file_list_curated.map do |file_remote_info| 167 | file_remote_info[1][:file_id] = file_remote_info[0] 168 | file_remote_info[1] 169 | end 170 | end 171 | end 172 | 173 | def list_files 174 | # retrieval produces its own output 175 | @orig_stdout = $stdout 176 | $stdout = $stderr 177 | files = get_file_list_by_timestamp 178 | $stdout = @orig_stdout 179 | puts "[" 180 | files[0...-1].each do |file| 181 | puts file.to_json + "," 182 | end 183 | puts files[-1].to_json 184 | puts "]" 185 | end 186 | 187 | def download_files 188 | start_time = Time.now 189 | puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives." 190 | puts 191 | 192 | if file_list_by_timestamp.count == 0 193 | puts "No files to download." 194 | puts "Possible reasons:" 195 | puts "\t* Site is not in Wayback Machine Archive." 196 | puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0 197 | puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0 198 | puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter 199 | puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter 200 | return 201 | end 202 | 203 | puts "#{file_list_by_timestamp.count} files to download:" 204 | 205 | threads = [] 206 | @processed_file_count = 0 207 | @threads_count = 1 unless @threads_count != 0 208 | @threads_count.times do 209 | threads << Thread.new do 210 | until file_queue.empty? 211 | file_remote_info = file_queue.pop(true) rescue nil 212 | download_file(file_remote_info) if file_remote_info 213 | end 214 | end 215 | end 216 | 217 | threads.each(&:join) 218 | end_time = Time.now 219 | puts 220 | puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)" 221 | end 222 | 223 | def structure_dir_path dir_path 224 | begin 225 | FileUtils::mkdir_p dir_path unless File.exist? dir_path 226 | rescue Errno::EEXIST => e 227 | error_to_string = e.to_s 228 | puts "# #{error_to_string}" 229 | if error_to_string.include? "File exists @ dir_s_mkdir - " 230 | file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1] 231 | elsif error_to_string.include? "File exists - " 232 | file_already_existing = error_to_string.split("File exists - ")[-1] 233 | else 234 | raise "Unhandled directory restructure error # #{error_to_string}" 235 | end 236 | file_already_existing_temporary = file_already_existing + '.temp' 237 | file_already_existing_permanent = file_already_existing + '/index.html' 238 | FileUtils::mv file_already_existing, file_already_existing_temporary 239 | FileUtils::mkdir_p file_already_existing 240 | FileUtils::mv file_already_existing_temporary, file_already_existing_permanent 241 | puts "#{file_already_existing} -> #{file_already_existing_permanent}" 242 | structure_dir_path dir_path 243 | end 244 | end 245 | 246 | def download_file file_remote_info 247 | current_encoding = "".encoding 248 | file_url = file_remote_info[:file_url].encode(current_encoding) 249 | file_id = file_remote_info[:file_id] 250 | file_timestamp = file_remote_info[:timestamp] 251 | file_path_elements = file_id.split('/') 252 | if file_id == "" 253 | dir_path = backup_path 254 | file_path = backup_path + 'index.html' 255 | elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.' 256 | dir_path = backup_path + file_path_elements[0..-1].join('/') 257 | file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html' 258 | else 259 | dir_path = backup_path + file_path_elements[0..-2].join('/') 260 | file_path = backup_path + file_path_elements[0..-1].join('/') 261 | end 262 | if Gem.win_platform? 263 | dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) } 264 | file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) } 265 | end 266 | unless File.exist? file_path 267 | begin 268 | structure_dir_path dir_path 269 | open(file_path, "wb") do |file| 270 | begin 271 | URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}").open("Accept-Encoding" => "plain") do |uri| 272 | file.write(uri.read) 273 | end 274 | rescue OpenURI::HTTPError => e 275 | puts "#{file_url} # #{e}" 276 | if @all 277 | file.write(e.io.read) 278 | puts "#{file_path} saved anyway." 279 | end 280 | rescue StandardError => e 281 | puts "#{file_url} # #{e}" 282 | end 283 | end 284 | rescue StandardError => e 285 | puts "#{file_url} # #{e}" 286 | ensure 287 | if not @all and File.exist?(file_path) and File.size(file_path) == 0 288 | File.delete(file_path) 289 | puts "#{file_path} was empty and was removed." 290 | end 291 | end 292 | semaphore.synchronize do 293 | @processed_file_count += 1 294 | puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})" 295 | end 296 | else 297 | semaphore.synchronize do 298 | @processed_file_count += 1 299 | puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})" 300 | end 301 | end 302 | end 303 | 304 | def file_queue 305 | @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info } 306 | end 307 | 308 | def file_list_by_timestamp 309 | @file_list_by_timestamp ||= get_file_list_by_timestamp 310 | end 311 | 312 | def semaphore 313 | @semaphore ||= Mutex.new 314 | end 315 | end 316 | -------------------------------------------------------------------------------- /lib/wayback_machine_downloader/archive_api.rb: -------------------------------------------------------------------------------- 1 | require 'json' 2 | require 'uri' 3 | 4 | module ArchiveAPI 5 | 6 | def get_raw_list_from_api url, page_index 7 | request_url = URI("https://web.archive.org/cdx/search/xd") 8 | params = [["output", "json"], ["url", url]] 9 | params += parameters_for_api page_index 10 | request_url.query = URI.encode_www_form(params) 11 | 12 | begin 13 | json = JSON.parse(URI(request_url).open.read) 14 | if (json[0] <=> ["timestamp","original"]) == 0 15 | json.shift 16 | end 17 | json 18 | rescue JSON::ParserError 19 | [] 20 | end 21 | end 22 | 23 | def parameters_for_api page_index 24 | parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]] 25 | if !@all 26 | parameters.push(["filter", "statuscode:200"]) 27 | end 28 | if @from_timestamp and @from_timestamp != 0 29 | parameters.push(["from", @from_timestamp.to_s]) 30 | end 31 | if @to_timestamp and @to_timestamp != 0 32 | parameters.push(["to", @to_timestamp.to_s]) 33 | end 34 | if page_index 35 | parameters.push(["page", page_index]) 36 | end 37 | parameters 38 | end 39 | 40 | end 41 | -------------------------------------------------------------------------------- /lib/wayback_machine_downloader/tidy_bytes.rb: -------------------------------------------------------------------------------- 1 | module TibyBytes 2 | 3 | # CP-1252 decimal byte => UTF-8 approximation as an array of bytes 4 | CP1252 = { 5 | 128 => [226, 130, 172], 6 | 129 => nil, 7 | 130 => [226, 128, 154], 8 | 131 => [198, 146], 9 | 132 => [226, 128, 158], 10 | 133 => [226, 128, 166], 11 | 134 => [226, 128, 160], 12 | 135 => [226, 128, 161], 13 | 136 => [203, 134], 14 | 137 => [226, 128, 176], 15 | 138 => [197, 160], 16 | 139 => [226, 128, 185], 17 | 140 => [197, 146], 18 | 141 => nil, 19 | 142 => [197, 189], 20 | 143 => nil, 21 | 144 => nil, 22 | 145 => [226, 128, 152], 23 | 146 => [226, 128, 153], 24 | 147 => [226, 128, 156], 25 | 148 => [226, 128, 157], 26 | 149 => [226, 128, 162], 27 | 150 => [226, 128, 147], 28 | 151 => [226, 128, 148], 29 | 152 => [203, 156], 30 | 153 => [226, 132, 162], 31 | 154 => [197, 161], 32 | 155 => [226, 128, 186], 33 | 156 => [197, 147], 34 | 157 => nil, 35 | 158 => [197, 190], 36 | 159 => [197, 184] 37 | } 38 | 39 | module StringMixin 40 | 41 | # Attempt to replace invalid UTF-8 bytes with valid ones. This method 42 | # naively assumes if you have invalid UTF8 bytes, they are either Windows 43 | # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not 44 | # always work. 45 | # 46 | # Passing +true+ will forcibly tidy all bytes, assuming that the string's 47 | # encoding is CP-1252 or ISO-8859-1. 48 | def tidy_bytes(force = false) 49 | 50 | if force 51 | return unpack("C*").map do |b| 52 | tidy_byte(b) 53 | end.flatten.compact.pack("C*").unpack("U*").pack("U*") 54 | end 55 | 56 | bytes = unpack("C*") 57 | conts_expected = 0 58 | last_lead = 0 59 | 60 | bytes.each_index do |i| 61 | 62 | byte = bytes[i] 63 | _is_ascii = byte < 128 64 | is_cont = byte > 127 && byte < 192 65 | is_lead = byte > 191 && byte < 245 66 | is_unused = byte > 240 67 | is_restricted = byte > 244 68 | 69 | # Impossible or highly unlikely byte? Clean it. 70 | if is_unused || is_restricted 71 | bytes[i] = tidy_byte(byte) 72 | elsif is_cont 73 | # Not expecting continuation byte? Clean up. Otherwise, now expect one less. 74 | conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1 75 | else 76 | if conts_expected > 0 77 | # Expected continuation, but got ASCII or leading? Clean backwards up to 78 | # the leading byte. 79 | begin 80 | (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])} 81 | rescue NoMethodError 82 | next 83 | end 84 | conts_expected = 0 85 | end 86 | if is_lead 87 | # Final byte is leading? Clean it. 88 | if i == bytes.length - 1 89 | bytes[i] = tidy_byte(bytes.last) 90 | else 91 | # Valid leading byte? Expect continuations determined by position of 92 | # first zero bit, with max of 3. 93 | conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3 94 | last_lead = i 95 | end 96 | end 97 | end 98 | end 99 | begin 100 | bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*") 101 | rescue ArgumentError 102 | nil 103 | end 104 | end 105 | 106 | # Tidy bytes in-place. 107 | def tidy_bytes!(force = false) 108 | replace tidy_bytes(force) 109 | end 110 | 111 | private 112 | 113 | def tidy_byte(byte) 114 | byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64] 115 | end 116 | 117 | end 118 | end 119 | 120 | class String 121 | include TibyBytes::StringMixin 122 | end 123 | -------------------------------------------------------------------------------- /lib/wayback_machine_downloader/to_regex.rb: -------------------------------------------------------------------------------- 1 | module ToRegex 2 | module StringMixin 3 | class << self 4 | def literal?(str) 5 | REGEXP_DELIMITERS.none? { |s, e| str.start_with?(s) and str =~ /#{e}#{INLINE_OPTIONS}\z/ } 6 | end 7 | end 8 | 9 | INLINE_OPTIONS = /[imxnesu]*/ 10 | REGEXP_DELIMITERS = { 11 | '%r{' => '}', 12 | '/' => '/', 13 | } 14 | 15 | # Get a regex back 16 | # 17 | # Without :literal or :detect, `"foo".to_regex` will return nil. 18 | # 19 | # @param [optional, Hash] options 20 | # @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp 21 | # @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally 22 | # @option options [true,false] :ignore_case /foo/i 23 | # @option options [true,false] :multiline /foo/m 24 | # @option options [true,false] :extended /foo/x 25 | # @option options [true,false] :lang /foo/[nesu] 26 | def to_regex(options = {}) 27 | if args = as_regexp(options) 28 | ::Regexp.new(*args) 29 | end 30 | end 31 | 32 | # Return arguments that can be passed to `Regexp.new` 33 | # @see to_regexp 34 | def as_regexp(options = {}) 35 | unless options.is_a?(::Hash) 36 | raise ::ArgumentError, "[to_regexp] Options must be a Hash" 37 | end 38 | str = self 39 | 40 | return if options[:detect] and str == '' 41 | 42 | if options[:literal] or (options[:detect] and ToRegexp::String.literal?(str)) 43 | content = ::Regexp.escape str 44 | elsif delim_set = REGEXP_DELIMITERS.detect { |k, _| str.start_with?(k) } 45 | delim_start, delim_end = delim_set 46 | /\A#{delim_start}(.*)#{delim_end}(#{INLINE_OPTIONS})\z/u =~ str 47 | content = $1 48 | inline_options = $2 49 | return unless content.is_a?(::String) 50 | content.gsub! '\\/', '/' 51 | if inline_options 52 | options[:ignore_case] = true if inline_options.include?('i') 53 | options[:multiline] = true if inline_options.include?('m') 54 | options[:extended] = true if inline_options.include?('x') 55 | # 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8 56 | options[:lang] = inline_options.scan(/[nesu]/i).join.downcase 57 | end 58 | else 59 | return 60 | end 61 | 62 | ignore_case = options[:ignore_case] ? ::Regexp::IGNORECASE : 0 63 | multiline = options[:multiline] ? ::Regexp::MULTILINE : 0 64 | extended = options[:extended] ? ::Regexp::EXTENDED : 0 65 | lang = options[:lang] || '' 66 | if ::RUBY_VERSION > '1.9' and lang.include?('u') 67 | lang = lang.delete 'u' 68 | end 69 | 70 | if lang.empty? 71 | [ content, (ignore_case|multiline|extended) ] 72 | else 73 | [ content, (ignore_case|multiline|extended), lang ] 74 | end 75 | end 76 | end 77 | end 78 | 79 | class String 80 | include ToRegex::StringMixin 81 | end 82 | -------------------------------------------------------------------------------- /test/test_wayback_machine_downloader.rb: -------------------------------------------------------------------------------- 1 | require 'minitest/autorun' 2 | require 'wayback_machine_downloader' 3 | 4 | class WaybackMachineDownloaderTest < Minitest::Test 5 | 6 | def setup 7 | @wayback_machine_downloader = WaybackMachineDownloader.new( 8 | base_url: 'http://www.onlyfreegames.net') 9 | $stdout = StringIO.new 10 | end 11 | 12 | def teardown 13 | FileUtils.rm_rf(@wayback_machine_downloader.backup_path) 14 | end 15 | 16 | def test_base_url_being_set 17 | assert_equal 'http://www.onlyfreegames.net', @wayback_machine_downloader.base_url 18 | end 19 | 20 | def test_backup_name_being_set 21 | assert_equal 'www.onlyfreegames.net', @wayback_machine_downloader.backup_name 22 | end 23 | 24 | def test_backup_name_being_set_when_base_url_is_domain 25 | @wayback_machine_downloader.base_url = 'www.onlyfreegames.net' 26 | assert_equal 'www.onlyfreegames.net', @wayback_machine_downloader.backup_name 27 | end 28 | 29 | def test_file_list_curated 30 | assert_equal 20060711191226, @wayback_machine_downloader.get_file_list_curated["linux.htm"][:timestamp] 31 | end 32 | 33 | def test_file_list_by_timestamp 34 | file_expected = { 35 | file_url: "http://www.onlyfreegames.net:80/strat.html", 36 | timestamp: 20060111084756, 37 | file_id: "strat.html" 38 | } 39 | assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-2] 40 | end 41 | 42 | def test_without_exact_url 43 | @wayback_machine_downloader.exact_url = false 44 | assert @wayback_machine_downloader.get_file_list_curated.size > 1 45 | end 46 | 47 | def test_exact_url 48 | @wayback_machine_downloader.exact_url = true 49 | assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size 50 | end 51 | 52 | def test_file_list_only_filter_without_matches 53 | @wayback_machine_downloader.only_filter = 'abc123' 54 | assert_equal 0, @wayback_machine_downloader.get_file_list_curated.size 55 | end 56 | 57 | def test_file_list_only_filter_with_1_match 58 | @wayback_machine_downloader.only_filter = 'menu.html' 59 | assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size 60 | end 61 | 62 | def test_file_list_only_filter_with_a_regex 63 | @wayback_machine_downloader.only_filter = '/\.(gif|je?pg|bmp)$/i' 64 | assert_equal 37, @wayback_machine_downloader.get_file_list_curated.size 65 | end 66 | 67 | def test_file_list_exclude_filter_without_matches 68 | @wayback_machine_downloader.exclude_filter = 'abc123' 69 | assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size 70 | end 71 | 72 | def test_file_list_exclude_filter_with_1_match 73 | @wayback_machine_downloader.exclude_filter = 'menu.html' 74 | assert_equal 67, @wayback_machine_downloader.get_file_list_curated.size 75 | end 76 | 77 | def test_file_list_exclude_filter_with_a_regex 78 | @wayback_machine_downloader.exclude_filter = '/\.(gif|je?pg|bmp)$/i' 79 | assert_equal 31, @wayback_machine_downloader.get_file_list_curated.size 80 | end 81 | 82 | def test_file_download 83 | @wayback_machine_downloader.download_files 84 | linux_page = open 'websites/www.onlyfreegames.net/linux.htm' 85 | assert_includes linux_page.read, "Linux Games" 86 | end 87 | 88 | def test_all_timestamps_being_respected 89 | @wayback_machine_downloader.all_timestamps = true 90 | assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size 91 | end 92 | 93 | def test_from_timestamp_being_respected 94 | @wayback_machine_downloader.from_timestamp = 20050716231334 95 | file_url = @wayback_machine_downloader.get_file_list_curated["linux.htm"][:file_url] 96 | assert_equal "http://www.onlyfreegames.net:80/linux.htm", file_url 97 | end 98 | 99 | def test_to_timestamp_being_respected 100 | @wayback_machine_downloader.to_timestamp = 20050716231334 101 | assert_nil @wayback_machine_downloader.get_file_list_curated["linux.htm"] 102 | end 103 | 104 | def test_all_get_file_list_curated_size 105 | @wayback_machine_downloader.all = true 106 | assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size 107 | end 108 | 109 | # Testing encoding conflicts needs a different base_url 110 | def test_nonascii_suburls_download 111 | @wayback_machine_downloader = WaybackMachineDownloader.new( 112 | base_url: 'https://en.wikipedia.org/wiki/%C3%84') 113 | # Once just for the downloading... 114 | @wayback_machine_downloader.download_files 115 | end 116 | 117 | def test_nonascii_suburls_already_present 118 | @wayback_machine_downloader = WaybackMachineDownloader.new( 119 | base_url: 'https://en.wikipedia.org/wiki/%C3%84') 120 | # ... twice to test the "is already present" case 121 | @wayback_machine_downloader.download_files 122 | @wayback_machine_downloader.download_files 123 | end 124 | 125 | end 126 | -------------------------------------------------------------------------------- /wayback_machine_downloader.gemspec: -------------------------------------------------------------------------------- 1 | require './lib/wayback_machine_downloader' 2 | 3 | Gem::Specification.new do |s| 4 | s.name = "wayback_machine_downloader" 5 | s.version = WaybackMachineDownloader::VERSION 6 | s.executables << "wayback_machine_downloader" 7 | s.summary = "Download an entire website from the Wayback Machine." 8 | s.description = "Download an entire website from the Wayback Machine. Wayback Machine by Internet Archive (archive.org) is an awesome tool to view any website at any point of time but lacks an export feature. Wayback Machine Downloader brings exactly this." 9 | s.authors = ["hartator"] 10 | s.email = "hartator@gmail.com" 11 | s.files = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"] 12 | s.homepage = "https://github.com/hartator/wayback-machine-downloader" 13 | s.license = "MIT" 14 | s.required_ruby_version = '>= 1.9.2' 15 | s.add_development_dependency 'rake', '~> 10.2' 16 | s.add_development_dependency 'minitest', '~> 5.2' 17 | end 18 | --------------------------------------------------------------------------------