├── .gitignore
├── .travis.yml
├── Dockerfile
├── Gemfile
├── MIT-LICENSE.txt
├── README.md
├── Rakefile
├── bin
    └── wayback_machine_downloader
├── lib
    ├── wayback_machine_downloader.rb
    └── wayback_machine_downloader
    │   ├── archive_api.rb
    │   ├── tidy_bytes.rb
    │   └── to_regex.rb
├── test
    └── test_wayback_machine_downloader.rb
└── wayback_machine_downloader.gemspec


/.gitignore:
--------------------------------------------------------------------------------
 1 | ## PROJECT::GENERAL
 2 | .yardoc
 3 | coverage
 4 | doc
 5 | rdoc
 6 | log
 7 | websites
 8 | .DS_Store
 9 | 
10 | ## BUNDLER
11 | *.gem
12 | .bundle
13 | pkg
14 | Gemfile.lock
15 | 
16 | ## RBENV
17 | .ruby-version
18 | .rbenv*
19 | 
20 | ## RCOV
21 | coverage.data
22 | 
23 | tmp
24 | 
25 | ## RUBINIUS
26 | *.rbc
27 | 
28 | test.rb
29 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: ruby
 3 | rvm:
 4 |   - 1.9.2
 5 |   - 1.9.3
 6 |   - 2.0.0
 7 |   - 2.1
 8 |   - 2.2
 9 |   - 2.3.1
10 |   - jruby
11 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ruby:2.3
2 | COPY . /build
3 | RUN cd build && \
4 |     bundle install
5 | ENTRYPOINT [ "/usr/local/bundle/bin/wayback_machine_downloader" ]
6 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 | 
3 | gemspec
4 | 
5 | gem "retryable", "~> 3.0"
6 | 


--------------------------------------------------------------------------------
/MIT-LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015-2016 Julian Khaleghy and contributors
 4 | See the full list at https://github.com/hartator/wayback-machine-downloader/graphs/contributors
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Wayback Machine Downloader
  2 | 
  3 | [![Gem Version](https://badge.fury.io/rb/wayback_machine_downloader.svg)](https://rubygems.org/gems/wayback_machine_downloader/)
  4 | [![Build Status](https://travis-ci.org/hartator/wayback-machine-downloader.svg?branch=master)](https://travis-ci.org/hartator/wayback-machine-downloader)
  5 | 
  6 | Download an entire website from the Internet Archive Wayback Machine.
  7 | 
  8 | ## Installation
  9 | 
 10 | You need to install Ruby on your system (>= 1.9.2) - if you don't already have it.
 11 | Then run:
 12 | 
 13 |     gem install wayback_machine_downloader
 14 | 
 15 | **Tip:** If you run into permission errors, you might have to add `sudo` in front of this command.
 16 | 
 17 | ## Basic Usage
 18 | 
 19 | Run wayback_machine_downloader with the base url of the website you want to retrieve as a parameter (e.g., http://example.com):
 20 | 
 21 |     wayback_machine_downloader http://example.com
 22 | 
 23 | ## How it works
 24 | 
 25 | It will download the last version of every file present on Wayback Machine to `./websites/example.com/`. It will also re-create a directory structure and auto-create `index.html` pages to work seamlessly with Apache and Nginx. All files downloaded are the original ones and not Wayback Machine rewritten versions. This way, URLs and links structure are the same as before.
 26 | 
 27 | ## Advanced Usage
 28 | 
 29 | 	Usage: wayback_machine_downloader http://example.com
 30 | 
 31 | 	Download an entire website from the Wayback Machine.
 32 | 
 33 | 	Optional options:
 34 | 	    -d, --directory PATH             Directory to save the downloaded files into
 35 | 					     Default is ./websites/ plus the domain name
 36 | 	    -s, --all-timestamps             Download all snapshots/timestamps for a given website
 37 | 	    -f, --from TIMESTAMP             Only files on or after timestamp supplied (ie. 20060716231334)
 38 | 	    -t, --to TIMESTAMP               Only files on or before timestamp supplied (ie. 20100916231334)
 39 | 	    -e, --exact-url                  Download only the url provied and not the full site
 40 | 	    -o, --only ONLY_FILTER           Restrict downloading to urls that match this filter
 41 | 					     (use // notation for the filter to be treated as a regex)
 42 | 	    -x, --exclude EXCLUDE_FILTER     Skip downloading of urls that match this filter
 43 | 					     (use // notation for the filter to be treated as a regex)
 44 | 	    -a, --all                        Expand downloading to error files (40x and 50x) and redirections (30x)
 45 | 	    -c, --concurrency NUMBER         Number of multiple files to dowload at a time
 46 | 					     Default is one file at a time (ie. 20)
 47 | 	    -p, --maximum-snapshot NUMBER    Maximum snapshot pages to consider (Default is 100)
 48 | 					     Count an average of 150,000 snapshots per page
 49 | 	    -l, --list                       Only list file urls in a JSON format with the archived timestamps, won't download anything
 50 | 	    
 51 | ## Specify directory to save files to
 52 | 
 53 |     -d, --directory PATH
 54 | 
 55 | Optional. By default, Wayback Machine Downloader will download files to `./websites/` followed by the domain name of the website. You may want to save files in a specific directory using this option.
 56 | 
 57 | Example:
 58 | 
 59 |     wayback_machine_downloader http://example.com --directory downloaded-backup/
 60 |     
 61 | ## All Timestamps
 62 | 
 63 |     -s, --all-timestamps 
 64 | 
 65 | Optional. This option will download all timestamps/snapshots for a given website. It will uses the timepstamp of each snapshot as directory.
 66 | 
 67 | Example:
 68 | 
 69 |     wayback_machine_downloader http://example.com --all-timestamps 
 70 |     
 71 |     Will download:
 72 |     	websites/example.com/20060715085250/index.html
 73 |     	websites/example.com/20051120005053/index.html
 74 |     	websites/example.com/20060111095815/img/logo.png
 75 |     	...
 76 | 
 77 | ## From Timestamp
 78 | 
 79 |     -f, --from TIMESTAMP
 80 | 
 81 | Optional. You may want to supply a from timestamp to lock your backup to a specific version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., http://web.archive.org/web/20060716231334/http://example.com). You can also use years (2006), years + month (200607), etc. It can be used in combination of To Timestamp.
 82 | Wayback Machine Downloader will then fetch only file versions on or after the timestamp specified.
 83 | 
 84 | Example:
 85 | 
 86 |     wayback_machine_downloader http://example.com --from 20060716231334
 87 | 
 88 | ## To Timestamp
 89 | 
 90 |     -t, --to TIMESTAMP
 91 | 
 92 | Optional. You may want to supply a to timestamp to lock your backup to a specifc version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., http://web.archive.org/web/20100916231334/http://example.com). You can also use years (2010), years + month (201009), etc. It can be used in combination of From Timestamp.
 93 | Wayback Machine Downloader will then fetch only file versions on or before the timestamp specified.
 94 | 
 95 | Example:
 96 | 
 97 |     wayback_machine_downloader http://example.com --to 20100916231334
 98 |     
 99 | ## Exact Url
100 | 
101 | 	-e, --exact-url 
102 | 
103 | Optional. If you want to retrieve only the file matching exactly the url provided, you can use this flag. It will avoid downloading anything else.
104 | 
105 | For example, if you only want to download only the html homepage file of example.com:
106 | 
107 |     wayback_machine_downloader http://example.com --exact-url 
108 | 
109 | 
110 | ## Only URL Filter
111 | 
112 |      -o, --only ONLY_FILTER
113 | 
114 | Optional. You may want to retrieve files which are of a certain type (e.g., .pdf, .jpg, .wrd...) or are in a specific directory. To do so, you can supply the `--only` flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download.
115 | 
116 | For example, if you only want to download files inside a specific `my_directory`:
117 | 
118 |     wayback_machine_downloader http://example.com --only my_directory
119 | 
120 | Or if you want to download every images without anything else:
121 | 
122 |     wayback_machine_downloader http://example.com --only "/\.(gif|jpg|jpeg)$/i"
123 | 
124 | ## Exclude URL Filter
125 | 
126 |      -x, --exclude EXCLUDE_FILTER
127 | 
128 | Optional. You may want to retrieve files which aren't of a certain type (e.g., .pdf, .jpg, .wrd...) or aren't in a specific directory. To do so, you can supply the `--exclude` flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download.
129 | 
130 | For example, if you want to avoid downloading files inside `my_directory`:
131 | 
132 |     wayback_machine_downloader http://example.com --exclude my_directory
133 | 
134 | Or if you want to download everything except images:
135 | 
136 |     wayback_machine_downloader http://example.com --exclude "/\.(gif|jpg|jpeg)$/i"
137 | 
138 | ## Expand downloading to all file types
139 | 
140 |      -a, --all
141 | 
142 | Optional. By default, Wayback Machine Downloader limits itself to files that responded with 200 OK code. If you also need errors files (40x and 50x codes) or redirections files (30x codes), you can use the `--all` or `-a` flag and Wayback Machine Downloader will download them in addition of the 200 OK files. It will also keep empty files that are removed by default.
143 | 
144 | Example:
145 | 
146 |     wayback_machine_downloader http://example.com --all
147 | 
148 | ## Only list files without downloading
149 | 
150 |      -l, --list
151 | 
152 | It will just display the files to be downloaded with their snapshot timestamps and urls. The output format is JSON. It won't download anything. It's useful for debugging or to connect to another application.
153 | 
154 | Example:
155 | 
156 |     wayback_machine_downloader http://example.com --list
157 | 
158 | ## Maximum number of snapshot pages to consider
159 | 
160 |     -p, --snapshot-pages NUMBER    
161 | 
162 | Optional. Specify the maximum number of snapshot pages to consider. Count an average of 150,000 snapshots per page. 100 is the default maximum number of snapshot pages and should be sufficient for most websites. Use a bigger number if you want to download a very large website.
163 | 
164 | Example:
165 | 
166 |     wayback_machine_downloader http://example.com --snapshot-pages 300    
167 | 
168 | ## Download multiple files at a time
169 | 
170 |     -c, --concurrency NUMBER  
171 | 
172 | Optional. Specify the number of multiple files you want to download at the same time. Allows to speed up the download of a website significantly. Default is to download one file at a time.
173 | 
174 | Example:
175 | 
176 |     wayback_machine_downloader http://example.com --concurrency 20
177 | 
178 | ## Using the Docker image
179 | 
180 | As an alternative installation way, we have a Docker image! Retrieve the wayback-machine-downloader Docker image this way:
181 | 
182 |     docker pull hartator/wayback-machine-downloader
183 | 
184 | Then, you should be able to use the Docker image to download websites. For example:
185 | 
186 |     docker run --rm -it -v $PWD/websites:/websites hartator/wayback-machine-downloader http://example.com
187 | 
188 | ## Contributing
189 | 
190 | Contributions are welcome! Just submit a pull request via GitHub.
191 | 
192 | To run the tests:
193 | 
194 |     bundle install
195 |     bundle exec rake test
196 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require 'rake/testtask'
2 | 
3 | Rake::TestTask.new do |t|
4 |   t.libs << 'test'
5 | end
6 | 
7 | desc "Run tests"
8 | task :default => :test
9 | 


--------------------------------------------------------------------------------
/bin/wayback_machine_downloader:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require_relative '../lib/wayback_machine_downloader'
 4 | require 'optparse'
 5 | require 'pp'
 6 | 
 7 | options = {}
 8 | option_parser = OptionParser.new do |opts|
 9 |   opts.banner = "Usage: wayback_machine_downloader http://example.com"
10 | 
11 |   opts.separator ""
12 |   opts.separator "Download an entire website from the Wayback Machine."
13 | 
14 |   opts.separator ""
15 |   opts.separator "Optional options:"
16 | 
17 |   opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
18 |     options[:directory] = t
19 |   end
20 | 
21 |   opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
22 |     options[:all_timestamps] = true
23 |   end
24 |   
25 |   opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
26 |     options[:from_timestamp] = t
27 |   end
28 | 
29 |   opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
30 |     options[:to_timestamp] = t
31 |   end
32 | 
33 |   opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
34 |     options[:exact_url] = t
35 |   end
36 | 
37 |   opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
38 |     options[:only_filter] = t
39 |   end
40 | 
41 |   opts.on("-w", "--wait SECONDS", Integer, "Wait the specified number of seconds between requests") do |t|
42 |     options[:wait_seconds] = t
43 |   end
44 | 
45 |   opts.on("--random-wait", "When used with --wait, randomize number of seconds waited between requests by a factor of 0.5 to 2") do |t|
46 |     options[:wait_randomize] = true
47 |   end
48 | 
49 |   opts.on("--tries NUMBER", Integer, "Number of times to retry for non-fatal connection errors (Default is 20)") do |t|
50 |     options[:tries] = t
51 |   end
52 | 
53 |   opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
54 |     options[:exclude_filter] = t
55 |   end
56 | 
57 |   opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
58 |     options[:all] = true
59 |   end
60 | 
61 |   opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time", "Default is one file at a time (ie. 20)") do |t|
62 |     options[:threads_count] = t
63 |   end
64 | 
65 |   opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
66 |     options[:maximum_pages] = t
67 |   end
68 | 
69 |   opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
70 |     options[:list] = true
71 |   end
72 | 
73 |   opts.on("-v", "--version", "Display version") do |t|
74 |     options[:version] = t
75 |   end
76 | end.parse!
77 | 
78 | if (base_url = ARGV[-1])
79 |   options[:base_url] = base_url
80 |   wayback_machine_downloader = WaybackMachineDownloader.new options
81 |   if options[:list]
82 |     wayback_machine_downloader.list_files
83 |   else
84 |     wayback_machine_downloader.download_files
85 |   end
86 | elsif options[:version]
87 |   puts WaybackMachineDownloader::VERSION
88 | else
89 |   puts "You need to specify a website to backup. (e.g., http://example.com)"
90 |   puts "Run `wayback_machine_downloader --help` for more help."
91 | end
92 | 


--------------------------------------------------------------------------------
/lib/wayback_machine_downloader.rb:
--------------------------------------------------------------------------------
  1 | # encoding: UTF-8
  2 | 
  3 | require 'thread'
  4 | require 'net/http'
  5 | require 'open-uri'
  6 | require 'fileutils'
  7 | require 'cgi'
  8 | require 'json'
  9 | require 'retryable'
 10 | require_relative 'wayback_machine_downloader/tidy_bytes'
 11 | require_relative 'wayback_machine_downloader/to_regex'
 12 | require_relative 'wayback_machine_downloader/archive_api'
 13 | 
 14 | class WaybackMachineDownloader
 15 | 
 16 |   include ArchiveAPI
 17 | 
 18 |   VERSION = "2.2.1"
 19 | 
 20 |   attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
 21 |     :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, 
 22 |     :all, :maximum_pages, :threads_count, :wait_seconds, :wait_randomized
 23 | 
 24 |   def initialize params
 25 |     @base_url = params[:base_url]
 26 |     @exact_url = params[:exact_url]
 27 |     @directory = params[:directory]
 28 |     @all_timestamps = params[:all_timestamps]
 29 |     @from_timestamp = params[:from_timestamp].to_i
 30 |     @to_timestamp = params[:to_timestamp].to_i
 31 |     @only_filter = params[:only_filter]
 32 |     @exclude_filter = params[:exclude_filter]
 33 |     @all = params[:all]
 34 |     @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
 35 |     @threads_count = params[:threads_count].to_i
 36 |     @wait_seconds = params[:wait_seconds].to_i
 37 |     @wait_randomized = params[:wait_randomized]
 38 |     @tries = params[:tries] ? params[:tries].to_i : 20
 39 |   end
 40 | 
 41 |   def backup_name
 42 |     if @base_url.include? '//'
 43 |       @base_url.split('/')[2]
 44 |     else
 45 |       @base_url
 46 |     end
 47 |   end
 48 | 
 49 |   def backup_path
 50 |     if @directory
 51 |       if @directory[-1] == '/'
 52 |         @directory
 53 |       else
 54 |         @directory + '/'
 55 |       end
 56 |     else
 57 |       'websites/' + backup_name + '/'
 58 |     end
 59 |   end
 60 | 
 61 |   def match_only_filter file_url
 62 |     if @only_filter
 63 |       only_filter_regex = @only_filter.to_regex
 64 |       if only_filter_regex
 65 |         only_filter_regex =~ file_url
 66 |       else
 67 |         file_url.downcase.include? @only_filter.downcase
 68 |       end
 69 |     else
 70 |       true
 71 |     end
 72 |   end
 73 | 
 74 |   def match_exclude_filter file_url
 75 |     if @exclude_filter
 76 |       exclude_filter_regex = @exclude_filter.to_regex
 77 |       if exclude_filter_regex
 78 |         exclude_filter_regex =~ file_url
 79 |       else
 80 |         file_url.downcase.include? @exclude_filter.downcase
 81 |       end
 82 |     else
 83 |       false
 84 |     end
 85 |   end
 86 | 
 87 |   def get_all_snapshots_to_consider
 88 |     # Note: Passing a page index parameter allow us to get more snapshots,
 89 |     # but from a less fresh index
 90 |     print "Getting snapshot pages"
 91 |     snapshot_list_to_consider = ""
 92 |     snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
 93 |     print "."
 94 |     unless @exact_url
 95 |       @maximum_pages.times do |page_index|
 96 |         wait
 97 |         snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
 98 |         break if snapshot_list.empty?
 99 |         snapshot_list_to_consider += snapshot_list
100 |         print "."
101 |       end
102 |     end
103 |     puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
104 |     puts
105 |     snapshot_list_to_consider
106 |   end
107 | 
108 |   def get_file_list_curated
109 |     file_list_curated = Hash.new
110 |     get_all_snapshots_to_consider.each_line do |line|
111 |       next unless line.include?('/')
112 |       file_timestamp = line[0..13].to_i
113 |       file_url = line[15..-2]
114 |       file_id = file_url.split('/')[3..-1].join('/')
115 |       file_id = CGI::unescape file_id 
116 |       file_id = file_id.tidy_bytes unless file_id == ""
117 |       if file_id.nil?
118 |         puts "Malformed file url, ignoring: #{file_url}"
119 |       else
120 |         if match_exclude_filter(file_url)
121 |           puts "File url matches exclude filter, ignoring: #{file_url}"
122 |         elsif not match_only_filter(file_url)
123 |           puts "File url doesn't match only filter, ignoring: #{file_url}"
124 |         elsif file_list_curated[file_id]
125 |           unless file_list_curated[file_id][:timestamp] > file_timestamp
126 |             file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
127 |           end
128 |         else
129 |           file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
130 |         end
131 |       end
132 |     end
133 |     file_list_curated
134 |   end
135 | 
136 |   def get_file_list_all_timestamps
137 |     file_list_curated = Hash.new
138 |     get_all_snapshots_to_consider.each_line do |line|
139 |       next unless line.include?('/')
140 |       file_timestamp = line[0..13].to_i
141 |       file_url = line[15..-2]
142 |       file_id = file_url.split('/')[3..-1].join('/')
143 |       file_id_and_timestamp = [file_timestamp, file_id].join('/')
144 |       file_id_and_timestamp = CGI::unescape file_id_and_timestamp 
145 |       file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
146 |       if file_id.nil?
147 |         puts "Malformed file url, ignoring: #{file_url}"
148 |       else
149 |         if match_exclude_filter(file_url)
150 |           puts "File url matches exclude filter, ignoring: #{file_url}"
151 |         elsif not match_only_filter(file_url)
152 |           puts "File url doesn't match only filter, ignoring: #{file_url}"
153 |         elsif file_list_curated[file_id_and_timestamp]
154 |           puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
155 |         else
156 |           file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
157 |         end
158 |       end
159 |     end
160 |     puts "file_list_curated: " + file_list_curated.count.to_s
161 |     file_list_curated
162 |   end
163 | 
164 | 
165 |   def get_file_list_by_timestamp
166 |     if @all_timestamps
167 |       file_list_curated = get_file_list_all_timestamps
168 |       file_list_curated.map do |file_remote_info|
169 |         file_remote_info[1][:file_id] = file_remote_info[0]
170 |         file_remote_info[1]
171 |       end
172 |     else
173 |       file_list_curated = get_file_list_curated
174 |       file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
175 |       file_list_curated.map do |file_remote_info|
176 |         file_remote_info[1][:file_id] = file_remote_info[0]
177 |         file_remote_info[1]
178 |       end
179 |     end
180 |   end
181 | 
182 |   def list_files
183 |     # retrieval produces its own output
184 |     files = get_file_list_by_timestamp
185 |     puts "["
186 |     files.each do |file|
187 |       puts file.to_json + ","
188 |     end
189 |     puts "]"
190 |   end
191 | 
192 |   def download_files
193 |     start_time = Time.now
194 |     puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
195 |     puts
196 | 
197 |     if file_list_by_timestamp.count == 0
198 |       puts "No files to download."
199 |       puts "Possible reasons:"
200 |       puts "\t* Site is not in Wayback Machine Archive."
201 |       puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
202 |       puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
203 |       puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
204 |       puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
205 |       return
206 |     end
207 |  
208 |     puts "#{file_list_by_timestamp.count} files to download:"
209 | 
210 |     threads = []
211 |     @processed_file_count = 0
212 |     @threads_count = 1 unless @threads_count != 0
213 |     @threads_count.times do
214 |       threads << Thread.new do
215 |         until file_queue.empty?
216 |           wait
217 |           file_remote_info = file_queue.pop(true) rescue nil
218 |           download_file(file_remote_info) if file_remote_info
219 |         end
220 |       end
221 |     end
222 | 
223 |     threads.each(&:join)
224 |     end_time = Time.now
225 |     puts
226 |     puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
227 |   end
228 | 
229 |   def structure_dir_path dir_path
230 |     begin
231 |       FileUtils::mkdir_p dir_path unless File.exist? dir_path
232 |     rescue Errno::EEXIST => e
233 |       error_to_string = e.to_s
234 |       puts "# #{error_to_string}"
235 |       if error_to_string.include? "File exists @ dir_s_mkdir - "
236 |         file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
237 |       elsif error_to_string.include? "File exists - "
238 |         file_already_existing = error_to_string.split("File exists - ")[-1]
239 |       else
240 |         raise "Unhandled directory restructure error # #{error_to_string}"
241 |       end
242 |       file_already_existing_temporary = file_already_existing + '.temp'
243 |       file_already_existing_permanent = file_already_existing + '/index.html'
244 |       FileUtils::mv file_already_existing, file_already_existing_temporary
245 |       FileUtils::mkdir_p file_already_existing
246 |       FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
247 |       puts "#{file_already_existing} -> #{file_already_existing_permanent}"
248 |       structure_dir_path dir_path
249 |     end
250 |   end
251 | 
252 |   def download_file file_remote_info
253 |     current_encoding = "".encoding
254 |     file_url = file_remote_info[:file_url].encode(current_encoding)
255 |     file_id = file_remote_info[:file_id]
256 |     file_timestamp = file_remote_info[:timestamp]
257 |     file_path_elements = file_id.split('/')
258 |     if file_id == ""
259 |       dir_path = backup_path
260 |       file_path = backup_path + 'index.html'
261 |     elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
262 |       dir_path = backup_path + file_path_elements[0..-1].join('/')
263 |       file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
264 |     else
265 |       dir_path = backup_path + file_path_elements[0..-2].join('/')
266 |       file_path = backup_path + file_path_elements[0..-1].join('/')
267 |     end
268 |     if Gem.win_platform?
269 |       dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
270 |       file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
271 |     end
272 |     unless File.exist? file_path
273 |       begin
274 |         structure_dir_path dir_path
275 |         open(file_path, "wb") do |file|
276 |           begin
277 |             Retryable.retryable(tries: @tries, on: Net::ReadTimeout, sleep_method: self.method(:wait)) do
278 |               URI.open("http://web.archive.org/web/#{file_timestamp}id_/#{file_url}", "Accept-Encoding" => "plain") do |uri|
279 |                 file.write(uri.read)
280 |               end
281 |             end
282 |           rescue OpenURI::HTTPError => e
283 |             puts "#{file_url} # #{e}"
284 |             if @all
285 |               file.write(e.io.read)
286 |               puts "#{file_path} saved anyway."
287 |             end
288 |           rescue StandardError => e
289 |             puts "#{file_url} # #{e}"
290 |           end
291 |         end
292 |       rescue StandardError => e
293 |         puts "#{file_url} # #{e}"
294 |       ensure
295 |         if not @all and File.exist?(file_path) and File.size(file_path) == 0
296 |           File.delete(file_path)
297 |           puts "#{file_path} was empty and was removed."
298 |         end
299 |       end
300 |       semaphore.synchronize do
301 |         @processed_file_count += 1
302 |         puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
303 |       end
304 |     else
305 |       semaphore.synchronize do
306 |         @processed_file_count += 1
307 |         puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
308 |       end
309 |     end
310 |   end
311 | 
312 |   def file_queue
313 |     @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
314 |   end
315 | 
316 |   def file_list_by_timestamp
317 |     @file_list_by_timestamp ||= get_file_list_by_timestamp
318 |   end
319 | 
320 |   def semaphore
321 |     @semaphore ||= Mutex.new
322 |   end
323 | 
324 |   def wait
325 |     @wait_seconds.positive? && @wait_randomized ? sleep(@wait_seconds.to_f * (rand(1.5) + 0.5)) : sleep(@wait_seconds)
326 |   end
327 | end
328 | 


--------------------------------------------------------------------------------
/lib/wayback_machine_downloader/archive_api.rb:
--------------------------------------------------------------------------------
 1 | module ArchiveAPI
 2 | 
 3 |   def get_raw_list_from_api url, page_index
 4 |     request_url = "http://web.archive.org/cdx/search/xd?url="
 5 |     request_url += CGI.escape url
 6 |     request_url += parameters_for_api page_index
 7 | 
 8 |     Retryable.retryable(tries: @tries, on: Net::ReadTimeout, sleep_method: self.method(:wait)) do
 9 |       URI.open(request_url).read
10 |     end
11 |   end
12 | 
13 |   def parameters_for_api page_index
14 |     parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
15 |     if @all
16 |       parameters += ""
17 |     else
18 |       parameters += "&filter=statuscode:200"
19 |     end
20 |     if @from_timestamp and @from_timestamp != 0
21 |       parameters += "&from=" + @from_timestamp.to_s
22 |     end
23 |     if @to_timestamp and @to_timestamp != 0
24 |       parameters += "&to=" + @to_timestamp.to_s
25 |     end
26 |     if page_index
27 |       parameters += "&page=#{page_index}"
28 |     end
29 |     parameters
30 |   end
31 | 
32 | end
33 | 


--------------------------------------------------------------------------------
/lib/wayback_machine_downloader/tidy_bytes.rb:
--------------------------------------------------------------------------------
  1 | module TibyBytes
  2 | 
  3 |   # CP-1252 decimal byte => UTF-8 approximation as an array of bytes
  4 |   CP1252 = {
  5 |     128 => [226, 130, 172],
  6 |     129 => nil,
  7 |     130 => [226, 128, 154],
  8 |     131 => [198, 146],
  9 |     132 => [226, 128, 158],
 10 |     133 => [226, 128, 166],
 11 |     134 => [226, 128, 160],
 12 |     135 => [226, 128, 161],
 13 |     136 => [203, 134],
 14 |     137 => [226, 128, 176],
 15 |     138 => [197, 160],
 16 |     139 => [226, 128, 185],
 17 |     140 => [197, 146],
 18 |     141 => nil,
 19 |     142 => [197, 189],
 20 |     143 => nil,
 21 |     144 => nil,
 22 |     145 => [226, 128, 152],
 23 |     146 => [226, 128, 153],
 24 |     147 => [226, 128, 156],
 25 |     148 => [226, 128, 157],
 26 |     149 => [226, 128, 162],
 27 |     150 => [226, 128, 147],
 28 |     151 => [226, 128, 148],
 29 |     152 => [203, 156],
 30 |     153 => [226, 132, 162],
 31 |     154 => [197, 161],
 32 |     155 => [226, 128, 186],
 33 |     156 => [197, 147],
 34 |     157 => nil,
 35 |     158 => [197, 190],
 36 |     159 => [197, 184]
 37 |   }
 38 | 
 39 |   module StringMixin
 40 | 
 41 |     # Attempt to replace invalid UTF-8 bytes with valid ones. This method
 42 |     # naively assumes if you have invalid UTF8 bytes, they are either Windows
 43 |     # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
 44 |     # always work.
 45 |     #
 46 |     # Passing +true+ will forcibly tidy all bytes, assuming that the string's
 47 |     # encoding is CP-1252 or ISO-8859-1.
 48 |     def tidy_bytes(force = false)
 49 | 
 50 |       if force
 51 |         return unpack("C*").map do |b|
 52 |           tidy_byte(b)
 53 |         end.flatten.compact.pack("C*").unpack("U*").pack("U*")
 54 |       end
 55 | 
 56 |       bytes = unpack("C*")
 57 |       conts_expected = 0
 58 |       last_lead = 0
 59 | 
 60 |       bytes.each_index do |i|
 61 | 
 62 |         byte          = bytes[i]
 63 |         _is_ascii     = byte < 128
 64 |         is_cont       = byte > 127 && byte < 192
 65 |         is_lead       = byte > 191 && byte < 245
 66 |         is_unused     = byte > 240
 67 |         is_restricted = byte > 244
 68 | 
 69 |         # Impossible or highly unlikely byte? Clean it.
 70 |         if is_unused || is_restricted
 71 |           bytes[i] = tidy_byte(byte)
 72 |         elsif is_cont
 73 |           # Not expecting contination byte? Clean up. Otherwise, now expect one less.
 74 |           conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
 75 |         else
 76 |           if conts_expected > 0
 77 |             # Expected continuation, but got ASCII or leading? Clean backwards up to
 78 |             # the leading byte.
 79 |             begin
 80 |               (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
 81 |             rescue NoMethodError
 82 |               next
 83 |             end
 84 |             conts_expected = 0
 85 |           end
 86 |           if is_lead
 87 |             # Final byte is leading? Clean it.
 88 |             if i == bytes.length - 1
 89 |               bytes[i] = tidy_byte(bytes.last)
 90 |             else
 91 |               # Valid leading byte? Expect continuations determined by position of
 92 |               # first zero bit, with max of 3.
 93 |               conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
 94 |               last_lead = i
 95 |             end
 96 |           end
 97 |         end
 98 |       end
 99 |       begin
100 |         bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
101 |       rescue ArgumentError
102 |         nil
103 |       end
104 |     end
105 | 
106 |     # Tidy bytes in-place.
107 |     def tidy_bytes!(force = false)
108 |       replace tidy_bytes(force)
109 |     end
110 | 
111 |     private
112 | 
113 |     def tidy_byte(byte)
114 |       byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
115 |     end
116 | 
117 |   end
118 | end
119 | 
120 | class String
121 |   include TibyBytes::StringMixin
122 | end
123 | 


--------------------------------------------------------------------------------
/lib/wayback_machine_downloader/to_regex.rb:
--------------------------------------------------------------------------------
 1 | module ToRegex
 2 |   module StringMixin
 3 |     class << self
 4 |       def literal?(str)
 5 |         REGEXP_DELIMITERS.none? { |s, e| str.start_with?(s) and str =~ /#{e}#{INLINE_OPTIONS}\z/ }
 6 |       end
 7 |     end
 8 | 
 9 |     INLINE_OPTIONS = /[imxnesu]*/
10 |     REGEXP_DELIMITERS = {
11 |       '%r{' => '}',
12 |       '/' => '/',
13 |     }
14 | 
15 |     # Get a regex back
16 |     #
17 |     # Without :literal or :detect, `"foo".to_regex` will return nil.
18 |     #
19 |     # @param [optional, Hash] options
20 |     # @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp
21 |     # @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally
22 |     # @option options [true,false] :ignore_case /foo/i
23 |     # @option options [true,false] :multiline /foo/m
24 |     # @option options [true,false] :extended /foo/x
25 |     # @option options [true,false] :lang /foo/[nesu]
26 |     def to_regex(options = {})
27 |       if args = as_regexp(options)
28 |         ::Regexp.new(*args)
29 |       end
30 |     end
31 | 
32 |     # Return arguments that can be passed to `Regexp.new`
33 |     # @see to_regexp
34 |     def as_regexp(options = {})
35 |       unless options.is_a?(::Hash)
36 |         raise ::ArgumentError, "[to_regexp] Options must be a Hash"
37 |       end
38 |       str = self
39 | 
40 |       return if options[:detect] and str == ''
41 | 
42 |       if options[:literal] or (options[:detect] and ToRegexp::String.literal?(str))
43 |         content = ::Regexp.escape str
44 |       elsif delim_set = REGEXP_DELIMITERS.detect { |k, _| str.start_with?(k) }
45 |         delim_start, delim_end = delim_set
46 |         /\A#{delim_start}(.*)#{delim_end}(#{INLINE_OPTIONS})\z/u =~ str
47 |         content = $1
48 |         inline_options = $2
49 |         return unless content.is_a?(::String)
50 |         content.gsub! '\\/', '/'
51 |         if inline_options
52 |           options[:ignore_case] = true if inline_options.include?('i')
53 |           options[:multiline] = true if inline_options.include?('m')
54 |           options[:extended] = true if inline_options.include?('x')
55 |           # 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8
56 |           options[:lang] = inline_options.scan(/[nesu]/i).join.downcase
57 |         end
58 |       else
59 |         return
60 |       end
61 | 
62 |       ignore_case = options[:ignore_case] ? ::Regexp::IGNORECASE : 0
63 |       multiline = options[:multiline] ? ::Regexp::MULTILINE : 0
64 |       extended = options[:extended] ? ::Regexp::EXTENDED : 0
65 |       lang = options[:lang] || ''
66 |       if ::RUBY_VERSION > '1.9' and lang.include?('u')
67 |         lang = lang.delete 'u'
68 |       end
69 | 
70 |       if lang.empty?
71 |         [ content, (ignore_case|multiline|extended) ]
72 |       else
73 |         [ content, (ignore_case|multiline|extended), lang ]
74 |       end
75 |     end
76 |   end
77 | end
78 | 
79 | class String
80 |   include ToRegex::StringMixin
81 | end
82 | 


--------------------------------------------------------------------------------
/test/test_wayback_machine_downloader.rb:
--------------------------------------------------------------------------------
  1 | require 'minitest/autorun'
  2 | require 'wayback_machine_downloader'
  3 | 
  4 | class WaybackMachineDownloaderTest < Minitest::Test
  5 | 
  6 |   def setup
  7 |     @wayback_machine_downloader = WaybackMachineDownloader.new(
  8 |       base_url: 'http://www.onlyfreegames.net')
  9 |     $stdout = StringIO.new
 10 |   end
 11 | 
 12 |   def teardown
 13 |     FileUtils.rm_rf(@wayback_machine_downloader.backup_path)
 14 |   end
 15 | 
 16 |   def test_base_url_being_set
 17 |     assert_equal 'http://www.onlyfreegames.net', @wayback_machine_downloader.base_url
 18 |   end
 19 | 
 20 |   def test_backup_name_being_set
 21 |     assert_equal 'www.onlyfreegames.net', @wayback_machine_downloader.backup_name
 22 |   end
 23 | 
 24 |   def test_backup_name_being_set_when_base_url_is_domain
 25 |     @wayback_machine_downloader.base_url = 'www.onlyfreegames.net'
 26 |     assert_equal 'www.onlyfreegames.net', @wayback_machine_downloader.backup_name
 27 |   end
 28 | 
 29 |   def test_file_list_curated
 30 |     assert_equal 20060711191226, @wayback_machine_downloader.get_file_list_curated["linux.htm"][:timestamp]
 31 |   end
 32 | 
 33 |   def test_file_list_by_timestamp
 34 |     file_expected = {
 35 |       file_url: "http://www.onlyfreegames.net:80/strat.html",
 36 |       timestamp: 20060111084756,
 37 |       file_id: "strat.html"
 38 |     }
 39 |     assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-2]
 40 |   end
 41 | 
 42 |   def test_without_exact_url
 43 |     @wayback_machine_downloader.exact_url = false
 44 |     assert @wayback_machine_downloader.get_file_list_curated.size > 1
 45 |   end
 46 | 
 47 |   def test_exact_url
 48 |     @wayback_machine_downloader.exact_url = true
 49 |     assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size
 50 |   end
 51 | 
 52 |   def test_file_list_only_filter_without_matches
 53 |     @wayback_machine_downloader.only_filter = 'abc123'
 54 |     assert_equal 0, @wayback_machine_downloader.get_file_list_curated.size
 55 |   end
 56 | 
 57 |   def test_file_list_only_filter_with_1_match
 58 |     @wayback_machine_downloader.only_filter = 'menu.html'
 59 |     assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size
 60 |   end
 61 | 
 62 |   def test_file_list_only_filter_with_a_regex
 63 |     @wayback_machine_downloader.only_filter = '/\.(gif|je?pg|bmp)$/i'
 64 |     assert_equal 37, @wayback_machine_downloader.get_file_list_curated.size
 65 |   end
 66 | 
 67 |   def test_file_list_exclude_filter_without_matches
 68 |     @wayback_machine_downloader.exclude_filter = 'abc123'
 69 |     assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size
 70 |   end
 71 | 
 72 |   def test_file_list_exclude_filter_with_1_match
 73 |     @wayback_machine_downloader.exclude_filter = 'menu.html'
 74 |     assert_equal 67, @wayback_machine_downloader.get_file_list_curated.size
 75 |   end
 76 | 
 77 |   def test_file_list_exclude_filter_with_a_regex
 78 |     @wayback_machine_downloader.exclude_filter = '/\.(gif|je?pg|bmp)$/i'
 79 |     assert_equal 31, @wayback_machine_downloader.get_file_list_curated.size
 80 |   end
 81 | 
 82 |   def test_file_download
 83 |     @wayback_machine_downloader.download_files
 84 |     linux_page = open 'websites/www.onlyfreegames.net/linux.htm'
 85 |     assert_includes linux_page.read, "Linux Games"
 86 |   end
 87 | 
 88 |   def test_all_timestamps_being_respected
 89 |     @wayback_machine_downloader.all_timestamps = true
 90 |     assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size
 91 |   end
 92 | 
 93 |   def test_from_timestamp_being_respected
 94 |     @wayback_machine_downloader.from_timestamp = 20050716231334
 95 |     file_url = @wayback_machine_downloader.get_file_list_curated["linux.htm"][:file_url]
 96 |     assert_equal "http://www.onlyfreegames.net:80/linux.htm", file_url
 97 |   end
 98 | 
 99 |   def test_to_timestamp_being_respected
100 |     @wayback_machine_downloader.to_timestamp = 20050716231334
101 |     assert_nil @wayback_machine_downloader.get_file_list_curated["linux.htm"]
102 |   end
103 | 
104 |   def test_all_get_file_list_curated_size
105 |     @wayback_machine_downloader.all = true
106 |     assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size
107 |   end
108 |  
109 |   # Testing encoding conflicts needs a different base_url
110 |   def test_nonascii_suburls_download
111 |     @wayback_machine_downloader = WaybackMachineDownloader.new(
112 |       base_url: 'https://en.wikipedia.org/wiki/%C3%84')
113 |     # Once just for the downloading...
114 |     @wayback_machine_downloader.download_files
115 |   end
116 | 
117 |   def test_nonascii_suburls_already_present
118 |     @wayback_machine_downloader = WaybackMachineDownloader.new(
119 |       base_url: 'https://en.wikipedia.org/wiki/%C3%84')
120 |     # ... twice to test the "is already present" case
121 |     @wayback_machine_downloader.download_files
122 |     @wayback_machine_downloader.download_files
123 |   end
124 | 
125 | end
126 | 


--------------------------------------------------------------------------------
/wayback_machine_downloader.gemspec:
--------------------------------------------------------------------------------
 1 | require './lib/wayback_machine_downloader'
 2 | 
 3 | Gem::Specification.new do |s|
 4 |   s.name        = "wayback_machine_downloader"
 5 |   s.version     = WaybackMachineDownloader::VERSION
 6 |   s.executables << "wayback_machine_downloader"
 7 |   s.summary     = "Download an entire website from the Wayback Machine."
 8 |   s.description = "Download an entire website from the Wayback Machine. Wayback Machine by Internet Archive (archive.org) is an awesome tool to view any website at any point of time but lacks an export feature. Wayback Machine Downloader brings exactly this."
 9 |   s.authors     = ["hartator"]
10 |   s.email       = "hartator@gmail.com"
11 |   s.files       = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"]
12 |   s.homepage    = "https://github.com/hartator/wayback-machine-downloader"
13 |   s.license     = "MIT"
14 |   s.required_ruby_version = '>= 1.9.2'
15 |   s.add_development_dependency 'rake', '~> 10.2'
16 |   s.add_development_dependency 'minitest', '~> 5.2'
17 | end
18 | 


--------------------------------------------------------------------------------