├── .gitignore
├── .travis.yml
├── Dockerfile
├── Gemfile
├── MIT-LICENSE.txt
├── README.md
├── Rakefile
├── bin
    └── wayback_machine_downloader
├── lib
    ├── wayback_machine_downloader.rb
    └── wayback_machine_downloader
    │   ├── archive_api.rb
    │   ├── tidy_bytes.rb
    │   └── to_regex.rb
├── test
    └── test_wayback_machine_downloader.rb
└── wayback_machine_downloader.gemspec


/.gitignore:
--------------------------------------------------------------------------------
 1 | ## PROJECT::GENERAL
 2 | .yardoc
 3 | coverage
 4 | doc
 5 | rdoc
 6 | log
 7 | websites
 8 | .DS_Store
 9 | .rake_tasks~
10 | 
11 | ## BUNDLER
12 | *.gem
13 | .bundle
14 | pkg
15 | Gemfile.lock
16 | 
17 | ## RBENV
18 | .ruby-version
19 | .rbenv*
20 | 
21 | ## RCOV
22 | coverage.data
23 | 
24 | tmp
25 | 
26 | ## RUBINIUS
27 | *.rbc
28 | 
29 | test.rb
30 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: ruby
 3 | rvm:
 4 |   - 1.9.2
 5 |   - 1.9.3
 6 |   - 2.0.0
 7 |   - 2.1
 8 |   - 2.2
 9 |   - 2.3.1
10 |   - jruby
11 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ruby:2.3
2 | COPY . /build
3 | RUN cd build && \
4 |     bundle install
5 | ENTRYPOINT [ "/usr/local/bundle/bin/wayback_machine_downloader" ]
6 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 | 
3 | gemspec
4 | 


--------------------------------------------------------------------------------
/MIT-LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015-2016 Julian Khaleghy and contributors
 4 | See the full list at https://github.com/hartator/wayback-machine-downloader/graphs/contributors
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Wayback Machine Downloader
  2 | 
  3 | [![Gem Version](https://badge.fury.io/rb/wayback_machine_downloader.svg)](https://rubygems.org/gems/wayback_machine_downloader/)
  4 | [![Build Status](https://travis-ci.org/hartator/wayback-machine-downloader.svg?branch=master)](https://travis-ci.org/hartator/wayback-machine-downloader)
  5 | 
  6 | Download an entire website from the Internet Archive Wayback Machine.
  7 | 
  8 | ## Installation
  9 | 
 10 | You need to install Ruby on your system (>= 1.9.2) - if you don't already have it.
 11 | Then run:
 12 | 
 13 |     gem install wayback_machine_downloader
 14 | 
 15 | **Tip:** If you run into permission errors, you might have to add `sudo` in front of this command.
 16 | 
 17 | ## Basic Usage
 18 | 
 19 | Run wayback_machine_downloader with the base url of the website you want to retrieve as a parameter (e.g., http://example.com):
 20 | 
 21 |     wayback_machine_downloader http://example.com
 22 | 
 23 | ## How it works
 24 | 
 25 | It will download the last version of every file present on Wayback Machine to `./websites/example.com/`. It will also re-create a directory structure and auto-create `index.html` pages to work seamlessly with Apache and Nginx. All files downloaded are the original ones and not Wayback Machine rewritten versions. This way, URLs and links structure are the same as before.
 26 | 
 27 | ## Advanced Usage
 28 | 
 29 | 	Usage: wayback_machine_downloader http://example.com
 30 | 
 31 | 	Download an entire website from the Wayback Machine.
 32 | 
 33 | 	Optional options:
 34 | 	    -d, --directory PATH             Directory to save the downloaded files into
 35 | 					     Default is ./websites/ plus the domain name
 36 | 	    -s, --all-timestamps             Download all snapshots/timestamps for a given website
 37 | 	    -f, --from TIMESTAMP             Only files on or after timestamp supplied (ie. 20060716231334)
 38 | 	    -t, --to TIMESTAMP               Only files on or before timestamp supplied (ie. 20100916231334)
 39 | 	    -e, --exact-url                  Download only the url provided and not the full site
 40 | 	    -o, --only ONLY_FILTER           Restrict downloading to urls that match this filter
 41 | 					     (use // notation for the filter to be treated as a regex)
 42 | 	    -x, --exclude EXCLUDE_FILTER     Skip downloading of urls that match this filter
 43 | 					     (use // notation for the filter to be treated as a regex)
 44 | 	    -a, --all                        Expand downloading to error files (40x and 50x) and redirections (30x)
 45 | 	    -c, --concurrency NUMBER         Number of multiple files to download at a time
 46 | 					     Default is one file at a time (ie. 20)
 47 | 	    -p, --maximum-snapshot NUMBER    Maximum snapshot pages to consider (Default is 100)
 48 | 					     Count an average of 150,000 snapshots per page
 49 | 	    -l, --list                       Only list file urls in a JSON format with the archived timestamps, won't download anything
 50 | 	    
 51 | ## Specify directory to save files to
 52 | 
 53 |     -d, --directory PATH
 54 | 
 55 | Optional. By default, Wayback Machine Downloader will download files to `./websites/` followed by the domain name of the website. You may want to save files in a specific directory using this option.
 56 | 
 57 | Example:
 58 | 
 59 |     wayback_machine_downloader http://example.com --directory downloaded-backup/
 60 |     
 61 | ## All Timestamps
 62 | 
 63 |     -s, --all-timestamps 
 64 | 
 65 | Optional. This option will download all timestamps/snapshots for a given website. It will uses the timestamp of each snapshot as directory.
 66 | 
 67 | Example:
 68 | 
 69 |     wayback_machine_downloader http://example.com --all-timestamps 
 70 |     
 71 |     Will download:
 72 |     	websites/example.com/20060715085250/index.html
 73 |     	websites/example.com/20051120005053/index.html
 74 |     	websites/example.com/20060111095815/img/logo.png
 75 |     	...
 76 | 
 77 | ## From Timestamp
 78 | 
 79 |     -f, --from TIMESTAMP
 80 | 
 81 | Optional. You may want to supply a from timestamp to lock your backup to a specific version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., https://web.archive.org/web/20060716231334/http://example.com). You can also use years (2006), years + month (200607), etc. It can be used in combination of To Timestamp.
 82 | Wayback Machine Downloader will then fetch only file versions on or after the timestamp specified.
 83 | 
 84 | Example:
 85 | 
 86 |     wayback_machine_downloader http://example.com --from 20060716231334
 87 | 
 88 | ## To Timestamp
 89 | 
 90 |     -t, --to TIMESTAMP
 91 | 
 92 | Optional. You may want to supply a to timestamp to lock your backup to a specific version of the website. Timestamps can be found inside the urls of the regular Wayback Machine website (e.g., https://web.archive.org/web/20100916231334/http://example.com). You can also use years (2010), years + month (201009), etc. It can be used in combination of From Timestamp.
 93 | Wayback Machine Downloader will then fetch only file versions on or before the timestamp specified.
 94 | 
 95 | Example:
 96 | 
 97 |     wayback_machine_downloader http://example.com --to 20100916231334
 98 |     
 99 | ## Exact Url
100 | 
101 | 	-e, --exact-url 
102 | 
103 | Optional. If you want to retrieve only the file matching exactly the url provided, you can use this flag. It will avoid downloading anything else.
104 | 
105 | For example, if you only want to download only the html homepage file of example.com:
106 | 
107 |     wayback_machine_downloader http://example.com --exact-url 
108 | 
109 | 
110 | ## Only URL Filter
111 | 
112 |      -o, --only ONLY_FILTER
113 | 
114 | Optional. You may want to retrieve files which are of a certain type (e.g., .pdf, .jpg, .wrd...) or are in a specific directory. To do so, you can supply the `--only` flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download.
115 | 
116 | For example, if you only want to download files inside a specific `my_directory`:
117 | 
118 |     wayback_machine_downloader http://example.com --only my_directory
119 | 
120 | Or if you want to download every images without anything else:
121 | 
122 |     wayback_machine_downloader http://example.com --only "/\.(gif|jpg|jpeg)$/i"
123 | 
124 | ## Exclude URL Filter
125 | 
126 |      -x, --exclude EXCLUDE_FILTER
127 | 
128 | Optional. You may want to retrieve files which aren't of a certain type (e.g., .pdf, .jpg, .wrd...) or aren't in a specific directory. To do so, you can supply the `--exclude` flag with a string or a regex (using the '/regex/' notation) to limit which files Wayback Machine Downloader will download.
129 | 
130 | For example, if you want to avoid downloading files inside `my_directory`:
131 | 
132 |     wayback_machine_downloader http://example.com --exclude my_directory
133 | 
134 | Or if you want to download everything except images:
135 | 
136 |     wayback_machine_downloader http://example.com --exclude "/\.(gif|jpg|jpeg)$/i"
137 | 
138 | ## Expand downloading to all file types
139 | 
140 |      -a, --all
141 | 
142 | Optional. By default, Wayback Machine Downloader limits itself to files that responded with 200 OK code. If you also need errors files (40x and 50x codes) or redirections files (30x codes), you can use the `--all` or `-a` flag and Wayback Machine Downloader will download them in addition of the 200 OK files. It will also keep empty files that are removed by default.
143 | 
144 | Example:
145 | 
146 |     wayback_machine_downloader http://example.com --all
147 | 
148 | ## Only list files without downloading
149 | 
150 |      -l, --list
151 | 
152 | It will just display the files to be downloaded with their snapshot timestamps and urls. The output format is JSON. It won't download anything. It's useful for debugging or to connect to another application.
153 | 
154 | Example:
155 | 
156 |     wayback_machine_downloader http://example.com --list
157 | 
158 | ## Maximum number of snapshot pages to consider
159 | 
160 |     -p, --snapshot-pages NUMBER    
161 | 
162 | Optional. Specify the maximum number of snapshot pages to consider. Count an average of 150,000 snapshots per page. 100 is the default maximum number of snapshot pages and should be sufficient for most websites. Use a bigger number if you want to download a very large website.
163 | 
164 | Example:
165 | 
166 |     wayback_machine_downloader http://example.com --snapshot-pages 300    
167 | 
168 | ## Download multiple files at a time
169 | 
170 |     -c, --concurrency NUMBER  
171 | 
172 | Optional. Specify the number of multiple files you want to download at the same time. Allows one to speed up the download of a website significantly. Default is to download one file at a time.
173 | 
174 | Example:
175 | 
176 |     wayback_machine_downloader http://example.com --concurrency 20
177 | 
178 | ## Using the Docker image
179 | 
180 | As an alternative installation way, we have a Docker image! Retrieve the wayback-machine-downloader Docker image this way:
181 | 
182 |     docker pull hartator/wayback-machine-downloader
183 | 
184 | Then, you should be able to use the Docker image to download websites. For example:
185 | 
186 |     docker run --rm -it -v $PWD/websites:/websites hartator/wayback-machine-downloader http://example.com
187 | 
188 | ## Contributing
189 | 
190 | Contributions are welcome! Just submit a pull request via GitHub.
191 | 
192 | To run the tests:
193 | 
194 |     bundle install
195 |     bundle exec rake test
196 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require 'rake/testtask'
2 | 
3 | Rake::TestTask.new do |t|
4 |   t.libs << 'test'
5 | end
6 | 
7 | desc "Run tests"
8 | task :default => :test
9 | 


--------------------------------------------------------------------------------
/bin/wayback_machine_downloader:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require_relative '../lib/wayback_machine_downloader'
 4 | require 'optparse'
 5 | require 'pp'
 6 | 
 7 | options = {}
 8 | option_parser = OptionParser.new do |opts|
 9 |   opts.banner = "Usage: wayback_machine_downloader http://example.com"
10 | 
11 |   opts.separator ""
12 |   opts.separator "Download an entire website from the Wayback Machine."
13 | 
14 |   opts.separator ""
15 |   opts.separator "Optional options:"
16 | 
17 |   opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into", "Default is ./websites/ plus the domain name") do |t|
18 |     options[:directory] = t
19 |   end
20 | 
21 |   opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
22 |     options[:all_timestamps] = true
23 |   end
24 |   
25 |   opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
26 |     options[:from_timestamp] = t
27 |   end
28 | 
29 |   opts.on("-t", "--to TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20100916231334)") do |t|
30 |     options[:to_timestamp] = t
31 |   end
32 | 
33 |   opts.on("-e", "--exact-url", "Download only the url provied and not the full site") do |t|
34 |     options[:exact_url] = t
35 |   end
36 | 
37 |   opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
38 |     options[:only_filter] = t
39 |   end
40 | 
41 |   opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
42 |     options[:exclude_filter] = t
43 |   end
44 | 
45 |   opts.on("-a", "--all", "Expand downloading to error files (40x and 50x) and redirections (30x)") do |t|
46 |     options[:all] = true
47 |   end
48 | 
49 |   opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to download at a time", "Default is one file at a time (ie. 20)") do |t|
50 |     options[:threads_count] = t
51 |   end
52 | 
53 |   opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
54 |     options[:maximum_pages] = t
55 |   end
56 | 
57 |   opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
58 |     options[:list] = true
59 |   end
60 | 
61 |   opts.on("-v", "--version", "Display version") do |t|
62 |     options[:version] = t
63 |   end
64 | end.parse!
65 | 
66 | if (base_url = ARGV[-1])
67 |   options[:base_url] = base_url
68 |   wayback_machine_downloader = WaybackMachineDownloader.new options
69 |   if options[:list]
70 |     wayback_machine_downloader.list_files
71 |   else
72 |     wayback_machine_downloader.download_files
73 |   end
74 | elsif options[:version]
75 |   puts WaybackMachineDownloader::VERSION
76 | else
77 |   puts "You need to specify a website to backup. (e.g., http://example.com)"
78 |   puts "Run `wayback_machine_downloader --help` for more help."
79 | end
80 | 


--------------------------------------------------------------------------------
/lib/wayback_machine_downloader.rb:
--------------------------------------------------------------------------------
  1 | # encoding: UTF-8
  2 | 
  3 | require 'thread'
  4 | require 'net/http'
  5 | require 'open-uri'
  6 | require 'fileutils'
  7 | require 'cgi'
  8 | require 'json'
  9 | require_relative 'wayback_machine_downloader/tidy_bytes'
 10 | require_relative 'wayback_machine_downloader/to_regex'
 11 | require_relative 'wayback_machine_downloader/archive_api'
 12 | 
 13 | class WaybackMachineDownloader
 14 | 
 15 |   include ArchiveAPI
 16 | 
 17 |   VERSION = "2.3.1"
 18 | 
 19 |   attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
 20 |     :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, 
 21 |     :all, :maximum_pages, :threads_count
 22 | 
 23 |   def initialize params
 24 |     @base_url = params[:base_url]
 25 |     @exact_url = params[:exact_url]
 26 |     @directory = params[:directory]
 27 |     @all_timestamps = params[:all_timestamps]
 28 |     @from_timestamp = params[:from_timestamp].to_i
 29 |     @to_timestamp = params[:to_timestamp].to_i
 30 |     @only_filter = params[:only_filter]
 31 |     @exclude_filter = params[:exclude_filter]
 32 |     @all = params[:all]
 33 |     @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
 34 |     @threads_count = params[:threads_count].to_i
 35 |   end
 36 | 
 37 |   def backup_name
 38 |     if @base_url.include? '//'
 39 |       @base_url.split('/')[2]
 40 |     else
 41 |       @base_url
 42 |     end
 43 |   end
 44 | 
 45 |   def backup_path
 46 |     if @directory
 47 |       if @directory[-1] == '/'
 48 |         @directory
 49 |       else
 50 |         @directory + '/'
 51 |       end
 52 |     else
 53 |       'websites/' + backup_name + '/'
 54 |     end
 55 |   end
 56 | 
 57 |   def match_only_filter file_url
 58 |     if @only_filter
 59 |       only_filter_regex = @only_filter.to_regex
 60 |       if only_filter_regex
 61 |         only_filter_regex =~ file_url
 62 |       else
 63 |         file_url.downcase.include? @only_filter.downcase
 64 |       end
 65 |     else
 66 |       true
 67 |     end
 68 |   end
 69 | 
 70 |   def match_exclude_filter file_url
 71 |     if @exclude_filter
 72 |       exclude_filter_regex = @exclude_filter.to_regex
 73 |       if exclude_filter_regex
 74 |         exclude_filter_regex =~ file_url
 75 |       else
 76 |         file_url.downcase.include? @exclude_filter.downcase
 77 |       end
 78 |     else
 79 |       false
 80 |     end
 81 |   end
 82 | 
 83 |   def get_all_snapshots_to_consider
 84 |     # Note: Passing a page index parameter allow us to get more snapshots,
 85 |     # but from a less fresh index
 86 |     print "Getting snapshot pages"
 87 |     snapshot_list_to_consider = []
 88 |     snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
 89 |     print "."
 90 |     unless @exact_url
 91 |       @maximum_pages.times do |page_index|
 92 |         snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
 93 |         break if snapshot_list.empty?
 94 |         snapshot_list_to_consider += snapshot_list
 95 |         print "."
 96 |       end
 97 |     end
 98 |     puts " found #{snapshot_list_to_consider.length} snaphots to consider."
 99 |     puts
100 |     snapshot_list_to_consider
101 |   end
102 | 
103 |   def get_file_list_curated
104 |     file_list_curated = Hash.new
105 |     get_all_snapshots_to_consider.each do |file_timestamp, file_url|
106 |       next unless file_url.include?('/')
107 |       file_id = file_url.split('/')[3..-1].join('/')
108 |       file_id = CGI::unescape file_id 
109 |       file_id = file_id.tidy_bytes unless file_id == ""
110 |       if file_id.nil?
111 |         puts "Malformed file url, ignoring: #{file_url}"
112 |       else
113 |         if match_exclude_filter(file_url)
114 |           puts "File url matches exclude filter, ignoring: #{file_url}"
115 |         elsif not match_only_filter(file_url)
116 |           puts "File url doesn't match only filter, ignoring: #{file_url}"
117 |         elsif file_list_curated[file_id]
118 |           unless file_list_curated[file_id][:timestamp] > file_timestamp
119 |             file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
120 |           end
121 |         else
122 |           file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
123 |         end
124 |       end
125 |     end
126 |     file_list_curated
127 |   end
128 | 
129 |   def get_file_list_all_timestamps
130 |     file_list_curated = Hash.new
131 |     get_all_snapshots_to_consider.each do |file_timestamp, file_url|
132 |       next unless file_url.include?('/')
133 |       file_id = file_url.split('/')[3..-1].join('/')
134 |       file_id_and_timestamp = [file_timestamp, file_id].join('/')
135 |       file_id_and_timestamp = CGI::unescape file_id_and_timestamp 
136 |       file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
137 |       if file_id.nil?
138 |         puts "Malformed file url, ignoring: #{file_url}"
139 |       else
140 |         if match_exclude_filter(file_url)
141 |           puts "File url matches exclude filter, ignoring: #{file_url}"
142 |         elsif not match_only_filter(file_url)
143 |           puts "File url doesn't match only filter, ignoring: #{file_url}"
144 |         elsif file_list_curated[file_id_and_timestamp]
145 |           puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
146 |         else
147 |           file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
148 |         end
149 |       end
150 |     end
151 |     puts "file_list_curated: " + file_list_curated.count.to_s
152 |     file_list_curated
153 |   end
154 | 
155 | 
156 |   def get_file_list_by_timestamp
157 |     if @all_timestamps
158 |       file_list_curated = get_file_list_all_timestamps
159 |       file_list_curated.map do |file_remote_info|
160 |         file_remote_info[1][:file_id] = file_remote_info[0]
161 |         file_remote_info[1]
162 |       end
163 |     else
164 |       file_list_curated = get_file_list_curated
165 |       file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
166 |       file_list_curated.map do |file_remote_info|
167 |         file_remote_info[1][:file_id] = file_remote_info[0]
168 |         file_remote_info[1]
169 |       end
170 |     end
171 |   end
172 | 
173 |   def list_files
174 |     # retrieval produces its own output
175 |     @orig_stdout = $stdout
176 |     $stdout = $stderr
177 |     files = get_file_list_by_timestamp
178 |     $stdout = @orig_stdout
179 |     puts "["
180 |     files[0...-1].each do |file|
181 |       puts file.to_json + ","
182 |     end
183 |     puts files[-1].to_json
184 |     puts "]"
185 |   end
186 | 
187 |   def download_files
188 |     start_time = Time.now
189 |     puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
190 |     puts
191 | 
192 |     if file_list_by_timestamp.count == 0
193 |       puts "No files to download."
194 |       puts "Possible reasons:"
195 |       puts "\t* Site is not in Wayback Machine Archive."
196 |       puts "\t* From timestamp too much in the future." if @from_timestamp and @from_timestamp != 0
197 |       puts "\t* To timestamp too much in the past." if @to_timestamp and @to_timestamp != 0
198 |       puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
199 |       puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
200 |       return
201 |     end
202 |  
203 |     puts "#{file_list_by_timestamp.count} files to download:"
204 | 
205 |     threads = []
206 |     @processed_file_count = 0
207 |     @threads_count = 1 unless @threads_count != 0
208 |     @threads_count.times do
209 |       threads << Thread.new do
210 |         until file_queue.empty?
211 |           file_remote_info = file_queue.pop(true) rescue nil
212 |           download_file(file_remote_info) if file_remote_info
213 |         end
214 |       end
215 |     end
216 | 
217 |     threads.each(&:join)
218 |     end_time = Time.now
219 |     puts
220 |     puts "Download completed in #{(end_time - start_time).round(2)}s, saved in #{backup_path} (#{file_list_by_timestamp.size} files)"
221 |   end
222 | 
223 |   def structure_dir_path dir_path
224 |     begin
225 |       FileUtils::mkdir_p dir_path unless File.exist? dir_path
226 |     rescue Errno::EEXIST => e
227 |       error_to_string = e.to_s
228 |       puts "# #{error_to_string}"
229 |       if error_to_string.include? "File exists @ dir_s_mkdir - "
230 |         file_already_existing = error_to_string.split("File exists @ dir_s_mkdir - ")[-1]
231 |       elsif error_to_string.include? "File exists - "
232 |         file_already_existing = error_to_string.split("File exists - ")[-1]
233 |       else
234 |         raise "Unhandled directory restructure error # #{error_to_string}"
235 |       end
236 |       file_already_existing_temporary = file_already_existing + '.temp'
237 |       file_already_existing_permanent = file_already_existing + '/index.html'
238 |       FileUtils::mv file_already_existing, file_already_existing_temporary
239 |       FileUtils::mkdir_p file_already_existing
240 |       FileUtils::mv file_already_existing_temporary, file_already_existing_permanent
241 |       puts "#{file_already_existing} -> #{file_already_existing_permanent}"
242 |       structure_dir_path dir_path
243 |     end
244 |   end
245 | 
246 |   def download_file file_remote_info
247 |     current_encoding = "".encoding
248 |     file_url = file_remote_info[:file_url].encode(current_encoding)
249 |     file_id = file_remote_info[:file_id]
250 |     file_timestamp = file_remote_info[:timestamp]
251 |     file_path_elements = file_id.split('/')
252 |     if file_id == ""
253 |       dir_path = backup_path
254 |       file_path = backup_path + 'index.html'
255 |     elsif file_url[-1] == '/' or not file_path_elements[-1].include? '.'
256 |       dir_path = backup_path + file_path_elements[0..-1].join('/')
257 |       file_path = backup_path + file_path_elements[0..-1].join('/') + '/index.html'
258 |     else
259 |       dir_path = backup_path + file_path_elements[0..-2].join('/')
260 |       file_path = backup_path + file_path_elements[0..-1].join('/')
261 |     end
262 |     if Gem.win_platform?
263 |       dir_path = dir_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
264 |       file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
265 |     end
266 |     unless File.exist? file_path
267 |       begin
268 |         structure_dir_path dir_path
269 |         open(file_path, "wb") do |file|
270 |           begin
271 |             URI("https://web.archive.org/web/#{file_timestamp}id_/#{file_url}").open("Accept-Encoding" => "plain") do |uri|
272 |               file.write(uri.read)
273 |             end
274 |           rescue OpenURI::HTTPError => e
275 |             puts "#{file_url} # #{e}"
276 |             if @all
277 |               file.write(e.io.read)
278 |               puts "#{file_path} saved anyway."
279 |             end
280 |           rescue StandardError => e
281 |             puts "#{file_url} # #{e}"
282 |           end
283 |         end
284 |       rescue StandardError => e
285 |         puts "#{file_url} # #{e}"
286 |       ensure
287 |         if not @all and File.exist?(file_path) and File.size(file_path) == 0
288 |           File.delete(file_path)
289 |           puts "#{file_path} was empty and was removed."
290 |         end
291 |       end
292 |       semaphore.synchronize do
293 |         @processed_file_count += 1
294 |         puts "#{file_url} -> #{file_path} (#{@processed_file_count}/#{file_list_by_timestamp.size})"
295 |       end
296 |     else
297 |       semaphore.synchronize do
298 |         @processed_file_count += 1
299 |         puts "#{file_url} # #{file_path} already exists. (#{@processed_file_count}/#{file_list_by_timestamp.size})"
300 |       end
301 |     end
302 |   end
303 | 
304 |   def file_queue
305 |     @file_queue ||= file_list_by_timestamp.each_with_object(Queue.new) { |file_info, q| q << file_info }
306 |   end
307 | 
308 |   def file_list_by_timestamp
309 |     @file_list_by_timestamp ||= get_file_list_by_timestamp
310 |   end
311 | 
312 |   def semaphore
313 |     @semaphore ||= Mutex.new
314 |   end
315 | end
316 | 


--------------------------------------------------------------------------------
/lib/wayback_machine_downloader/archive_api.rb:
--------------------------------------------------------------------------------
 1 | require 'json'
 2 | require 'uri'
 3 | 
 4 | module ArchiveAPI
 5 | 
 6 |   def get_raw_list_from_api url, page_index
 7 |     request_url = URI("https://web.archive.org/cdx/search/xd")
 8 |     params = [["output", "json"], ["url", url]]
 9 |     params += parameters_for_api page_index
10 |     request_url.query = URI.encode_www_form(params)
11 | 
12 |     begin
13 |       json = JSON.parse(URI(request_url).open.read)
14 |       if (json[0] <=> ["timestamp","original"]) == 0
15 |         json.shift
16 |       end
17 |       json
18 |     rescue JSON::ParserError
19 |       []
20 |     end
21 |   end
22 | 
23 |   def parameters_for_api page_index
24 |     parameters = [["fl", "timestamp,original"], ["collapse", "digest"], ["gzip", "false"]]
25 |     if !@all
26 |       parameters.push(["filter", "statuscode:200"])
27 |     end
28 |     if @from_timestamp and @from_timestamp != 0
29 |       parameters.push(["from", @from_timestamp.to_s])
30 |     end
31 |     if @to_timestamp and @to_timestamp != 0
32 |       parameters.push(["to", @to_timestamp.to_s])
33 |     end
34 |     if page_index
35 |       parameters.push(["page", page_index])
36 |     end
37 |     parameters
38 |   end
39 | 
40 | end
41 | 


--------------------------------------------------------------------------------
/lib/wayback_machine_downloader/tidy_bytes.rb:
--------------------------------------------------------------------------------
  1 | module TibyBytes
  2 | 
  3 |   # CP-1252 decimal byte => UTF-8 approximation as an array of bytes
  4 |   CP1252 = {
  5 |     128 => [226, 130, 172],
  6 |     129 => nil,
  7 |     130 => [226, 128, 154],
  8 |     131 => [198, 146],
  9 |     132 => [226, 128, 158],
 10 |     133 => [226, 128, 166],
 11 |     134 => [226, 128, 160],
 12 |     135 => [226, 128, 161],
 13 |     136 => [203, 134],
 14 |     137 => [226, 128, 176],
 15 |     138 => [197, 160],
 16 |     139 => [226, 128, 185],
 17 |     140 => [197, 146],
 18 |     141 => nil,
 19 |     142 => [197, 189],
 20 |     143 => nil,
 21 |     144 => nil,
 22 |     145 => [226, 128, 152],
 23 |     146 => [226, 128, 153],
 24 |     147 => [226, 128, 156],
 25 |     148 => [226, 128, 157],
 26 |     149 => [226, 128, 162],
 27 |     150 => [226, 128, 147],
 28 |     151 => [226, 128, 148],
 29 |     152 => [203, 156],
 30 |     153 => [226, 132, 162],
 31 |     154 => [197, 161],
 32 |     155 => [226, 128, 186],
 33 |     156 => [197, 147],
 34 |     157 => nil,
 35 |     158 => [197, 190],
 36 |     159 => [197, 184]
 37 |   }
 38 | 
 39 |   module StringMixin
 40 | 
 41 |     # Attempt to replace invalid UTF-8 bytes with valid ones. This method
 42 |     # naively assumes if you have invalid UTF8 bytes, they are either Windows
 43 |     # CP-1252 or ISO8859-1. In practice this isn't a bad assumption, but may not
 44 |     # always work.
 45 |     #
 46 |     # Passing +true+ will forcibly tidy all bytes, assuming that the string's
 47 |     # encoding is CP-1252 or ISO-8859-1.
 48 |     def tidy_bytes(force = false)
 49 | 
 50 |       if force
 51 |         return unpack("C*").map do |b|
 52 |           tidy_byte(b)
 53 |         end.flatten.compact.pack("C*").unpack("U*").pack("U*")
 54 |       end
 55 | 
 56 |       bytes = unpack("C*")
 57 |       conts_expected = 0
 58 |       last_lead = 0
 59 | 
 60 |       bytes.each_index do |i|
 61 | 
 62 |         byte          = bytes[i]
 63 |         _is_ascii     = byte < 128
 64 |         is_cont       = byte > 127 && byte < 192
 65 |         is_lead       = byte > 191 && byte < 245
 66 |         is_unused     = byte > 240
 67 |         is_restricted = byte > 244
 68 | 
 69 |         # Impossible or highly unlikely byte? Clean it.
 70 |         if is_unused || is_restricted
 71 |           bytes[i] = tidy_byte(byte)
 72 |         elsif is_cont
 73 |           # Not expecting continuation byte? Clean up. Otherwise, now expect one less.
 74 |           conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
 75 |         else
 76 |           if conts_expected > 0
 77 |             # Expected continuation, but got ASCII or leading? Clean backwards up to
 78 |             # the leading byte.
 79 |             begin
 80 |               (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
 81 |             rescue NoMethodError
 82 |               next
 83 |             end
 84 |             conts_expected = 0
 85 |           end
 86 |           if is_lead
 87 |             # Final byte is leading? Clean it.
 88 |             if i == bytes.length - 1
 89 |               bytes[i] = tidy_byte(bytes.last)
 90 |             else
 91 |               # Valid leading byte? Expect continuations determined by position of
 92 |               # first zero bit, with max of 3.
 93 |               conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
 94 |               last_lead = i
 95 |             end
 96 |           end
 97 |         end
 98 |       end
 99 |       begin
100 |         bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
101 |       rescue ArgumentError
102 |         nil
103 |       end
104 |     end
105 | 
106 |     # Tidy bytes in-place.
107 |     def tidy_bytes!(force = false)
108 |       replace tidy_bytes(force)
109 |     end
110 | 
111 |     private
112 | 
113 |     def tidy_byte(byte)
114 |       byte < 160 ? TibyBytes::CP1252[byte] : byte < 192 ? [194, byte] : [195, byte - 64]
115 |     end
116 | 
117 |   end
118 | end
119 | 
120 | class String
121 |   include TibyBytes::StringMixin
122 | end
123 | 


--------------------------------------------------------------------------------
/lib/wayback_machine_downloader/to_regex.rb:
--------------------------------------------------------------------------------
 1 | module ToRegex
 2 |   module StringMixin
 3 |     class << self
 4 |       def literal?(str)
 5 |         REGEXP_DELIMITERS.none? { |s, e| str.start_with?(s) and str =~ /#{e}#{INLINE_OPTIONS}\z/ }
 6 |       end
 7 |     end
 8 | 
 9 |     INLINE_OPTIONS = /[imxnesu]*/
10 |     REGEXP_DELIMITERS = {
11 |       '%r{' => '}',
12 |       '/' => '/',
13 |     }
14 | 
15 |     # Get a regex back
16 |     #
17 |     # Without :literal or :detect, `"foo".to_regex` will return nil.
18 |     #
19 |     # @param [optional, Hash] options
20 |     # @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp
21 |     # @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally
22 |     # @option options [true,false] :ignore_case /foo/i
23 |     # @option options [true,false] :multiline /foo/m
24 |     # @option options [true,false] :extended /foo/x
25 |     # @option options [true,false] :lang /foo/[nesu]
26 |     def to_regex(options = {})
27 |       if args = as_regexp(options)
28 |         ::Regexp.new(*args)
29 |       end
30 |     end
31 | 
32 |     # Return arguments that can be passed to `Regexp.new`
33 |     # @see to_regexp
34 |     def as_regexp(options = {})
35 |       unless options.is_a?(::Hash)
36 |         raise ::ArgumentError, "[to_regexp] Options must be a Hash"
37 |       end
38 |       str = self
39 | 
40 |       return if options[:detect] and str == ''
41 | 
42 |       if options[:literal] or (options[:detect] and ToRegexp::String.literal?(str))
43 |         content = ::Regexp.escape str
44 |       elsif delim_set = REGEXP_DELIMITERS.detect { |k, _| str.start_with?(k) }
45 |         delim_start, delim_end = delim_set
46 |         /\A#{delim_start}(.*)#{delim_end}(#{INLINE_OPTIONS})\z/u =~ str
47 |         content = $1
48 |         inline_options = $2
49 |         return unless content.is_a?(::String)
50 |         content.gsub! '\\/', '/'
51 |         if inline_options
52 |           options[:ignore_case] = true if inline_options.include?('i')
53 |           options[:multiline] = true if inline_options.include?('m')
54 |           options[:extended] = true if inline_options.include?('x')
55 |           # 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8
56 |           options[:lang] = inline_options.scan(/[nesu]/i).join.downcase
57 |         end
58 |       else
59 |         return
60 |       end
61 | 
62 |       ignore_case = options[:ignore_case] ? ::Regexp::IGNORECASE : 0
63 |       multiline = options[:multiline] ? ::Regexp::MULTILINE : 0
64 |       extended = options[:extended] ? ::Regexp::EXTENDED : 0
65 |       lang = options[:lang] || ''
66 |       if ::RUBY_VERSION > '1.9' and lang.include?('u')
67 |         lang = lang.delete 'u'
68 |       end
69 | 
70 |       if lang.empty?
71 |         [ content, (ignore_case|multiline|extended) ]
72 |       else
73 |         [ content, (ignore_case|multiline|extended), lang ]
74 |       end
75 |     end
76 |   end
77 | end
78 | 
79 | class String
80 |   include ToRegex::StringMixin
81 | end
82 | 


--------------------------------------------------------------------------------
/test/test_wayback_machine_downloader.rb:
--------------------------------------------------------------------------------
  1 | require 'minitest/autorun'
  2 | require 'wayback_machine_downloader'
  3 | 
  4 | class WaybackMachineDownloaderTest < Minitest::Test
  5 | 
  6 |   def setup
  7 |     @wayback_machine_downloader = WaybackMachineDownloader.new(
  8 |       base_url: 'http://www.onlyfreegames.net')
  9 |     $stdout = StringIO.new
 10 |   end
 11 | 
 12 |   def teardown
 13 |     FileUtils.rm_rf(@wayback_machine_downloader.backup_path)
 14 |   end
 15 | 
 16 |   def test_base_url_being_set
 17 |     assert_equal 'http://www.onlyfreegames.net', @wayback_machine_downloader.base_url
 18 |   end
 19 | 
 20 |   def test_backup_name_being_set
 21 |     assert_equal 'www.onlyfreegames.net', @wayback_machine_downloader.backup_name
 22 |   end
 23 | 
 24 |   def test_backup_name_being_set_when_base_url_is_domain
 25 |     @wayback_machine_downloader.base_url = 'www.onlyfreegames.net'
 26 |     assert_equal 'www.onlyfreegames.net', @wayback_machine_downloader.backup_name
 27 |   end
 28 | 
 29 |   def test_file_list_curated
 30 |     assert_equal 20060711191226, @wayback_machine_downloader.get_file_list_curated["linux.htm"][:timestamp]
 31 |   end
 32 | 
 33 |   def test_file_list_by_timestamp
 34 |     file_expected = {
 35 |       file_url: "http://www.onlyfreegames.net:80/strat.html",
 36 |       timestamp: 20060111084756,
 37 |       file_id: "strat.html"
 38 |     }
 39 |     assert_equal file_expected, @wayback_machine_downloader.get_file_list_by_timestamp[-2]
 40 |   end
 41 | 
 42 |   def test_without_exact_url
 43 |     @wayback_machine_downloader.exact_url = false
 44 |     assert @wayback_machine_downloader.get_file_list_curated.size > 1
 45 |   end
 46 | 
 47 |   def test_exact_url
 48 |     @wayback_machine_downloader.exact_url = true
 49 |     assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size
 50 |   end
 51 | 
 52 |   def test_file_list_only_filter_without_matches
 53 |     @wayback_machine_downloader.only_filter = 'abc123'
 54 |     assert_equal 0, @wayback_machine_downloader.get_file_list_curated.size
 55 |   end
 56 | 
 57 |   def test_file_list_only_filter_with_1_match
 58 |     @wayback_machine_downloader.only_filter = 'menu.html'
 59 |     assert_equal 1, @wayback_machine_downloader.get_file_list_curated.size
 60 |   end
 61 | 
 62 |   def test_file_list_only_filter_with_a_regex
 63 |     @wayback_machine_downloader.only_filter = '/\.(gif|je?pg|bmp)$/i'
 64 |     assert_equal 37, @wayback_machine_downloader.get_file_list_curated.size
 65 |   end
 66 | 
 67 |   def test_file_list_exclude_filter_without_matches
 68 |     @wayback_machine_downloader.exclude_filter = 'abc123'
 69 |     assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size
 70 |   end
 71 | 
 72 |   def test_file_list_exclude_filter_with_1_match
 73 |     @wayback_machine_downloader.exclude_filter = 'menu.html'
 74 |     assert_equal 67, @wayback_machine_downloader.get_file_list_curated.size
 75 |   end
 76 | 
 77 |   def test_file_list_exclude_filter_with_a_regex
 78 |     @wayback_machine_downloader.exclude_filter = '/\.(gif|je?pg|bmp)$/i'
 79 |     assert_equal 31, @wayback_machine_downloader.get_file_list_curated.size
 80 |   end
 81 | 
 82 |   def test_file_download
 83 |     @wayback_machine_downloader.download_files
 84 |     linux_page = open 'websites/www.onlyfreegames.net/linux.htm'
 85 |     assert_includes linux_page.read, "Linux Games"
 86 |   end
 87 | 
 88 |   def test_all_timestamps_being_respected
 89 |     @wayback_machine_downloader.all_timestamps = true
 90 |     assert_equal 68, @wayback_machine_downloader.get_file_list_curated.size
 91 |   end
 92 | 
 93 |   def test_from_timestamp_being_respected
 94 |     @wayback_machine_downloader.from_timestamp = 20050716231334
 95 |     file_url = @wayback_machine_downloader.get_file_list_curated["linux.htm"][:file_url]
 96 |     assert_equal "http://www.onlyfreegames.net:80/linux.htm", file_url
 97 |   end
 98 | 
 99 |   def test_to_timestamp_being_respected
100 |     @wayback_machine_downloader.to_timestamp = 20050716231334
101 |     assert_nil @wayback_machine_downloader.get_file_list_curated["linux.htm"]
102 |   end
103 | 
104 |   def test_all_get_file_list_curated_size
105 |     @wayback_machine_downloader.all = true
106 |     assert_equal 69, @wayback_machine_downloader.get_file_list_curated.size
107 |   end
108 |  
109 |   # Testing encoding conflicts needs a different base_url
110 |   def test_nonascii_suburls_download
111 |     @wayback_machine_downloader = WaybackMachineDownloader.new(
112 |       base_url: 'https://en.wikipedia.org/wiki/%C3%84')
113 |     # Once just for the downloading...
114 |     @wayback_machine_downloader.download_files
115 |   end
116 | 
117 |   def test_nonascii_suburls_already_present
118 |     @wayback_machine_downloader = WaybackMachineDownloader.new(
119 |       base_url: 'https://en.wikipedia.org/wiki/%C3%84')
120 |     # ... twice to test the "is already present" case
121 |     @wayback_machine_downloader.download_files
122 |     @wayback_machine_downloader.download_files
123 |   end
124 | 
125 | end
126 | 


--------------------------------------------------------------------------------
/wayback_machine_downloader.gemspec:
--------------------------------------------------------------------------------
 1 | require './lib/wayback_machine_downloader'
 2 | 
 3 | Gem::Specification.new do |s|
 4 |   s.name        = "wayback_machine_downloader"
 5 |   s.version     = WaybackMachineDownloader::VERSION
 6 |   s.executables << "wayback_machine_downloader"
 7 |   s.summary     = "Download an entire website from the Wayback Machine."
 8 |   s.description = "Download an entire website from the Wayback Machine. Wayback Machine by Internet Archive (archive.org) is an awesome tool to view any website at any point of time but lacks an export feature. Wayback Machine Downloader brings exactly this."
 9 |   s.authors     = ["hartator"]
10 |   s.email       = "hartator@gmail.com"
11 |   s.files       = ["lib/wayback_machine_downloader.rb", "lib/wayback_machine_downloader/tidy_bytes.rb", "lib/wayback_machine_downloader/to_regex.rb", "lib/wayback_machine_downloader/archive_api.rb"]
12 |   s.homepage    = "https://github.com/hartator/wayback-machine-downloader"
13 |   s.license     = "MIT"
14 |   s.required_ruby_version = '>= 1.9.2'
15 |   s.add_development_dependency 'rake', '~> 10.2'
16 |   s.add_development_dependency 'minitest', '~> 5.2'
17 | end
18 | 


--------------------------------------------------------------------------------