├── spec
    ├── data
    │   ├── robots.txt
    │   ├── test_gzip.gz
    │   ├── sitemap.xml
    │   ├── sitemap_index.xml
    │   └── sitemap_index_with_duplicate_url.xml
    ├── support
    │   └── test_logger.rb
    ├── wayback_archiver
    │   ├── null_logger_spec.rb
    │   ├── archive_result_spec.rb
    │   ├── thread_pool_spec.rb
    │   ├── http_code_spec.rb
    │   ├── url_collector_spec.rb
    │   ├── adapters
    │   │   └── wayback_machine_spec.rb
    │   ├── archive_spec.rb
    │   ├── sitemap_spec.rb
    │   ├── sitemapper_spec.rb
    │   └── request_spec.rb
    ├── spec_helper.rb
    └── wayback_archiver_spec.rb
├── lib
    ├── wayback_archiver
    │   ├── version.rb
    │   ├── null_logger.rb
    │   ├── response.rb
    │   ├── archive_result.rb
    │   ├── thread_pool.rb
    │   ├── adapters
    │   │   └── wayback_machine.rb
    │   ├── http_code.rb
    │   ├── url_collector.rb
    │   ├── sitemap.rb
    │   ├── sitemapper.rb
    │   ├── archive.rb
    │   └── request.rb
    ├── robots.rb
    └── wayback_archiver.rb
├── Gemfile
├── .travis.yml
├── .github
    └── dependabot.yml
├── .gitignore
├── Rakefile
├── LICENSE
├── CHANGELOG.md
├── wayback_archiver.gemspec
├── bin
    └── wayback_archiver
└── README.md


/spec/data/robots.txt:
--------------------------------------------------------------------------------
1 | Sitemap: http://www.example.com/sitemap.xml


--------------------------------------------------------------------------------
/spec/data/test_gzip.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buren/wayback_archiver/HEAD/spec/data/test_gzip.gz


--------------------------------------------------------------------------------
/lib/wayback_archiver/version.rb:
--------------------------------------------------------------------------------
1 | module WaybackArchiver
2 |   # Gem version
3 |   VERSION = '1.5.0'.freeze
4 | end
5 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | # Specify your gem's dependencies in wayback_archiver.gemspec
4 | gemspec
5 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: false
2 | language: ruby
3 | rvm:
4 |   - 3.0.0
5 |   - 3.1.0
6 |   - 3.2.0
7 |   - 3.3.0
8 | before_install: gem install bundler
9 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: bundler
4 |   directory: "/"
5 |   schedule:
6 |     interval: monthly
7 |     time: "04:00"
8 |   open-pull-requests-limit: 10
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.gem
 2 | *.rbc
 3 | .bundle
 4 | .config
 5 | .yardoc
 6 | Gemfile.lock
 7 | InstalledFiles
 8 | _yardoc
 9 | coverage
10 | doc/
11 | lib/bundler/man
12 | pkg
13 | rdoc
14 | spec/reports
15 | test/tmp
16 | test/version_tmp
17 | tmp
18 | TODO.md
19 | .byebug_history
20 | 


--------------------------------------------------------------------------------
/lib/wayback_archiver/null_logger.rb:
--------------------------------------------------------------------------------
 1 | require 'logger'
 2 | 
 3 | module WaybackArchiver
 4 |   # Don't log anyting / Send the logs to the abyss
 5 |   class NullLogger < Logger
 6 |     # Allow any and all params
 7 |     def initialize(*args); end
 8 | 
 9 |     # Allow any and alls params and don't do anyting
10 |     def add(*args, &block); end
11 |   end
12 | end
13 | 


--------------------------------------------------------------------------------
/spec/data/sitemap.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 4 | 
 5 |    <url>
 6 | 
 7 |       <loc>http://www.example.com/</loc>
 8 | 
 9 |       <lastmod>2005-01-01</lastmod>
10 | 
11 |       <changefreq>monthly</changefreq>
12 | 
13 |       <priority>0.8</priority>
14 | 
15 |    </url>
16 | 
17 | </urlset>
18 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require 'bundler/gem_tasks'
 2 | 
 3 | task default: :spec
 4 | 
 5 | task :console do
 6 |   require 'bundler/setup'
 7 |   require 'irb'
 8 |   require 'wayback_archiver'
 9 |   ARGV.clear
10 |   IRB.start
11 | end
12 | 
13 | task :spec do
14 |   begin
15 |     require 'rspec/core/rake_task'
16 |     RSpec::Core::RakeTask.new(:spec)
17 |   rescue LoadError
18 |     puts 'Could *not* load rspec'
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/lib/wayback_archiver/response.rb:
--------------------------------------------------------------------------------
 1 | module WaybackArchiver
 2 |   # Response data struct
 3 |   Response = Struct.new(:code, :message, :body, :uri, :error)
 4 |   class Response
 5 |     # Returns true if a successfull response
 6 |     # @example check if Response was successfull
 7 |     #    response = Response.new('200', 'OK', 'buren', 'http://example.com')
 8 |     #    response.success? # => true
 9 |     def success?
10 |       HTTPCode.success?(code)
11 |     end
12 |   end
13 | end
14 | 


--------------------------------------------------------------------------------
/spec/data/sitemap_index.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 4 | 
 5 |    <sitemap>
 6 | 
 7 |       <loc>http://www.example.com/sitemap1.xml.gz</loc>
 8 | 
 9 |       <lastmod>2004-10-01T18:23:17+00:00</lastmod>
10 | 
11 |    </sitemap>
12 | 
13 |    <sitemap>
14 | 
15 |       <loc>http://www.example.com/sitemap2.xml.gz</loc>
16 | 
17 |       <lastmod>2005-01-01</lastmod>
18 | 
19 |    </sitemap>
20 | 
21 | </sitemapindex>


--------------------------------------------------------------------------------
/spec/support/test_logger.rb:
--------------------------------------------------------------------------------
 1 | require 'logger'
 2 | 
 3 | # Test logger
 4 | class TestLogger < Logger
 5 |   attr_reader :info_log, :debug_log, :error_log
 6 | 
 7 |   def initialize(*_args)
 8 |     @info_log = []
 9 |     @debug_log = []
10 |     @error_log = []
11 |   end
12 | 
13 |   def add(*args)
14 |     log_type, _, log_string = args
15 |     case log_type
16 |     when 0 then @debug_log
17 |     when 1 then @info_log
18 |     when 3 then @error_log
19 |     end << log_string
20 |   end
21 | end
22 | 


--------------------------------------------------------------------------------
/spec/data/sitemap_index_with_duplicate_url.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 4 | 
 5 |    <sitemap>
 6 | 
 7 |       <loc>http://www.example.com/sitemap1.xml.gz</loc>
 8 | 
 9 |       <lastmod>2004-10-01T18:23:17+00:00</lastmod>
10 | 
11 |    </sitemap>
12 | 
13 |    <sitemap>
14 | 
15 |       <loc>http://www.example.com/sitemap1.xml.gz</loc>
16 | 
17 |       <lastmod>2005-01-01</lastmod>
18 | 
19 |    </sitemap>
20 | 
21 | </sitemapindex>
22 | 


--------------------------------------------------------------------------------
/spec/wayback_archiver/null_logger_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper'
 2 | 
 3 | RSpec.describe WaybackArchiver::NullLogger do
 4 |   it 'inherits from Logger' do
 5 |     expect(described_class.ancestors).to include(Logger)
 6 |   end
 7 | 
 8 |   it 'can be initialized with arguments' do
 9 |     logger = described_class.new('buren')
10 |     expect(logger.is_a?(described_class)).to eq(true)
11 |   end
12 | 
13 |   it 'has #add method that can recieve args and a block' do
14 |     logger = described_class.new('buren')
15 |     expect(logger.add('buren', &:nil?)).to be_nil
16 |   end
17 | end
18 | 


--------------------------------------------------------------------------------
/spec/wayback_archiver/archive_result_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper'
 2 | 
 3 | RSpec.describe WaybackArchiver::ArchiveResult do
 4 |   describe '#archived_url' do
 5 |     it 'returns the uri' do
 6 |       expect(described_class.new('buren').archived_url).to eq('buren')
 7 |     end
 8 |   end
 9 | 
10 |   describe '#errored?' do
11 |     it 'returns true if errored' do
12 |       expect(described_class.new(nil, error: true).errored?).to eq(true)
13 |     end
14 |   end
15 | 
16 |   describe '#success?' do
17 |     it 'returns true if success' do
18 |       expect(described_class.new(nil, error: nil).success?).to eq(true)
19 |     end
20 |   end
21 | end
22 | 


--------------------------------------------------------------------------------
/lib/wayback_archiver/archive_result.rb:
--------------------------------------------------------------------------------
 1 | module WaybackArchiver
 2 |   # Result data for posting URL to archive
 3 |   class ArchiveResult
 4 |     attr_reader :uri, :code, :request_url, :response_error, :error
 5 | 
 6 |     def initialize(uri, code: nil, request_url: nil, response_error: nil, error: nil)
 7 |       @uri = uri
 8 |       @code = code
 9 |       @request_url = request_url
10 |       @response_error = response_error
11 |       @error = error
12 |     end
13 | 
14 |     # @return [String] the URL that was archived
15 |     def archived_url
16 |       uri
17 |     end
18 | 
19 |     # @return [Boolean] true if success
20 |     def success?
21 |       !errored?
22 |     end
23 | 
24 |     # @return [Boolean] true if errored
25 |     def errored?
26 |       !!error
27 |     end
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/spec/wayback_archiver/thread_pool_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper'
 2 | 
 3 | RSpec.describe WaybackArchiver::ThreadPool do
 4 |   context 'with concurrency less than 1' do
 5 |     it 'raises ArgumentError' do
 6 |       expect { described_class.build(0) }.to raise_error(ArgumentError)
 7 |     end
 8 |   end
 9 | 
10 |   context 'with concurrency 1' do
11 |     it 'returns a Concurrency::ImmediateExecutor' do
12 |       thread_pool = described_class.build(1)
13 |       expect(thread_pool).to be_an_instance_of(Concurrent::ImmediateExecutor)
14 |     end
15 |   end
16 | 
17 |   context 'with concurrency greater than 1' do
18 |     it 'returns a Concurrent::FixedThreadPool' do
19 |       thread_pool = described_class.build(2)
20 |       expect(thread_pool).to be_an_instance_of(Concurrent::FixedThreadPool)
21 |     end
22 |   end
23 | end
24 | 


--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
 1 | require 'simplecov'
 2 | require 'coveralls'
 3 | 
 4 | formatters = [
 5 |   SimpleCov::Formatter::HTMLFormatter,
 6 |   Coveralls::SimpleCov::Formatter
 7 | ]
 8 | SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new(formatters)
 9 | SimpleCov.start
10 | 
11 | Dir['./spec/support/**/*.rb'].each { |file| require file }
12 | 
13 | require 'wayback_archiver'
14 | require 'webmock/rspec'
15 | require 'byebug'
16 | 
17 | WebMock.disable_net_connect!
18 | 
19 | RSpec.configure do |config|
20 |   config.order = 'random'
21 |   config.run_all_when_everything_filtered = false
22 | 
23 |   config.before(:each) do
24 |     WaybackArchiver.logger = TestLogger.new
25 | 
26 |     # Set defalt concurrency to 1, so we don't have to deal with concurrency
27 |     # issues in Webmock and rspec-mocks
28 |     WaybackArchiver.concurrency = 1
29 | 
30 |     WaybackArchiver.max_limit = WaybackArchiver::DEFAULT_MAX_LIMIT
31 |   end
32 | end
33 | 


--------------------------------------------------------------------------------
/lib/wayback_archiver/thread_pool.rb:
--------------------------------------------------------------------------------
 1 | require 'concurrent'
 2 | 
 3 | module WaybackArchiver
 4 |   # Thread pool
 5 |   class ThreadPool
 6 |     # Build a thread pool
 7 |     # @return [Concurrent::FixedThreadPool/Concurrent::ImmediateExecutor] an instance of a concurrent thread pool
 8 |     # @param [Integer] concurrency the desired concurrency
 9 |     # @example Build a thread pool with 10 as the desired concurrency
10 |     #    pool = ThreadPool.build(10)
11 |     #    pool.post { some_work } # Returns a Concurrent::FixedThreadPool
12 |     # @example Build a thread pool with 1 as the desired concurrency
13 |     #    pool = ThreadPool.build(1)
14 |     #    pool.post { some_work } # Returns a Concurrent::ImmediateExecutor
15 |     # @see https://github.com/ruby-concurrency/concurrent-ruby/blob/master/doc/thread_pools.md
16 |     def self.build(concurrency)
17 |       if concurrency == 1
18 |         Concurrent::ImmediateExecutor.new
19 |       elsif concurrency > 1
20 |         Concurrent::FixedThreadPool.new(concurrency)
21 |       else
22 |         raise ArgumentError, 'concurrency must be one or greater'
23 |       end
24 |     end
25 |   end
26 | end
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014 Jacob Burenstam Linder
 2 | 
 3 | MIT License
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/lib/wayback_archiver/adapters/wayback_machine.rb:
--------------------------------------------------------------------------------
 1 | require 'wayback_archiver/archive_result'
 2 | require 'wayback_archiver/request'
 3 | 
 4 | module WaybackArchiver
 5 |   # WaybackMachine adapter
 6 |   class WaybackMachine
 7 |     # Wayback Machine base URL.
 8 |     BASE_URL    = 'https://web.archive.org/save/'.freeze
 9 | 
10 |     # Send URL to Wayback Machine.
11 |     # @return [ArchiveResult] the sent URL.
12 |     # @param [String] url to send.
13 |     # @example Archive example.com, with default options
14 |     #    WaybackMachine.call('http://example.com')
15 |     def self.call(url)
16 |       request_url  = "#{BASE_URL}#{url&.strip}"
17 |       response = Request.get(request_url, follow_redirects: false)
18 |       WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
19 |       ArchiveResult.new(
20 |         url,
21 |         code: response.code,
22 |         request_url: response.uri,
23 |         response_error: response.error
24 |       )
25 |     rescue Request::Error => e
26 |       WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
27 |       ArchiveResult.new(url, error: e)
28 |     end
29 |   end
30 | end
31 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | 
 3 | ## HEAD
 4 | 
 5 | ## v1.5.0
 6 | 
 7 | - Strip URLs found in Sitemaps
 8 | - Inline `robots` dependency, closes [#51](https://github.com/buren/wayback_archiver/issues/51)
 9 | - Update Sitemap XML parsing to work better with newer versions of REXML
10 | - Fix issue calling `Spidr` with option hash (i.e use double spat operator)
11 | 
12 | ## v1.4.0
13 | 
14 | * Don't respect robots.txt file by default, [PR#41](https://github.com/buren/wayback_archiver/pull/41)
15 | * Add `WaybackArchiver::respect_robots_txt=` configuration option, to control whether to respect robots.txt file or not
16 | * Update `spidr` gem, resolves [issue#25](https://github.com/buren/wayback_archiver/issues/25)
17 | * Set default concurrency to `1` due to harsher rate limiting on Wayback Machine
18 | * Support for crawling multiple hosts, for example www.example.com, example.com and app.example.com [PR#27](https://github.com/buren/wayback_archiver/pull/27)
19 | 
20 | ## v1.3.0
21 | 
22 | * Archive every page found, not only HTML pages - [#24](https://github.com/buren/wayback_archiver/pull/24) thanks [@chlorophyll-zz](https://github.com/chlorophyll-zz).
23 | 
24 | ## v1.2.1
25 | 
26 | * Track what urls have been visited in sitemapper and don't visit them twice
27 | * Protect sitemap index duplicates
28 | 
29 | ## v1.2.0
30 | 
31 |  Is history...
32 | 


--------------------------------------------------------------------------------
/lib/wayback_archiver/http_code.rb:
--------------------------------------------------------------------------------
 1 | module WaybackArchiver
 2 |   # Convience class for HTTP response codes
 3 |   class HTTPCode
 4 |     # Type of code as symbol
 5 |     # @return [Symbol] code type
 6 |     # @param [String/Integer] code the response code
 7 |     # @example
 8 |     #    HttpCode.type('200')
 9 |     def self.type(code)
10 |       code = code.to_s
11 |       return :success if success?(code)
12 |       return :redirect if redirect?(code)
13 |       return :error if error?(code)
14 | 
15 |       :unknown
16 |     end
17 | 
18 |     # Whether the code is a success type
19 |     # @return [Boolean] is success or not
20 |     # @param [String] code the response code
21 |     # @example
22 |     #    HttpCode.success?('200') # => true
23 |     # @example
24 |     #    HttpCode.success?(200) # => true
25 |     # @example
26 |     #    HttpCode.success?(nil) # => false
27 |     def self.success?(code)
28 |       !!code.to_s.match(/2\d\d/)
29 |     end
30 | 
31 |     # Whether the code is a redirect type
32 |     # @return [Boolean] is redirect or not
33 |     # @param [String] code the response code
34 |     # @example
35 |     #    HttpCode.redirect?('301')
36 |     def self.redirect?(code)
37 |       !!code.to_s.match(/3\d\d/)
38 |     end
39 | 
40 |     # Whether the code is a error type
41 |     # @return [Boolean] is error or not
42 |     # @param [String] code the response code
43 |     # @example
44 |     #    HttpCode.error?('301')
45 |     def self.error?(code)
46 |       !!code.to_s.match(/4\d\d/) || !!code.to_s.match(/5\d\d/)
47 |     end
48 |   end
49 | end
50 | 


--------------------------------------------------------------------------------
/wayback_archiver.gemspec:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | lib = File.expand_path('../lib', __FILE__)
 4 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 5 | require 'wayback_archiver/version'
 6 | 
 7 | Gem::Specification.new do |spec|
 8 |   spec.name          = 'wayback_archiver'
 9 |   spec.version       = WaybackArchiver::VERSION
10 |   spec.authors       = ['Jacob Burenstam']
11 |   spec.email         = ['burenstam@gmail.com']
12 | 
13 |   spec.summary       = 'Post URLs to Wayback Machine (Internet Archive)'
14 |   spec.description   = 'Post URLs to Wayback Machine (Internet Archive), using a crawler, from Sitemap(s) or a list of URLs.'
15 |   spec.homepage      = 'https://github.com/buren/wayback_archiver'
16 |   spec.license       = 'MIT'
17 | 
18 |   spec.files         = Dir.glob('{bin,lib}/**/*')
19 |   spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
20 |   spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
21 |   spec.require_paths = ['lib']
22 | 
23 |   spec.required_ruby_version = '>= 2.0.0'
24 | 
25 |   spec.add_runtime_dependency 'spidr',         '~> 0.7.1' # Crawl sites
26 |   spec.add_runtime_dependency 'concurrent-ruby', '~> 1.3' # Concurrency primitivies
27 |   spec.add_runtime_dependency 'rexml',         '~> 3.3.9'
28 | 
29 |   spec.add_development_dependency 'bundler',   '~> 2.1'
30 |   spec.add_development_dependency 'rake',      '~> 12.3'
31 |   spec.add_development_dependency 'rspec',     '~> 3.1'
32 |   spec.add_development_dependency 'yard',      '~> 0.9'
33 |   spec.add_development_dependency 'simplecov', '~> 0.14.1'
34 |   spec.add_development_dependency 'coveralls', '~> 0.8'
35 |   spec.add_development_dependency 'redcarpet', '~> 3.2'
36 |   spec.add_development_dependency 'webmock', '~> 3.0'
37 |   spec.add_development_dependency 'byebug', '~> 11.1.3'
38 | end
39 | 


--------------------------------------------------------------------------------
/spec/wayback_archiver/http_code_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper'
 2 | 
 3 | RSpec.describe WaybackArchiver::HTTPCode do
 4 |   describe '::type' do
 5 |     [
 6 |       # argument, expected
 7 |       [200, :success],
 8 |       ['200', :success],
 9 |       ['301', :redirect],
10 |       ['302', :redirect],
11 |       ['400', :error],
12 |       ['404', :error],
13 |       ['500', :error],
14 |       ['503', :error],
15 |       ['999', :unknown]
16 |     ].each do |data|
17 |       code, expected = data
18 | 
19 |       it "returns #{expected} for #{code} code" do
20 |         expect(described_class.type(code)).to eq(expected)
21 |       end
22 |     end
23 |   end
24 | 
25 |   describe '::success?' do
26 |     it 'returns true when code is success' do
27 |       code = '200'
28 |       expect(described_class.success?(code)).to eq(true)
29 |     end
30 | 
31 |     it 'returns false when code is not success' do
32 |       code = '300'
33 |       expect(described_class.success?(code)).to eq(false)
34 |     end
35 |   end
36 | 
37 |   describe '::error?' do
38 |     it 'returns true when code is 400 error' do
39 |       code = '400'
40 |       expect(described_class.error?(code)).to eq(true)
41 |     end
42 | 
43 |     it 'returns true when code is 500 error' do
44 |       code = '500'
45 |       expect(described_class.error?(code)).to eq(true)
46 |     end
47 | 
48 |     it 'returns false when code is not error' do
49 |       code = '200'
50 |       expect(described_class.error?(code)).to eq(false)
51 |     end
52 |   end
53 | 
54 |   describe '::redirect?' do
55 |     it 'returns true when code is redirect' do
56 |       code = '300'
57 |       expect(described_class.redirect?(code)).to eq(true)
58 |     end
59 | 
60 |     it 'returns false when code is not redirect' do
61 |       code = '200'
62 |       expect(described_class.redirect?(code)).to eq(false)
63 |     end
64 |   end
65 | end
66 | 


--------------------------------------------------------------------------------
/spec/wayback_archiver/url_collector_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper'
 2 | 
 3 | RSpec.describe WaybackArchiver::URLCollector do
 4 |   describe '::sitemap' do
 5 |     it 'calls Sitemapper::urls' do
 6 |       expected = %w[http://example.com]
 7 |       allow(WaybackArchiver::Sitemapper).to receive(:urls).and_return(expected)
 8 |       expect(described_class.sitemap('http://example.com')).to eq(expected)
 9 |     end
10 |   end
11 | 
12 |   describe '::crawl' do
13 |     let(:headers) do
14 |       {
15 |         'Accept' => '*/*',
16 |         'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
17 |         'User-Agent' => WaybackArchiver.user_agent
18 |       }
19 |     end
20 | 
21 |     it 'can crawl' do
22 |       html_page = <<-HTML
23 |       <!DOCTYPE html>
24 |       <html>
25 |         <head>
26 |           <meta charset="utf-8">
27 |           <title>Testing</title>
28 |         </head>
29 |         <body>
30 |           <a href="http://example.com/found">An URL</a>
31 |         </body>
32 |       </html>
33 |       HTML
34 | 
35 |       response_headers = { 'Content-Type' => 'text/html; charset=utf-8' }
36 | 
37 |       stub_request(:get, 'http://example.com/robots.txt')
38 |         .with(headers: headers)
39 |         .to_return(status: 200, body: '', headers: {})
40 | 
41 |       stub_request(:get, 'http://example.com/')
42 |         .with(headers: headers)
43 |         .to_return(status: 200, body: html_page, headers: response_headers)
44 | 
45 |       stub_request(:get, 'http://example.com/found')
46 |         .with(headers: headers)
47 |         .to_return(status: 200, body: '', headers: response_headers)
48 | 
49 |       expected_urls = %w[http://example.com http://example.com/found]
50 |       expected_urls_dup = expected_urls.dup
51 |       found_urls = described_class.crawl('http://example.com') do |url|
52 |         expect(url).to eq(expected_urls.shift)
53 |       end
54 | 
55 |       expect(found_urls).to eq(expected_urls_dup)
56 |     end
57 |   end
58 | end
59 | 


--------------------------------------------------------------------------------
/spec/wayback_archiver/adapters/wayback_machine_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper'
 2 | 
 3 | RSpec.describe WaybackArchiver::WaybackMachine do
 4 |   let(:headers) do
 5 |     {
 6 |       'Accept' => '*/*',
 7 |       'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
 8 |       'User-Agent' => WaybackArchiver.user_agent
 9 |     }
10 |   end
11 | 
12 |   describe '::call' do
13 |     it 'posts URL to the Wayback Machine' do
14 |       url = 'https://example.com'
15 |       expected_request_url = "https://web.archive.org/save/#{url}"
16 | 
17 |       stub_request(:get, expected_request_url)
18 |         .with(headers: headers)
19 |         .to_return(status: 301, body: 'buren', headers: {})
20 | 
21 |       result = described_class.call(url)
22 | 
23 |       expect(result.uri).to eq(url)
24 |       expect(result.code).to eq('301')
25 |       expect(WaybackArchiver.logger.debug_log.first).to include(expected_request_url)
26 |       expect(WaybackArchiver.logger.info_log.last).to include(url)
27 |     end
28 | 
29 |     it 'rescues and logs Request::ServerError' do
30 |       allow(WaybackArchiver::Request).to receive(:get)
31 |         .and_raise(WaybackArchiver::Request::MaxRedirectError, 'too many redirects')
32 | 
33 |       url = 'https://example.com'
34 |       expected_request_url = "https://web.archive.org/save/#{url}"
35 | 
36 |       stub_request(:get, expected_request_url)
37 |         .with(headers: headers)
38 |         .to_return(status: 301, body: 'buren', headers: {})
39 | 
40 |       result = described_class.call(url)
41 | 
42 |       expect(result.uri).to eq(url)
43 |       expect(result.response_error).to be_nil
44 |       expect(result.request_url).to be_nil
45 |       expect(result.error).to be_a(WaybackArchiver::Request::MaxRedirectError)
46 | 
47 |       last_error_log = WaybackArchiver.logger.error_log.last
48 |       expect(last_error_log).to include(url)
49 |       expect(last_error_log).to include('MaxRedirectError')
50 |       expect(last_error_log).to include('too many redirects')
51 |     end
52 |   end
53 | end
54 | 


--------------------------------------------------------------------------------
/lib/wayback_archiver/url_collector.rb:
--------------------------------------------------------------------------------
 1 | require 'spidr'
 2 | require 'robots'
 3 | 
 4 | require 'wayback_archiver/sitemapper'
 5 | require 'wayback_archiver/request'
 6 | 
 7 | module WaybackArchiver
 8 |   # Retrive URLs from different sources
 9 |   class URLCollector
10 |     # Retrieve URLs from Sitemap.
11 |     # @return [Array<String>] of URLs defined in Sitemap.
12 |     # @param [String] url domain to retrieve Sitemap from.
13 |     # @example Get URLs defined in Sitemap for google.com
14 |     #    URLCollector.sitemap('https://google.com/sitemap.xml')
15 |     def self.sitemap(url)
16 |       Sitemapper.urls(url: Request.build_uri(url))
17 |     end
18 | 
19 |     # Retrieve URLs by crawling.
20 |     # @return [Array<String>] of URLs defined found during crawl.
21 |     # @param [String] url domain to crawl URLs from.
22 |     # @param [Array<String, Regexp>] hosts to crawl.
23 |     # @example Crawl URLs defined on example.com
24 |     #    URLCollector.crawl('http://example.com')
25 |     # @example Crawl URLs defined on example.com and limit the number of visited pages to 100
26 |     #    URLCollector.crawl('http://example.com', limit: 100)
27 |     # @example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100
28 |     #    URLCollector.crawl('http://example.com', limit: -1)
29 |     # @example Crawl multiple hosts
30 |     #    URLCollector.crawl(
31 |     #      'http://example.com',
32 |     #      hosts: [
33 |     #        'example.com',
34 |     #        /host[\d]+\.example\.com/
35 |     #      ]
36 |     #    )
37 |     def self.crawl(url, hosts: [], limit: WaybackArchiver.max_limit)
38 |       urls = []
39 |       start_at_url = Request.build_uri(url).to_s
40 |       options = {
41 |         robots: WaybackArchiver.respect_robots_txt,
42 |         hosts: hosts,
43 |         user_agent: WaybackArchiver.user_agent
44 |       }
45 |       options[:limit] = limit unless limit == -1
46 | 
47 |       Spidr.site(start_at_url, **options) do |spider|
48 |         spider.every_page do |page|
49 |           page_url = page.url.to_s
50 |           urls << page_url
51 |           WaybackArchiver.logger.debug "Found: #{page_url}"
52 |           yield(page_url) if block_given?
53 |         end
54 |       end
55 |       urls
56 |     end
57 |   end
58 | end
59 | 


--------------------------------------------------------------------------------
/bin/wayback_archiver:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'optparse'
 4 | require 'wayback_archiver'
 5 | 
 6 | # Default values
 7 | urls = nil
 8 | strategy = 'auto'
 9 | log = STDOUT
10 | log_level = Logger::INFO
11 | concurrency = WaybackArchiver.concurrency
12 | limit = WaybackArchiver.max_limit
13 | hosts = []
14 | 
15 | optparse = OptionParser.new do |parser|
16 |   parser.banner = 'Usage: wayback_archiver [<url>] [options]'
17 | 
18 |   parser.on('--auto', 'Auto (default)') do |value|
19 |     strategy = 'auto'
20 |   end
21 | 
22 |   parser.on('--crawl', 'Crawl') do |value|
23 |     strategy = 'crawl'
24 |   end
25 | 
26 |   parser.on('--sitemap', 'Sitemap') do |value|
27 |     strategy = 'sitemap'
28 |   end
29 | 
30 |   parser.on('--urls', '--url', 'URL(s)') do |value|
31 |     strategy = 'urls'
32 |   end
33 | 
34 |   parser.on('--hosts=[example.com]', Array, 'Only spider links on certain hosts') do |value|
35 |     hosts = value.map { |v| Regexp.new(v) } if value
36 |   end
37 | 
38 |   parser.on('--concurrency=1', Integer, 'Concurrency') do |value|
39 |     concurrency = value
40 |   end
41 | 
42 |   parser.on('--limit=5', Integer, 'Max number of URLs to archive') do |value|
43 |     limit = value
44 |   end
45 | 
46 |   parser.on('--log=output.log', String, 'Path to desired log file (if no argument is given it defaults to STDOUT)') do |path|
47 |     log = path
48 |   end
49 | 
50 |   parser.on('--[no-]verbose', 'Verboes logs') do |value|
51 |     log_level = value ? Logger::DEBUG : Logger::WARN
52 |   end
53 | 
54 |   parser.on('-h', '--help', 'How to use') do
55 |     puts parser
56 |     exit
57 |   end
58 | 
59 |   # No argument, shows at tail. This will print an options summary.
60 |   parser.on_tail('-h', '--help', 'Show this message') do
61 |     puts parser
62 |     exit
63 |   end
64 | 
65 |   parser.on_tail('--version', 'Show version') do
66 |     puts "WaybackArchiver version #{WaybackArchiver::VERSION}"
67 |     exit
68 |   end
69 | end
70 | 
71 | optparse.parse!
72 | 
73 | urls = ARGV.map(&:strip).reject(&:empty?)
74 | if urls.empty?
75 |   puts optparse.help
76 |   raise ArgumentError, "[<url>] is required"
77 | end
78 | 
79 | WaybackArchiver.logger = Logger.new(log).tap do |logger|
80 |   logger.progname = 'WaybackArchiver'
81 |   logger.level = log_level
82 | end
83 | 
84 | # If no strategy has explicitly been given, then default to 'auto'
85 | strategy ||= 'auto'
86 | urls.each do |url|
87 |   WaybackArchiver.archive(
88 |     url,
89 |     hosts: hosts,
90 |     strategy: strategy,
91 |     concurrency: concurrency,
92 |     limit: limit
93 |   )
94 | end
95 | 


--------------------------------------------------------------------------------
/lib/wayback_archiver/sitemap.rb:
--------------------------------------------------------------------------------
 1 | require 'uri'
 2 | require 'rexml/document'
 3 | 
 4 | module WaybackArchiver
 5 |   # Parse Sitemaps, https://www.sitemaps.org
 6 |   class Sitemap
 7 |     attr_reader :document
 8 | 
 9 |     def initialize(xml_or_string, strict: false)
10 |       @contents = xml_or_string
11 |       @document = REXML::Document.new(xml_or_string)
12 |     rescue REXML::ParseException => _e
13 |       raise if strict
14 | 
15 |       @document = REXML::Document.new('')
16 |     end
17 | 
18 |     # Return all URLs defined in Sitemap.
19 |     # @return [Array<String>] of URLs defined in Sitemap.
20 |     # @example Get URLs defined in Sitemap
21 |     #    sitemap = Sitemap.new(xml)
22 |     #    sitemap.urls
23 |     def urls
24 |       @urls ||= extract_urls('url')
25 |     end
26 | 
27 |     # Return all sitemap URLs defined in Sitemap.
28 |     # @return [Array<String>] of Sitemap URLs defined in Sitemap.
29 |     # @example Get Sitemap URLs defined in Sitemap
30 |     #    sitemap = Sitemap.new(xml)
31 |     #    sitemap.sitemaps
32 |     def sitemaps
33 |       @sitemaps ||= extract_urls('sitemap')
34 |     end
35 | 
36 |     # Check if sitemap is a plain file
37 |     # @return [Boolean] whether document is plain
38 |     def plain_document?
39 |       document.elements.empty?
40 |     end
41 | 
42 |     # Return the name of the document (if there is one)
43 |     # @return [String] the document root name
44 |     def root_name
45 |       return unless document.root
46 | 
47 |       document.root.name
48 |     end
49 | 
50 |     # Returns true of Sitemap is a Sitemap index
51 |     # @return [Boolean] of whether the Sitemap is an Sitemap index or not
52 |     # @example Check if Sitemap is a sitemap index
53 |     #    sitemap = Sitemap.new(xml)
54 |     #    sitemap.sitemap_index?
55 |     def sitemap_index?
56 |       root_name == 'sitemapindex'
57 |     end
58 | 
59 |     # Returns true of Sitemap lists regular URLs
60 |     # @return [Boolean] of whether the Sitemap regular URL list
61 |     # @example Check if Sitemap is a regular URL list
62 |     #    sitemap = Sitemap.new(xml)
63 |     #    sitemap.urlset?
64 |     def urlset?
65 |       root_name == 'urlset'
66 |     end
67 | 
68 |     private
69 | 
70 |     def valid_url?(url)
71 |       uri = URI.parse(url)
72 |       uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
73 |     rescue URI::InvalidURIError
74 |       false
75 |     end
76 | 
77 |     # Extract URLs from Sitemap
78 |     def extract_urls(node_name)
79 |       if plain_document?
80 |         return @contents.to_s
81 |           .each_line.map(&:strip)
82 |           .select(&method(:valid_url?))
83 |       end
84 | 
85 |       urls = []
86 |       document.root.elements.each("#{node_name}/loc") do |element|
87 |         urls << element.text
88 |       end
89 |       urls
90 |     end
91 |   end
92 | end
93 | 


--------------------------------------------------------------------------------
/spec/wayback_archiver/archive_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper'
 2 | 
 3 | RSpec.describe WaybackArchiver::Archive do
 4 |   let(:headers) do
 5 |     {
 6 |       'Accept' => '*/*',
 7 |       'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
 8 |       'User-Agent' => WaybackArchiver.user_agent
 9 |     }
10 |   end
11 | 
12 |   describe '::post' do
13 |     it 'calls ::post_url for each URL' do
14 |       allow(described_class).to receive(:post_url).and_return(WaybackArchiver::ArchiveResult.new(nil))
15 | 
16 |       result = described_class.post(%w[https://example.com https://example.com/path])
17 | 
18 |       expect(described_class).to have_received(:post_url).twice
19 |     end
20 | 
21 |     it 'calls ::post_url for each URL with support for an max limit' do
22 |       allow(described_class).to receive(:post_url).and_return(WaybackArchiver::ArchiveResult.new(nil))
23 | 
24 |       result = described_class.post(%w[https://example.com https://example.com/path], limit: 1)
25 | 
26 |       expect(described_class).to have_received(:post_url).once
27 |     end
28 |   end
29 | 
30 |   describe '::crawl' do
31 |     it 'calls URLCollector::crawl and ::post_url' do
32 |       url = 'https://example.com'
33 | 
34 |       allow(WaybackArchiver::URLCollector).to receive(:crawl)
35 |         .and_yield(url)
36 |         .and_return([url])
37 | 
38 |       allow(described_class).to receive(:post_url).and_return(WaybackArchiver::ArchiveResult.new(url))
39 | 
40 |       expect(described_class.crawl(url)[0].uri).to eq(url)
41 |     end
42 |   end
43 | 
44 |   describe '::post_url' do
45 |     it 'posts URL to the Wayback Machine' do
46 |       url = 'https://example.com'
47 |       expected_request_url = "https://web.archive.org/save/#{url}"
48 | 
49 |       stub_request(:get, expected_request_url)
50 |         .with(headers: headers)
51 |         .to_return(status: 301, body: 'buren', headers: {})
52 | 
53 |       result = described_class.post_url(url)
54 | 
55 |       expect(result.uri).to eq(url)
56 |       expect(result.code).to eq('301')
57 |       expect(WaybackArchiver.logger.debug_log.first).to include(expected_request_url)
58 |       expect(WaybackArchiver.logger.info_log.last).to include(url)
59 |     end
60 | 
61 |     it 'rescues and logs Request::ServerError' do
62 |       allow(WaybackArchiver::Request).to receive(:get)
63 |         .and_raise(WaybackArchiver::Request::MaxRedirectError, 'too many redirects')
64 | 
65 |       url = 'https://example.com'
66 |       expected_request_url = "https://web.archive.org/save/#{url}"
67 | 
68 |       stub_request(:get, expected_request_url)
69 |         .with(headers: headers)
70 |         .to_return(status: 301, body: 'buren', headers: {})
71 | 
72 |       result = described_class.post_url(url)
73 | 
74 |       expect(result.uri).to eq(url)
75 |       expect(result.response_error).to be_nil
76 |       expect(result.request_url).to be_nil
77 |       expect(result.error).to be_a(WaybackArchiver::Request::MaxRedirectError)
78 | 
79 |       last_error_log = WaybackArchiver.logger.error_log.last
80 |       expect(last_error_log).to include(url)
81 |       expect(last_error_log).to include('MaxRedirectError')
82 |       expect(last_error_log).to include('too many redirects')
83 |     end
84 |   end
85 | end
86 | 


--------------------------------------------------------------------------------
/lib/wayback_archiver/sitemapper.rb:
--------------------------------------------------------------------------------
 1 | require 'set'
 2 | require 'robots'
 3 | 
 4 | require 'wayback_archiver/sitemap'
 5 | require 'wayback_archiver/request'
 6 | 
 7 | module WaybackArchiver
 8 |   # Fetch and parse sitemaps recursively
 9 |   class Sitemapper
10 |     # Common locations for Sitemap(s)
11 |     COMMON_SITEMAP_LOCATIONS = %w[
12 |       sitemap_index.xml.gz
13 |       sitemap-index.xml.gz
14 |       sitemap_index.xml
15 |       sitemap-index.xml
16 |       sitemap.xml.gz
17 |       sitemap.xml
18 |     ].freeze
19 | 
20 |     # Autodiscover the location of the Sitemap, then fetch and parse recursively.
21 |     # First it tries /robots.txt, then common locations for Sitemap and finally the supplied URL.
22 |     # @return [Array<String>] of URLs defined in Sitemap(s).
23 |     # @param [URI] url to domain.
24 |     # @example Get URLs defined in Sitemap for google.com
25 |     #    Sitemapper.autodiscover('https://google.com/')
26 |     # @see http://www.sitemaps.org
27 |     def self.autodiscover(url)
28 |       WaybackArchiver.logger.info 'Looking for Sitemap(s) in /robots.txt'
29 |       robots = Robots.new(WaybackArchiver.user_agent)
30 |       sitemaps = robots.other_values(url)['Sitemap']
31 | 
32 |       if sitemaps
33 |         return sitemaps.flat_map do |sitemap|
34 |           WaybackArchiver.logger.info "Fetching Sitemap at #{sitemap}"
35 |           urls(url: sitemap)
36 |         end
37 |       end
38 | 
39 |       COMMON_SITEMAP_LOCATIONS.each do |path|
40 |         WaybackArchiver.logger.info "Looking for Sitemap at #{path}"
41 |         sitemap_url = [url, path].join(url.end_with?('/') ? '' : '/')
42 |         response = Request.get(sitemap_url, raise_on_http_error: false)
43 | 
44 |         if response.success?
45 |           WaybackArchiver.logger.info "Sitemap found at #{sitemap_url}"
46 |           return urls(xml: response.body)
47 |         end
48 |       end
49 | 
50 |       WaybackArchiver.logger.info "Looking for Sitemap at #{url}"
51 |       urls(url: url)
52 |     rescue Request::Error => e
53 |       WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
54 |       []
55 |     end
56 | 
57 |     # Fetch and parse sitemaps recursively.
58 |     # @return [Array<String>] of URLs defined in Sitemap(s).
59 |     # @param url [String] URL to Sitemap.
60 |     # @param xml [String] Sitemap XML.
61 |     # @example Get URLs defined in Sitemap for google.com
62 |     #    Sitemapper.urls(url: 'https://google.com/sitemap.xml')
63 |     # @example Get URLs defined in Sitemap
64 |     #    Sitemapper.urls(xml: xml)
65 |     # @see http://www.sitemaps.org
66 |     def self.urls(url: nil, xml: nil, visited: Set.new)
67 |       if visited.include?(url)
68 |         WaybackArchiver.logger.debug "Already visited #{url} skipping.."
69 |         return []
70 |       end
71 | 
72 |       visited << url if url
73 | 
74 |       xml = Request.get(url).body unless xml
75 |       sitemap = Sitemap.new(xml)
76 | 
77 |       if sitemap.sitemap_index?
78 |         sitemap.sitemaps.flat_map do |sitemap_url|
79 |           urls(url: sitemap_url, visited: visited)
80 |         end
81 |       else
82 |         sitemap.urls.map { |url| url&.strip }
83 |       end
84 |     rescue Request::Error => e
85 |       WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
86 | 
87 |       []
88 |     end
89 |   end
90 | end
91 | 


--------------------------------------------------------------------------------
/spec/wayback_archiver/sitemap_spec.rb:
--------------------------------------------------------------------------------
  1 | require 'spec_helper'
  2 | 
  3 | RSpec.describe WaybackArchiver::Sitemap do
  4 |   describe '#new' do
  5 |     it 'raises error REXML::ParseException when strict mode is true' do
  6 |       expect do
  7 |         described_class.new('<wat></wat><man></man>', strict: true)
  8 |       end.to raise_error(REXML::ParseException)
  9 |     end
 10 | 
 11 |     it 'if strict mode false it swallows XML errors' do
 12 |       sitemap = described_class.new('<buren></buren><stam></stam>')
 13 |       expect(sitemap.urls).to be_empty
 14 |     end
 15 |   end
 16 | 
 17 |   describe '#urls' do
 18 |     it 'returns URLs in XML sitemap' do
 19 |       sitemap = described_class.new(File.read('spec/data/sitemap.xml'))
 20 |       expect(sitemap.urls).to eq(%w[http://www.example.com/])
 21 |     end
 22 | 
 23 |     it 'returns URLs in plain text sitemap' do
 24 |       file = "http://www.example.com/\nhttp://www.example.com/path"
 25 |       sitemap = described_class.new(file)
 26 |       expected = %w[
 27 |         http://www.example.com/
 28 |         http://www.example.com/path
 29 |       ]
 30 |       expect(sitemap.urls).to eq(expected)
 31 |     end
 32 | 
 33 |     it 'returns empty array when passed empty document' do
 34 |       sitemap = described_class.new('')
 35 |       expect(sitemap.urls).to be_empty
 36 |     end
 37 |   end
 38 | 
 39 |   describe '#sitemaps' do
 40 |     it 'returns sitemap URLs in sitemap' do
 41 |       sitemap = described_class.new(File.read('spec/data/sitemap_index.xml'))
 42 |       expected = %w[
 43 |         http://www.example.com/sitemap1.xml.gz
 44 |         http://www.example.com/sitemap2.xml.gz
 45 |       ]
 46 |       expect(sitemap.sitemaps).to eq(expected)
 47 |     end
 48 | 
 49 |     it 'returns empty array when passed empty document' do
 50 |       sitemap = described_class.new('')
 51 |       expect(sitemap.sitemaps).to be_empty
 52 |     end
 53 |   end
 54 | 
 55 |   describe '#plain_document?' do
 56 |     it 'returns true when passed non-XML document' do
 57 |       sitemap = described_class.new('')
 58 |       expect(sitemap.plain_document?).to eq(true)
 59 |     end
 60 | 
 61 |     it 'returns false when passed XML document' do
 62 |       sitemap = described_class.new('<buren></buren>')
 63 |       expect(sitemap.plain_document?).to eq(false)
 64 |     end
 65 |   end
 66 | 
 67 |   describe '#root_name' do
 68 |     it 'returns nil when passed non-XML document' do
 69 |       sitemap = described_class.new('')
 70 |       expect(sitemap.root_name).to be_nil
 71 |     end
 72 | 
 73 |     it 'returns root name when passed XML document' do
 74 |       sitemap = described_class.new('<buren></buren>')
 75 |       expect(sitemap.root_name).to eq('buren')
 76 |     end
 77 |   end
 78 | 
 79 |   describe '#sitemap_index?' do
 80 |     it 'returns true if document is a sitemap index' do
 81 |       sitemap = described_class.new(File.read('spec/data/sitemap_index.xml'))
 82 |       expect(sitemap.sitemap_index?).to eq(true)
 83 |     end
 84 | 
 85 |     it 'returns false if document sitemap' do
 86 |       sitemap = described_class.new(File.read('spec/data/sitemap.xml'))
 87 |       expect(sitemap.sitemap_index?).to eq(false)
 88 |     end
 89 |   end
 90 | 
 91 |   describe '#urlset?' do
 92 |     it 'returns true if document is a sitemap' do
 93 |       sitemap = described_class.new(File.read('spec/data/sitemap.xml'))
 94 |       expect(sitemap.urlset?).to eq(true)
 95 |     end
 96 | 
 97 |     it 'returns false if document is a sitemap index' do
 98 |       sitemap = described_class.new(File.read('spec/data/sitemap_index.xml'))
 99 |       expect(sitemap.urlset?).to eq(false)
100 |     end
101 |   end
102 | end
103 | 


--------------------------------------------------------------------------------
/lib/wayback_archiver/archive.rb:
--------------------------------------------------------------------------------
  1 | require 'concurrent'
  2 | 
  3 | require 'wayback_archiver/thread_pool'
  4 | require 'wayback_archiver/adapters/wayback_machine'
  5 | 
  6 | module WaybackArchiver
  7 |   # Post URL(s) to Wayback Machine
  8 |   class Archive
  9 |     # Send URLs to Wayback Machine.
 10 |     # @return [Array<ArchiveResult>] with sent URLs.
 11 |     # @param [Array<String>] urls to send to the Wayback Machine.
 12 |     # @param concurrency [Integer] the default is 1
 13 |     # @yield [archive_result] If a block is given, each result will be yielded
 14 |     # @yieldparam [ArchiveResult] archive_result
 15 |     # @example Archive urls, asynchronously
 16 |     #    Archive.post(['http://example.com'])
 17 |     #    Archiver.post(['http://example.com']) do |result|
 18 |     #      puts [result.code || 'error', result.url] # print response status and URL
 19 |     #    end
 20 |     # @example Archive urls, using only 1 thread
 21 |     #    Archive.post(['http://example.com'], concurrency: 1)
 22 |     # @example Stop after archiving 100 links
 23 |     #    Archive.post(['http://example.com'], limit: 100)
 24 |     # @example Explicitly set no limit on how many links are posted
 25 |     #    Archive.post(['http://example.com'], limit: -1)
 26 |     def self.post(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
 27 |       WaybackArchiver.logger.info "Total URLs to be sent: #{urls.length}"
 28 |       WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
 29 | 
 30 |       urls_queue = if limit == -1
 31 |                      urls
 32 |                    else
 33 |                      urls[0...limit]
 34 |                    end
 35 | 
 36 |       posted_urls = Concurrent::Array.new
 37 |       pool = ThreadPool.build(concurrency)
 38 | 
 39 |       urls_queue.each do |url|
 40 |         pool.post do
 41 |           result = post_url(url)
 42 |           yield(result) if block_given?
 43 |           posted_urls << result unless result.errored?
 44 |         end
 45 |       end
 46 | 
 47 |       pool.shutdown
 48 |       pool.wait_for_termination
 49 | 
 50 |       WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine"
 51 |       posted_urls
 52 |     end
 53 | 
 54 |     # Send URLs to Wayback Machine by crawling the site.
 55 |     # @return [Array<ArchiveResult>] with URLs sent to the Wayback Machine.
 56 |     # @param [String] source for URL to crawl.
 57 |     # @param concurrency [Integer] the default is 1
 58 |     # @param [Array<String, Regexp>] hosts to crawl
 59 |     # @yield [archive_result] If a block is given, each result will be yielded
 60 |     # @yieldparam [ArchiveResult] archive_result
 61 |     # @example Crawl example.com and send all URLs of the same domain
 62 |     #    Archiver.crawl('example.com')
 63 |     #    Archiver.crawl('example.com') do |result|
 64 |     #      puts [result.code || 'error', result.url] # print response status and URL
 65 |     #    end
 66 |     # @example Crawl example.com and send all URLs of the same domain with low concurrency
 67 |     #    Archiver.crawl('example.com', concurrency: 1)
 68 |     # @example Stop after archiving 100 links
 69 |     #    Archiver.crawl('example.com', limit: 100)
 70 |     # @example Crawl multiple hosts
 71 |     #    URLCollector.crawl(
 72 |     #      'http://example.com',
 73 |     #      hosts: [
 74 |     #        'example.com',
 75 |     #        /host[\d]+\.example\.com/
 76 |     #      ]
 77 |     #    )
 78 |     def self.crawl(source, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
 79 |       WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
 80 | 
 81 |       posted_urls = Concurrent::Array.new
 82 |       pool = ThreadPool.build(concurrency)
 83 | 
 84 |       found_urls = URLCollector.crawl(source, hosts: hosts, limit: limit) do |url|
 85 |         pool.post do
 86 |           result = post_url(url)
 87 |           yield(result) if block_given?
 88 |           posted_urls << result unless result.errored?
 89 |         end
 90 |       end
 91 |       WaybackArchiver.logger.info "Crawling of #{source} finished, found #{found_urls.length} URL(s)"
 92 |       pool.shutdown
 93 |       pool.wait_for_termination
 94 | 
 95 |       WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine"
 96 |       posted_urls
 97 |     end
 98 | 
 99 |     # Send URL to Wayback Machine.
100 |     # @return [ArchiveResult] the sent URL.
101 |     # @param [String] url to send.
102 |     # @example Archive example.com, with default options
103 |     #    Archive.post_url('http://example.com')
104 |     def self.post_url(url)
105 |       WaybackArchiver.adapter.call(url)
106 |     end
107 |   end
108 | end
109 | 


--------------------------------------------------------------------------------
/lib/robots.rb:
--------------------------------------------------------------------------------
  1 | #
  2 | #  Copyright (c) 2008 Kyle Maxwell, contributors
  3 | #
  4 | #  Permission is hereby granted, free of charge, to any person
  5 | #  obtaining a copy of this software and associated documentation
  6 | #  files (the "Software"), to deal in the Software without
  7 | #  restriction, including without limitation the rights to use,
  8 | #  copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | #  copies of the Software, and to permit persons to whom the
 10 | #  Software is furnished to do so, subject to the following
 11 | #  conditions:
 12 | #
 13 | #  The above copyright notice and this permission notice shall be
 14 | #  included in all copies or substantial portions of the Software.
 15 | #
 16 | #  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 | #  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 18 | #  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 | #  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 20 | #  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 21 | #  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 22 | #  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 23 | #  OTHER DEALINGS IN THE SOFTWARE.
 24 | #
 25 | 
 26 | require "open-uri"
 27 | require "uri"
 28 | require "rubygems"
 29 | require "timeout"
 30 | 
 31 | class Robots
 32 |   
 33 |   DEFAULT_TIMEOUT = 3
 34 |   
 35 |   class ParsedRobots
 36 |     
 37 |     def initialize(uri, user_agent)
 38 |       @last_accessed = Time.at(1)
 39 |       
 40 |       io = Robots.get_robots_txt(uri, user_agent)
 41 |       
 42 |       if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
 43 |         io = StringIO.new("User-agent: *\nAllow: /\n")
 44 |       end
 45 | 
 46 |       @other = {}
 47 |       @disallows = {}
 48 |       @allows = {}
 49 |       @delays = {} # added delays to make it work
 50 |       agent = /.*/
 51 |       io.each do |line|
 52 |         next if line =~ /^\s*(#.*|$)/
 53 |         arr = line.split(":")
 54 |         key = arr.shift
 55 |         value = arr.join(":").strip
 56 |         value.strip!
 57 |         case key
 58 |         when "User-agent"
 59 |           agent = to_regex(value)
 60 |         when "Allow"
 61 |           @allows[agent] ||= []
 62 |           @allows[agent] << to_regex(value)
 63 |         when "Disallow"
 64 |           @disallows[agent] ||= []
 65 |           @disallows[agent] << to_regex(value)
 66 |         when "Crawl-delay"
 67 |           @delays[agent] = value.to_i
 68 |         else
 69 |           @other[key] ||= []
 70 |           @other[key] << value
 71 |         end
 72 |       end
 73 |       
 74 |       @parsed = true
 75 |     end
 76 |     
 77 |     def allowed?(uri, user_agent)
 78 |       return true unless @parsed
 79 |       allowed = true
 80 |       path = uri.request_uri
 81 |       
 82 |       @disallows.each do |key, value|
 83 |         if user_agent =~ key
 84 |           value.each do |rule|
 85 |             if path =~ rule
 86 |               allowed = false
 87 |             end
 88 |           end
 89 |         end
 90 |       end
 91 |       
 92 |       @allows.each do |key, value|
 93 |         unless allowed      
 94 |           if user_agent =~ key
 95 |             value.each do |rule|
 96 |               if path =~ rule
 97 |                 allowed = true
 98 |               end
 99 |             end
100 |           end
101 |         end
102 |       end
103 |       
104 |       if allowed && @delays[user_agent]
105 |         sleep @delays[user_agent] - (Time.now - @last_accessed)
106 |         @last_accessed = Time.now
107 |       end
108 |       
109 |       return allowed
110 |     end
111 |     
112 |     def other_values
113 |       @other
114 |     end
115 |     
116 |   protected
117 |     
118 |     def to_regex(pattern)
119 |       return /should-not-match-anything-123456789/ if pattern.strip.empty?
120 |       pattern = Regexp.escape(pattern)
121 |       pattern.gsub!(Regexp.escape("*"), ".*")
122 |       Regexp.compile("^#{pattern}")
123 |     end
124 |   end
125 |   
126 |   def self.get_robots_txt(uri, user_agent)
127 |     begin
128 |       Timeout::timeout(Robots.timeout) do
129 |         io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
130 |       end 
131 |     rescue Timeout::Error
132 |       STDERR.puts "robots.txt request timed out"
133 |     end
134 |   end
135 |   
136 |   def self.timeout=(t)
137 |     @timeout = t
138 |   end
139 |   
140 |   def self.timeout
141 |     @timeout || DEFAULT_TIMEOUT
142 |   end
143 |   
144 |   def initialize(user_agent)
145 |     @user_agent = user_agent
146 |     @parsed = {}
147 |   end
148 |   
149 |   def allowed?(uri)
150 |     uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
151 |     host = uri.host
152 |     @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
153 |     @parsed[host].allowed?(uri, @user_agent)
154 |   end
155 |   
156 |   def other_values(uri)
157 |     uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
158 |     host = uri.host
159 |     @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
160 |     @parsed[host].other_values
161 |   end
162 | end
163 | 


--------------------------------------------------------------------------------
/spec/wayback_archiver/sitemapper_spec.rb:
--------------------------------------------------------------------------------
  1 | require 'spec_helper'
  2 | 
  3 | RSpec.describe WaybackArchiver::Sitemapper do
  4 |   let(:headers) do
  5 |     {
  6 |       'Accept' => '*/*',
  7 |       'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
  8 |       'User-Agent' => WaybackArchiver.user_agent
  9 |     }
 10 |   end
 11 | 
 12 |   let(:robots_txt) { File.read('spec/data/robots.txt') }
 13 |   let(:sitemap_index_xml) { File.read('spec/data/sitemap_index.xml') }
 14 |   let(:sitemap_index_with_duplicate_url_xml) { File.read('spec/data/sitemap_index_with_duplicate_url.xml') }
 15 |   let(:sitemap_xml) { File.read('spec/data/sitemap.xml') }
 16 | 
 17 |   describe '::autodiscover' do
 18 |     context 'with found Sitemap location in robots.txt' do
 19 |       it 'fetches those Sitemap(s) and returns all present URLs' do
 20 |         # The robots gem doesn't play nice with the WebMock so we can't test this until
 21 |         # https://github.com/fizx/robots/pull/9 is merged.
 22 |         # Until then we're gonna use rspec-mocks
 23 |         # stub_request(:get, 'http://www.example.com/robots.txt').
 24 |         #   with(headers: headers).
 25 |         #   to_return(status: 200, body: robots_txt, headers: {})
 26 |         allow_any_instance_of(Robots).to receive(:other_values).and_return('Sitemap' => %w[http://www.example.com/sitemap.xml])
 27 | 
 28 |         stub_request(:get, 'http://www.example.com/sitemap.xml')
 29 |           .with(headers: headers)
 30 |           .to_return(status: 200, body: sitemap_xml, headers: {})
 31 | 
 32 |         expect(described_class.autodiscover('http://www.example.com')).to eq(%w[http://www.example.com/])
 33 |       end
 34 | 
 35 |       it 'returns empty list on request error' do
 36 |         allow_any_instance_of(Robots).to receive(:other_values).and_raise(WaybackArchiver::Request::Error)
 37 | 
 38 |         expect(described_class.autodiscover('http://www.example.com')).to be_empty
 39 |       end
 40 |     end
 41 | 
 42 |     context 'with found Sitemap location among common Sitemap locations' do
 43 |       it 'returns all present URLs if a Sitemap is found' do
 44 |         base_url = 'http://www.example.com'
 45 |         stub_request(:get, "#{base_url}/robots.txt")
 46 |           .with(headers: headers)
 47 |           .to_return(status: 200, body: robots_txt, headers: {})
 48 | 
 49 |         sitemap_path = WaybackArchiver::Sitemapper::COMMON_SITEMAP_LOCATIONS.first
 50 | 
 51 |         stub_request(:get, [base_url, sitemap_path].join('/'))
 52 |           .with(headers: headers)
 53 |           .to_return(status: 200, body: sitemap_xml, headers: {})
 54 | 
 55 |         expect(described_class.autodiscover('http://www.example.com')).to eq(%w[http://www.example.com/])
 56 |       end
 57 |     end
 58 | 
 59 |     context 'at the provided URL' do
 60 |       it 'returns all present URLs if a Sitemap is found' do
 61 |         base_url = 'http://www.example.com'
 62 |         stub_request(:get, "#{base_url}/robots.txt")
 63 |           .with(headers: headers)
 64 |           .to_return(status: 200, body: robots_txt, headers: {})
 65 | 
 66 |         WaybackArchiver::Sitemapper::COMMON_SITEMAP_LOCATIONS.each do |sitemap_path|
 67 |           stub_request(:get, [base_url, sitemap_path].join('/'))
 68 |             .with(headers: headers)
 69 |             .to_return(status: 404, body: '', headers: {})
 70 |         end
 71 | 
 72 |         stub_request(:get, base_url)
 73 |           .with(headers: headers)
 74 |           .to_return(status: 200, body: sitemap_xml, headers: {})
 75 | 
 76 |         expect(described_class.autodiscover(base_url)).to eq(%w[http://www.example.com/])
 77 |       end
 78 |     end
 79 |   end
 80 | 
 81 |   describe '::urls' do
 82 |     it 'can start with xml argument' do
 83 |       expect(described_class.urls(xml: sitemap_xml)).to eq(%w[http://www.example.com/])
 84 |     end
 85 | 
 86 |     it 'returns empty array if url already has been visited' do
 87 |       start_url = 'http://www.example.com/sitemap_index.xml'
 88 | 
 89 |       stub_request(:get, start_url)
 90 |         .with(headers: headers)
 91 |         .to_return(status: 200, body: sitemap_index_with_duplicate_url_xml, headers: {})
 92 | 
 93 |       %w[http://www.example.com/sitemap1.xml.gz].each do |url|
 94 |         stub_request(:get, url)
 95 |           .with(headers: headers)
 96 |           .to_return(status: 200, body: sitemap_xml, headers: {})
 97 |       end
 98 | 
 99 |       result = described_class.urls(url: start_url)
100 |       expect(WaybackArchiver.logger.debug_log).to include("Already visited http://www.example.com/sitemap1.xml.gz skipping..")
101 |       expect(result).to eq(%w[http://www.example.com/])
102 |     end
103 | 
104 |     context 'with url argument and returned sitemap index' do
105 |       it 'follows the index and returns all URLs sitemap(s)' do
106 |         start_url = 'http://www.example.com/sitemap_index.xml'
107 | 
108 |         stub_request(:get, start_url)
109 |           .with(headers: headers)
110 |           .to_return(status: 200, body: sitemap_index_xml, headers: {})
111 | 
112 |         %w[http://www.example.com/sitemap1.xml.gz http://www.example.com/sitemap2.xml.gz].each do |url|
113 |           stub_request(:get, url)
114 |             .with(headers: headers)
115 |             .to_return(status: 200, body: sitemap_xml, headers: {})
116 |         end
117 | 
118 |         result = described_class.urls(url: start_url)
119 |         expect(result).to eq(%w[http://www.example.com/ http://www.example.com/])
120 |       end
121 |     end
122 | 
123 |     context 'with url argument and returned sitemap' do
124 |       it 'returns all URLs in sitemap' do
125 |         stub_request(:get, 'http://www.example.com/sitemap.xml')
126 |           .with(headers: headers)
127 |           .to_return(status: 200, body: sitemap_xml, headers: {})
128 | 
129 |         result = described_class.urls(url: 'http://www.example.com/sitemap.xml')
130 |         expect(result).to eq(%w[http://www.example.com/])
131 |       end
132 |     end
133 | 
134 |     it 'returns empty list on request error' do
135 |       allow(WaybackArchiver::Request).to receive(:get).and_raise(WaybackArchiver::Request::Error)
136 | 
137 |       expect(described_class.urls(url: 'http://www.example.com')).to be_empty
138 |     end
139 |   end
140 | end
141 | 


--------------------------------------------------------------------------------
/spec/wayback_archiver_spec.rb:
--------------------------------------------------------------------------------
  1 | require 'spec_helper'
  2 | 
  3 | RSpec.describe WaybackArchiver do
  4 |   describe '::archive' do
  5 |     it 'raises ArgumentError when passed unknown strategy' do
  6 |       expect do
  7 |         described_class.archive('http://example.com', strategy: :watman_strategy)
  8 |       end.to raise_error(ArgumentError)
  9 |     end
 10 | 
 11 |     it 'calls ::auto when no strategy is given' do
 12 |       allow(described_class).to receive(:auto).and_return([])
 13 |       described_class.archive('http://example.com')
 14 |       expect(described_class).to have_received(:auto).once
 15 |     end
 16 | 
 17 |     it 'calls ::auto when passed auto as strategy' do
 18 |       allow(described_class).to receive(:auto).and_return([])
 19 |       described_class.archive('http://example.com', strategy: :auto)
 20 |       expect(described_class).to have_received(:auto).once
 21 |     end
 22 | 
 23 |     it 'calls ::crawl when passed crawl as strategy' do
 24 |       allow(described_class).to receive(:crawl).and_return([])
 25 |       described_class.archive('http://example.com', strategy: :crawl)
 26 |       expect(described_class).to have_received(:crawl).once
 27 |     end
 28 | 
 29 |     it 'calls ::urls when passed urls as strategy' do
 30 |       allow(described_class).to receive(:urls).and_return([])
 31 |       described_class.archive('http://example.com', strategy: :urls)
 32 |       expect(described_class).to have_received(:urls).once
 33 |     end
 34 | 
 35 |     it 'calls ::urls when passed url as strategy' do
 36 |       allow(described_class).to receive(:urls).and_return([])
 37 |       described_class.archive('http://example.com', strategy: :url)
 38 |       expect(described_class).to have_received(:urls).once
 39 |     end
 40 | 
 41 |     it 'calls ::sitemap when passed sitemap as strategy' do
 42 |       allow(described_class).to receive(:sitemap).and_return([])
 43 |       described_class.archive('http://example.com', strategy: :sitemap)
 44 |       expect(described_class).to have_received(:sitemap).once
 45 |     end
 46 | 
 47 |     context 'legacy strategy param' do
 48 |       it 'raises ArgumentError when passed unknown strategy' do
 49 |         expect do
 50 |           described_class.archive('http://example.com', :watman_strategy)
 51 |         end.to raise_error(ArgumentError)
 52 |       end
 53 | 
 54 |       it 'calls ::auto when passed auto as strategy' do
 55 |         allow(described_class).to receive(:auto).and_return([])
 56 |         described_class.archive('http://example.com', :auto)
 57 |         expect(described_class).to have_received(:auto).once
 58 |       end
 59 | 
 60 |       it 'calls ::crawl when passed crawl as strategy' do
 61 |         allow(described_class).to receive(:crawl).and_return([])
 62 |         described_class.archive('http://example.com', :crawl)
 63 |         expect(described_class).to have_received(:crawl).once
 64 |       end
 65 | 
 66 |       it 'calls ::urls when passed urls as strategy' do
 67 |         allow(described_class).to receive(:urls).and_return([])
 68 |         described_class.archive('http://example.com', :urls)
 69 |         expect(described_class).to have_received(:urls).once
 70 |       end
 71 | 
 72 |       it 'calls ::urls when passed url as strategy' do
 73 |         allow(described_class).to receive(:urls).and_return([])
 74 |         described_class.archive('http://example.com', :url)
 75 |         expect(described_class).to have_received(:urls).once
 76 |       end
 77 | 
 78 |       it 'calls ::sitemap when passed sitemap as strategy' do
 79 |         allow(described_class).to receive(:sitemap).and_return([])
 80 |         described_class.archive('http://example.com', :sitemap)
 81 |         expect(described_class).to have_received(:sitemap).once
 82 |       end
 83 |     end
 84 |   end
 85 | 
 86 |   describe '::auto' do
 87 |     it 'calls Sitemapper::autodiscover and ::crawl if Sitemapper returned empty result' do
 88 |       allow(described_class::Sitemapper).to receive(:autodiscover).and_return([])
 89 |       allow(described_class).to receive(:crawl).and_return([])
 90 | 
 91 |       described_class.auto('http://example.com')
 92 | 
 93 |       expect(described_class::Sitemapper).to have_received(:autodiscover).once
 94 |       expect(described_class).to have_received(:crawl).once
 95 |     end
 96 | 
 97 |     it 'calls Sitemapper::autodiscover and ::urls if Sitemapper returned non-empty result' do
 98 |       allow(described_class::Sitemapper).to receive(:autodiscover).and_return(['url'])
 99 |       allow(described_class).to receive(:urls).and_return([])
100 | 
101 |       described_class.auto('http://example.com')
102 | 
103 |       expect(described_class::Sitemapper).to have_received(:autodiscover).once
104 |       expect(described_class).to have_received(:urls).once
105 |     end
106 |   end
107 | 
108 |   describe '::crawl' do
109 |     it 'calls Archive::crawl' do
110 |       allow(described_class::Archive).to receive(:crawl).and_return([])
111 | 
112 |       described_class.crawl('http://example.com')
113 | 
114 |       expect(described_class::Archive).to have_received(:crawl).once
115 |     end
116 |   end
117 | 
118 |   describe '::urls' do
119 |     it 'calls Archive::post' do
120 |       allow(described_class::Archive).to receive(:post).and_return([])
121 | 
122 |       described_class.urls('http://example.com')
123 | 
124 |       expect(described_class::Archive).to have_received(:post).once
125 |     end
126 |   end
127 | 
128 |   describe '::sitemap' do
129 |     it 'calls URLCollector::sitemap and Archive::post' do
130 |       allow(described_class::URLCollector).to receive(:sitemap).and_return([])
131 |       allow(described_class::Archive).to receive(:post).and_return([])
132 | 
133 |       described_class.sitemap('http://example.com')
134 | 
135 |       expect(described_class::URLCollector).to have_received(:sitemap).once
136 |       expect(described_class::Archive).to have_received(:post).once
137 |     end
138 |   end
139 | 
140 |   describe '::default_logger!' do
141 |     it 'has NullLogger as the default logger' do
142 |       described_class.default_logger!
143 |       expect(described_class.logger.class).to eq(described_class::NullLogger)
144 |     end
145 |   end
146 | 
147 |   describe '::logger=' do
148 |     it 'can set logger' do
149 |       MyLogger = Struct.new(:name).new('buren')
150 |       described_class.logger = MyLogger
151 |       expect(described_class.logger).to eq(MyLogger)
152 |     end
153 |   end
154 | 
155 |   describe '::user_agent=' do
156 |     it 'can set user_agent' do
157 |       described_class.user_agent = 'buren'
158 |       expect(described_class.user_agent).to eq('buren')
159 |     end
160 |   end
161 | 
162 |   describe '::concurrency=' do
163 |     it 'can set concurrency' do
164 |       described_class.concurrency = 1
165 |       expect(described_class.concurrency).to eq(1)
166 |     end
167 |   end
168 | 
169 |   describe '::max_limit=' do
170 |     it 'can set max_limit' do
171 |       described_class.max_limit = 1
172 |       expect(described_class.max_limit).to eq(1)
173 |     end
174 |   end
175 | 
176 |   describe '::adapter=' do
177 |     it 'can set adapter' do
178 |       adapter = WaybackArchiver::WaybackMachine
179 |       described_class.adapter = adapter
180 |       expect(described_class.adapter).to match(adapter)
181 |     end
182 | 
183 |     it 'raises error unless all adapter respond to #call' do
184 |       expect { described_class.adapter = 1 }.to raise_error(ArgumentError)
185 |     end
186 |   end
187 | end
188 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # WaybackArchiver
  2 | 
  3 | Post URLs to [Wayback Machine](https://archive.org/web/) (Internet Archive), using a crawler, from [Sitemap(s)](http://www.sitemaps.org), or a list of URLs.
  4 | 
  5 | > The Wayback Machine is a digital archive of the World Wide Web [...]
  6 | > The service enables users to see archived versions of web pages across time ...  
  7 | > \- [Wikipedia](https://en.wikipedia.org/wiki/Wayback_Machine)
  8 | 
  9 | [![Build Status](https://travis-ci.org/buren/wayback_archiver.svg?branch=master)](https://travis-ci.org/buren/wayback_archiver) [![Code Climate](https://codeclimate.com/github/buren/wayback_archiver.png)](https://codeclimate.com/github/buren/wayback_archiver) [![Docs badge](https://inch-ci.org/github/buren/wayback_archiver.svg?branch=master)](http://www.rubydoc.info/github/buren/wayback_archiver/master) [![Gem Version](https://badge.fury.io/rb/wayback_archiver.svg)](http://badge.fury.io/rb/wayback_archiver)
 10 | 
 11 | __Index__
 12 | 
 13 | * [Installation](#installation)
 14 | * [Usage](#usage)
 15 |   - [Ruby](#ruby)
 16 |   - [CLI](#cli)
 17 | * [Configuration](#configuration)
 18 | * [RubyDoc](#docs)
 19 | * [Contributing](#contributing)
 20 | * [MIT License](#license)
 21 | * [References](#references)
 22 | 
 23 | ## Installation
 24 | 
 25 | Install the gem:
 26 | ```
 27 | $ gem install wayback_archiver
 28 | ```
 29 | 
 30 | Or add this line to your application's Gemfile:
 31 | 
 32 | ```ruby
 33 | gem 'wayback_archiver'
 34 | ```
 35 | 
 36 | And then execute:
 37 | 
 38 | ```
 39 | $ bundle
 40 | ```
 41 | 
 42 | ## Usage
 43 | 
 44 | * [Ruby](#ruby)
 45 | * [CLI](#cli)
 46 | 
 47 | __Strategies__:
 48 | 
 49 | * `auto` (the default) - Will try to
 50 |     1. Find Sitemap(s) defined in `/robots.txt`
 51 |     2. Then in common sitemap locations `/sitemap-index.xml`, `/sitemap.xml` etc.
 52 |     3. Fallback to crawling (using the excellent [spidr](https://github.com/postmodern/spidr/) gem)
 53 | * `sitemap` - Parse Sitemap(s), supports [index files](https://www.sitemaps.org/protocol.html#index) (and gzip)
 54 | * `urls` - Post URL(s)
 55 | 
 56 | ## Ruby
 57 | 
 58 | First require the gem
 59 | 
 60 | ```ruby
 61 | require 'wayback_archiver'
 62 | ```
 63 | 
 64 | _Examples_:
 65 | 
 66 | Auto
 67 | 
 68 | ```ruby
 69 | # auto is the default
 70 | WaybackArchiver.archive('example.com')
 71 | 
 72 | # or explicitly
 73 | WaybackArchiver.archive('example.com', strategy: :auto)
 74 | ```
 75 | 
 76 | Crawl
 77 | 
 78 | ```ruby
 79 | WaybackArchiver.archive('example.com',  strategy: :crawl)
 80 | ```
 81 | 
 82 | Only send one single URL
 83 | 
 84 | ```ruby
 85 | WaybackArchiver.archive('example.com', strategy: :url)
 86 | ```
 87 | 
 88 | Send multiple URLs
 89 | 
 90 | ```ruby
 91 | WaybackArchiver.archive(%w[example.com www.example.com], strategy: :urls)
 92 | ```
 93 | 
 94 | Send all URL(s) found in Sitemap
 95 | 
 96 | ```ruby
 97 | WaybackArchiver.archive('example.com/sitemap.xml', strategy: :sitemap)
 98 | 
 99 | # works with Sitemap index files too
100 | WaybackArchiver.archive('example.com/sitemap-index.xml.gz', strategy: :sitemap)
101 | ```
102 | 
103 | Specify concurrency
104 | 
105 | ```ruby
106 | WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10)
107 | ```
108 | 
109 | Specify max number of URLs to be archived
110 | 
111 | ```ruby
112 | WaybackArchiver.archive('example.com', strategy: :auto, limit: 10)
113 | ```
114 | 
115 | Each archive strategy can receive a block that will be called for each URL
116 | 
117 | ```ruby
118 | WaybackArchiver.archive('example.com', strategy: :auto) do |result|
119 |   if result.success?
120 |     puts "Successfully archived: #{result.archived_url}"
121 |   else
122 |     puts "Error (HTTP #{result.code}) when archiving: #{result.archived_url}"
123 |   end
124 | end
125 | ```
126 | 
127 | Use your own adapter for posting found URLs
128 | 
129 | ```ruby
130 | WaybackArchiver.adapter = ->(url) { puts url } # whatever that responds to #call
131 | ```
132 | 
133 | ## CLI
134 | 
135 | __Usage__:
136 | 
137 | ```
138 | wayback_archiver [<url>] [options]
139 | ```
140 | 
141 | Print full usage instructions
142 | 
143 | ```
144 | wayback_archiver --help
145 | ```
146 | 
147 | _Examples_:
148 | 
149 | Auto
150 | 
151 | ```
152 | # auto is the default
153 | wayback_archiver example.com
154 | 
155 | # or explicitly
156 | wayback_archiver example.com --auto
157 | ```
158 | 
159 | Crawl
160 | 
161 | ```bash
162 | wayback_archiver example.com --crawl
163 | ```
164 | 
165 | Only send one single URL
166 | 
167 | ```bash
168 | wayback_archiver example.com --url
169 | ```
170 | 
171 | Send multiple URLs
172 | 
173 | ```bash
174 | wayback_archiver example.com www.example.com --urls
175 | ```
176 | 
177 | Crawl multiple URLs
178 | 
179 | ```bash
180 | wayback_archiver example.com www.example.com --crawl
181 | ```
182 | 
183 | Send all URL(s) found in Sitemap
184 | 
185 | ```bash
186 | wayback_archiver example.com/sitemap.xml
187 | 
188 | # works with Sitemap index files too
189 | wayback_archiver example.com/sitemap-index.xml.gz
190 | ```
191 | 
192 | Most options
193 | 
194 | ```bash
195 | wayback_archiver example.com www.example.com --auto --concurrency=10 --limit=100 --log=output.log --verbose
196 | ```
197 | 
198 | View archive: [https://web.archive.org/web/*/http://example.com](https://web.archive.org/web/*/http://example.com) (replace `http://example.com` with to your desired domain).
199 | 
200 | ## Configuration
201 | 
202 | :information_source: By default `wayback_archiver` doesn't respect robots.txt files, see [this Internet Archive blog post](https://blog.archive.org/2017/04/17/robots-txt-meant-for-search-engines-dont-work-well-for-web-archives/) for more information.
203 | 
204 | Configuration (the below values are the defaults)
205 | 
206 | ```ruby
207 | WaybackArchiver.concurrency = 1
208 | WaybackArchiver.user_agent = WaybackArchiver::USER_AGENT
209 | WaybackArchiver.respect_robots_txt = WaybackArchiver::DEFAULT_RESPECT_ROBOTS_TXT
210 | WaybackArchiver.logger = Logger.new(STDOUT)
211 | WaybackArchiver.max_limit = -1 # unlimited
212 | WaybackArchiver.adapter = WaybackArchiver::WaybackMachine # must implement #call(url)
213 | ```
214 | 
215 | For a more verbose log you can configure `WaybackArchiver` as such:
216 | 
217 | ```ruby
218 | WaybackArchiver.logger = Logger.new(STDOUT).tap do |logger|
219 |   logger.progname = 'WaybackArchiver'
220 |   logger.level = Logger::DEBUG
221 | end
222 | ```
223 | 
224 | _Pro tip_: If you're using the gem in a Rails app you can set `WaybackArchiver.logger = Rails.logger`.
225 | 
226 | ## Docs
227 | 
228 | You can find the docs online on [RubyDoc](http://www.rubydoc.info/github/buren/wayback_archiver/master).
229 | 
230 | This gem is documented using `yard` (run from the root of this repository).
231 | 
232 | ```bash
233 | yard # Generates documentation to doc/
234 | ```
235 | 
236 | ## Contributing
237 | 
238 | Contributions, feedback and suggestions are very welcome.
239 | 
240 | 1. Fork it
241 | 2. Create your feature branch (`git checkout -b my-new-feature`)
242 | 3. Commit your changes (`git commit -am 'Add some feature'`)
243 | 4. Push to the branch (`git push origin my-new-feature`)
244 | 5. Create new Pull Request
245 | 
246 | ## License
247 | 
248 | [MIT License](LICENSE)
249 | 
250 | ## References
251 | 
252 | * Don't know what the Wayback Machine (Internet Archive) is? [Wayback Machine](https://archive.org/web/)
253 | * Don't know what a Sitemap is? [sitemaps.org](http://www.sitemaps.org)
254 | * Don't know what robot.txt is? [www.robotstxt.org](http://www.robotstxt.org/robotstxt.html)
255 | 


--------------------------------------------------------------------------------
/spec/wayback_archiver/request_spec.rb:
--------------------------------------------------------------------------------
  1 | require 'spec_helper'
  2 | 
  3 | RSpec.describe WaybackArchiver::Request do
  4 |   describe '::get' do
  5 |     let(:headers) do
  6 |       {
  7 |         'Accept' => '*/*',
  8 |         'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
  9 |         'User-Agent' => WaybackArchiver.user_agent
 10 |       }
 11 |     end
 12 | 
 13 |     [
 14 |       [described_class::ServerError, Timeout::Error],
 15 |       [described_class::ServerError, OpenSSL::SSL::SSLError],
 16 |       [described_class::ServerError, Net::HTTPBadResponse],
 17 |       [described_class::ServerError, Zlib::Error],
 18 |       # For some reason the below line causes an ArgumentError exception to be raised instead
 19 |       # [described_class::ClientError, SystemCallError],
 20 |       [described_class::ClientError, SocketError],
 21 |       [described_class::ClientError, IOError]
 22 |     ].each do |test_data|
 23 |       error_klass, raised_error_klass = test_data
 24 | 
 25 |       it "raises #{error_klass} on #{raised_error_klass}" do
 26 |         allow_any_instance_of(Net::HTTP).to receive(:request).and_raise(raised_error_klass)
 27 | 
 28 |         expect { described_class.get('https://example.com') }.to raise_error(error_klass)
 29 |       end
 30 |     end
 31 | 
 32 |     it 'returns response when server response with HTTP 200' do
 33 |       stub_request(:get, 'https://example.com/')
 34 |         .with(headers: headers)
 35 |         .to_return(status: 200, body: 'buren', headers: {})
 36 | 
 37 |       result = described_class.get('https://example.com')
 38 |       expect(result.code).to eq('200')
 39 |     end
 40 | 
 41 |     it 'follows redirect when server response with HTTP 3XX' do
 42 |       response_headers = { 'location' => '/redirect-path' }
 43 |       stub_request(:get, 'https://example.com/')
 44 |         .with(headers: headers)
 45 |         .to_return(status: 301, body: 'buren', headers: response_headers)
 46 | 
 47 |       stub_request(:get, 'https://example.com/redirect-path')
 48 |         .with(headers: headers)
 49 |         .to_return(status: 200, body: 'buren', headers: {})
 50 | 
 51 |       result = described_class.get('https://example.com', max_redirects: 1)
 52 |       expect(result.code).to eq('200')
 53 |       expect(result.uri).to eq('https://example.com/redirect-path')
 54 |     end
 55 | 
 56 |     it 'raises MaxRedirectError if max redirects is reached' do
 57 |       response_headers = { 'location' => '/redirect-path' }
 58 |       stub_request(:get, 'https://example.com/')
 59 |         .with(headers: headers)
 60 |         .to_return(status: 301, body: 'buren', headers: response_headers)
 61 | 
 62 |       expect do
 63 |         described_class.get('https://example.com', max_redirects: 0)
 64 |       end.to raise_error(described_class::MaxRedirectError)
 65 |     end
 66 | 
 67 |     it 'raises UnknownResponseCodeError if server response with unknown HTTP code' do
 68 |       stub_request(:get, 'https://example.com/')
 69 |         .with(headers: headers)
 70 |         .to_return(status: 100, body: 'buren', headers: {})
 71 | 
 72 |       expect do
 73 |         described_class.get('https://example.com')
 74 |       end.to raise_error(described_class::UnknownResponseCodeError)
 75 |     end
 76 | 
 77 |     it 'raises ResponseError if server responded with an error and raise_on_http_error is true' do
 78 |       stub_request(:get, 'https://example.com/')
 79 |         .with(headers: headers)
 80 |         .to_return(status: 400, body: 'buren', headers: {})
 81 | 
 82 |       expect do
 83 |         described_class.get('https://example.com', raise_on_http_error: true)
 84 |       end.to raise_error(described_class::ResponseError)
 85 |     end
 86 | 
 87 |     it 'returns response if server responds with an error and raise_on_http_error is false' do
 88 |       stub_request(:get, 'https://example.com/')
 89 |         .with(headers: headers)
 90 |         .to_return(status: 400, body: 'buren', headers: {})
 91 | 
 92 |       result = described_class.get('https://example.com', raise_on_http_error: false)
 93 | 
 94 |       expect(result.code).to eq('400')
 95 |     end
 96 |   end
 97 | 
 98 |   describe '::build_response' do
 99 |     it 'builds a Response object' do
100 |       expected = WaybackArchiver::Response.new('200', 'OK', 'buren', 'http://example.com')
101 |       response = described_class.build_response(
102 |         'http://example.com',
103 |         Struct.new(:code, :message, :body).new('200', 'OK', 'buren')
104 |       )
105 | 
106 |       expect(response).to eq(expected)
107 |     end
108 | 
109 |     it 'builds a response object that has a #success? method' do
110 |       response = described_class.build_response(
111 |         'http://example.com',
112 |         Struct.new(:code, :message, :body).new('200', 'OK', 'buren')
113 |       )
114 | 
115 |       expect(response.success?).to eq(true)
116 |     end
117 |   end
118 | 
119 |   describe '::build_redirect_uri' do
120 |     it 'raises InvalidRedirectError if no location header is found' do
121 |       response = Struct.new(:header).new({ location: nil })
122 |       redirect_error = WaybackArchiver::Request::InvalidRedirectError
123 | 
124 |       expect do
125 |         described_class.build_redirect_uri('', response)
126 |       end.to raise_error(redirect_error)
127 |     end
128 | 
129 |     it 'adds base URI if location header is relative' do
130 |       base_uri = 'http://example.com'
131 |       response = Struct.new(:header).new({ 'location' => '/path' })
132 |       result = described_class.build_redirect_uri(base_uri, response)
133 | 
134 |       expect(result).to eq(URI.parse('http://example.com/path'))
135 |     end
136 | 
137 |     it 'returns location header' do
138 |       base_uri = 'http://example.com'
139 |       response = Struct.new(:header).new({ 'location' => 'https://example.com/path' })
140 |       result = described_class.build_redirect_uri(base_uri, response)
141 | 
142 |       expect(result).to eq(URI.parse('https://example.com/path'))
143 |     end
144 |   end
145 | 
146 |   describe '::build_uri' do
147 |     it 'returns URI untouched if passed an instance of URI' do
148 |       uri = URI.parse('http://example.com')
149 |       expect(described_class.build_uri(uri)).to eq(uri)
150 |     end
151 | 
152 |     it 'returns URI instance if passed string with http protocol' do
153 |       uri = URI.parse('http://example.com')
154 |       expect(described_class.build_uri('http://example.com')).to eq(uri)
155 |     end
156 | 
157 |     it 'returns URI instance if passed string with https protocol' do
158 |       uri = URI.parse('https://example.com')
159 |       expect(described_class.build_uri('https://example.com')).to eq(uri)
160 |     end
161 | 
162 |     it 'returns URI instance with protocol if passed string without protocol' do
163 |       uri = URI.parse('http://example.com')
164 |       expect(described_class.build_uri('example.com')).to eq(uri)
165 |     end
166 |   end
167 | 
168 |   describe '::parse_body' do
169 |     it 'returns empty string if passed nil' do
170 |       expect(described_class.parse_body(nil)).to eq('')
171 |     end
172 | 
173 |     it 'returns string untouched if passed a regular string' do
174 |       expect(described_class.parse_body('buren')).to eq('buren')
175 |     end
176 | 
177 |     it 'returns uncompressed string if passed a gzipped string' do
178 |       gzipped_string = File.read('spec/data/test_gzip.gz')
179 |       expect(described_class.parse_body(gzipped_string)).to eq("buren\n")
180 |     end
181 |   end
182 | 
183 |   describe '::blank?' do
184 |     it 'returns true if passed nil' do
185 |       expect(described_class.blank?(nil)).to eq(true)
186 |     end
187 | 
188 |     it 'returns true if passed empty string' do
189 |       expect(described_class.blank?('')).to eq(true)
190 |     end
191 | 
192 |     it 'returns true if passed string with only spaces' do
193 |       expect(described_class.blank?('  ')).to eq(true)
194 |     end
195 | 
196 |     it 'returns false if passed non-string empty' do
197 |       expect(described_class.blank?('buren')).to eq(false)
198 |     end
199 |   end
200 | end
201 | 


--------------------------------------------------------------------------------
/lib/wayback_archiver/request.rb:
--------------------------------------------------------------------------------
  1 | require 'net/http'
  2 | require 'openssl'
  3 | require 'timeout'
  4 | require 'uri'
  5 | require 'zlib'
  6 | 
  7 | require 'wayback_archiver/http_code'
  8 | require 'wayback_archiver/response'
  9 | 
 10 | module WaybackArchiver
 11 |   # Make HTTP requests
 12 |   class Request
 13 |     # General error, something went wrong
 14 |     class Error < StandardError; end
 15 |     # Client error, something went wrong on the local machine
 16 |     class ClientError < Error; end
 17 |     # Server error, the remote server did something wrong
 18 |     class ServerError < Error; end
 19 |     # Remote server responded with a HTTP error
 20 |     class HTTPError < ServerError; end
 21 |     # Remote server error
 22 |     class ResponseError < ServerError; end
 23 |     # Max redirects reached error
 24 |     class MaxRedirectError < ServerError; end
 25 |     # Remote server responded with an invalid redirect
 26 |     class InvalidRedirectError < ServerError; end
 27 |     # Remote server responded with an unknown HTTP code
 28 |     class UnknownResponseCodeError < ServerError; end
 29 | 
 30 |     # GET response wrapper
 31 |     GETStruct = Struct.new(:response, :error)
 32 | 
 33 |     # Max number of redirects before an error is raised
 34 |     MAX_REDIRECTS = 10
 35 | 
 36 |     # Known request errors
 37 |     REQUEST_ERRORS = {
 38 |       # server
 39 |       Timeout::Error => ServerError,
 40 |       OpenSSL::SSL::SSLError => ServerError,
 41 |       Net::HTTPBadResponse => ServerError,
 42 |       Zlib::Error => ServerError,
 43 |       # client
 44 |       SystemCallError => ClientError,
 45 |       SocketError => ClientError,
 46 |       IOError => ClientError
 47 |     }.freeze
 48 | 
 49 |     # Get reponse.
 50 |     # @return [Response] the http response representation.
 51 |     # @param [String, URI] uri to retrieve.
 52 |     # @param max_redirects [Integer] max redirects (default: 10).
 53 |     # @param follow_redirects [Boolean] follow redirects (default: true).
 54 |     # @example Get example.com
 55 |     #    Request.get('example.com')
 56 |     # @example Get http://example.com and follow max 3 redirects
 57 |     #    Request.get('http://example.com', max_redirects: 3)
 58 |     # @example Get http://example.com and don't follow redirects
 59 |     #    Request.get('http://example.com', follow_redirects: false)
 60 |     # @raise [Error] super class of all exceptions that this method can raise
 61 |     # @raise [ServerError] all server errors
 62 |     # @raise [ClientError] all client errors
 63 |     # @raise [HTTPError] all HTTP errors
 64 |     # @raise [MaxRedirectError] too many redirects, subclass of HTTPError (only raised if raise_on_http_error flag is true)
 65 |     # @raise [ResponseError] server responsed with a 4xx or 5xx HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true)
 66 |     # @raise [UnknownResponseCodeError] server responded with an unknown HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true)
 67 |     # @raise [InvalidRedirectError] server responded with an invalid redirect, subclass of HTTPError (only raised if raise_on_http_error flag is true)
 68 |     def self.get(
 69 |       uri,
 70 |       max_redirects: MAX_REDIRECTS,
 71 |       raise_on_http_error: false,
 72 |       follow_redirects: true
 73 |     )
 74 |       uri = build_uri(uri)
 75 | 
 76 |       redirect_count = 0
 77 |       until redirect_count > max_redirects
 78 |         WaybackArchiver.logger.debug "Requesting #{uri}"
 79 | 
 80 |         http = Net::HTTP.new(uri.host, uri.port)
 81 |         if uri.scheme == 'https'
 82 |           http.use_ssl = true
 83 |           http.verify_mode = OpenSSL::SSL::VERIFY_NONE
 84 |         end
 85 | 
 86 |         request = Net::HTTP::Get.new(uri.request_uri)
 87 |         request['User-Agent'] = WaybackArchiver.user_agent
 88 | 
 89 |         result = perform_request(uri, http, request)
 90 |         response = result.response
 91 |         error = result.error
 92 | 
 93 |         raise error if error
 94 | 
 95 |         code = response.code
 96 |         WaybackArchiver.logger.debug "[#{code}, #{response.message}] Requested #{uri}"
 97 | 
 98 |         case HTTPCode.type(code)
 99 |         when :success
100 |           return build_response(uri, response)
101 |         when :redirect
102 |           return build_response(uri, response) unless follow_redirects
103 | 
104 |           uri = build_redirect_uri(uri, response)
105 |           redirect_count += 1
106 |           next
107 |         when :error
108 |           if raise_on_http_error
109 |             raise ResponseError, "Failed with response code: #{code} when requesting #{uri}"
110 |           end
111 | 
112 |           return build_response(uri, response)
113 |         else
114 |           raise UnknownResponseCodeError, "Unknown HTTP response code #{code} when requesting #{uri}"
115 |         end
116 |       end
117 | 
118 |       raise MaxRedirectError, "Redirected too many times when requesting #{uri}"
119 |     end
120 | 
121 |     # Builds a Response object.
122 |     # @return [Response]
123 |     # @param [URI] uri that was requested.
124 |     # @param [Net::HTTPResponse] response the server response.
125 |     # @example Build Response object for example.com
126 |     #    Request.build_response(uri, net_http_response)
127 |     def self.build_response(uri, response)
128 |       Response.new(
129 |         response.code,
130 |         response.message,
131 |         parse_body(response.body),
132 |         uri.to_s
133 |       )
134 |     end
135 | 
136 |     # Builds an URI for a redirect response.
137 |     # @return [URI] to redirect to.
138 |     # @param [URI] uri that was requested.
139 |     # @param [Net::HTTPResponse] response the server response.
140 |     # @example Build redirect URI for example.com (lets pretend it will redirect..)
141 |     #    Request.build_redirect_uri('http://example.com', net_http_response)
142 |     def self.build_redirect_uri(uri, response)
143 |       location_header = response.header.fetch('location') do
144 |         raise InvalidRedirectError, "No location header found on redirect when requesting #{uri}"
145 |       end
146 | 
147 |       location = URI.parse(location_header)
148 |       return build_uri(uri) + location_header if location.relative?
149 | 
150 |       location
151 |     end
152 | 
153 |     # Build URI.
154 |     # @return [URI] uri to redirect to.
155 |     # @param [URI, String] uri to build.
156 |     # @example Build URI for example.com
157 |     #    Request.build_uri('http://example.com')
158 |     # @example Build URI for #<URI::HTTP http://example.com>
159 |     #    uri = URI.parse('http://example.com')
160 |     #    Request.build_uri(uri)
161 |     def self.build_uri(uri)
162 |       return uri if uri.is_a?(URI)
163 | 
164 |       uri = "http://#{uri}" unless uri =~ %r{^https?://}
165 |       URI.parse(uri)
166 |     end
167 | 
168 |     # Parse response body, handles reqular and gzipped response bodies.
169 |     # @return [String] the response body.
170 |     # @param [String] response_body the server response body.
171 |     # @example Return response body for response.
172 |     #    Request.parse_body(uri, net_http_response)
173 |     def self.parse_body(response_body)
174 |       return '' unless response_body
175 | 
176 |       Zlib::GzipReader.new(StringIO.new(response_body)).read
177 |     rescue Zlib::GzipFile::Error => _e
178 |       response_body
179 |     end
180 | 
181 |     # Return whether a value is blank or not.
182 |     # @return [Boolean] whether the value is blank or not.
183 |     # @param [Object] value the value to check if its blank or not.
184 |     # @example Returns false for nil.
185 |     #    Request.blank?(nil)
186 |     # @example Returns false for empty string.
187 |     #    Request.blank?('')
188 |     # @example Returns false for string with only spaces.
189 |     #    Request.blank?('  ')
190 |     def self.blank?(value)
191 |       return true unless value
192 |       return true if value.strip.empty?
193 | 
194 |       false
195 |     end
196 | 
197 |     private
198 | 
199 |     def self.perform_request(uri, http, request)
200 |       # TODO: Consider retrying on certain HTTP response codes, i.e 429, 503
201 |       response = http.request(request)
202 |       GETStruct.new(response)
203 |     rescue *REQUEST_ERRORS.keys => e
204 |       build_request_error(uri, e, REQUEST_ERRORS.fetch(e.class))
205 |     end
206 | 
207 |     def self.build_request_error(uri, error, error_wrapper_klass)
208 |       WaybackArchiver.logger.error "Request to #{uri} failed: #{error_wrapper_klass}, #{error.class}, #{error.message}"
209 | 
210 |       GETStruct.new(
211 |         Response.new,
212 |         error_wrapper_klass.new("#{error.class}, #{error.message}")
213 |       )
214 |     end
215 |   end
216 | end
217 | 


--------------------------------------------------------------------------------
/lib/wayback_archiver.rb:
--------------------------------------------------------------------------------
  1 | require 'wayback_archiver/thread_pool'
  2 | require 'wayback_archiver/null_logger'
  3 | require 'wayback_archiver/version'
  4 | require 'wayback_archiver/url_collector'
  5 | require 'wayback_archiver/archive'
  6 | require 'wayback_archiver/sitemapper'
  7 | 
  8 | # WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap or by passing a list of URLs.
  9 | module WaybackArchiver
 10 |   # Link to gem on rubygems.org, part of the sent User-Agent
 11 |   INFO_LINK  = 'https://rubygems.org/gems/wayback_archiver'.freeze
 12 |   # WaybackArchiver User-Agent
 13 |   USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
 14 |   # Default for whether to respect robots txt files
 15 |   DEFAULT_RESPECT_ROBOTS_TXT = false
 16 | 
 17 |   # Default concurrency for archiving URLs
 18 |   DEFAULT_CONCURRENCY = 1
 19 | 
 20 |   # Maxmium number of links posted (-1 is no limit)
 21 |   DEFAULT_MAX_LIMIT = -1
 22 | 
 23 |   # Send URLs to Wayback Machine.
 24 |   # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
 25 |   # @param [String/Array<String>] source for URL(s).
 26 |   # @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
 27 |   # @param [Array<String, Regexp>] hosts to crawl.
 28 |   # @example Crawl example.com and send all URLs of the same domain
 29 |   #    WaybackArchiver.archive('example.com') # Default strategy is :auto
 30 |   #    WaybackArchiver.archive('example.com', strategy: :auto)
 31 |   #    WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10)
 32 |   #    WaybackArchiver.archive('example.com', strategy: :auto, limit: 100) # send max 100 URLs
 33 |   #    WaybackArchiver.archive('example.com', :auto)
 34 |   # @example Crawl example.com and send all URLs of the same domain
 35 |   #    WaybackArchiver.archive('example.com', strategy: :crawl)
 36 |   #    WaybackArchiver.archive('example.com', strategy: :crawl, concurrency: 10)
 37 |   #    WaybackArchiver.archive('example.com', strategy: :crawl, limit: 100) # send max 100 URLs
 38 |   #    WaybackArchiver.archive('example.com', :crawl)
 39 |   # @example Send example.com Sitemap URLs
 40 |   #    WaybackArchiver.archive('example.com', strategy: :sitemap)
 41 |   #    WaybackArchiver.archive('example.com', strategy: :sitemap, concurrency: 10)
 42 |   #    WaybackArchiver.archive('example.com', strategy: :sitemap, limit: 100) # send max 100 URLs
 43 |   #    WaybackArchiver.archive('example.com', :sitemap)
 44 |   # @example Send only example.com
 45 |   #    WaybackArchiver.archive('example.com', strategy: :url)
 46 |   #    WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
 47 |   #    WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
 48 |   #    WaybackArchiver.archive('example.com', :url)
 49 |   # @example Crawl multiple hosts
 50 |   #    WaybackArchiver.archive(
 51 |   #      'http://example.com',
 52 |   #      hosts: [
 53 |   #        'example.com',
 54 |   #        /host[\d]+\.example\.com/
 55 |   #      ]
 56 |   #    )
 57 |   def self.archive(source, legacy_strategy = nil, strategy: :auto, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
 58 |     strategy = legacy_strategy || strategy
 59 | 
 60 |     case strategy.to_s
 61 |     when 'crawl'   then crawl(source, concurrency: concurrency, limit: limit, hosts: hosts, &block)
 62 |     when 'auto'    then auto(source, concurrency: concurrency, limit: limit, &block)
 63 |     when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit, &block)
 64 |     when 'urls'    then urls(source, concurrency: concurrency, limit: limit, &block)
 65 |     when 'url'     then urls(source, concurrency: concurrency, limit: limit, &block)
 66 |     else
 67 |       raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
 68 |     end
 69 |   end
 70 | 
 71 |   # Look for Sitemap(s) and if nothing is found fallback to crawling.
 72 |   # Then send found URLs to the Wayback Machine.
 73 |   # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
 74 |   # @param [String] source (must be a valid URL).
 75 |   # @param concurrency [Integer]
 76 |   # @example Auto archive example.com
 77 |   #    WaybackArchiver.auto('example.com') # Default concurrency is 1
 78 |   # @example Auto archive example.com with low concurrency
 79 |   #    WaybackArchiver.auto('example.com', concurrency: 1)
 80 |   # @example Auto archive example.com and archive max 100 URLs
 81 |   #    WaybackArchiver.auto('example.com', limit: 100)
 82 |   # @see http://www.sitemaps.org
 83 |   def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
 84 |     urls = Sitemapper.autodiscover(source)
 85 |     return urls(urls, concurrency: concurrency, &block) if urls.any?
 86 | 
 87 |     crawl(source, concurrency: concurrency, &block)
 88 |   end
 89 | 
 90 |   # Crawl site for URLs to send to the Wayback Machine.
 91 |   # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
 92 |   # @param [String] url to start crawling from.
 93 |   # @param [Array<String, Regexp>] hosts to crawl
 94 |   # @param concurrency [Integer]
 95 |   # @example Crawl example.com and send all URLs of the same domain
 96 |   #    WaybackArchiver.crawl('example.com') # Default concurrency is 1
 97 |   # @example Crawl example.com and send all URLs of the same domain with low concurrency
 98 |   #    WaybackArchiver.crawl('example.com', concurrency: 1)
 99 |   # @example Crawl example.com and archive max 100 URLs
100 |   #    WaybackArchiver.crawl('example.com', limit: 100)
101 |   # @example Crawl multiple hosts
102 |   #    URLCollector.crawl(
103 |   #      'http://example.com',
104 |   #      hosts: [
105 |   #        'example.com',
106 |   #        /host[\d]+\.example\.com/
107 |   #      ]
108 |   #    )
109 |   def self.crawl(url, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
110 |     WaybackArchiver.logger.info "Crawling #{url}"
111 |     Archive.crawl(url, hosts: hosts, concurrency: concurrency, limit: limit, &block)
112 |   end
113 | 
114 |   # Get URLs from sitemap and send found URLs to the Wayback Machine.
115 |   # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
116 |   # @param [String] url to the sitemap.
117 |   # @param concurrency [Integer]
118 |   # @example Get example.com sitemap and archive all found URLs
119 |   #    WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 1
120 |   # @example Get example.com sitemap and archive all found URLs with low concurrency
121 |   #    WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
122 |   # @example Get example.com sitemap archive max 100 URLs
123 |   #    WaybackArchiver.sitemap('example.com/sitemap.xml', limit: 100)
124 |   # @see http://www.sitemaps.org
125 |   def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
126 |     WaybackArchiver.logger.info "Fetching Sitemap"
127 |     Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit, &block)
128 |   end
129 | 
130 |   # Send URL to the Wayback Machine.
131 |   # @return [Array<ArchiveResult>] of URLs sent to the Wayback Machine.
132 |   # @param [Array<String>/String] urls or url.
133 |   # @param concurrency [Integer]
134 |   # @example Archive example.com
135 |   #    WaybackArchiver.urls('example.com')
136 |   # @example Archive example.com and google.com
137 |   #    WaybackArchiver.urls(%w(example.com google.com))
138 |   # @example Archive example.com, max 100 URLs
139 |   #    WaybackArchiver.urls(%w(example.com www.example.com), limit: 100)
140 |   def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
141 |     Archive.post(Array(urls), concurrency: concurrency, &block)
142 |   end
143 | 
144 |   # Set logger
145 |   # @return [Object] the set logger
146 |   # @param [Object] logger an object than response to quacks like a Logger
147 |   # @example set a logger that prints to standard out (STDOUT)
148 |   #    WaybackArchiver.logger = Logger.new(STDOUT)
149 |   def self.logger=(logger)
150 |     @logger = logger
151 |   end
152 | 
153 |   # Returns the current logger
154 |   # @return [Object] the current logger instance
155 |   def self.logger
156 |     @logger ||= NullLogger.new
157 |   end
158 | 
159 |   # Resets the logger to the default
160 |   # @return [NullLogger] a new instance of NullLogger
161 |   def self.default_logger!
162 |     @logger = NullLogger.new
163 |   end
164 | 
165 |   # Sets the user agent
166 |   # @return [String] the configured user agent
167 |   # @param [String] user_agent the desired user agent
168 |   def self.user_agent=(user_agent)
169 |     @user_agent = user_agent
170 |   end
171 | 
172 |   # Returns the configured user agent
173 |   # @return [String] the configured or the default user agent
174 |   def self.user_agent
175 |     @user_agent ||= USER_AGENT
176 |   end
177 | 
178 |   # Sets the default respect_robots_txt
179 |   # @return [Boolean] the desired default for respect_robots_txt
180 |   # @param [Boolean] respect_robots_txt the desired default
181 |   def self.respect_robots_txt=(respect_robots_txt)
182 |     @respect_robots_txt = respect_robots_txt
183 |   end
184 | 
185 |   # Returns the default respect_robots_txt
186 |   # @return [Boolean] the configured or the default respect_robots_txt
187 |   def self.respect_robots_txt
188 |     @respect_robots_txt ||= DEFAULT_RESPECT_ROBOTS_TXT
189 |   end
190 | 
191 |   # Sets the default concurrency
192 |   # @return [Integer] the desired default concurrency
193 |   # @param [Integer] concurrency the desired default concurrency
194 |   def self.concurrency=(concurrency)
195 |     @concurrency = concurrency
196 |   end
197 | 
198 |   # Returns the default concurrency
199 |   # @return [Integer] the configured or the default concurrency
200 |   def self.concurrency
201 |     @concurrency ||= DEFAULT_CONCURRENCY
202 |   end
203 | 
204 |   # Sets the default max_limit
205 |   # @return [Integer] the desired default max_limit
206 |   # @param [Integer] max_limit the desired default max_limit
207 |   def self.max_limit=(max_limit)
208 |     @max_limit = max_limit
209 |   end
210 | 
211 |   # Returns the default max_limit
212 |   # @return [Integer] the configured or the default max_limit
213 |   def self.max_limit
214 |     @max_limit ||= DEFAULT_MAX_LIMIT
215 |   end
216 | 
217 |   # Sets the adapter
218 |   # @return [Object, #call>] the configured adapter
219 |   # @param [Object, #call>] the adapter
220 |   def self.adapter=(adapter)
221 |     unless adapter.respond_to?(:call)
222 |       raise(ArgumentError, 'adapter must implement #call')
223 |     end
224 | 
225 |     @adapter = adapter
226 |   end
227 | 
228 |   # Returns the configured adapter
229 |   # @return [Integer] the configured or the default adapter
230 |   def self.adapter
231 |     @adapter ||= WaybackMachine
232 |   end
233 | end
234 | 


--------------------------------------------------------------------------------