├── spec ├── data │ ├── robots.txt │ ├── test_gzip.gz │ ├── sitemap.xml │ ├── sitemap_index.xml │ └── sitemap_index_with_duplicate_url.xml ├── support │ └── test_logger.rb ├── wayback_archiver │ ├── null_logger_spec.rb │ ├── archive_result_spec.rb │ ├── thread_pool_spec.rb │ ├── http_code_spec.rb │ ├── url_collector_spec.rb │ ├── adapters │ │ └── wayback_machine_spec.rb │ ├── archive_spec.rb │ ├── sitemap_spec.rb │ ├── sitemapper_spec.rb │ └── request_spec.rb ├── spec_helper.rb └── wayback_archiver_spec.rb ├── lib ├── wayback_archiver │ ├── version.rb │ ├── null_logger.rb │ ├── response.rb │ ├── archive_result.rb │ ├── thread_pool.rb │ ├── adapters │ │ └── wayback_machine.rb │ ├── http_code.rb │ ├── url_collector.rb │ ├── sitemap.rb │ ├── sitemapper.rb │ ├── archive.rb │ └── request.rb ├── robots.rb └── wayback_archiver.rb ├── Gemfile ├── .travis.yml ├── .github └── dependabot.yml ├── .gitignore ├── Rakefile ├── LICENSE ├── CHANGELOG.md ├── wayback_archiver.gemspec ├── bin └── wayback_archiver └── README.md /spec/data/robots.txt: -------------------------------------------------------------------------------- 1 | Sitemap: http://www.example.com/sitemap.xml -------------------------------------------------------------------------------- /spec/data/test_gzip.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buren/wayback_archiver/HEAD/spec/data/test_gzip.gz -------------------------------------------------------------------------------- /lib/wayback_archiver/version.rb: -------------------------------------------------------------------------------- 1 | module WaybackArchiver 2 | # Gem version 3 | VERSION = '1.5.0'.freeze 4 | end 5 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in wayback_archiver.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: ruby 3 | rvm: 4 | - 3.0.0 5 | - 3.1.0 6 | - 3.2.0 7 | - 3.3.0 8 | before_install: gem install bundler 9 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: bundler 4 | directory: "/" 5 | schedule: 6 | interval: monthly 7 | time: "04:00" 8 | open-pull-requests-limit: 10 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | .yardoc 6 | Gemfile.lock 7 | InstalledFiles 8 | _yardoc 9 | coverage 10 | doc/ 11 | lib/bundler/man 12 | pkg 13 | rdoc 14 | spec/reports 15 | test/tmp 16 | test/version_tmp 17 | tmp 18 | TODO.md 19 | .byebug_history 20 | -------------------------------------------------------------------------------- /lib/wayback_archiver/null_logger.rb: -------------------------------------------------------------------------------- 1 | require 'logger' 2 | 3 | module WaybackArchiver 4 | # Don't log anyting / Send the logs to the abyss 5 | class NullLogger < Logger 6 | # Allow any and all params 7 | def initialize(*args); end 8 | 9 | # Allow any and alls params and don't do anyting 10 | def add(*args, &block); end 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /spec/data/sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | http://www.example.com/ 8 | 9 | 2005-01-01 10 | 11 | monthly 12 | 13 | 0.8 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'bundler/gem_tasks' 2 | 3 | task default: :spec 4 | 5 | task :console do 6 | require 'bundler/setup' 7 | require 'irb' 8 | require 'wayback_archiver' 9 | ARGV.clear 10 | IRB.start 11 | end 12 | 13 | task :spec do 14 | begin 15 | require 'rspec/core/rake_task' 16 | RSpec::Core::RakeTask.new(:spec) 17 | rescue LoadError 18 | puts 'Could *not* load rspec' 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/wayback_archiver/response.rb: -------------------------------------------------------------------------------- 1 | module WaybackArchiver 2 | # Response data struct 3 | Response = Struct.new(:code, :message, :body, :uri, :error) 4 | class Response 5 | # Returns true if a successfull response 6 | # @example check if Response was successfull 7 | # response = Response.new('200', 'OK', 'buren', 'http://example.com') 8 | # response.success? # => true 9 | def success? 10 | HTTPCode.success?(code) 11 | end 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /spec/data/sitemap_index.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | http://www.example.com/sitemap1.xml.gz 8 | 9 | 2004-10-01T18:23:17+00:00 10 | 11 | 12 | 13 | 14 | 15 | http://www.example.com/sitemap2.xml.gz 16 | 17 | 2005-01-01 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /spec/support/test_logger.rb: -------------------------------------------------------------------------------- 1 | require 'logger' 2 | 3 | # Test logger 4 | class TestLogger < Logger 5 | attr_reader :info_log, :debug_log, :error_log 6 | 7 | def initialize(*_args) 8 | @info_log = [] 9 | @debug_log = [] 10 | @error_log = [] 11 | end 12 | 13 | def add(*args) 14 | log_type, _, log_string = args 15 | case log_type 16 | when 0 then @debug_log 17 | when 1 then @info_log 18 | when 3 then @error_log 19 | end << log_string 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /spec/data/sitemap_index_with_duplicate_url.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | http://www.example.com/sitemap1.xml.gz 8 | 9 | 2004-10-01T18:23:17+00:00 10 | 11 | 12 | 13 | 14 | 15 | http://www.example.com/sitemap1.xml.gz 16 | 17 | 2005-01-01 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /spec/wayback_archiver/null_logger_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WaybackArchiver::NullLogger do 4 | it 'inherits from Logger' do 5 | expect(described_class.ancestors).to include(Logger) 6 | end 7 | 8 | it 'can be initialized with arguments' do 9 | logger = described_class.new('buren') 10 | expect(logger.is_a?(described_class)).to eq(true) 11 | end 12 | 13 | it 'has #add method that can recieve args and a block' do 14 | logger = described_class.new('buren') 15 | expect(logger.add('buren', &:nil?)).to be_nil 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /spec/wayback_archiver/archive_result_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WaybackArchiver::ArchiveResult do 4 | describe '#archived_url' do 5 | it 'returns the uri' do 6 | expect(described_class.new('buren').archived_url).to eq('buren') 7 | end 8 | end 9 | 10 | describe '#errored?' do 11 | it 'returns true if errored' do 12 | expect(described_class.new(nil, error: true).errored?).to eq(true) 13 | end 14 | end 15 | 16 | describe '#success?' do 17 | it 'returns true if success' do 18 | expect(described_class.new(nil, error: nil).success?).to eq(true) 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/wayback_archiver/archive_result.rb: -------------------------------------------------------------------------------- 1 | module WaybackArchiver 2 | # Result data for posting URL to archive 3 | class ArchiveResult 4 | attr_reader :uri, :code, :request_url, :response_error, :error 5 | 6 | def initialize(uri, code: nil, request_url: nil, response_error: nil, error: nil) 7 | @uri = uri 8 | @code = code 9 | @request_url = request_url 10 | @response_error = response_error 11 | @error = error 12 | end 13 | 14 | # @return [String] the URL that was archived 15 | def archived_url 16 | uri 17 | end 18 | 19 | # @return [Boolean] true if success 20 | def success? 21 | !errored? 22 | end 23 | 24 | # @return [Boolean] true if errored 25 | def errored? 26 | !!error 27 | end 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /spec/wayback_archiver/thread_pool_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WaybackArchiver::ThreadPool do 4 | context 'with concurrency less than 1' do 5 | it 'raises ArgumentError' do 6 | expect { described_class.build(0) }.to raise_error(ArgumentError) 7 | end 8 | end 9 | 10 | context 'with concurrency 1' do 11 | it 'returns a Concurrency::ImmediateExecutor' do 12 | thread_pool = described_class.build(1) 13 | expect(thread_pool).to be_an_instance_of(Concurrent::ImmediateExecutor) 14 | end 15 | end 16 | 17 | context 'with concurrency greater than 1' do 18 | it 'returns a Concurrent::FixedThreadPool' do 19 | thread_pool = described_class.build(2) 20 | expect(thread_pool).to be_an_instance_of(Concurrent::FixedThreadPool) 21 | end 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require 'simplecov' 2 | require 'coveralls' 3 | 4 | formatters = [ 5 | SimpleCov::Formatter::HTMLFormatter, 6 | Coveralls::SimpleCov::Formatter 7 | ] 8 | SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new(formatters) 9 | SimpleCov.start 10 | 11 | Dir['./spec/support/**/*.rb'].each { |file| require file } 12 | 13 | require 'wayback_archiver' 14 | require 'webmock/rspec' 15 | require 'byebug' 16 | 17 | WebMock.disable_net_connect! 18 | 19 | RSpec.configure do |config| 20 | config.order = 'random' 21 | config.run_all_when_everything_filtered = false 22 | 23 | config.before(:each) do 24 | WaybackArchiver.logger = TestLogger.new 25 | 26 | # Set defalt concurrency to 1, so we don't have to deal with concurrency 27 | # issues in Webmock and rspec-mocks 28 | WaybackArchiver.concurrency = 1 29 | 30 | WaybackArchiver.max_limit = WaybackArchiver::DEFAULT_MAX_LIMIT 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /lib/wayback_archiver/thread_pool.rb: -------------------------------------------------------------------------------- 1 | require 'concurrent' 2 | 3 | module WaybackArchiver 4 | # Thread pool 5 | class ThreadPool 6 | # Build a thread pool 7 | # @return [Concurrent::FixedThreadPool/Concurrent::ImmediateExecutor] an instance of a concurrent thread pool 8 | # @param [Integer] concurrency the desired concurrency 9 | # @example Build a thread pool with 10 as the desired concurrency 10 | # pool = ThreadPool.build(10) 11 | # pool.post { some_work } # Returns a Concurrent::FixedThreadPool 12 | # @example Build a thread pool with 1 as the desired concurrency 13 | # pool = ThreadPool.build(1) 14 | # pool.post { some_work } # Returns a Concurrent::ImmediateExecutor 15 | # @see https://github.com/ruby-concurrency/concurrent-ruby/blob/master/doc/thread_pools.md 16 | def self.build(concurrency) 17 | if concurrency == 1 18 | Concurrent::ImmediateExecutor.new 19 | elsif concurrency > 1 20 | Concurrent::FixedThreadPool.new(concurrency) 21 | else 22 | raise ArgumentError, 'concurrency must be one or greater' 23 | end 24 | end 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 Jacob Burenstam Linder 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /lib/wayback_archiver/adapters/wayback_machine.rb: -------------------------------------------------------------------------------- 1 | require 'wayback_archiver/archive_result' 2 | require 'wayback_archiver/request' 3 | 4 | module WaybackArchiver 5 | # WaybackMachine adapter 6 | class WaybackMachine 7 | # Wayback Machine base URL. 8 | BASE_URL = 'https://web.archive.org/save/'.freeze 9 | 10 | # Send URL to Wayback Machine. 11 | # @return [ArchiveResult] the sent URL. 12 | # @param [String] url to send. 13 | # @example Archive example.com, with default options 14 | # WaybackMachine.call('http://example.com') 15 | def self.call(url) 16 | request_url = "#{BASE_URL}#{url&.strip}" 17 | response = Request.get(request_url, follow_redirects: false) 18 | WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}" 19 | ArchiveResult.new( 20 | url, 21 | code: response.code, 22 | request_url: response.uri, 23 | response_error: response.error 24 | ) 25 | rescue Request::Error => e 26 | WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}" 27 | ArchiveResult.new(url, error: e) 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | ## HEAD 4 | 5 | ## v1.5.0 6 | 7 | - Strip URLs found in Sitemaps 8 | - Inline `robots` dependency, closes [#51](https://github.com/buren/wayback_archiver/issues/51) 9 | - Update Sitemap XML parsing to work better with newer versions of REXML 10 | - Fix issue calling `Spidr` with option hash (i.e use double spat operator) 11 | 12 | ## v1.4.0 13 | 14 | * Don't respect robots.txt file by default, [PR#41](https://github.com/buren/wayback_archiver/pull/41) 15 | * Add `WaybackArchiver::respect_robots_txt=` configuration option, to control whether to respect robots.txt file or not 16 | * Update `spidr` gem, resolves [issue#25](https://github.com/buren/wayback_archiver/issues/25) 17 | * Set default concurrency to `1` due to harsher rate limiting on Wayback Machine 18 | * Support for crawling multiple hosts, for example www.example.com, example.com and app.example.com [PR#27](https://github.com/buren/wayback_archiver/pull/27) 19 | 20 | ## v1.3.0 21 | 22 | * Archive every page found, not only HTML pages - [#24](https://github.com/buren/wayback_archiver/pull/24) thanks [@chlorophyll-zz](https://github.com/chlorophyll-zz). 23 | 24 | ## v1.2.1 25 | 26 | * Track what urls have been visited in sitemapper and don't visit them twice 27 | * Protect sitemap index duplicates 28 | 29 | ## v1.2.0 30 | 31 | Is history... 32 | -------------------------------------------------------------------------------- /lib/wayback_archiver/http_code.rb: -------------------------------------------------------------------------------- 1 | module WaybackArchiver 2 | # Convience class for HTTP response codes 3 | class HTTPCode 4 | # Type of code as symbol 5 | # @return [Symbol] code type 6 | # @param [String/Integer] code the response code 7 | # @example 8 | # HttpCode.type('200') 9 | def self.type(code) 10 | code = code.to_s 11 | return :success if success?(code) 12 | return :redirect if redirect?(code) 13 | return :error if error?(code) 14 | 15 | :unknown 16 | end 17 | 18 | # Whether the code is a success type 19 | # @return [Boolean] is success or not 20 | # @param [String] code the response code 21 | # @example 22 | # HttpCode.success?('200') # => true 23 | # @example 24 | # HttpCode.success?(200) # => true 25 | # @example 26 | # HttpCode.success?(nil) # => false 27 | def self.success?(code) 28 | !!code.to_s.match(/2\d\d/) 29 | end 30 | 31 | # Whether the code is a redirect type 32 | # @return [Boolean] is redirect or not 33 | # @param [String] code the response code 34 | # @example 35 | # HttpCode.redirect?('301') 36 | def self.redirect?(code) 37 | !!code.to_s.match(/3\d\d/) 38 | end 39 | 40 | # Whether the code is a error type 41 | # @return [Boolean] is error or not 42 | # @param [String] code the response code 43 | # @example 44 | # HttpCode.error?('301') 45 | def self.error?(code) 46 | !!code.to_s.match(/4\d\d/) || !!code.to_s.match(/5\d\d/) 47 | end 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /wayback_archiver.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | lib = File.expand_path('../lib', __FILE__) 4 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 5 | require 'wayback_archiver/version' 6 | 7 | Gem::Specification.new do |spec| 8 | spec.name = 'wayback_archiver' 9 | spec.version = WaybackArchiver::VERSION 10 | spec.authors = ['Jacob Burenstam'] 11 | spec.email = ['burenstam@gmail.com'] 12 | 13 | spec.summary = 'Post URLs to Wayback Machine (Internet Archive)' 14 | spec.description = 'Post URLs to Wayback Machine (Internet Archive), using a crawler, from Sitemap(s) or a list of URLs.' 15 | spec.homepage = 'https://github.com/buren/wayback_archiver' 16 | spec.license = 'MIT' 17 | 18 | spec.files = Dir.glob('{bin,lib}/**/*') 19 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 20 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 21 | spec.require_paths = ['lib'] 22 | 23 | spec.required_ruby_version = '>= 2.0.0' 24 | 25 | spec.add_runtime_dependency 'spidr', '~> 0.7.1' # Crawl sites 26 | spec.add_runtime_dependency 'concurrent-ruby', '~> 1.3' # Concurrency primitivies 27 | spec.add_runtime_dependency 'rexml', '~> 3.3.9' 28 | 29 | spec.add_development_dependency 'bundler', '~> 2.1' 30 | spec.add_development_dependency 'rake', '~> 12.3' 31 | spec.add_development_dependency 'rspec', '~> 3.1' 32 | spec.add_development_dependency 'yard', '~> 0.9' 33 | spec.add_development_dependency 'simplecov', '~> 0.14.1' 34 | spec.add_development_dependency 'coveralls', '~> 0.8' 35 | spec.add_development_dependency 'redcarpet', '~> 3.2' 36 | spec.add_development_dependency 'webmock', '~> 3.0' 37 | spec.add_development_dependency 'byebug', '~> 11.1.3' 38 | end 39 | -------------------------------------------------------------------------------- /spec/wayback_archiver/http_code_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WaybackArchiver::HTTPCode do 4 | describe '::type' do 5 | [ 6 | # argument, expected 7 | [200, :success], 8 | ['200', :success], 9 | ['301', :redirect], 10 | ['302', :redirect], 11 | ['400', :error], 12 | ['404', :error], 13 | ['500', :error], 14 | ['503', :error], 15 | ['999', :unknown] 16 | ].each do |data| 17 | code, expected = data 18 | 19 | it "returns #{expected} for #{code} code" do 20 | expect(described_class.type(code)).to eq(expected) 21 | end 22 | end 23 | end 24 | 25 | describe '::success?' do 26 | it 'returns true when code is success' do 27 | code = '200' 28 | expect(described_class.success?(code)).to eq(true) 29 | end 30 | 31 | it 'returns false when code is not success' do 32 | code = '300' 33 | expect(described_class.success?(code)).to eq(false) 34 | end 35 | end 36 | 37 | describe '::error?' do 38 | it 'returns true when code is 400 error' do 39 | code = '400' 40 | expect(described_class.error?(code)).to eq(true) 41 | end 42 | 43 | it 'returns true when code is 500 error' do 44 | code = '500' 45 | expect(described_class.error?(code)).to eq(true) 46 | end 47 | 48 | it 'returns false when code is not error' do 49 | code = '200' 50 | expect(described_class.error?(code)).to eq(false) 51 | end 52 | end 53 | 54 | describe '::redirect?' do 55 | it 'returns true when code is redirect' do 56 | code = '300' 57 | expect(described_class.redirect?(code)).to eq(true) 58 | end 59 | 60 | it 'returns false when code is not redirect' do 61 | code = '200' 62 | expect(described_class.redirect?(code)).to eq(false) 63 | end 64 | end 65 | end 66 | -------------------------------------------------------------------------------- /spec/wayback_archiver/url_collector_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WaybackArchiver::URLCollector do 4 | describe '::sitemap' do 5 | it 'calls Sitemapper::urls' do 6 | expected = %w[http://example.com] 7 | allow(WaybackArchiver::Sitemapper).to receive(:urls).and_return(expected) 8 | expect(described_class.sitemap('http://example.com')).to eq(expected) 9 | end 10 | end 11 | 12 | describe '::crawl' do 13 | let(:headers) do 14 | { 15 | 'Accept' => '*/*', 16 | 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 17 | 'User-Agent' => WaybackArchiver.user_agent 18 | } 19 | end 20 | 21 | it 'can crawl' do 22 | html_page = <<-HTML 23 | 24 | 25 | 26 | 27 | Testing 28 | 29 | 30 | An URL 31 | 32 | 33 | HTML 34 | 35 | response_headers = { 'Content-Type' => 'text/html; charset=utf-8' } 36 | 37 | stub_request(:get, 'http://example.com/robots.txt') 38 | .with(headers: headers) 39 | .to_return(status: 200, body: '', headers: {}) 40 | 41 | stub_request(:get, 'http://example.com/') 42 | .with(headers: headers) 43 | .to_return(status: 200, body: html_page, headers: response_headers) 44 | 45 | stub_request(:get, 'http://example.com/found') 46 | .with(headers: headers) 47 | .to_return(status: 200, body: '', headers: response_headers) 48 | 49 | expected_urls = %w[http://example.com http://example.com/found] 50 | expected_urls_dup = expected_urls.dup 51 | found_urls = described_class.crawl('http://example.com') do |url| 52 | expect(url).to eq(expected_urls.shift) 53 | end 54 | 55 | expect(found_urls).to eq(expected_urls_dup) 56 | end 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /spec/wayback_archiver/adapters/wayback_machine_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WaybackArchiver::WaybackMachine do 4 | let(:headers) do 5 | { 6 | 'Accept' => '*/*', 7 | 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 8 | 'User-Agent' => WaybackArchiver.user_agent 9 | } 10 | end 11 | 12 | describe '::call' do 13 | it 'posts URL to the Wayback Machine' do 14 | url = 'https://example.com' 15 | expected_request_url = "https://web.archive.org/save/#{url}" 16 | 17 | stub_request(:get, expected_request_url) 18 | .with(headers: headers) 19 | .to_return(status: 301, body: 'buren', headers: {}) 20 | 21 | result = described_class.call(url) 22 | 23 | expect(result.uri).to eq(url) 24 | expect(result.code).to eq('301') 25 | expect(WaybackArchiver.logger.debug_log.first).to include(expected_request_url) 26 | expect(WaybackArchiver.logger.info_log.last).to include(url) 27 | end 28 | 29 | it 'rescues and logs Request::ServerError' do 30 | allow(WaybackArchiver::Request).to receive(:get) 31 | .and_raise(WaybackArchiver::Request::MaxRedirectError, 'too many redirects') 32 | 33 | url = 'https://example.com' 34 | expected_request_url = "https://web.archive.org/save/#{url}" 35 | 36 | stub_request(:get, expected_request_url) 37 | .with(headers: headers) 38 | .to_return(status: 301, body: 'buren', headers: {}) 39 | 40 | result = described_class.call(url) 41 | 42 | expect(result.uri).to eq(url) 43 | expect(result.response_error).to be_nil 44 | expect(result.request_url).to be_nil 45 | expect(result.error).to be_a(WaybackArchiver::Request::MaxRedirectError) 46 | 47 | last_error_log = WaybackArchiver.logger.error_log.last 48 | expect(last_error_log).to include(url) 49 | expect(last_error_log).to include('MaxRedirectError') 50 | expect(last_error_log).to include('too many redirects') 51 | end 52 | end 53 | end 54 | -------------------------------------------------------------------------------- /lib/wayback_archiver/url_collector.rb: -------------------------------------------------------------------------------- 1 | require 'spidr' 2 | require 'robots' 3 | 4 | require 'wayback_archiver/sitemapper' 5 | require 'wayback_archiver/request' 6 | 7 | module WaybackArchiver 8 | # Retrive URLs from different sources 9 | class URLCollector 10 | # Retrieve URLs from Sitemap. 11 | # @return [Array] of URLs defined in Sitemap. 12 | # @param [String] url domain to retrieve Sitemap from. 13 | # @example Get URLs defined in Sitemap for google.com 14 | # URLCollector.sitemap('https://google.com/sitemap.xml') 15 | def self.sitemap(url) 16 | Sitemapper.urls(url: Request.build_uri(url)) 17 | end 18 | 19 | # Retrieve URLs by crawling. 20 | # @return [Array] of URLs defined found during crawl. 21 | # @param [String] url domain to crawl URLs from. 22 | # @param [Array] hosts to crawl. 23 | # @example Crawl URLs defined on example.com 24 | # URLCollector.crawl('http://example.com') 25 | # @example Crawl URLs defined on example.com and limit the number of visited pages to 100 26 | # URLCollector.crawl('http://example.com', limit: 100) 27 | # @example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100 28 | # URLCollector.crawl('http://example.com', limit: -1) 29 | # @example Crawl multiple hosts 30 | # URLCollector.crawl( 31 | # 'http://example.com', 32 | # hosts: [ 33 | # 'example.com', 34 | # /host[\d]+\.example\.com/ 35 | # ] 36 | # ) 37 | def self.crawl(url, hosts: [], limit: WaybackArchiver.max_limit) 38 | urls = [] 39 | start_at_url = Request.build_uri(url).to_s 40 | options = { 41 | robots: WaybackArchiver.respect_robots_txt, 42 | hosts: hosts, 43 | user_agent: WaybackArchiver.user_agent 44 | } 45 | options[:limit] = limit unless limit == -1 46 | 47 | Spidr.site(start_at_url, **options) do |spider| 48 | spider.every_page do |page| 49 | page_url = page.url.to_s 50 | urls << page_url 51 | WaybackArchiver.logger.debug "Found: #{page_url}" 52 | yield(page_url) if block_given? 53 | end 54 | end 55 | urls 56 | end 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /bin/wayback_archiver: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'optparse' 4 | require 'wayback_archiver' 5 | 6 | # Default values 7 | urls = nil 8 | strategy = 'auto' 9 | log = STDOUT 10 | log_level = Logger::INFO 11 | concurrency = WaybackArchiver.concurrency 12 | limit = WaybackArchiver.max_limit 13 | hosts = [] 14 | 15 | optparse = OptionParser.new do |parser| 16 | parser.banner = 'Usage: wayback_archiver [] [options]' 17 | 18 | parser.on('--auto', 'Auto (default)') do |value| 19 | strategy = 'auto' 20 | end 21 | 22 | parser.on('--crawl', 'Crawl') do |value| 23 | strategy = 'crawl' 24 | end 25 | 26 | parser.on('--sitemap', 'Sitemap') do |value| 27 | strategy = 'sitemap' 28 | end 29 | 30 | parser.on('--urls', '--url', 'URL(s)') do |value| 31 | strategy = 'urls' 32 | end 33 | 34 | parser.on('--hosts=[example.com]', Array, 'Only spider links on certain hosts') do |value| 35 | hosts = value.map { |v| Regexp.new(v) } if value 36 | end 37 | 38 | parser.on('--concurrency=1', Integer, 'Concurrency') do |value| 39 | concurrency = value 40 | end 41 | 42 | parser.on('--limit=5', Integer, 'Max number of URLs to archive') do |value| 43 | limit = value 44 | end 45 | 46 | parser.on('--log=output.log', String, 'Path to desired log file (if no argument is given it defaults to STDOUT)') do |path| 47 | log = path 48 | end 49 | 50 | parser.on('--[no-]verbose', 'Verboes logs') do |value| 51 | log_level = value ? Logger::DEBUG : Logger::WARN 52 | end 53 | 54 | parser.on('-h', '--help', 'How to use') do 55 | puts parser 56 | exit 57 | end 58 | 59 | # No argument, shows at tail. This will print an options summary. 60 | parser.on_tail('-h', '--help', 'Show this message') do 61 | puts parser 62 | exit 63 | end 64 | 65 | parser.on_tail('--version', 'Show version') do 66 | puts "WaybackArchiver version #{WaybackArchiver::VERSION}" 67 | exit 68 | end 69 | end 70 | 71 | optparse.parse! 72 | 73 | urls = ARGV.map(&:strip).reject(&:empty?) 74 | if urls.empty? 75 | puts optparse.help 76 | raise ArgumentError, "[] is required" 77 | end 78 | 79 | WaybackArchiver.logger = Logger.new(log).tap do |logger| 80 | logger.progname = 'WaybackArchiver' 81 | logger.level = log_level 82 | end 83 | 84 | # If no strategy has explicitly been given, then default to 'auto' 85 | strategy ||= 'auto' 86 | urls.each do |url| 87 | WaybackArchiver.archive( 88 | url, 89 | hosts: hosts, 90 | strategy: strategy, 91 | concurrency: concurrency, 92 | limit: limit 93 | ) 94 | end 95 | -------------------------------------------------------------------------------- /lib/wayback_archiver/sitemap.rb: -------------------------------------------------------------------------------- 1 | require 'uri' 2 | require 'rexml/document' 3 | 4 | module WaybackArchiver 5 | # Parse Sitemaps, https://www.sitemaps.org 6 | class Sitemap 7 | attr_reader :document 8 | 9 | def initialize(xml_or_string, strict: false) 10 | @contents = xml_or_string 11 | @document = REXML::Document.new(xml_or_string) 12 | rescue REXML::ParseException => _e 13 | raise if strict 14 | 15 | @document = REXML::Document.new('') 16 | end 17 | 18 | # Return all URLs defined in Sitemap. 19 | # @return [Array] of URLs defined in Sitemap. 20 | # @example Get URLs defined in Sitemap 21 | # sitemap = Sitemap.new(xml) 22 | # sitemap.urls 23 | def urls 24 | @urls ||= extract_urls('url') 25 | end 26 | 27 | # Return all sitemap URLs defined in Sitemap. 28 | # @return [Array] of Sitemap URLs defined in Sitemap. 29 | # @example Get Sitemap URLs defined in Sitemap 30 | # sitemap = Sitemap.new(xml) 31 | # sitemap.sitemaps 32 | def sitemaps 33 | @sitemaps ||= extract_urls('sitemap') 34 | end 35 | 36 | # Check if sitemap is a plain file 37 | # @return [Boolean] whether document is plain 38 | def plain_document? 39 | document.elements.empty? 40 | end 41 | 42 | # Return the name of the document (if there is one) 43 | # @return [String] the document root name 44 | def root_name 45 | return unless document.root 46 | 47 | document.root.name 48 | end 49 | 50 | # Returns true of Sitemap is a Sitemap index 51 | # @return [Boolean] of whether the Sitemap is an Sitemap index or not 52 | # @example Check if Sitemap is a sitemap index 53 | # sitemap = Sitemap.new(xml) 54 | # sitemap.sitemap_index? 55 | def sitemap_index? 56 | root_name == 'sitemapindex' 57 | end 58 | 59 | # Returns true of Sitemap lists regular URLs 60 | # @return [Boolean] of whether the Sitemap regular URL list 61 | # @example Check if Sitemap is a regular URL list 62 | # sitemap = Sitemap.new(xml) 63 | # sitemap.urlset? 64 | def urlset? 65 | root_name == 'urlset' 66 | end 67 | 68 | private 69 | 70 | def valid_url?(url) 71 | uri = URI.parse(url) 72 | uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS) 73 | rescue URI::InvalidURIError 74 | false 75 | end 76 | 77 | # Extract URLs from Sitemap 78 | def extract_urls(node_name) 79 | if plain_document? 80 | return @contents.to_s 81 | .each_line.map(&:strip) 82 | .select(&method(:valid_url?)) 83 | end 84 | 85 | urls = [] 86 | document.root.elements.each("#{node_name}/loc") do |element| 87 | urls << element.text 88 | end 89 | urls 90 | end 91 | end 92 | end 93 | -------------------------------------------------------------------------------- /spec/wayback_archiver/archive_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WaybackArchiver::Archive do 4 | let(:headers) do 5 | { 6 | 'Accept' => '*/*', 7 | 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 8 | 'User-Agent' => WaybackArchiver.user_agent 9 | } 10 | end 11 | 12 | describe '::post' do 13 | it 'calls ::post_url for each URL' do 14 | allow(described_class).to receive(:post_url).and_return(WaybackArchiver::ArchiveResult.new(nil)) 15 | 16 | result = described_class.post(%w[https://example.com https://example.com/path]) 17 | 18 | expect(described_class).to have_received(:post_url).twice 19 | end 20 | 21 | it 'calls ::post_url for each URL with support for an max limit' do 22 | allow(described_class).to receive(:post_url).and_return(WaybackArchiver::ArchiveResult.new(nil)) 23 | 24 | result = described_class.post(%w[https://example.com https://example.com/path], limit: 1) 25 | 26 | expect(described_class).to have_received(:post_url).once 27 | end 28 | end 29 | 30 | describe '::crawl' do 31 | it 'calls URLCollector::crawl and ::post_url' do 32 | url = 'https://example.com' 33 | 34 | allow(WaybackArchiver::URLCollector).to receive(:crawl) 35 | .and_yield(url) 36 | .and_return([url]) 37 | 38 | allow(described_class).to receive(:post_url).and_return(WaybackArchiver::ArchiveResult.new(url)) 39 | 40 | expect(described_class.crawl(url)[0].uri).to eq(url) 41 | end 42 | end 43 | 44 | describe '::post_url' do 45 | it 'posts URL to the Wayback Machine' do 46 | url = 'https://example.com' 47 | expected_request_url = "https://web.archive.org/save/#{url}" 48 | 49 | stub_request(:get, expected_request_url) 50 | .with(headers: headers) 51 | .to_return(status: 301, body: 'buren', headers: {}) 52 | 53 | result = described_class.post_url(url) 54 | 55 | expect(result.uri).to eq(url) 56 | expect(result.code).to eq('301') 57 | expect(WaybackArchiver.logger.debug_log.first).to include(expected_request_url) 58 | expect(WaybackArchiver.logger.info_log.last).to include(url) 59 | end 60 | 61 | it 'rescues and logs Request::ServerError' do 62 | allow(WaybackArchiver::Request).to receive(:get) 63 | .and_raise(WaybackArchiver::Request::MaxRedirectError, 'too many redirects') 64 | 65 | url = 'https://example.com' 66 | expected_request_url = "https://web.archive.org/save/#{url}" 67 | 68 | stub_request(:get, expected_request_url) 69 | .with(headers: headers) 70 | .to_return(status: 301, body: 'buren', headers: {}) 71 | 72 | result = described_class.post_url(url) 73 | 74 | expect(result.uri).to eq(url) 75 | expect(result.response_error).to be_nil 76 | expect(result.request_url).to be_nil 77 | expect(result.error).to be_a(WaybackArchiver::Request::MaxRedirectError) 78 | 79 | last_error_log = WaybackArchiver.logger.error_log.last 80 | expect(last_error_log).to include(url) 81 | expect(last_error_log).to include('MaxRedirectError') 82 | expect(last_error_log).to include('too many redirects') 83 | end 84 | end 85 | end 86 | -------------------------------------------------------------------------------- /lib/wayback_archiver/sitemapper.rb: -------------------------------------------------------------------------------- 1 | require 'set' 2 | require 'robots' 3 | 4 | require 'wayback_archiver/sitemap' 5 | require 'wayback_archiver/request' 6 | 7 | module WaybackArchiver 8 | # Fetch and parse sitemaps recursively 9 | class Sitemapper 10 | # Common locations for Sitemap(s) 11 | COMMON_SITEMAP_LOCATIONS = %w[ 12 | sitemap_index.xml.gz 13 | sitemap-index.xml.gz 14 | sitemap_index.xml 15 | sitemap-index.xml 16 | sitemap.xml.gz 17 | sitemap.xml 18 | ].freeze 19 | 20 | # Autodiscover the location of the Sitemap, then fetch and parse recursively. 21 | # First it tries /robots.txt, then common locations for Sitemap and finally the supplied URL. 22 | # @return [Array] of URLs defined in Sitemap(s). 23 | # @param [URI] url to domain. 24 | # @example Get URLs defined in Sitemap for google.com 25 | # Sitemapper.autodiscover('https://google.com/') 26 | # @see http://www.sitemaps.org 27 | def self.autodiscover(url) 28 | WaybackArchiver.logger.info 'Looking for Sitemap(s) in /robots.txt' 29 | robots = Robots.new(WaybackArchiver.user_agent) 30 | sitemaps = robots.other_values(url)['Sitemap'] 31 | 32 | if sitemaps 33 | return sitemaps.flat_map do |sitemap| 34 | WaybackArchiver.logger.info "Fetching Sitemap at #{sitemap}" 35 | urls(url: sitemap) 36 | end 37 | end 38 | 39 | COMMON_SITEMAP_LOCATIONS.each do |path| 40 | WaybackArchiver.logger.info "Looking for Sitemap at #{path}" 41 | sitemap_url = [url, path].join(url.end_with?('/') ? '' : '/') 42 | response = Request.get(sitemap_url, raise_on_http_error: false) 43 | 44 | if response.success? 45 | WaybackArchiver.logger.info "Sitemap found at #{sitemap_url}" 46 | return urls(xml: response.body) 47 | end 48 | end 49 | 50 | WaybackArchiver.logger.info "Looking for Sitemap at #{url}" 51 | urls(url: url) 52 | rescue Request::Error => e 53 | WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}" 54 | [] 55 | end 56 | 57 | # Fetch and parse sitemaps recursively. 58 | # @return [Array] of URLs defined in Sitemap(s). 59 | # @param url [String] URL to Sitemap. 60 | # @param xml [String] Sitemap XML. 61 | # @example Get URLs defined in Sitemap for google.com 62 | # Sitemapper.urls(url: 'https://google.com/sitemap.xml') 63 | # @example Get URLs defined in Sitemap 64 | # Sitemapper.urls(xml: xml) 65 | # @see http://www.sitemaps.org 66 | def self.urls(url: nil, xml: nil, visited: Set.new) 67 | if visited.include?(url) 68 | WaybackArchiver.logger.debug "Already visited #{url} skipping.." 69 | return [] 70 | end 71 | 72 | visited << url if url 73 | 74 | xml = Request.get(url).body unless xml 75 | sitemap = Sitemap.new(xml) 76 | 77 | if sitemap.sitemap_index? 78 | sitemap.sitemaps.flat_map do |sitemap_url| 79 | urls(url: sitemap_url, visited: visited) 80 | end 81 | else 82 | sitemap.urls.map { |url| url&.strip } 83 | end 84 | rescue Request::Error => e 85 | WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}" 86 | 87 | [] 88 | end 89 | end 90 | end 91 | -------------------------------------------------------------------------------- /spec/wayback_archiver/sitemap_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WaybackArchiver::Sitemap do 4 | describe '#new' do 5 | it 'raises error REXML::ParseException when strict mode is true' do 6 | expect do 7 | described_class.new('', strict: true) 8 | end.to raise_error(REXML::ParseException) 9 | end 10 | 11 | it 'if strict mode false it swallows XML errors' do 12 | sitemap = described_class.new('') 13 | expect(sitemap.urls).to be_empty 14 | end 15 | end 16 | 17 | describe '#urls' do 18 | it 'returns URLs in XML sitemap' do 19 | sitemap = described_class.new(File.read('spec/data/sitemap.xml')) 20 | expect(sitemap.urls).to eq(%w[http://www.example.com/]) 21 | end 22 | 23 | it 'returns URLs in plain text sitemap' do 24 | file = "http://www.example.com/\nhttp://www.example.com/path" 25 | sitemap = described_class.new(file) 26 | expected = %w[ 27 | http://www.example.com/ 28 | http://www.example.com/path 29 | ] 30 | expect(sitemap.urls).to eq(expected) 31 | end 32 | 33 | it 'returns empty array when passed empty document' do 34 | sitemap = described_class.new('') 35 | expect(sitemap.urls).to be_empty 36 | end 37 | end 38 | 39 | describe '#sitemaps' do 40 | it 'returns sitemap URLs in sitemap' do 41 | sitemap = described_class.new(File.read('spec/data/sitemap_index.xml')) 42 | expected = %w[ 43 | http://www.example.com/sitemap1.xml.gz 44 | http://www.example.com/sitemap2.xml.gz 45 | ] 46 | expect(sitemap.sitemaps).to eq(expected) 47 | end 48 | 49 | it 'returns empty array when passed empty document' do 50 | sitemap = described_class.new('') 51 | expect(sitemap.sitemaps).to be_empty 52 | end 53 | end 54 | 55 | describe '#plain_document?' do 56 | it 'returns true when passed non-XML document' do 57 | sitemap = described_class.new('') 58 | expect(sitemap.plain_document?).to eq(true) 59 | end 60 | 61 | it 'returns false when passed XML document' do 62 | sitemap = described_class.new('') 63 | expect(sitemap.plain_document?).to eq(false) 64 | end 65 | end 66 | 67 | describe '#root_name' do 68 | it 'returns nil when passed non-XML document' do 69 | sitemap = described_class.new('') 70 | expect(sitemap.root_name).to be_nil 71 | end 72 | 73 | it 'returns root name when passed XML document' do 74 | sitemap = described_class.new('') 75 | expect(sitemap.root_name).to eq('buren') 76 | end 77 | end 78 | 79 | describe '#sitemap_index?' do 80 | it 'returns true if document is a sitemap index' do 81 | sitemap = described_class.new(File.read('spec/data/sitemap_index.xml')) 82 | expect(sitemap.sitemap_index?).to eq(true) 83 | end 84 | 85 | it 'returns false if document sitemap' do 86 | sitemap = described_class.new(File.read('spec/data/sitemap.xml')) 87 | expect(sitemap.sitemap_index?).to eq(false) 88 | end 89 | end 90 | 91 | describe '#urlset?' do 92 | it 'returns true if document is a sitemap' do 93 | sitemap = described_class.new(File.read('spec/data/sitemap.xml')) 94 | expect(sitemap.urlset?).to eq(true) 95 | end 96 | 97 | it 'returns false if document is a sitemap index' do 98 | sitemap = described_class.new(File.read('spec/data/sitemap_index.xml')) 99 | expect(sitemap.urlset?).to eq(false) 100 | end 101 | end 102 | end 103 | -------------------------------------------------------------------------------- /lib/wayback_archiver/archive.rb: -------------------------------------------------------------------------------- 1 | require 'concurrent' 2 | 3 | require 'wayback_archiver/thread_pool' 4 | require 'wayback_archiver/adapters/wayback_machine' 5 | 6 | module WaybackArchiver 7 | # Post URL(s) to Wayback Machine 8 | class Archive 9 | # Send URLs to Wayback Machine. 10 | # @return [Array] with sent URLs. 11 | # @param [Array] urls to send to the Wayback Machine. 12 | # @param concurrency [Integer] the default is 1 13 | # @yield [archive_result] If a block is given, each result will be yielded 14 | # @yieldparam [ArchiveResult] archive_result 15 | # @example Archive urls, asynchronously 16 | # Archive.post(['http://example.com']) 17 | # Archiver.post(['http://example.com']) do |result| 18 | # puts [result.code || 'error', result.url] # print response status and URL 19 | # end 20 | # @example Archive urls, using only 1 thread 21 | # Archive.post(['http://example.com'], concurrency: 1) 22 | # @example Stop after archiving 100 links 23 | # Archive.post(['http://example.com'], limit: 100) 24 | # @example Explicitly set no limit on how many links are posted 25 | # Archive.post(['http://example.com'], limit: -1) 26 | def self.post(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit) 27 | WaybackArchiver.logger.info "Total URLs to be sent: #{urls.length}" 28 | WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads" 29 | 30 | urls_queue = if limit == -1 31 | urls 32 | else 33 | urls[0...limit] 34 | end 35 | 36 | posted_urls = Concurrent::Array.new 37 | pool = ThreadPool.build(concurrency) 38 | 39 | urls_queue.each do |url| 40 | pool.post do 41 | result = post_url(url) 42 | yield(result) if block_given? 43 | posted_urls << result unless result.errored? 44 | end 45 | end 46 | 47 | pool.shutdown 48 | pool.wait_for_termination 49 | 50 | WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine" 51 | posted_urls 52 | end 53 | 54 | # Send URLs to Wayback Machine by crawling the site. 55 | # @return [Array] with URLs sent to the Wayback Machine. 56 | # @param [String] source for URL to crawl. 57 | # @param concurrency [Integer] the default is 1 58 | # @param [Array] hosts to crawl 59 | # @yield [archive_result] If a block is given, each result will be yielded 60 | # @yieldparam [ArchiveResult] archive_result 61 | # @example Crawl example.com and send all URLs of the same domain 62 | # Archiver.crawl('example.com') 63 | # Archiver.crawl('example.com') do |result| 64 | # puts [result.code || 'error', result.url] # print response status and URL 65 | # end 66 | # @example Crawl example.com and send all URLs of the same domain with low concurrency 67 | # Archiver.crawl('example.com', concurrency: 1) 68 | # @example Stop after archiving 100 links 69 | # Archiver.crawl('example.com', limit: 100) 70 | # @example Crawl multiple hosts 71 | # URLCollector.crawl( 72 | # 'http://example.com', 73 | # hosts: [ 74 | # 'example.com', 75 | # /host[\d]+\.example\.com/ 76 | # ] 77 | # ) 78 | def self.crawl(source, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit) 79 | WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads" 80 | 81 | posted_urls = Concurrent::Array.new 82 | pool = ThreadPool.build(concurrency) 83 | 84 | found_urls = URLCollector.crawl(source, hosts: hosts, limit: limit) do |url| 85 | pool.post do 86 | result = post_url(url) 87 | yield(result) if block_given? 88 | posted_urls << result unless result.errored? 89 | end 90 | end 91 | WaybackArchiver.logger.info "Crawling of #{source} finished, found #{found_urls.length} URL(s)" 92 | pool.shutdown 93 | pool.wait_for_termination 94 | 95 | WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine" 96 | posted_urls 97 | end 98 | 99 | # Send URL to Wayback Machine. 100 | # @return [ArchiveResult] the sent URL. 101 | # @param [String] url to send. 102 | # @example Archive example.com, with default options 103 | # Archive.post_url('http://example.com') 104 | def self.post_url(url) 105 | WaybackArchiver.adapter.call(url) 106 | end 107 | end 108 | end 109 | -------------------------------------------------------------------------------- /lib/robots.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2008 Kyle Maxwell, contributors 3 | # 4 | # Permission is hereby granted, free of charge, to any person 5 | # obtaining a copy of this software and associated documentation 6 | # files (the "Software"), to deal in the Software without 7 | # restriction, including without limitation the rights to use, 8 | # copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the 10 | # Software is furnished to do so, subject to the following 11 | # conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be 14 | # included in all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 18 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 20 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 21 | # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23 | # OTHER DEALINGS IN THE SOFTWARE. 24 | # 25 | 26 | require "open-uri" 27 | require "uri" 28 | require "rubygems" 29 | require "timeout" 30 | 31 | class Robots 32 | 33 | DEFAULT_TIMEOUT = 3 34 | 35 | class ParsedRobots 36 | 37 | def initialize(uri, user_agent) 38 | @last_accessed = Time.at(1) 39 | 40 | io = Robots.get_robots_txt(uri, user_agent) 41 | 42 | if !io || io.content_type != "text/plain" || io.status != ["200", "OK"] 43 | io = StringIO.new("User-agent: *\nAllow: /\n") 44 | end 45 | 46 | @other = {} 47 | @disallows = {} 48 | @allows = {} 49 | @delays = {} # added delays to make it work 50 | agent = /.*/ 51 | io.each do |line| 52 | next if line =~ /^\s*(#.*|$)/ 53 | arr = line.split(":") 54 | key = arr.shift 55 | value = arr.join(":").strip 56 | value.strip! 57 | case key 58 | when "User-agent" 59 | agent = to_regex(value) 60 | when "Allow" 61 | @allows[agent] ||= [] 62 | @allows[agent] << to_regex(value) 63 | when "Disallow" 64 | @disallows[agent] ||= [] 65 | @disallows[agent] << to_regex(value) 66 | when "Crawl-delay" 67 | @delays[agent] = value.to_i 68 | else 69 | @other[key] ||= [] 70 | @other[key] << value 71 | end 72 | end 73 | 74 | @parsed = true 75 | end 76 | 77 | def allowed?(uri, user_agent) 78 | return true unless @parsed 79 | allowed = true 80 | path = uri.request_uri 81 | 82 | @disallows.each do |key, value| 83 | if user_agent =~ key 84 | value.each do |rule| 85 | if path =~ rule 86 | allowed = false 87 | end 88 | end 89 | end 90 | end 91 | 92 | @allows.each do |key, value| 93 | unless allowed 94 | if user_agent =~ key 95 | value.each do |rule| 96 | if path =~ rule 97 | allowed = true 98 | end 99 | end 100 | end 101 | end 102 | end 103 | 104 | if allowed && @delays[user_agent] 105 | sleep @delays[user_agent] - (Time.now - @last_accessed) 106 | @last_accessed = Time.now 107 | end 108 | 109 | return allowed 110 | end 111 | 112 | def other_values 113 | @other 114 | end 115 | 116 | protected 117 | 118 | def to_regex(pattern) 119 | return /should-not-match-anything-123456789/ if pattern.strip.empty? 120 | pattern = Regexp.escape(pattern) 121 | pattern.gsub!(Regexp.escape("*"), ".*") 122 | Regexp.compile("^#{pattern}") 123 | end 124 | end 125 | 126 | def self.get_robots_txt(uri, user_agent) 127 | begin 128 | Timeout::timeout(Robots.timeout) do 129 | io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil 130 | end 131 | rescue Timeout::Error 132 | STDERR.puts "robots.txt request timed out" 133 | end 134 | end 135 | 136 | def self.timeout=(t) 137 | @timeout = t 138 | end 139 | 140 | def self.timeout 141 | @timeout || DEFAULT_TIMEOUT 142 | end 143 | 144 | def initialize(user_agent) 145 | @user_agent = user_agent 146 | @parsed = {} 147 | end 148 | 149 | def allowed?(uri) 150 | uri = URI.parse(uri.to_s) unless uri.is_a?(URI) 151 | host = uri.host 152 | @parsed[host] ||= ParsedRobots.new(uri, @user_agent) 153 | @parsed[host].allowed?(uri, @user_agent) 154 | end 155 | 156 | def other_values(uri) 157 | uri = URI.parse(uri.to_s) unless uri.is_a?(URI) 158 | host = uri.host 159 | @parsed[host] ||= ParsedRobots.new(uri, @user_agent) 160 | @parsed[host].other_values 161 | end 162 | end 163 | -------------------------------------------------------------------------------- /spec/wayback_archiver/sitemapper_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WaybackArchiver::Sitemapper do 4 | let(:headers) do 5 | { 6 | 'Accept' => '*/*', 7 | 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 8 | 'User-Agent' => WaybackArchiver.user_agent 9 | } 10 | end 11 | 12 | let(:robots_txt) { File.read('spec/data/robots.txt') } 13 | let(:sitemap_index_xml) { File.read('spec/data/sitemap_index.xml') } 14 | let(:sitemap_index_with_duplicate_url_xml) { File.read('spec/data/sitemap_index_with_duplicate_url.xml') } 15 | let(:sitemap_xml) { File.read('spec/data/sitemap.xml') } 16 | 17 | describe '::autodiscover' do 18 | context 'with found Sitemap location in robots.txt' do 19 | it 'fetches those Sitemap(s) and returns all present URLs' do 20 | # The robots gem doesn't play nice with the WebMock so we can't test this until 21 | # https://github.com/fizx/robots/pull/9 is merged. 22 | # Until then we're gonna use rspec-mocks 23 | # stub_request(:get, 'http://www.example.com/robots.txt'). 24 | # with(headers: headers). 25 | # to_return(status: 200, body: robots_txt, headers: {}) 26 | allow_any_instance_of(Robots).to receive(:other_values).and_return('Sitemap' => %w[http://www.example.com/sitemap.xml]) 27 | 28 | stub_request(:get, 'http://www.example.com/sitemap.xml') 29 | .with(headers: headers) 30 | .to_return(status: 200, body: sitemap_xml, headers: {}) 31 | 32 | expect(described_class.autodiscover('http://www.example.com')).to eq(%w[http://www.example.com/]) 33 | end 34 | 35 | it 'returns empty list on request error' do 36 | allow_any_instance_of(Robots).to receive(:other_values).and_raise(WaybackArchiver::Request::Error) 37 | 38 | expect(described_class.autodiscover('http://www.example.com')).to be_empty 39 | end 40 | end 41 | 42 | context 'with found Sitemap location among common Sitemap locations' do 43 | it 'returns all present URLs if a Sitemap is found' do 44 | base_url = 'http://www.example.com' 45 | stub_request(:get, "#{base_url}/robots.txt") 46 | .with(headers: headers) 47 | .to_return(status: 200, body: robots_txt, headers: {}) 48 | 49 | sitemap_path = WaybackArchiver::Sitemapper::COMMON_SITEMAP_LOCATIONS.first 50 | 51 | stub_request(:get, [base_url, sitemap_path].join('/')) 52 | .with(headers: headers) 53 | .to_return(status: 200, body: sitemap_xml, headers: {}) 54 | 55 | expect(described_class.autodiscover('http://www.example.com')).to eq(%w[http://www.example.com/]) 56 | end 57 | end 58 | 59 | context 'at the provided URL' do 60 | it 'returns all present URLs if a Sitemap is found' do 61 | base_url = 'http://www.example.com' 62 | stub_request(:get, "#{base_url}/robots.txt") 63 | .with(headers: headers) 64 | .to_return(status: 200, body: robots_txt, headers: {}) 65 | 66 | WaybackArchiver::Sitemapper::COMMON_SITEMAP_LOCATIONS.each do |sitemap_path| 67 | stub_request(:get, [base_url, sitemap_path].join('/')) 68 | .with(headers: headers) 69 | .to_return(status: 404, body: '', headers: {}) 70 | end 71 | 72 | stub_request(:get, base_url) 73 | .with(headers: headers) 74 | .to_return(status: 200, body: sitemap_xml, headers: {}) 75 | 76 | expect(described_class.autodiscover(base_url)).to eq(%w[http://www.example.com/]) 77 | end 78 | end 79 | end 80 | 81 | describe '::urls' do 82 | it 'can start with xml argument' do 83 | expect(described_class.urls(xml: sitemap_xml)).to eq(%w[http://www.example.com/]) 84 | end 85 | 86 | it 'returns empty array if url already has been visited' do 87 | start_url = 'http://www.example.com/sitemap_index.xml' 88 | 89 | stub_request(:get, start_url) 90 | .with(headers: headers) 91 | .to_return(status: 200, body: sitemap_index_with_duplicate_url_xml, headers: {}) 92 | 93 | %w[http://www.example.com/sitemap1.xml.gz].each do |url| 94 | stub_request(:get, url) 95 | .with(headers: headers) 96 | .to_return(status: 200, body: sitemap_xml, headers: {}) 97 | end 98 | 99 | result = described_class.urls(url: start_url) 100 | expect(WaybackArchiver.logger.debug_log).to include("Already visited http://www.example.com/sitemap1.xml.gz skipping..") 101 | expect(result).to eq(%w[http://www.example.com/]) 102 | end 103 | 104 | context 'with url argument and returned sitemap index' do 105 | it 'follows the index and returns all URLs sitemap(s)' do 106 | start_url = 'http://www.example.com/sitemap_index.xml' 107 | 108 | stub_request(:get, start_url) 109 | .with(headers: headers) 110 | .to_return(status: 200, body: sitemap_index_xml, headers: {}) 111 | 112 | %w[http://www.example.com/sitemap1.xml.gz http://www.example.com/sitemap2.xml.gz].each do |url| 113 | stub_request(:get, url) 114 | .with(headers: headers) 115 | .to_return(status: 200, body: sitemap_xml, headers: {}) 116 | end 117 | 118 | result = described_class.urls(url: start_url) 119 | expect(result).to eq(%w[http://www.example.com/ http://www.example.com/]) 120 | end 121 | end 122 | 123 | context 'with url argument and returned sitemap' do 124 | it 'returns all URLs in sitemap' do 125 | stub_request(:get, 'http://www.example.com/sitemap.xml') 126 | .with(headers: headers) 127 | .to_return(status: 200, body: sitemap_xml, headers: {}) 128 | 129 | result = described_class.urls(url: 'http://www.example.com/sitemap.xml') 130 | expect(result).to eq(%w[http://www.example.com/]) 131 | end 132 | end 133 | 134 | it 'returns empty list on request error' do 135 | allow(WaybackArchiver::Request).to receive(:get).and_raise(WaybackArchiver::Request::Error) 136 | 137 | expect(described_class.urls(url: 'http://www.example.com')).to be_empty 138 | end 139 | end 140 | end 141 | -------------------------------------------------------------------------------- /spec/wayback_archiver_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WaybackArchiver do 4 | describe '::archive' do 5 | it 'raises ArgumentError when passed unknown strategy' do 6 | expect do 7 | described_class.archive('http://example.com', strategy: :watman_strategy) 8 | end.to raise_error(ArgumentError) 9 | end 10 | 11 | it 'calls ::auto when no strategy is given' do 12 | allow(described_class).to receive(:auto).and_return([]) 13 | described_class.archive('http://example.com') 14 | expect(described_class).to have_received(:auto).once 15 | end 16 | 17 | it 'calls ::auto when passed auto as strategy' do 18 | allow(described_class).to receive(:auto).and_return([]) 19 | described_class.archive('http://example.com', strategy: :auto) 20 | expect(described_class).to have_received(:auto).once 21 | end 22 | 23 | it 'calls ::crawl when passed crawl as strategy' do 24 | allow(described_class).to receive(:crawl).and_return([]) 25 | described_class.archive('http://example.com', strategy: :crawl) 26 | expect(described_class).to have_received(:crawl).once 27 | end 28 | 29 | it 'calls ::urls when passed urls as strategy' do 30 | allow(described_class).to receive(:urls).and_return([]) 31 | described_class.archive('http://example.com', strategy: :urls) 32 | expect(described_class).to have_received(:urls).once 33 | end 34 | 35 | it 'calls ::urls when passed url as strategy' do 36 | allow(described_class).to receive(:urls).and_return([]) 37 | described_class.archive('http://example.com', strategy: :url) 38 | expect(described_class).to have_received(:urls).once 39 | end 40 | 41 | it 'calls ::sitemap when passed sitemap as strategy' do 42 | allow(described_class).to receive(:sitemap).and_return([]) 43 | described_class.archive('http://example.com', strategy: :sitemap) 44 | expect(described_class).to have_received(:sitemap).once 45 | end 46 | 47 | context 'legacy strategy param' do 48 | it 'raises ArgumentError when passed unknown strategy' do 49 | expect do 50 | described_class.archive('http://example.com', :watman_strategy) 51 | end.to raise_error(ArgumentError) 52 | end 53 | 54 | it 'calls ::auto when passed auto as strategy' do 55 | allow(described_class).to receive(:auto).and_return([]) 56 | described_class.archive('http://example.com', :auto) 57 | expect(described_class).to have_received(:auto).once 58 | end 59 | 60 | it 'calls ::crawl when passed crawl as strategy' do 61 | allow(described_class).to receive(:crawl).and_return([]) 62 | described_class.archive('http://example.com', :crawl) 63 | expect(described_class).to have_received(:crawl).once 64 | end 65 | 66 | it 'calls ::urls when passed urls as strategy' do 67 | allow(described_class).to receive(:urls).and_return([]) 68 | described_class.archive('http://example.com', :urls) 69 | expect(described_class).to have_received(:urls).once 70 | end 71 | 72 | it 'calls ::urls when passed url as strategy' do 73 | allow(described_class).to receive(:urls).and_return([]) 74 | described_class.archive('http://example.com', :url) 75 | expect(described_class).to have_received(:urls).once 76 | end 77 | 78 | it 'calls ::sitemap when passed sitemap as strategy' do 79 | allow(described_class).to receive(:sitemap).and_return([]) 80 | described_class.archive('http://example.com', :sitemap) 81 | expect(described_class).to have_received(:sitemap).once 82 | end 83 | end 84 | end 85 | 86 | describe '::auto' do 87 | it 'calls Sitemapper::autodiscover and ::crawl if Sitemapper returned empty result' do 88 | allow(described_class::Sitemapper).to receive(:autodiscover).and_return([]) 89 | allow(described_class).to receive(:crawl).and_return([]) 90 | 91 | described_class.auto('http://example.com') 92 | 93 | expect(described_class::Sitemapper).to have_received(:autodiscover).once 94 | expect(described_class).to have_received(:crawl).once 95 | end 96 | 97 | it 'calls Sitemapper::autodiscover and ::urls if Sitemapper returned non-empty result' do 98 | allow(described_class::Sitemapper).to receive(:autodiscover).and_return(['url']) 99 | allow(described_class).to receive(:urls).and_return([]) 100 | 101 | described_class.auto('http://example.com') 102 | 103 | expect(described_class::Sitemapper).to have_received(:autodiscover).once 104 | expect(described_class).to have_received(:urls).once 105 | end 106 | end 107 | 108 | describe '::crawl' do 109 | it 'calls Archive::crawl' do 110 | allow(described_class::Archive).to receive(:crawl).and_return([]) 111 | 112 | described_class.crawl('http://example.com') 113 | 114 | expect(described_class::Archive).to have_received(:crawl).once 115 | end 116 | end 117 | 118 | describe '::urls' do 119 | it 'calls Archive::post' do 120 | allow(described_class::Archive).to receive(:post).and_return([]) 121 | 122 | described_class.urls('http://example.com') 123 | 124 | expect(described_class::Archive).to have_received(:post).once 125 | end 126 | end 127 | 128 | describe '::sitemap' do 129 | it 'calls URLCollector::sitemap and Archive::post' do 130 | allow(described_class::URLCollector).to receive(:sitemap).and_return([]) 131 | allow(described_class::Archive).to receive(:post).and_return([]) 132 | 133 | described_class.sitemap('http://example.com') 134 | 135 | expect(described_class::URLCollector).to have_received(:sitemap).once 136 | expect(described_class::Archive).to have_received(:post).once 137 | end 138 | end 139 | 140 | describe '::default_logger!' do 141 | it 'has NullLogger as the default logger' do 142 | described_class.default_logger! 143 | expect(described_class.logger.class).to eq(described_class::NullLogger) 144 | end 145 | end 146 | 147 | describe '::logger=' do 148 | it 'can set logger' do 149 | MyLogger = Struct.new(:name).new('buren') 150 | described_class.logger = MyLogger 151 | expect(described_class.logger).to eq(MyLogger) 152 | end 153 | end 154 | 155 | describe '::user_agent=' do 156 | it 'can set user_agent' do 157 | described_class.user_agent = 'buren' 158 | expect(described_class.user_agent).to eq('buren') 159 | end 160 | end 161 | 162 | describe '::concurrency=' do 163 | it 'can set concurrency' do 164 | described_class.concurrency = 1 165 | expect(described_class.concurrency).to eq(1) 166 | end 167 | end 168 | 169 | describe '::max_limit=' do 170 | it 'can set max_limit' do 171 | described_class.max_limit = 1 172 | expect(described_class.max_limit).to eq(1) 173 | end 174 | end 175 | 176 | describe '::adapter=' do 177 | it 'can set adapter' do 178 | adapter = WaybackArchiver::WaybackMachine 179 | described_class.adapter = adapter 180 | expect(described_class.adapter).to match(adapter) 181 | end 182 | 183 | it 'raises error unless all adapter respond to #call' do 184 | expect { described_class.adapter = 1 }.to raise_error(ArgumentError) 185 | end 186 | end 187 | end 188 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WaybackArchiver 2 | 3 | Post URLs to [Wayback Machine](https://archive.org/web/) (Internet Archive), using a crawler, from [Sitemap(s)](http://www.sitemaps.org), or a list of URLs. 4 | 5 | > The Wayback Machine is a digital archive of the World Wide Web [...] 6 | > The service enables users to see archived versions of web pages across time ... 7 | > \- [Wikipedia](https://en.wikipedia.org/wiki/Wayback_Machine) 8 | 9 | [![Build Status](https://travis-ci.org/buren/wayback_archiver.svg?branch=master)](https://travis-ci.org/buren/wayback_archiver) [![Code Climate](https://codeclimate.com/github/buren/wayback_archiver.png)](https://codeclimate.com/github/buren/wayback_archiver) [![Docs badge](https://inch-ci.org/github/buren/wayback_archiver.svg?branch=master)](http://www.rubydoc.info/github/buren/wayback_archiver/master) [![Gem Version](https://badge.fury.io/rb/wayback_archiver.svg)](http://badge.fury.io/rb/wayback_archiver) 10 | 11 | __Index__ 12 | 13 | * [Installation](#installation) 14 | * [Usage](#usage) 15 | - [Ruby](#ruby) 16 | - [CLI](#cli) 17 | * [Configuration](#configuration) 18 | * [RubyDoc](#docs) 19 | * [Contributing](#contributing) 20 | * [MIT License](#license) 21 | * [References](#references) 22 | 23 | ## Installation 24 | 25 | Install the gem: 26 | ``` 27 | $ gem install wayback_archiver 28 | ``` 29 | 30 | Or add this line to your application's Gemfile: 31 | 32 | ```ruby 33 | gem 'wayback_archiver' 34 | ``` 35 | 36 | And then execute: 37 | 38 | ``` 39 | $ bundle 40 | ``` 41 | 42 | ## Usage 43 | 44 | * [Ruby](#ruby) 45 | * [CLI](#cli) 46 | 47 | __Strategies__: 48 | 49 | * `auto` (the default) - Will try to 50 | 1. Find Sitemap(s) defined in `/robots.txt` 51 | 2. Then in common sitemap locations `/sitemap-index.xml`, `/sitemap.xml` etc. 52 | 3. Fallback to crawling (using the excellent [spidr](https://github.com/postmodern/spidr/) gem) 53 | * `sitemap` - Parse Sitemap(s), supports [index files](https://www.sitemaps.org/protocol.html#index) (and gzip) 54 | * `urls` - Post URL(s) 55 | 56 | ## Ruby 57 | 58 | First require the gem 59 | 60 | ```ruby 61 | require 'wayback_archiver' 62 | ``` 63 | 64 | _Examples_: 65 | 66 | Auto 67 | 68 | ```ruby 69 | # auto is the default 70 | WaybackArchiver.archive('example.com') 71 | 72 | # or explicitly 73 | WaybackArchiver.archive('example.com', strategy: :auto) 74 | ``` 75 | 76 | Crawl 77 | 78 | ```ruby 79 | WaybackArchiver.archive('example.com', strategy: :crawl) 80 | ``` 81 | 82 | Only send one single URL 83 | 84 | ```ruby 85 | WaybackArchiver.archive('example.com', strategy: :url) 86 | ``` 87 | 88 | Send multiple URLs 89 | 90 | ```ruby 91 | WaybackArchiver.archive(%w[example.com www.example.com], strategy: :urls) 92 | ``` 93 | 94 | Send all URL(s) found in Sitemap 95 | 96 | ```ruby 97 | WaybackArchiver.archive('example.com/sitemap.xml', strategy: :sitemap) 98 | 99 | # works with Sitemap index files too 100 | WaybackArchiver.archive('example.com/sitemap-index.xml.gz', strategy: :sitemap) 101 | ``` 102 | 103 | Specify concurrency 104 | 105 | ```ruby 106 | WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10) 107 | ``` 108 | 109 | Specify max number of URLs to be archived 110 | 111 | ```ruby 112 | WaybackArchiver.archive('example.com', strategy: :auto, limit: 10) 113 | ``` 114 | 115 | Each archive strategy can receive a block that will be called for each URL 116 | 117 | ```ruby 118 | WaybackArchiver.archive('example.com', strategy: :auto) do |result| 119 | if result.success? 120 | puts "Successfully archived: #{result.archived_url}" 121 | else 122 | puts "Error (HTTP #{result.code}) when archiving: #{result.archived_url}" 123 | end 124 | end 125 | ``` 126 | 127 | Use your own adapter for posting found URLs 128 | 129 | ```ruby 130 | WaybackArchiver.adapter = ->(url) { puts url } # whatever that responds to #call 131 | ``` 132 | 133 | ## CLI 134 | 135 | __Usage__: 136 | 137 | ``` 138 | wayback_archiver [] [options] 139 | ``` 140 | 141 | Print full usage instructions 142 | 143 | ``` 144 | wayback_archiver --help 145 | ``` 146 | 147 | _Examples_: 148 | 149 | Auto 150 | 151 | ``` 152 | # auto is the default 153 | wayback_archiver example.com 154 | 155 | # or explicitly 156 | wayback_archiver example.com --auto 157 | ``` 158 | 159 | Crawl 160 | 161 | ```bash 162 | wayback_archiver example.com --crawl 163 | ``` 164 | 165 | Only send one single URL 166 | 167 | ```bash 168 | wayback_archiver example.com --url 169 | ``` 170 | 171 | Send multiple URLs 172 | 173 | ```bash 174 | wayback_archiver example.com www.example.com --urls 175 | ``` 176 | 177 | Crawl multiple URLs 178 | 179 | ```bash 180 | wayback_archiver example.com www.example.com --crawl 181 | ``` 182 | 183 | Send all URL(s) found in Sitemap 184 | 185 | ```bash 186 | wayback_archiver example.com/sitemap.xml 187 | 188 | # works with Sitemap index files too 189 | wayback_archiver example.com/sitemap-index.xml.gz 190 | ``` 191 | 192 | Most options 193 | 194 | ```bash 195 | wayback_archiver example.com www.example.com --auto --concurrency=10 --limit=100 --log=output.log --verbose 196 | ``` 197 | 198 | View archive: [https://web.archive.org/web/*/http://example.com](https://web.archive.org/web/*/http://example.com) (replace `http://example.com` with to your desired domain). 199 | 200 | ## Configuration 201 | 202 | :information_source: By default `wayback_archiver` doesn't respect robots.txt files, see [this Internet Archive blog post](https://blog.archive.org/2017/04/17/robots-txt-meant-for-search-engines-dont-work-well-for-web-archives/) for more information. 203 | 204 | Configuration (the below values are the defaults) 205 | 206 | ```ruby 207 | WaybackArchiver.concurrency = 1 208 | WaybackArchiver.user_agent = WaybackArchiver::USER_AGENT 209 | WaybackArchiver.respect_robots_txt = WaybackArchiver::DEFAULT_RESPECT_ROBOTS_TXT 210 | WaybackArchiver.logger = Logger.new(STDOUT) 211 | WaybackArchiver.max_limit = -1 # unlimited 212 | WaybackArchiver.adapter = WaybackArchiver::WaybackMachine # must implement #call(url) 213 | ``` 214 | 215 | For a more verbose log you can configure `WaybackArchiver` as such: 216 | 217 | ```ruby 218 | WaybackArchiver.logger = Logger.new(STDOUT).tap do |logger| 219 | logger.progname = 'WaybackArchiver' 220 | logger.level = Logger::DEBUG 221 | end 222 | ``` 223 | 224 | _Pro tip_: If you're using the gem in a Rails app you can set `WaybackArchiver.logger = Rails.logger`. 225 | 226 | ## Docs 227 | 228 | You can find the docs online on [RubyDoc](http://www.rubydoc.info/github/buren/wayback_archiver/master). 229 | 230 | This gem is documented using `yard` (run from the root of this repository). 231 | 232 | ```bash 233 | yard # Generates documentation to doc/ 234 | ``` 235 | 236 | ## Contributing 237 | 238 | Contributions, feedback and suggestions are very welcome. 239 | 240 | 1. Fork it 241 | 2. Create your feature branch (`git checkout -b my-new-feature`) 242 | 3. Commit your changes (`git commit -am 'Add some feature'`) 243 | 4. Push to the branch (`git push origin my-new-feature`) 244 | 5. Create new Pull Request 245 | 246 | ## License 247 | 248 | [MIT License](LICENSE) 249 | 250 | ## References 251 | 252 | * Don't know what the Wayback Machine (Internet Archive) is? [Wayback Machine](https://archive.org/web/) 253 | * Don't know what a Sitemap is? [sitemaps.org](http://www.sitemaps.org) 254 | * Don't know what robot.txt is? [www.robotstxt.org](http://www.robotstxt.org/robotstxt.html) 255 | -------------------------------------------------------------------------------- /spec/wayback_archiver/request_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe WaybackArchiver::Request do 4 | describe '::get' do 5 | let(:headers) do 6 | { 7 | 'Accept' => '*/*', 8 | 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3', 9 | 'User-Agent' => WaybackArchiver.user_agent 10 | } 11 | end 12 | 13 | [ 14 | [described_class::ServerError, Timeout::Error], 15 | [described_class::ServerError, OpenSSL::SSL::SSLError], 16 | [described_class::ServerError, Net::HTTPBadResponse], 17 | [described_class::ServerError, Zlib::Error], 18 | # For some reason the below line causes an ArgumentError exception to be raised instead 19 | # [described_class::ClientError, SystemCallError], 20 | [described_class::ClientError, SocketError], 21 | [described_class::ClientError, IOError] 22 | ].each do |test_data| 23 | error_klass, raised_error_klass = test_data 24 | 25 | it "raises #{error_klass} on #{raised_error_klass}" do 26 | allow_any_instance_of(Net::HTTP).to receive(:request).and_raise(raised_error_klass) 27 | 28 | expect { described_class.get('https://example.com') }.to raise_error(error_klass) 29 | end 30 | end 31 | 32 | it 'returns response when server response with HTTP 200' do 33 | stub_request(:get, 'https://example.com/') 34 | .with(headers: headers) 35 | .to_return(status: 200, body: 'buren', headers: {}) 36 | 37 | result = described_class.get('https://example.com') 38 | expect(result.code).to eq('200') 39 | end 40 | 41 | it 'follows redirect when server response with HTTP 3XX' do 42 | response_headers = { 'location' => '/redirect-path' } 43 | stub_request(:get, 'https://example.com/') 44 | .with(headers: headers) 45 | .to_return(status: 301, body: 'buren', headers: response_headers) 46 | 47 | stub_request(:get, 'https://example.com/redirect-path') 48 | .with(headers: headers) 49 | .to_return(status: 200, body: 'buren', headers: {}) 50 | 51 | result = described_class.get('https://example.com', max_redirects: 1) 52 | expect(result.code).to eq('200') 53 | expect(result.uri).to eq('https://example.com/redirect-path') 54 | end 55 | 56 | it 'raises MaxRedirectError if max redirects is reached' do 57 | response_headers = { 'location' => '/redirect-path' } 58 | stub_request(:get, 'https://example.com/') 59 | .with(headers: headers) 60 | .to_return(status: 301, body: 'buren', headers: response_headers) 61 | 62 | expect do 63 | described_class.get('https://example.com', max_redirects: 0) 64 | end.to raise_error(described_class::MaxRedirectError) 65 | end 66 | 67 | it 'raises UnknownResponseCodeError if server response with unknown HTTP code' do 68 | stub_request(:get, 'https://example.com/') 69 | .with(headers: headers) 70 | .to_return(status: 100, body: 'buren', headers: {}) 71 | 72 | expect do 73 | described_class.get('https://example.com') 74 | end.to raise_error(described_class::UnknownResponseCodeError) 75 | end 76 | 77 | it 'raises ResponseError if server responded with an error and raise_on_http_error is true' do 78 | stub_request(:get, 'https://example.com/') 79 | .with(headers: headers) 80 | .to_return(status: 400, body: 'buren', headers: {}) 81 | 82 | expect do 83 | described_class.get('https://example.com', raise_on_http_error: true) 84 | end.to raise_error(described_class::ResponseError) 85 | end 86 | 87 | it 'returns response if server responds with an error and raise_on_http_error is false' do 88 | stub_request(:get, 'https://example.com/') 89 | .with(headers: headers) 90 | .to_return(status: 400, body: 'buren', headers: {}) 91 | 92 | result = described_class.get('https://example.com', raise_on_http_error: false) 93 | 94 | expect(result.code).to eq('400') 95 | end 96 | end 97 | 98 | describe '::build_response' do 99 | it 'builds a Response object' do 100 | expected = WaybackArchiver::Response.new('200', 'OK', 'buren', 'http://example.com') 101 | response = described_class.build_response( 102 | 'http://example.com', 103 | Struct.new(:code, :message, :body).new('200', 'OK', 'buren') 104 | ) 105 | 106 | expect(response).to eq(expected) 107 | end 108 | 109 | it 'builds a response object that has a #success? method' do 110 | response = described_class.build_response( 111 | 'http://example.com', 112 | Struct.new(:code, :message, :body).new('200', 'OK', 'buren') 113 | ) 114 | 115 | expect(response.success?).to eq(true) 116 | end 117 | end 118 | 119 | describe '::build_redirect_uri' do 120 | it 'raises InvalidRedirectError if no location header is found' do 121 | response = Struct.new(:header).new({ location: nil }) 122 | redirect_error = WaybackArchiver::Request::InvalidRedirectError 123 | 124 | expect do 125 | described_class.build_redirect_uri('', response) 126 | end.to raise_error(redirect_error) 127 | end 128 | 129 | it 'adds base URI if location header is relative' do 130 | base_uri = 'http://example.com' 131 | response = Struct.new(:header).new({ 'location' => '/path' }) 132 | result = described_class.build_redirect_uri(base_uri, response) 133 | 134 | expect(result).to eq(URI.parse('http://example.com/path')) 135 | end 136 | 137 | it 'returns location header' do 138 | base_uri = 'http://example.com' 139 | response = Struct.new(:header).new({ 'location' => 'https://example.com/path' }) 140 | result = described_class.build_redirect_uri(base_uri, response) 141 | 142 | expect(result).to eq(URI.parse('https://example.com/path')) 143 | end 144 | end 145 | 146 | describe '::build_uri' do 147 | it 'returns URI untouched if passed an instance of URI' do 148 | uri = URI.parse('http://example.com') 149 | expect(described_class.build_uri(uri)).to eq(uri) 150 | end 151 | 152 | it 'returns URI instance if passed string with http protocol' do 153 | uri = URI.parse('http://example.com') 154 | expect(described_class.build_uri('http://example.com')).to eq(uri) 155 | end 156 | 157 | it 'returns URI instance if passed string with https protocol' do 158 | uri = URI.parse('https://example.com') 159 | expect(described_class.build_uri('https://example.com')).to eq(uri) 160 | end 161 | 162 | it 'returns URI instance with protocol if passed string without protocol' do 163 | uri = URI.parse('http://example.com') 164 | expect(described_class.build_uri('example.com')).to eq(uri) 165 | end 166 | end 167 | 168 | describe '::parse_body' do 169 | it 'returns empty string if passed nil' do 170 | expect(described_class.parse_body(nil)).to eq('') 171 | end 172 | 173 | it 'returns string untouched if passed a regular string' do 174 | expect(described_class.parse_body('buren')).to eq('buren') 175 | end 176 | 177 | it 'returns uncompressed string if passed a gzipped string' do 178 | gzipped_string = File.read('spec/data/test_gzip.gz') 179 | expect(described_class.parse_body(gzipped_string)).to eq("buren\n") 180 | end 181 | end 182 | 183 | describe '::blank?' do 184 | it 'returns true if passed nil' do 185 | expect(described_class.blank?(nil)).to eq(true) 186 | end 187 | 188 | it 'returns true if passed empty string' do 189 | expect(described_class.blank?('')).to eq(true) 190 | end 191 | 192 | it 'returns true if passed string with only spaces' do 193 | expect(described_class.blank?(' ')).to eq(true) 194 | end 195 | 196 | it 'returns false if passed non-string empty' do 197 | expect(described_class.blank?('buren')).to eq(false) 198 | end 199 | end 200 | end 201 | -------------------------------------------------------------------------------- /lib/wayback_archiver/request.rb: -------------------------------------------------------------------------------- 1 | require 'net/http' 2 | require 'openssl' 3 | require 'timeout' 4 | require 'uri' 5 | require 'zlib' 6 | 7 | require 'wayback_archiver/http_code' 8 | require 'wayback_archiver/response' 9 | 10 | module WaybackArchiver 11 | # Make HTTP requests 12 | class Request 13 | # General error, something went wrong 14 | class Error < StandardError; end 15 | # Client error, something went wrong on the local machine 16 | class ClientError < Error; end 17 | # Server error, the remote server did something wrong 18 | class ServerError < Error; end 19 | # Remote server responded with a HTTP error 20 | class HTTPError < ServerError; end 21 | # Remote server error 22 | class ResponseError < ServerError; end 23 | # Max redirects reached error 24 | class MaxRedirectError < ServerError; end 25 | # Remote server responded with an invalid redirect 26 | class InvalidRedirectError < ServerError; end 27 | # Remote server responded with an unknown HTTP code 28 | class UnknownResponseCodeError < ServerError; end 29 | 30 | # GET response wrapper 31 | GETStruct = Struct.new(:response, :error) 32 | 33 | # Max number of redirects before an error is raised 34 | MAX_REDIRECTS = 10 35 | 36 | # Known request errors 37 | REQUEST_ERRORS = { 38 | # server 39 | Timeout::Error => ServerError, 40 | OpenSSL::SSL::SSLError => ServerError, 41 | Net::HTTPBadResponse => ServerError, 42 | Zlib::Error => ServerError, 43 | # client 44 | SystemCallError => ClientError, 45 | SocketError => ClientError, 46 | IOError => ClientError 47 | }.freeze 48 | 49 | # Get reponse. 50 | # @return [Response] the http response representation. 51 | # @param [String, URI] uri to retrieve. 52 | # @param max_redirects [Integer] max redirects (default: 10). 53 | # @param follow_redirects [Boolean] follow redirects (default: true). 54 | # @example Get example.com 55 | # Request.get('example.com') 56 | # @example Get http://example.com and follow max 3 redirects 57 | # Request.get('http://example.com', max_redirects: 3) 58 | # @example Get http://example.com and don't follow redirects 59 | # Request.get('http://example.com', follow_redirects: false) 60 | # @raise [Error] super class of all exceptions that this method can raise 61 | # @raise [ServerError] all server errors 62 | # @raise [ClientError] all client errors 63 | # @raise [HTTPError] all HTTP errors 64 | # @raise [MaxRedirectError] too many redirects, subclass of HTTPError (only raised if raise_on_http_error flag is true) 65 | # @raise [ResponseError] server responsed with a 4xx or 5xx HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true) 66 | # @raise [UnknownResponseCodeError] server responded with an unknown HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true) 67 | # @raise [InvalidRedirectError] server responded with an invalid redirect, subclass of HTTPError (only raised if raise_on_http_error flag is true) 68 | def self.get( 69 | uri, 70 | max_redirects: MAX_REDIRECTS, 71 | raise_on_http_error: false, 72 | follow_redirects: true 73 | ) 74 | uri = build_uri(uri) 75 | 76 | redirect_count = 0 77 | until redirect_count > max_redirects 78 | WaybackArchiver.logger.debug "Requesting #{uri}" 79 | 80 | http = Net::HTTP.new(uri.host, uri.port) 81 | if uri.scheme == 'https' 82 | http.use_ssl = true 83 | http.verify_mode = OpenSSL::SSL::VERIFY_NONE 84 | end 85 | 86 | request = Net::HTTP::Get.new(uri.request_uri) 87 | request['User-Agent'] = WaybackArchiver.user_agent 88 | 89 | result = perform_request(uri, http, request) 90 | response = result.response 91 | error = result.error 92 | 93 | raise error if error 94 | 95 | code = response.code 96 | WaybackArchiver.logger.debug "[#{code}, #{response.message}] Requested #{uri}" 97 | 98 | case HTTPCode.type(code) 99 | when :success 100 | return build_response(uri, response) 101 | when :redirect 102 | return build_response(uri, response) unless follow_redirects 103 | 104 | uri = build_redirect_uri(uri, response) 105 | redirect_count += 1 106 | next 107 | when :error 108 | if raise_on_http_error 109 | raise ResponseError, "Failed with response code: #{code} when requesting #{uri}" 110 | end 111 | 112 | return build_response(uri, response) 113 | else 114 | raise UnknownResponseCodeError, "Unknown HTTP response code #{code} when requesting #{uri}" 115 | end 116 | end 117 | 118 | raise MaxRedirectError, "Redirected too many times when requesting #{uri}" 119 | end 120 | 121 | # Builds a Response object. 122 | # @return [Response] 123 | # @param [URI] uri that was requested. 124 | # @param [Net::HTTPResponse] response the server response. 125 | # @example Build Response object for example.com 126 | # Request.build_response(uri, net_http_response) 127 | def self.build_response(uri, response) 128 | Response.new( 129 | response.code, 130 | response.message, 131 | parse_body(response.body), 132 | uri.to_s 133 | ) 134 | end 135 | 136 | # Builds an URI for a redirect response. 137 | # @return [URI] to redirect to. 138 | # @param [URI] uri that was requested. 139 | # @param [Net::HTTPResponse] response the server response. 140 | # @example Build redirect URI for example.com (lets pretend it will redirect..) 141 | # Request.build_redirect_uri('http://example.com', net_http_response) 142 | def self.build_redirect_uri(uri, response) 143 | location_header = response.header.fetch('location') do 144 | raise InvalidRedirectError, "No location header found on redirect when requesting #{uri}" 145 | end 146 | 147 | location = URI.parse(location_header) 148 | return build_uri(uri) + location_header if location.relative? 149 | 150 | location 151 | end 152 | 153 | # Build URI. 154 | # @return [URI] uri to redirect to. 155 | # @param [URI, String] uri to build. 156 | # @example Build URI for example.com 157 | # Request.build_uri('http://example.com') 158 | # @example Build URI for # 159 | # uri = URI.parse('http://example.com') 160 | # Request.build_uri(uri) 161 | def self.build_uri(uri) 162 | return uri if uri.is_a?(URI) 163 | 164 | uri = "http://#{uri}" unless uri =~ %r{^https?://} 165 | URI.parse(uri) 166 | end 167 | 168 | # Parse response body, handles reqular and gzipped response bodies. 169 | # @return [String] the response body. 170 | # @param [String] response_body the server response body. 171 | # @example Return response body for response. 172 | # Request.parse_body(uri, net_http_response) 173 | def self.parse_body(response_body) 174 | return '' unless response_body 175 | 176 | Zlib::GzipReader.new(StringIO.new(response_body)).read 177 | rescue Zlib::GzipFile::Error => _e 178 | response_body 179 | end 180 | 181 | # Return whether a value is blank or not. 182 | # @return [Boolean] whether the value is blank or not. 183 | # @param [Object] value the value to check if its blank or not. 184 | # @example Returns false for nil. 185 | # Request.blank?(nil) 186 | # @example Returns false for empty string. 187 | # Request.blank?('') 188 | # @example Returns false for string with only spaces. 189 | # Request.blank?(' ') 190 | def self.blank?(value) 191 | return true unless value 192 | return true if value.strip.empty? 193 | 194 | false 195 | end 196 | 197 | private 198 | 199 | def self.perform_request(uri, http, request) 200 | # TODO: Consider retrying on certain HTTP response codes, i.e 429, 503 201 | response = http.request(request) 202 | GETStruct.new(response) 203 | rescue *REQUEST_ERRORS.keys => e 204 | build_request_error(uri, e, REQUEST_ERRORS.fetch(e.class)) 205 | end 206 | 207 | def self.build_request_error(uri, error, error_wrapper_klass) 208 | WaybackArchiver.logger.error "Request to #{uri} failed: #{error_wrapper_klass}, #{error.class}, #{error.message}" 209 | 210 | GETStruct.new( 211 | Response.new, 212 | error_wrapper_klass.new("#{error.class}, #{error.message}") 213 | ) 214 | end 215 | end 216 | end 217 | -------------------------------------------------------------------------------- /lib/wayback_archiver.rb: -------------------------------------------------------------------------------- 1 | require 'wayback_archiver/thread_pool' 2 | require 'wayback_archiver/null_logger' 3 | require 'wayback_archiver/version' 4 | require 'wayback_archiver/url_collector' 5 | require 'wayback_archiver/archive' 6 | require 'wayback_archiver/sitemapper' 7 | 8 | # WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap or by passing a list of URLs. 9 | module WaybackArchiver 10 | # Link to gem on rubygems.org, part of the sent User-Agent 11 | INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'.freeze 12 | # WaybackArchiver User-Agent 13 | USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze 14 | # Default for whether to respect robots txt files 15 | DEFAULT_RESPECT_ROBOTS_TXT = false 16 | 17 | # Default concurrency for archiving URLs 18 | DEFAULT_CONCURRENCY = 1 19 | 20 | # Maxmium number of links posted (-1 is no limit) 21 | DEFAULT_MAX_LIMIT = -1 22 | 23 | # Send URLs to Wayback Machine. 24 | # @return [Array] of URLs sent to the Wayback Machine. 25 | # @param [String/Array] source for URL(s). 26 | # @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto. 27 | # @param [Array] hosts to crawl. 28 | # @example Crawl example.com and send all URLs of the same domain 29 | # WaybackArchiver.archive('example.com') # Default strategy is :auto 30 | # WaybackArchiver.archive('example.com', strategy: :auto) 31 | # WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10) 32 | # WaybackArchiver.archive('example.com', strategy: :auto, limit: 100) # send max 100 URLs 33 | # WaybackArchiver.archive('example.com', :auto) 34 | # @example Crawl example.com and send all URLs of the same domain 35 | # WaybackArchiver.archive('example.com', strategy: :crawl) 36 | # WaybackArchiver.archive('example.com', strategy: :crawl, concurrency: 10) 37 | # WaybackArchiver.archive('example.com', strategy: :crawl, limit: 100) # send max 100 URLs 38 | # WaybackArchiver.archive('example.com', :crawl) 39 | # @example Send example.com Sitemap URLs 40 | # WaybackArchiver.archive('example.com', strategy: :sitemap) 41 | # WaybackArchiver.archive('example.com', strategy: :sitemap, concurrency: 10) 42 | # WaybackArchiver.archive('example.com', strategy: :sitemap, limit: 100) # send max 100 URLs 43 | # WaybackArchiver.archive('example.com', :sitemap) 44 | # @example Send only example.com 45 | # WaybackArchiver.archive('example.com', strategy: :url) 46 | # WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10) 47 | # WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs 48 | # WaybackArchiver.archive('example.com', :url) 49 | # @example Crawl multiple hosts 50 | # WaybackArchiver.archive( 51 | # 'http://example.com', 52 | # hosts: [ 53 | # 'example.com', 54 | # /host[\d]+\.example\.com/ 55 | # ] 56 | # ) 57 | def self.archive(source, legacy_strategy = nil, strategy: :auto, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block) 58 | strategy = legacy_strategy || strategy 59 | 60 | case strategy.to_s 61 | when 'crawl' then crawl(source, concurrency: concurrency, limit: limit, hosts: hosts, &block) 62 | when 'auto' then auto(source, concurrency: concurrency, limit: limit, &block) 63 | when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit, &block) 64 | when 'urls' then urls(source, concurrency: concurrency, limit: limit, &block) 65 | when 'url' then urls(source, concurrency: concurrency, limit: limit, &block) 66 | else 67 | raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl" 68 | end 69 | end 70 | 71 | # Look for Sitemap(s) and if nothing is found fallback to crawling. 72 | # Then send found URLs to the Wayback Machine. 73 | # @return [Array] of URLs sent to the Wayback Machine. 74 | # @param [String] source (must be a valid URL). 75 | # @param concurrency [Integer] 76 | # @example Auto archive example.com 77 | # WaybackArchiver.auto('example.com') # Default concurrency is 1 78 | # @example Auto archive example.com with low concurrency 79 | # WaybackArchiver.auto('example.com', concurrency: 1) 80 | # @example Auto archive example.com and archive max 100 URLs 81 | # WaybackArchiver.auto('example.com', limit: 100) 82 | # @see http://www.sitemaps.org 83 | def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block) 84 | urls = Sitemapper.autodiscover(source) 85 | return urls(urls, concurrency: concurrency, &block) if urls.any? 86 | 87 | crawl(source, concurrency: concurrency, &block) 88 | end 89 | 90 | # Crawl site for URLs to send to the Wayback Machine. 91 | # @return [Array] of URLs sent to the Wayback Machine. 92 | # @param [String] url to start crawling from. 93 | # @param [Array] hosts to crawl 94 | # @param concurrency [Integer] 95 | # @example Crawl example.com and send all URLs of the same domain 96 | # WaybackArchiver.crawl('example.com') # Default concurrency is 1 97 | # @example Crawl example.com and send all URLs of the same domain with low concurrency 98 | # WaybackArchiver.crawl('example.com', concurrency: 1) 99 | # @example Crawl example.com and archive max 100 URLs 100 | # WaybackArchiver.crawl('example.com', limit: 100) 101 | # @example Crawl multiple hosts 102 | # URLCollector.crawl( 103 | # 'http://example.com', 104 | # hosts: [ 105 | # 'example.com', 106 | # /host[\d]+\.example\.com/ 107 | # ] 108 | # ) 109 | def self.crawl(url, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block) 110 | WaybackArchiver.logger.info "Crawling #{url}" 111 | Archive.crawl(url, hosts: hosts, concurrency: concurrency, limit: limit, &block) 112 | end 113 | 114 | # Get URLs from sitemap and send found URLs to the Wayback Machine. 115 | # @return [Array] of URLs sent to the Wayback Machine. 116 | # @param [String] url to the sitemap. 117 | # @param concurrency [Integer] 118 | # @example Get example.com sitemap and archive all found URLs 119 | # WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 1 120 | # @example Get example.com sitemap and archive all found URLs with low concurrency 121 | # WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1) 122 | # @example Get example.com sitemap archive max 100 URLs 123 | # WaybackArchiver.sitemap('example.com/sitemap.xml', limit: 100) 124 | # @see http://www.sitemaps.org 125 | def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block) 126 | WaybackArchiver.logger.info "Fetching Sitemap" 127 | Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit, &block) 128 | end 129 | 130 | # Send URL to the Wayback Machine. 131 | # @return [Array] of URLs sent to the Wayback Machine. 132 | # @param [Array/String] urls or url. 133 | # @param concurrency [Integer] 134 | # @example Archive example.com 135 | # WaybackArchiver.urls('example.com') 136 | # @example Archive example.com and google.com 137 | # WaybackArchiver.urls(%w(example.com google.com)) 138 | # @example Archive example.com, max 100 URLs 139 | # WaybackArchiver.urls(%w(example.com www.example.com), limit: 100) 140 | def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block) 141 | Archive.post(Array(urls), concurrency: concurrency, &block) 142 | end 143 | 144 | # Set logger 145 | # @return [Object] the set logger 146 | # @param [Object] logger an object than response to quacks like a Logger 147 | # @example set a logger that prints to standard out (STDOUT) 148 | # WaybackArchiver.logger = Logger.new(STDOUT) 149 | def self.logger=(logger) 150 | @logger = logger 151 | end 152 | 153 | # Returns the current logger 154 | # @return [Object] the current logger instance 155 | def self.logger 156 | @logger ||= NullLogger.new 157 | end 158 | 159 | # Resets the logger to the default 160 | # @return [NullLogger] a new instance of NullLogger 161 | def self.default_logger! 162 | @logger = NullLogger.new 163 | end 164 | 165 | # Sets the user agent 166 | # @return [String] the configured user agent 167 | # @param [String] user_agent the desired user agent 168 | def self.user_agent=(user_agent) 169 | @user_agent = user_agent 170 | end 171 | 172 | # Returns the configured user agent 173 | # @return [String] the configured or the default user agent 174 | def self.user_agent 175 | @user_agent ||= USER_AGENT 176 | end 177 | 178 | # Sets the default respect_robots_txt 179 | # @return [Boolean] the desired default for respect_robots_txt 180 | # @param [Boolean] respect_robots_txt the desired default 181 | def self.respect_robots_txt=(respect_robots_txt) 182 | @respect_robots_txt = respect_robots_txt 183 | end 184 | 185 | # Returns the default respect_robots_txt 186 | # @return [Boolean] the configured or the default respect_robots_txt 187 | def self.respect_robots_txt 188 | @respect_robots_txt ||= DEFAULT_RESPECT_ROBOTS_TXT 189 | end 190 | 191 | # Sets the default concurrency 192 | # @return [Integer] the desired default concurrency 193 | # @param [Integer] concurrency the desired default concurrency 194 | def self.concurrency=(concurrency) 195 | @concurrency = concurrency 196 | end 197 | 198 | # Returns the default concurrency 199 | # @return [Integer] the configured or the default concurrency 200 | def self.concurrency 201 | @concurrency ||= DEFAULT_CONCURRENCY 202 | end 203 | 204 | # Sets the default max_limit 205 | # @return [Integer] the desired default max_limit 206 | # @param [Integer] max_limit the desired default max_limit 207 | def self.max_limit=(max_limit) 208 | @max_limit = max_limit 209 | end 210 | 211 | # Returns the default max_limit 212 | # @return [Integer] the configured or the default max_limit 213 | def self.max_limit 214 | @max_limit ||= DEFAULT_MAX_LIMIT 215 | end 216 | 217 | # Sets the adapter 218 | # @return [Object, #call>] the configured adapter 219 | # @param [Object, #call>] the adapter 220 | def self.adapter=(adapter) 221 | unless adapter.respond_to?(:call) 222 | raise(ArgumentError, 'adapter must implement #call') 223 | end 224 | 225 | @adapter = adapter 226 | end 227 | 228 | # Returns the configured adapter 229 | # @return [Integer] the configured or the default adapter 230 | def self.adapter 231 | @adapter ||= WaybackMachine 232 | end 233 | end 234 | --------------------------------------------------------------------------------