├── spec
├── data
│ ├── robots.txt
│ ├── test_gzip.gz
│ ├── sitemap.xml
│ ├── sitemap_index.xml
│ └── sitemap_index_with_duplicate_url.xml
├── support
│ └── test_logger.rb
├── wayback_archiver
│ ├── null_logger_spec.rb
│ ├── archive_result_spec.rb
│ ├── thread_pool_spec.rb
│ ├── http_code_spec.rb
│ ├── url_collector_spec.rb
│ ├── adapters
│ │ └── wayback_machine_spec.rb
│ ├── archive_spec.rb
│ ├── sitemap_spec.rb
│ ├── sitemapper_spec.rb
│ └── request_spec.rb
├── spec_helper.rb
└── wayback_archiver_spec.rb
├── lib
├── wayback_archiver
│ ├── version.rb
│ ├── null_logger.rb
│ ├── response.rb
│ ├── archive_result.rb
│ ├── thread_pool.rb
│ ├── adapters
│ │ └── wayback_machine.rb
│ ├── http_code.rb
│ ├── url_collector.rb
│ ├── sitemap.rb
│ ├── sitemapper.rb
│ ├── archive.rb
│ └── request.rb
├── robots.rb
└── wayback_archiver.rb
├── Gemfile
├── .travis.yml
├── .github
└── dependabot.yml
├── .gitignore
├── Rakefile
├── LICENSE
├── CHANGELOG.md
├── wayback_archiver.gemspec
├── bin
└── wayback_archiver
└── README.md
/spec/data/robots.txt:
--------------------------------------------------------------------------------
1 | Sitemap: http://www.example.com/sitemap.xml
--------------------------------------------------------------------------------
/spec/data/test_gzip.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buren/wayback_archiver/HEAD/spec/data/test_gzip.gz
--------------------------------------------------------------------------------
/lib/wayback_archiver/version.rb:
--------------------------------------------------------------------------------
1 | module WaybackArchiver
2 | # Gem version
3 | VERSION = '1.5.0'.freeze
4 | end
5 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | # Specify your gem's dependencies in wayback_archiver.gemspec
4 | gemspec
5 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: false
2 | language: ruby
3 | rvm:
4 | - 3.0.0
5 | - 3.1.0
6 | - 3.2.0
7 | - 3.3.0
8 | before_install: gem install bundler
9 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: bundler
4 | directory: "/"
5 | schedule:
6 | interval: monthly
7 | time: "04:00"
8 | open-pull-requests-limit: 10
9 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.gem
2 | *.rbc
3 | .bundle
4 | .config
5 | .yardoc
6 | Gemfile.lock
7 | InstalledFiles
8 | _yardoc
9 | coverage
10 | doc/
11 | lib/bundler/man
12 | pkg
13 | rdoc
14 | spec/reports
15 | test/tmp
16 | test/version_tmp
17 | tmp
18 | TODO.md
19 | .byebug_history
20 |
--------------------------------------------------------------------------------
/lib/wayback_archiver/null_logger.rb:
--------------------------------------------------------------------------------
1 | require 'logger'
2 |
3 | module WaybackArchiver
4 | # Don't log anyting / Send the logs to the abyss
5 | class NullLogger < Logger
6 | # Allow any and all params
7 | def initialize(*args); end
8 |
9 | # Allow any and alls params and don't do anyting
10 | def add(*args, &block); end
11 | end
12 | end
13 |
--------------------------------------------------------------------------------
/spec/data/sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | http://www.example.com/
8 |
9 | 2005-01-01
10 |
11 | monthly
12 |
13 | 0.8
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require 'bundler/gem_tasks'
2 |
3 | task default: :spec
4 |
5 | task :console do
6 | require 'bundler/setup'
7 | require 'irb'
8 | require 'wayback_archiver'
9 | ARGV.clear
10 | IRB.start
11 | end
12 |
13 | task :spec do
14 | begin
15 | require 'rspec/core/rake_task'
16 | RSpec::Core::RakeTask.new(:spec)
17 | rescue LoadError
18 | puts 'Could *not* load rspec'
19 | end
20 | end
21 |
--------------------------------------------------------------------------------
/lib/wayback_archiver/response.rb:
--------------------------------------------------------------------------------
1 | module WaybackArchiver
2 | # Response data struct
3 | Response = Struct.new(:code, :message, :body, :uri, :error)
4 | class Response
5 | # Returns true if a successfull response
6 | # @example check if Response was successfull
7 | # response = Response.new('200', 'OK', 'buren', 'http://example.com')
8 | # response.success? # => true
9 | def success?
10 | HTTPCode.success?(code)
11 | end
12 | end
13 | end
14 |
--------------------------------------------------------------------------------
/spec/data/sitemap_index.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | http://www.example.com/sitemap1.xml.gz
8 |
9 | 2004-10-01T18:23:17+00:00
10 |
11 |
12 |
13 |
14 |
15 | http://www.example.com/sitemap2.xml.gz
16 |
17 | 2005-01-01
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/spec/support/test_logger.rb:
--------------------------------------------------------------------------------
1 | require 'logger'
2 |
3 | # Test logger
4 | class TestLogger < Logger
5 | attr_reader :info_log, :debug_log, :error_log
6 |
7 | def initialize(*_args)
8 | @info_log = []
9 | @debug_log = []
10 | @error_log = []
11 | end
12 |
13 | def add(*args)
14 | log_type, _, log_string = args
15 | case log_type
16 | when 0 then @debug_log
17 | when 1 then @info_log
18 | when 3 then @error_log
19 | end << log_string
20 | end
21 | end
22 |
--------------------------------------------------------------------------------
/spec/data/sitemap_index_with_duplicate_url.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | http://www.example.com/sitemap1.xml.gz
8 |
9 | 2004-10-01T18:23:17+00:00
10 |
11 |
12 |
13 |
14 |
15 | http://www.example.com/sitemap1.xml.gz
16 |
17 | 2005-01-01
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/spec/wayback_archiver/null_logger_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WaybackArchiver::NullLogger do
4 | it 'inherits from Logger' do
5 | expect(described_class.ancestors).to include(Logger)
6 | end
7 |
8 | it 'can be initialized with arguments' do
9 | logger = described_class.new('buren')
10 | expect(logger.is_a?(described_class)).to eq(true)
11 | end
12 |
13 | it 'has #add method that can recieve args and a block' do
14 | logger = described_class.new('buren')
15 | expect(logger.add('buren', &:nil?)).to be_nil
16 | end
17 | end
18 |
--------------------------------------------------------------------------------
/spec/wayback_archiver/archive_result_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WaybackArchiver::ArchiveResult do
4 | describe '#archived_url' do
5 | it 'returns the uri' do
6 | expect(described_class.new('buren').archived_url).to eq('buren')
7 | end
8 | end
9 |
10 | describe '#errored?' do
11 | it 'returns true if errored' do
12 | expect(described_class.new(nil, error: true).errored?).to eq(true)
13 | end
14 | end
15 |
16 | describe '#success?' do
17 | it 'returns true if success' do
18 | expect(described_class.new(nil, error: nil).success?).to eq(true)
19 | end
20 | end
21 | end
22 |
--------------------------------------------------------------------------------
/lib/wayback_archiver/archive_result.rb:
--------------------------------------------------------------------------------
1 | module WaybackArchiver
2 | # Result data for posting URL to archive
3 | class ArchiveResult
4 | attr_reader :uri, :code, :request_url, :response_error, :error
5 |
6 | def initialize(uri, code: nil, request_url: nil, response_error: nil, error: nil)
7 | @uri = uri
8 | @code = code
9 | @request_url = request_url
10 | @response_error = response_error
11 | @error = error
12 | end
13 |
14 | # @return [String] the URL that was archived
15 | def archived_url
16 | uri
17 | end
18 |
19 | # @return [Boolean] true if success
20 | def success?
21 | !errored?
22 | end
23 |
24 | # @return [Boolean] true if errored
25 | def errored?
26 | !!error
27 | end
28 | end
29 | end
30 |
--------------------------------------------------------------------------------
/spec/wayback_archiver/thread_pool_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WaybackArchiver::ThreadPool do
4 | context 'with concurrency less than 1' do
5 | it 'raises ArgumentError' do
6 | expect { described_class.build(0) }.to raise_error(ArgumentError)
7 | end
8 | end
9 |
10 | context 'with concurrency 1' do
11 | it 'returns a Concurrency::ImmediateExecutor' do
12 | thread_pool = described_class.build(1)
13 | expect(thread_pool).to be_an_instance_of(Concurrent::ImmediateExecutor)
14 | end
15 | end
16 |
17 | context 'with concurrency greater than 1' do
18 | it 'returns a Concurrent::FixedThreadPool' do
19 | thread_pool = described_class.build(2)
20 | expect(thread_pool).to be_an_instance_of(Concurrent::FixedThreadPool)
21 | end
22 | end
23 | end
24 |
--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
1 | require 'simplecov'
2 | require 'coveralls'
3 |
4 | formatters = [
5 | SimpleCov::Formatter::HTMLFormatter,
6 | Coveralls::SimpleCov::Formatter
7 | ]
8 | SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new(formatters)
9 | SimpleCov.start
10 |
11 | Dir['./spec/support/**/*.rb'].each { |file| require file }
12 |
13 | require 'wayback_archiver'
14 | require 'webmock/rspec'
15 | require 'byebug'
16 |
17 | WebMock.disable_net_connect!
18 |
19 | RSpec.configure do |config|
20 | config.order = 'random'
21 | config.run_all_when_everything_filtered = false
22 |
23 | config.before(:each) do
24 | WaybackArchiver.logger = TestLogger.new
25 |
26 | # Set defalt concurrency to 1, so we don't have to deal with concurrency
27 | # issues in Webmock and rspec-mocks
28 | WaybackArchiver.concurrency = 1
29 |
30 | WaybackArchiver.max_limit = WaybackArchiver::DEFAULT_MAX_LIMIT
31 | end
32 | end
33 |
--------------------------------------------------------------------------------
/lib/wayback_archiver/thread_pool.rb:
--------------------------------------------------------------------------------
1 | require 'concurrent'
2 |
3 | module WaybackArchiver
4 | # Thread pool
5 | class ThreadPool
6 | # Build a thread pool
7 | # @return [Concurrent::FixedThreadPool/Concurrent::ImmediateExecutor] an instance of a concurrent thread pool
8 | # @param [Integer] concurrency the desired concurrency
9 | # @example Build a thread pool with 10 as the desired concurrency
10 | # pool = ThreadPool.build(10)
11 | # pool.post { some_work } # Returns a Concurrent::FixedThreadPool
12 | # @example Build a thread pool with 1 as the desired concurrency
13 | # pool = ThreadPool.build(1)
14 | # pool.post { some_work } # Returns a Concurrent::ImmediateExecutor
15 | # @see https://github.com/ruby-concurrency/concurrent-ruby/blob/master/doc/thread_pools.md
16 | def self.build(concurrency)
17 | if concurrency == 1
18 | Concurrent::ImmediateExecutor.new
19 | elsif concurrency > 1
20 | Concurrent::FixedThreadPool.new(concurrency)
21 | else
22 | raise ArgumentError, 'concurrency must be one or greater'
23 | end
24 | end
25 | end
26 | end
27 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2014 Jacob Burenstam Linder
2 |
3 | MIT License
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/lib/wayback_archiver/adapters/wayback_machine.rb:
--------------------------------------------------------------------------------
1 | require 'wayback_archiver/archive_result'
2 | require 'wayback_archiver/request'
3 |
4 | module WaybackArchiver
5 | # WaybackMachine adapter
6 | class WaybackMachine
7 | # Wayback Machine base URL.
8 | BASE_URL = 'https://web.archive.org/save/'.freeze
9 |
10 | # Send URL to Wayback Machine.
11 | # @return [ArchiveResult] the sent URL.
12 | # @param [String] url to send.
13 | # @example Archive example.com, with default options
14 | # WaybackMachine.call('http://example.com')
15 | def self.call(url)
16 | request_url = "#{BASE_URL}#{url&.strip}"
17 | response = Request.get(request_url, follow_redirects: false)
18 | WaybackArchiver.logger.info "Posted [#{response.code}, #{response.message}] #{url}"
19 | ArchiveResult.new(
20 | url,
21 | code: response.code,
22 | request_url: response.uri,
23 | response_error: response.error
24 | )
25 | rescue Request::Error => e
26 | WaybackArchiver.logger.error "Failed to archive #{url}: #{e.class}, #{e.message}"
27 | ArchiveResult.new(url, error: e)
28 | end
29 | end
30 | end
31 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Change Log
2 |
3 | ## HEAD
4 |
5 | ## v1.5.0
6 |
7 | - Strip URLs found in Sitemaps
8 | - Inline `robots` dependency, closes [#51](https://github.com/buren/wayback_archiver/issues/51)
9 | - Update Sitemap XML parsing to work better with newer versions of REXML
10 | - Fix issue calling `Spidr` with option hash (i.e use double spat operator)
11 |
12 | ## v1.4.0
13 |
14 | * Don't respect robots.txt file by default, [PR#41](https://github.com/buren/wayback_archiver/pull/41)
15 | * Add `WaybackArchiver::respect_robots_txt=` configuration option, to control whether to respect robots.txt file or not
16 | * Update `spidr` gem, resolves [issue#25](https://github.com/buren/wayback_archiver/issues/25)
17 | * Set default concurrency to `1` due to harsher rate limiting on Wayback Machine
18 | * Support for crawling multiple hosts, for example www.example.com, example.com and app.example.com [PR#27](https://github.com/buren/wayback_archiver/pull/27)
19 |
20 | ## v1.3.0
21 |
22 | * Archive every page found, not only HTML pages - [#24](https://github.com/buren/wayback_archiver/pull/24) thanks [@chlorophyll-zz](https://github.com/chlorophyll-zz).
23 |
24 | ## v1.2.1
25 |
26 | * Track what urls have been visited in sitemapper and don't visit them twice
27 | * Protect sitemap index duplicates
28 |
29 | ## v1.2.0
30 |
31 | Is history...
32 |
--------------------------------------------------------------------------------
/lib/wayback_archiver/http_code.rb:
--------------------------------------------------------------------------------
1 | module WaybackArchiver
2 | # Convience class for HTTP response codes
3 | class HTTPCode
4 | # Type of code as symbol
5 | # @return [Symbol] code type
6 | # @param [String/Integer] code the response code
7 | # @example
8 | # HttpCode.type('200')
9 | def self.type(code)
10 | code = code.to_s
11 | return :success if success?(code)
12 | return :redirect if redirect?(code)
13 | return :error if error?(code)
14 |
15 | :unknown
16 | end
17 |
18 | # Whether the code is a success type
19 | # @return [Boolean] is success or not
20 | # @param [String] code the response code
21 | # @example
22 | # HttpCode.success?('200') # => true
23 | # @example
24 | # HttpCode.success?(200) # => true
25 | # @example
26 | # HttpCode.success?(nil) # => false
27 | def self.success?(code)
28 | !!code.to_s.match(/2\d\d/)
29 | end
30 |
31 | # Whether the code is a redirect type
32 | # @return [Boolean] is redirect or not
33 | # @param [String] code the response code
34 | # @example
35 | # HttpCode.redirect?('301')
36 | def self.redirect?(code)
37 | !!code.to_s.match(/3\d\d/)
38 | end
39 |
40 | # Whether the code is a error type
41 | # @return [Boolean] is error or not
42 | # @param [String] code the response code
43 | # @example
44 | # HttpCode.error?('301')
45 | def self.error?(code)
46 | !!code.to_s.match(/4\d\d/) || !!code.to_s.match(/5\d\d/)
47 | end
48 | end
49 | end
50 |
--------------------------------------------------------------------------------
/wayback_archiver.gemspec:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | lib = File.expand_path('../lib', __FILE__)
4 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5 | require 'wayback_archiver/version'
6 |
7 | Gem::Specification.new do |spec|
8 | spec.name = 'wayback_archiver'
9 | spec.version = WaybackArchiver::VERSION
10 | spec.authors = ['Jacob Burenstam']
11 | spec.email = ['burenstam@gmail.com']
12 |
13 | spec.summary = 'Post URLs to Wayback Machine (Internet Archive)'
14 | spec.description = 'Post URLs to Wayback Machine (Internet Archive), using a crawler, from Sitemap(s) or a list of URLs.'
15 | spec.homepage = 'https://github.com/buren/wayback_archiver'
16 | spec.license = 'MIT'
17 |
18 | spec.files = Dir.glob('{bin,lib}/**/*')
19 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
20 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
21 | spec.require_paths = ['lib']
22 |
23 | spec.required_ruby_version = '>= 2.0.0'
24 |
25 | spec.add_runtime_dependency 'spidr', '~> 0.7.1' # Crawl sites
26 | spec.add_runtime_dependency 'concurrent-ruby', '~> 1.3' # Concurrency primitivies
27 | spec.add_runtime_dependency 'rexml', '~> 3.3.9'
28 |
29 | spec.add_development_dependency 'bundler', '~> 2.1'
30 | spec.add_development_dependency 'rake', '~> 12.3'
31 | spec.add_development_dependency 'rspec', '~> 3.1'
32 | spec.add_development_dependency 'yard', '~> 0.9'
33 | spec.add_development_dependency 'simplecov', '~> 0.14.1'
34 | spec.add_development_dependency 'coveralls', '~> 0.8'
35 | spec.add_development_dependency 'redcarpet', '~> 3.2'
36 | spec.add_development_dependency 'webmock', '~> 3.0'
37 | spec.add_development_dependency 'byebug', '~> 11.1.3'
38 | end
39 |
--------------------------------------------------------------------------------
/spec/wayback_archiver/http_code_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WaybackArchiver::HTTPCode do
4 | describe '::type' do
5 | [
6 | # argument, expected
7 | [200, :success],
8 | ['200', :success],
9 | ['301', :redirect],
10 | ['302', :redirect],
11 | ['400', :error],
12 | ['404', :error],
13 | ['500', :error],
14 | ['503', :error],
15 | ['999', :unknown]
16 | ].each do |data|
17 | code, expected = data
18 |
19 | it "returns #{expected} for #{code} code" do
20 | expect(described_class.type(code)).to eq(expected)
21 | end
22 | end
23 | end
24 |
25 | describe '::success?' do
26 | it 'returns true when code is success' do
27 | code = '200'
28 | expect(described_class.success?(code)).to eq(true)
29 | end
30 |
31 | it 'returns false when code is not success' do
32 | code = '300'
33 | expect(described_class.success?(code)).to eq(false)
34 | end
35 | end
36 |
37 | describe '::error?' do
38 | it 'returns true when code is 400 error' do
39 | code = '400'
40 | expect(described_class.error?(code)).to eq(true)
41 | end
42 |
43 | it 'returns true when code is 500 error' do
44 | code = '500'
45 | expect(described_class.error?(code)).to eq(true)
46 | end
47 |
48 | it 'returns false when code is not error' do
49 | code = '200'
50 | expect(described_class.error?(code)).to eq(false)
51 | end
52 | end
53 |
54 | describe '::redirect?' do
55 | it 'returns true when code is redirect' do
56 | code = '300'
57 | expect(described_class.redirect?(code)).to eq(true)
58 | end
59 |
60 | it 'returns false when code is not redirect' do
61 | code = '200'
62 | expect(described_class.redirect?(code)).to eq(false)
63 | end
64 | end
65 | end
66 |
--------------------------------------------------------------------------------
/spec/wayback_archiver/url_collector_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WaybackArchiver::URLCollector do
4 | describe '::sitemap' do
5 | it 'calls Sitemapper::urls' do
6 | expected = %w[http://example.com]
7 | allow(WaybackArchiver::Sitemapper).to receive(:urls).and_return(expected)
8 | expect(described_class.sitemap('http://example.com')).to eq(expected)
9 | end
10 | end
11 |
12 | describe '::crawl' do
13 | let(:headers) do
14 | {
15 | 'Accept' => '*/*',
16 | 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
17 | 'User-Agent' => WaybackArchiver.user_agent
18 | }
19 | end
20 |
21 | it 'can crawl' do
22 | html_page = <<-HTML
23 |
24 |
25 |
26 |
27 | Testing
28 |
29 |
30 | An URL
31 |
32 |
33 | HTML
34 |
35 | response_headers = { 'Content-Type' => 'text/html; charset=utf-8' }
36 |
37 | stub_request(:get, 'http://example.com/robots.txt')
38 | .with(headers: headers)
39 | .to_return(status: 200, body: '', headers: {})
40 |
41 | stub_request(:get, 'http://example.com/')
42 | .with(headers: headers)
43 | .to_return(status: 200, body: html_page, headers: response_headers)
44 |
45 | stub_request(:get, 'http://example.com/found')
46 | .with(headers: headers)
47 | .to_return(status: 200, body: '', headers: response_headers)
48 |
49 | expected_urls = %w[http://example.com http://example.com/found]
50 | expected_urls_dup = expected_urls.dup
51 | found_urls = described_class.crawl('http://example.com') do |url|
52 | expect(url).to eq(expected_urls.shift)
53 | end
54 |
55 | expect(found_urls).to eq(expected_urls_dup)
56 | end
57 | end
58 | end
59 |
--------------------------------------------------------------------------------
/spec/wayback_archiver/adapters/wayback_machine_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WaybackArchiver::WaybackMachine do
4 | let(:headers) do
5 | {
6 | 'Accept' => '*/*',
7 | 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
8 | 'User-Agent' => WaybackArchiver.user_agent
9 | }
10 | end
11 |
12 | describe '::call' do
13 | it 'posts URL to the Wayback Machine' do
14 | url = 'https://example.com'
15 | expected_request_url = "https://web.archive.org/save/#{url}"
16 |
17 | stub_request(:get, expected_request_url)
18 | .with(headers: headers)
19 | .to_return(status: 301, body: 'buren', headers: {})
20 |
21 | result = described_class.call(url)
22 |
23 | expect(result.uri).to eq(url)
24 | expect(result.code).to eq('301')
25 | expect(WaybackArchiver.logger.debug_log.first).to include(expected_request_url)
26 | expect(WaybackArchiver.logger.info_log.last).to include(url)
27 | end
28 |
29 | it 'rescues and logs Request::ServerError' do
30 | allow(WaybackArchiver::Request).to receive(:get)
31 | .and_raise(WaybackArchiver::Request::MaxRedirectError, 'too many redirects')
32 |
33 | url = 'https://example.com'
34 | expected_request_url = "https://web.archive.org/save/#{url}"
35 |
36 | stub_request(:get, expected_request_url)
37 | .with(headers: headers)
38 | .to_return(status: 301, body: 'buren', headers: {})
39 |
40 | result = described_class.call(url)
41 |
42 | expect(result.uri).to eq(url)
43 | expect(result.response_error).to be_nil
44 | expect(result.request_url).to be_nil
45 | expect(result.error).to be_a(WaybackArchiver::Request::MaxRedirectError)
46 |
47 | last_error_log = WaybackArchiver.logger.error_log.last
48 | expect(last_error_log).to include(url)
49 | expect(last_error_log).to include('MaxRedirectError')
50 | expect(last_error_log).to include('too many redirects')
51 | end
52 | end
53 | end
54 |
--------------------------------------------------------------------------------
/lib/wayback_archiver/url_collector.rb:
--------------------------------------------------------------------------------
1 | require 'spidr'
2 | require 'robots'
3 |
4 | require 'wayback_archiver/sitemapper'
5 | require 'wayback_archiver/request'
6 |
7 | module WaybackArchiver
8 | # Retrive URLs from different sources
9 | class URLCollector
10 | # Retrieve URLs from Sitemap.
11 | # @return [Array] of URLs defined in Sitemap.
12 | # @param [String] url domain to retrieve Sitemap from.
13 | # @example Get URLs defined in Sitemap for google.com
14 | # URLCollector.sitemap('https://google.com/sitemap.xml')
15 | def self.sitemap(url)
16 | Sitemapper.urls(url: Request.build_uri(url))
17 | end
18 |
19 | # Retrieve URLs by crawling.
20 | # @return [Array] of URLs defined found during crawl.
21 | # @param [String] url domain to crawl URLs from.
22 | # @param [Array] hosts to crawl.
23 | # @example Crawl URLs defined on example.com
24 | # URLCollector.crawl('http://example.com')
25 | # @example Crawl URLs defined on example.com and limit the number of visited pages to 100
26 | # URLCollector.crawl('http://example.com', limit: 100)
27 | # @example Crawl URLs defined on example.com and explicitly set no upper limit on the number of visited pages to 100
28 | # URLCollector.crawl('http://example.com', limit: -1)
29 | # @example Crawl multiple hosts
30 | # URLCollector.crawl(
31 | # 'http://example.com',
32 | # hosts: [
33 | # 'example.com',
34 | # /host[\d]+\.example\.com/
35 | # ]
36 | # )
37 | def self.crawl(url, hosts: [], limit: WaybackArchiver.max_limit)
38 | urls = []
39 | start_at_url = Request.build_uri(url).to_s
40 | options = {
41 | robots: WaybackArchiver.respect_robots_txt,
42 | hosts: hosts,
43 | user_agent: WaybackArchiver.user_agent
44 | }
45 | options[:limit] = limit unless limit == -1
46 |
47 | Spidr.site(start_at_url, **options) do |spider|
48 | spider.every_page do |page|
49 | page_url = page.url.to_s
50 | urls << page_url
51 | WaybackArchiver.logger.debug "Found: #{page_url}"
52 | yield(page_url) if block_given?
53 | end
54 | end
55 | urls
56 | end
57 | end
58 | end
59 |
--------------------------------------------------------------------------------
/bin/wayback_archiver:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | require 'optparse'
4 | require 'wayback_archiver'
5 |
6 | # Default values
7 | urls = nil
8 | strategy = 'auto'
9 | log = STDOUT
10 | log_level = Logger::INFO
11 | concurrency = WaybackArchiver.concurrency
12 | limit = WaybackArchiver.max_limit
13 | hosts = []
14 |
15 | optparse = OptionParser.new do |parser|
16 | parser.banner = 'Usage: wayback_archiver [] [options]'
17 |
18 | parser.on('--auto', 'Auto (default)') do |value|
19 | strategy = 'auto'
20 | end
21 |
22 | parser.on('--crawl', 'Crawl') do |value|
23 | strategy = 'crawl'
24 | end
25 |
26 | parser.on('--sitemap', 'Sitemap') do |value|
27 | strategy = 'sitemap'
28 | end
29 |
30 | parser.on('--urls', '--url', 'URL(s)') do |value|
31 | strategy = 'urls'
32 | end
33 |
34 | parser.on('--hosts=[example.com]', Array, 'Only spider links on certain hosts') do |value|
35 | hosts = value.map { |v| Regexp.new(v) } if value
36 | end
37 |
38 | parser.on('--concurrency=1', Integer, 'Concurrency') do |value|
39 | concurrency = value
40 | end
41 |
42 | parser.on('--limit=5', Integer, 'Max number of URLs to archive') do |value|
43 | limit = value
44 | end
45 |
46 | parser.on('--log=output.log', String, 'Path to desired log file (if no argument is given it defaults to STDOUT)') do |path|
47 | log = path
48 | end
49 |
50 | parser.on('--[no-]verbose', 'Verboes logs') do |value|
51 | log_level = value ? Logger::DEBUG : Logger::WARN
52 | end
53 |
54 | parser.on('-h', '--help', 'How to use') do
55 | puts parser
56 | exit
57 | end
58 |
59 | # No argument, shows at tail. This will print an options summary.
60 | parser.on_tail('-h', '--help', 'Show this message') do
61 | puts parser
62 | exit
63 | end
64 |
65 | parser.on_tail('--version', 'Show version') do
66 | puts "WaybackArchiver version #{WaybackArchiver::VERSION}"
67 | exit
68 | end
69 | end
70 |
71 | optparse.parse!
72 |
73 | urls = ARGV.map(&:strip).reject(&:empty?)
74 | if urls.empty?
75 | puts optparse.help
76 | raise ArgumentError, "[] is required"
77 | end
78 |
79 | WaybackArchiver.logger = Logger.new(log).tap do |logger|
80 | logger.progname = 'WaybackArchiver'
81 | logger.level = log_level
82 | end
83 |
84 | # If no strategy has explicitly been given, then default to 'auto'
85 | strategy ||= 'auto'
86 | urls.each do |url|
87 | WaybackArchiver.archive(
88 | url,
89 | hosts: hosts,
90 | strategy: strategy,
91 | concurrency: concurrency,
92 | limit: limit
93 | )
94 | end
95 |
--------------------------------------------------------------------------------
/lib/wayback_archiver/sitemap.rb:
--------------------------------------------------------------------------------
1 | require 'uri'
2 | require 'rexml/document'
3 |
4 | module WaybackArchiver
5 | # Parse Sitemaps, https://www.sitemaps.org
6 | class Sitemap
7 | attr_reader :document
8 |
9 | def initialize(xml_or_string, strict: false)
10 | @contents = xml_or_string
11 | @document = REXML::Document.new(xml_or_string)
12 | rescue REXML::ParseException => _e
13 | raise if strict
14 |
15 | @document = REXML::Document.new('')
16 | end
17 |
18 | # Return all URLs defined in Sitemap.
19 | # @return [Array] of URLs defined in Sitemap.
20 | # @example Get URLs defined in Sitemap
21 | # sitemap = Sitemap.new(xml)
22 | # sitemap.urls
23 | def urls
24 | @urls ||= extract_urls('url')
25 | end
26 |
27 | # Return all sitemap URLs defined in Sitemap.
28 | # @return [Array] of Sitemap URLs defined in Sitemap.
29 | # @example Get Sitemap URLs defined in Sitemap
30 | # sitemap = Sitemap.new(xml)
31 | # sitemap.sitemaps
32 | def sitemaps
33 | @sitemaps ||= extract_urls('sitemap')
34 | end
35 |
36 | # Check if sitemap is a plain file
37 | # @return [Boolean] whether document is plain
38 | def plain_document?
39 | document.elements.empty?
40 | end
41 |
42 | # Return the name of the document (if there is one)
43 | # @return [String] the document root name
44 | def root_name
45 | return unless document.root
46 |
47 | document.root.name
48 | end
49 |
50 | # Returns true of Sitemap is a Sitemap index
51 | # @return [Boolean] of whether the Sitemap is an Sitemap index or not
52 | # @example Check if Sitemap is a sitemap index
53 | # sitemap = Sitemap.new(xml)
54 | # sitemap.sitemap_index?
55 | def sitemap_index?
56 | root_name == 'sitemapindex'
57 | end
58 |
59 | # Returns true of Sitemap lists regular URLs
60 | # @return [Boolean] of whether the Sitemap regular URL list
61 | # @example Check if Sitemap is a regular URL list
62 | # sitemap = Sitemap.new(xml)
63 | # sitemap.urlset?
64 | def urlset?
65 | root_name == 'urlset'
66 | end
67 |
68 | private
69 |
70 | def valid_url?(url)
71 | uri = URI.parse(url)
72 | uri.is_a?(URI::HTTP) || uri.is_a?(URI::HTTPS)
73 | rescue URI::InvalidURIError
74 | false
75 | end
76 |
77 | # Extract URLs from Sitemap
78 | def extract_urls(node_name)
79 | if plain_document?
80 | return @contents.to_s
81 | .each_line.map(&:strip)
82 | .select(&method(:valid_url?))
83 | end
84 |
85 | urls = []
86 | document.root.elements.each("#{node_name}/loc") do |element|
87 | urls << element.text
88 | end
89 | urls
90 | end
91 | end
92 | end
93 |
--------------------------------------------------------------------------------
/spec/wayback_archiver/archive_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WaybackArchiver::Archive do
4 | let(:headers) do
5 | {
6 | 'Accept' => '*/*',
7 | 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
8 | 'User-Agent' => WaybackArchiver.user_agent
9 | }
10 | end
11 |
12 | describe '::post' do
13 | it 'calls ::post_url for each URL' do
14 | allow(described_class).to receive(:post_url).and_return(WaybackArchiver::ArchiveResult.new(nil))
15 |
16 | result = described_class.post(%w[https://example.com https://example.com/path])
17 |
18 | expect(described_class).to have_received(:post_url).twice
19 | end
20 |
21 | it 'calls ::post_url for each URL with support for an max limit' do
22 | allow(described_class).to receive(:post_url).and_return(WaybackArchiver::ArchiveResult.new(nil))
23 |
24 | result = described_class.post(%w[https://example.com https://example.com/path], limit: 1)
25 |
26 | expect(described_class).to have_received(:post_url).once
27 | end
28 | end
29 |
30 | describe '::crawl' do
31 | it 'calls URLCollector::crawl and ::post_url' do
32 | url = 'https://example.com'
33 |
34 | allow(WaybackArchiver::URLCollector).to receive(:crawl)
35 | .and_yield(url)
36 | .and_return([url])
37 |
38 | allow(described_class).to receive(:post_url).and_return(WaybackArchiver::ArchiveResult.new(url))
39 |
40 | expect(described_class.crawl(url)[0].uri).to eq(url)
41 | end
42 | end
43 |
44 | describe '::post_url' do
45 | it 'posts URL to the Wayback Machine' do
46 | url = 'https://example.com'
47 | expected_request_url = "https://web.archive.org/save/#{url}"
48 |
49 | stub_request(:get, expected_request_url)
50 | .with(headers: headers)
51 | .to_return(status: 301, body: 'buren', headers: {})
52 |
53 | result = described_class.post_url(url)
54 |
55 | expect(result.uri).to eq(url)
56 | expect(result.code).to eq('301')
57 | expect(WaybackArchiver.logger.debug_log.first).to include(expected_request_url)
58 | expect(WaybackArchiver.logger.info_log.last).to include(url)
59 | end
60 |
61 | it 'rescues and logs Request::ServerError' do
62 | allow(WaybackArchiver::Request).to receive(:get)
63 | .and_raise(WaybackArchiver::Request::MaxRedirectError, 'too many redirects')
64 |
65 | url = 'https://example.com'
66 | expected_request_url = "https://web.archive.org/save/#{url}"
67 |
68 | stub_request(:get, expected_request_url)
69 | .with(headers: headers)
70 | .to_return(status: 301, body: 'buren', headers: {})
71 |
72 | result = described_class.post_url(url)
73 |
74 | expect(result.uri).to eq(url)
75 | expect(result.response_error).to be_nil
76 | expect(result.request_url).to be_nil
77 | expect(result.error).to be_a(WaybackArchiver::Request::MaxRedirectError)
78 |
79 | last_error_log = WaybackArchiver.logger.error_log.last
80 | expect(last_error_log).to include(url)
81 | expect(last_error_log).to include('MaxRedirectError')
82 | expect(last_error_log).to include('too many redirects')
83 | end
84 | end
85 | end
86 |
--------------------------------------------------------------------------------
/lib/wayback_archiver/sitemapper.rb:
--------------------------------------------------------------------------------
1 | require 'set'
2 | require 'robots'
3 |
4 | require 'wayback_archiver/sitemap'
5 | require 'wayback_archiver/request'
6 |
7 | module WaybackArchiver
8 | # Fetch and parse sitemaps recursively
9 | class Sitemapper
10 | # Common locations for Sitemap(s)
11 | COMMON_SITEMAP_LOCATIONS = %w[
12 | sitemap_index.xml.gz
13 | sitemap-index.xml.gz
14 | sitemap_index.xml
15 | sitemap-index.xml
16 | sitemap.xml.gz
17 | sitemap.xml
18 | ].freeze
19 |
20 | # Autodiscover the location of the Sitemap, then fetch and parse recursively.
21 | # First it tries /robots.txt, then common locations for Sitemap and finally the supplied URL.
22 | # @return [Array] of URLs defined in Sitemap(s).
23 | # @param [URI] url to domain.
24 | # @example Get URLs defined in Sitemap for google.com
25 | # Sitemapper.autodiscover('https://google.com/')
26 | # @see http://www.sitemaps.org
27 | def self.autodiscover(url)
28 | WaybackArchiver.logger.info 'Looking for Sitemap(s) in /robots.txt'
29 | robots = Robots.new(WaybackArchiver.user_agent)
30 | sitemaps = robots.other_values(url)['Sitemap']
31 |
32 | if sitemaps
33 | return sitemaps.flat_map do |sitemap|
34 | WaybackArchiver.logger.info "Fetching Sitemap at #{sitemap}"
35 | urls(url: sitemap)
36 | end
37 | end
38 |
39 | COMMON_SITEMAP_LOCATIONS.each do |path|
40 | WaybackArchiver.logger.info "Looking for Sitemap at #{path}"
41 | sitemap_url = [url, path].join(url.end_with?('/') ? '' : '/')
42 | response = Request.get(sitemap_url, raise_on_http_error: false)
43 |
44 | if response.success?
45 | WaybackArchiver.logger.info "Sitemap found at #{sitemap_url}"
46 | return urls(xml: response.body)
47 | end
48 | end
49 |
50 | WaybackArchiver.logger.info "Looking for Sitemap at #{url}"
51 | urls(url: url)
52 | rescue Request::Error => e
53 | WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
54 | []
55 | end
56 |
57 | # Fetch and parse sitemaps recursively.
58 | # @return [Array] of URLs defined in Sitemap(s).
59 | # @param url [String] URL to Sitemap.
60 | # @param xml [String] Sitemap XML.
61 | # @example Get URLs defined in Sitemap for google.com
62 | # Sitemapper.urls(url: 'https://google.com/sitemap.xml')
63 | # @example Get URLs defined in Sitemap
64 | # Sitemapper.urls(xml: xml)
65 | # @see http://www.sitemaps.org
66 | def self.urls(url: nil, xml: nil, visited: Set.new)
67 | if visited.include?(url)
68 | WaybackArchiver.logger.debug "Already visited #{url} skipping.."
69 | return []
70 | end
71 |
72 | visited << url if url
73 |
74 | xml = Request.get(url).body unless xml
75 | sitemap = Sitemap.new(xml)
76 |
77 | if sitemap.sitemap_index?
78 | sitemap.sitemaps.flat_map do |sitemap_url|
79 | urls(url: sitemap_url, visited: visited)
80 | end
81 | else
82 | sitemap.urls.map { |url| url&.strip }
83 | end
84 | rescue Request::Error => e
85 | WaybackArchiver.logger.error "Error raised when requesting #{url}, #{e.class}, #{e.message}"
86 |
87 | []
88 | end
89 | end
90 | end
91 |
--------------------------------------------------------------------------------
/spec/wayback_archiver/sitemap_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WaybackArchiver::Sitemap do
4 | describe '#new' do
5 | it 'raises error REXML::ParseException when strict mode is true' do
6 | expect do
7 | described_class.new('', strict: true)
8 | end.to raise_error(REXML::ParseException)
9 | end
10 |
11 | it 'if strict mode false it swallows XML errors' do
12 | sitemap = described_class.new('')
13 | expect(sitemap.urls).to be_empty
14 | end
15 | end
16 |
17 | describe '#urls' do
18 | it 'returns URLs in XML sitemap' do
19 | sitemap = described_class.new(File.read('spec/data/sitemap.xml'))
20 | expect(sitemap.urls).to eq(%w[http://www.example.com/])
21 | end
22 |
23 | it 'returns URLs in plain text sitemap' do
24 | file = "http://www.example.com/\nhttp://www.example.com/path"
25 | sitemap = described_class.new(file)
26 | expected = %w[
27 | http://www.example.com/
28 | http://www.example.com/path
29 | ]
30 | expect(sitemap.urls).to eq(expected)
31 | end
32 |
33 | it 'returns empty array when passed empty document' do
34 | sitemap = described_class.new('')
35 | expect(sitemap.urls).to be_empty
36 | end
37 | end
38 |
39 | describe '#sitemaps' do
40 | it 'returns sitemap URLs in sitemap' do
41 | sitemap = described_class.new(File.read('spec/data/sitemap_index.xml'))
42 | expected = %w[
43 | http://www.example.com/sitemap1.xml.gz
44 | http://www.example.com/sitemap2.xml.gz
45 | ]
46 | expect(sitemap.sitemaps).to eq(expected)
47 | end
48 |
49 | it 'returns empty array when passed empty document' do
50 | sitemap = described_class.new('')
51 | expect(sitemap.sitemaps).to be_empty
52 | end
53 | end
54 |
55 | describe '#plain_document?' do
56 | it 'returns true when passed non-XML document' do
57 | sitemap = described_class.new('')
58 | expect(sitemap.plain_document?).to eq(true)
59 | end
60 |
61 | it 'returns false when passed XML document' do
62 | sitemap = described_class.new('')
63 | expect(sitemap.plain_document?).to eq(false)
64 | end
65 | end
66 |
67 | describe '#root_name' do
68 | it 'returns nil when passed non-XML document' do
69 | sitemap = described_class.new('')
70 | expect(sitemap.root_name).to be_nil
71 | end
72 |
73 | it 'returns root name when passed XML document' do
74 | sitemap = described_class.new('')
75 | expect(sitemap.root_name).to eq('buren')
76 | end
77 | end
78 |
79 | describe '#sitemap_index?' do
80 | it 'returns true if document is a sitemap index' do
81 | sitemap = described_class.new(File.read('spec/data/sitemap_index.xml'))
82 | expect(sitemap.sitemap_index?).to eq(true)
83 | end
84 |
85 | it 'returns false if document sitemap' do
86 | sitemap = described_class.new(File.read('spec/data/sitemap.xml'))
87 | expect(sitemap.sitemap_index?).to eq(false)
88 | end
89 | end
90 |
91 | describe '#urlset?' do
92 | it 'returns true if document is a sitemap' do
93 | sitemap = described_class.new(File.read('spec/data/sitemap.xml'))
94 | expect(sitemap.urlset?).to eq(true)
95 | end
96 |
97 | it 'returns false if document is a sitemap index' do
98 | sitemap = described_class.new(File.read('spec/data/sitemap_index.xml'))
99 | expect(sitemap.urlset?).to eq(false)
100 | end
101 | end
102 | end
103 |
--------------------------------------------------------------------------------
/lib/wayback_archiver/archive.rb:
--------------------------------------------------------------------------------
1 | require 'concurrent'
2 |
3 | require 'wayback_archiver/thread_pool'
4 | require 'wayback_archiver/adapters/wayback_machine'
5 |
6 | module WaybackArchiver
7 | # Post URL(s) to Wayback Machine
8 | class Archive
9 | # Send URLs to Wayback Machine.
10 | # @return [Array] with sent URLs.
11 | # @param [Array] urls to send to the Wayback Machine.
12 | # @param concurrency [Integer] the default is 1
13 | # @yield [archive_result] If a block is given, each result will be yielded
14 | # @yieldparam [ArchiveResult] archive_result
15 | # @example Archive urls, asynchronously
16 | # Archive.post(['http://example.com'])
17 | # Archiver.post(['http://example.com']) do |result|
18 | # puts [result.code || 'error', result.url] # print response status and URL
19 | # end
20 | # @example Archive urls, using only 1 thread
21 | # Archive.post(['http://example.com'], concurrency: 1)
22 | # @example Stop after archiving 100 links
23 | # Archive.post(['http://example.com'], limit: 100)
24 | # @example Explicitly set no limit on how many links are posted
25 | # Archive.post(['http://example.com'], limit: -1)
26 | def self.post(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
27 | WaybackArchiver.logger.info "Total URLs to be sent: #{urls.length}"
28 | WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
29 |
30 | urls_queue = if limit == -1
31 | urls
32 | else
33 | urls[0...limit]
34 | end
35 |
36 | posted_urls = Concurrent::Array.new
37 | pool = ThreadPool.build(concurrency)
38 |
39 | urls_queue.each do |url|
40 | pool.post do
41 | result = post_url(url)
42 | yield(result) if block_given?
43 | posted_urls << result unless result.errored?
44 | end
45 | end
46 |
47 | pool.shutdown
48 | pool.wait_for_termination
49 |
50 | WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine"
51 | posted_urls
52 | end
53 |
54 | # Send URLs to Wayback Machine by crawling the site.
55 | # @return [Array] with URLs sent to the Wayback Machine.
56 | # @param [String] source for URL to crawl.
57 | # @param concurrency [Integer] the default is 1
58 | # @param [Array] hosts to crawl
59 | # @yield [archive_result] If a block is given, each result will be yielded
60 | # @yieldparam [ArchiveResult] archive_result
61 | # @example Crawl example.com and send all URLs of the same domain
62 | # Archiver.crawl('example.com')
63 | # Archiver.crawl('example.com') do |result|
64 | # puts [result.code || 'error', result.url] # print response status and URL
65 | # end
66 | # @example Crawl example.com and send all URLs of the same domain with low concurrency
67 | # Archiver.crawl('example.com', concurrency: 1)
68 | # @example Stop after archiving 100 links
69 | # Archiver.crawl('example.com', limit: 100)
70 | # @example Crawl multiple hosts
71 | # URLCollector.crawl(
72 | # 'http://example.com',
73 | # hosts: [
74 | # 'example.com',
75 | # /host[\d]+\.example\.com/
76 | # ]
77 | # )
78 | def self.crawl(source, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit)
79 | WaybackArchiver.logger.info "Request are sent with up to #{concurrency} parallel threads"
80 |
81 | posted_urls = Concurrent::Array.new
82 | pool = ThreadPool.build(concurrency)
83 |
84 | found_urls = URLCollector.crawl(source, hosts: hosts, limit: limit) do |url|
85 | pool.post do
86 | result = post_url(url)
87 | yield(result) if block_given?
88 | posted_urls << result unless result.errored?
89 | end
90 | end
91 | WaybackArchiver.logger.info "Crawling of #{source} finished, found #{found_urls.length} URL(s)"
92 | pool.shutdown
93 | pool.wait_for_termination
94 |
95 | WaybackArchiver.logger.info "#{posted_urls.length} URL(s) posted to Wayback Machine"
96 | posted_urls
97 | end
98 |
99 | # Send URL to Wayback Machine.
100 | # @return [ArchiveResult] the sent URL.
101 | # @param [String] url to send.
102 | # @example Archive example.com, with default options
103 | # Archive.post_url('http://example.com')
104 | def self.post_url(url)
105 | WaybackArchiver.adapter.call(url)
106 | end
107 | end
108 | end
109 |
--------------------------------------------------------------------------------
/lib/robots.rb:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) 2008 Kyle Maxwell, contributors
3 | #
4 | # Permission is hereby granted, free of charge, to any person
5 | # obtaining a copy of this software and associated documentation
6 | # files (the "Software"), to deal in the Software without
7 | # restriction, including without limitation the rights to use,
8 | # copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the
10 | # Software is furnished to do so, subject to the following
11 | # conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be
14 | # included in all copies or substantial portions of the Software.
15 | #
16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21 | # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23 | # OTHER DEALINGS IN THE SOFTWARE.
24 | #
25 |
26 | require "open-uri"
27 | require "uri"
28 | require "rubygems"
29 | require "timeout"
30 |
31 | class Robots
32 |
33 | DEFAULT_TIMEOUT = 3
34 |
35 | class ParsedRobots
36 |
37 | def initialize(uri, user_agent)
38 | @last_accessed = Time.at(1)
39 |
40 | io = Robots.get_robots_txt(uri, user_agent)
41 |
42 | if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
43 | io = StringIO.new("User-agent: *\nAllow: /\n")
44 | end
45 |
46 | @other = {}
47 | @disallows = {}
48 | @allows = {}
49 | @delays = {} # added delays to make it work
50 | agent = /.*/
51 | io.each do |line|
52 | next if line =~ /^\s*(#.*|$)/
53 | arr = line.split(":")
54 | key = arr.shift
55 | value = arr.join(":").strip
56 | value.strip!
57 | case key
58 | when "User-agent"
59 | agent = to_regex(value)
60 | when "Allow"
61 | @allows[agent] ||= []
62 | @allows[agent] << to_regex(value)
63 | when "Disallow"
64 | @disallows[agent] ||= []
65 | @disallows[agent] << to_regex(value)
66 | when "Crawl-delay"
67 | @delays[agent] = value.to_i
68 | else
69 | @other[key] ||= []
70 | @other[key] << value
71 | end
72 | end
73 |
74 | @parsed = true
75 | end
76 |
77 | def allowed?(uri, user_agent)
78 | return true unless @parsed
79 | allowed = true
80 | path = uri.request_uri
81 |
82 | @disallows.each do |key, value|
83 | if user_agent =~ key
84 | value.each do |rule|
85 | if path =~ rule
86 | allowed = false
87 | end
88 | end
89 | end
90 | end
91 |
92 | @allows.each do |key, value|
93 | unless allowed
94 | if user_agent =~ key
95 | value.each do |rule|
96 | if path =~ rule
97 | allowed = true
98 | end
99 | end
100 | end
101 | end
102 | end
103 |
104 | if allowed && @delays[user_agent]
105 | sleep @delays[user_agent] - (Time.now - @last_accessed)
106 | @last_accessed = Time.now
107 | end
108 |
109 | return allowed
110 | end
111 |
112 | def other_values
113 | @other
114 | end
115 |
116 | protected
117 |
118 | def to_regex(pattern)
119 | return /should-not-match-anything-123456789/ if pattern.strip.empty?
120 | pattern = Regexp.escape(pattern)
121 | pattern.gsub!(Regexp.escape("*"), ".*")
122 | Regexp.compile("^#{pattern}")
123 | end
124 | end
125 |
126 | def self.get_robots_txt(uri, user_agent)
127 | begin
128 | Timeout::timeout(Robots.timeout) do
129 | io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
130 | end
131 | rescue Timeout::Error
132 | STDERR.puts "robots.txt request timed out"
133 | end
134 | end
135 |
136 | def self.timeout=(t)
137 | @timeout = t
138 | end
139 |
140 | def self.timeout
141 | @timeout || DEFAULT_TIMEOUT
142 | end
143 |
144 | def initialize(user_agent)
145 | @user_agent = user_agent
146 | @parsed = {}
147 | end
148 |
149 | def allowed?(uri)
150 | uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
151 | host = uri.host
152 | @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
153 | @parsed[host].allowed?(uri, @user_agent)
154 | end
155 |
156 | def other_values(uri)
157 | uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
158 | host = uri.host
159 | @parsed[host] ||= ParsedRobots.new(uri, @user_agent)
160 | @parsed[host].other_values
161 | end
162 | end
163 |
--------------------------------------------------------------------------------
/spec/wayback_archiver/sitemapper_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WaybackArchiver::Sitemapper do
4 | let(:headers) do
5 | {
6 | 'Accept' => '*/*',
7 | 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
8 | 'User-Agent' => WaybackArchiver.user_agent
9 | }
10 | end
11 |
12 | let(:robots_txt) { File.read('spec/data/robots.txt') }
13 | let(:sitemap_index_xml) { File.read('spec/data/sitemap_index.xml') }
14 | let(:sitemap_index_with_duplicate_url_xml) { File.read('spec/data/sitemap_index_with_duplicate_url.xml') }
15 | let(:sitemap_xml) { File.read('spec/data/sitemap.xml') }
16 |
17 | describe '::autodiscover' do
18 | context 'with found Sitemap location in robots.txt' do
19 | it 'fetches those Sitemap(s) and returns all present URLs' do
20 | # The robots gem doesn't play nice with the WebMock so we can't test this until
21 | # https://github.com/fizx/robots/pull/9 is merged.
22 | # Until then we're gonna use rspec-mocks
23 | # stub_request(:get, 'http://www.example.com/robots.txt').
24 | # with(headers: headers).
25 | # to_return(status: 200, body: robots_txt, headers: {})
26 | allow_any_instance_of(Robots).to receive(:other_values).and_return('Sitemap' => %w[http://www.example.com/sitemap.xml])
27 |
28 | stub_request(:get, 'http://www.example.com/sitemap.xml')
29 | .with(headers: headers)
30 | .to_return(status: 200, body: sitemap_xml, headers: {})
31 |
32 | expect(described_class.autodiscover('http://www.example.com')).to eq(%w[http://www.example.com/])
33 | end
34 |
35 | it 'returns empty list on request error' do
36 | allow_any_instance_of(Robots).to receive(:other_values).and_raise(WaybackArchiver::Request::Error)
37 |
38 | expect(described_class.autodiscover('http://www.example.com')).to be_empty
39 | end
40 | end
41 |
42 | context 'with found Sitemap location among common Sitemap locations' do
43 | it 'returns all present URLs if a Sitemap is found' do
44 | base_url = 'http://www.example.com'
45 | stub_request(:get, "#{base_url}/robots.txt")
46 | .with(headers: headers)
47 | .to_return(status: 200, body: robots_txt, headers: {})
48 |
49 | sitemap_path = WaybackArchiver::Sitemapper::COMMON_SITEMAP_LOCATIONS.first
50 |
51 | stub_request(:get, [base_url, sitemap_path].join('/'))
52 | .with(headers: headers)
53 | .to_return(status: 200, body: sitemap_xml, headers: {})
54 |
55 | expect(described_class.autodiscover('http://www.example.com')).to eq(%w[http://www.example.com/])
56 | end
57 | end
58 |
59 | context 'at the provided URL' do
60 | it 'returns all present URLs if a Sitemap is found' do
61 | base_url = 'http://www.example.com'
62 | stub_request(:get, "#{base_url}/robots.txt")
63 | .with(headers: headers)
64 | .to_return(status: 200, body: robots_txt, headers: {})
65 |
66 | WaybackArchiver::Sitemapper::COMMON_SITEMAP_LOCATIONS.each do |sitemap_path|
67 | stub_request(:get, [base_url, sitemap_path].join('/'))
68 | .with(headers: headers)
69 | .to_return(status: 404, body: '', headers: {})
70 | end
71 |
72 | stub_request(:get, base_url)
73 | .with(headers: headers)
74 | .to_return(status: 200, body: sitemap_xml, headers: {})
75 |
76 | expect(described_class.autodiscover(base_url)).to eq(%w[http://www.example.com/])
77 | end
78 | end
79 | end
80 |
81 | describe '::urls' do
82 | it 'can start with xml argument' do
83 | expect(described_class.urls(xml: sitemap_xml)).to eq(%w[http://www.example.com/])
84 | end
85 |
86 | it 'returns empty array if url already has been visited' do
87 | start_url = 'http://www.example.com/sitemap_index.xml'
88 |
89 | stub_request(:get, start_url)
90 | .with(headers: headers)
91 | .to_return(status: 200, body: sitemap_index_with_duplicate_url_xml, headers: {})
92 |
93 | %w[http://www.example.com/sitemap1.xml.gz].each do |url|
94 | stub_request(:get, url)
95 | .with(headers: headers)
96 | .to_return(status: 200, body: sitemap_xml, headers: {})
97 | end
98 |
99 | result = described_class.urls(url: start_url)
100 | expect(WaybackArchiver.logger.debug_log).to include("Already visited http://www.example.com/sitemap1.xml.gz skipping..")
101 | expect(result).to eq(%w[http://www.example.com/])
102 | end
103 |
104 | context 'with url argument and returned sitemap index' do
105 | it 'follows the index and returns all URLs sitemap(s)' do
106 | start_url = 'http://www.example.com/sitemap_index.xml'
107 |
108 | stub_request(:get, start_url)
109 | .with(headers: headers)
110 | .to_return(status: 200, body: sitemap_index_xml, headers: {})
111 |
112 | %w[http://www.example.com/sitemap1.xml.gz http://www.example.com/sitemap2.xml.gz].each do |url|
113 | stub_request(:get, url)
114 | .with(headers: headers)
115 | .to_return(status: 200, body: sitemap_xml, headers: {})
116 | end
117 |
118 | result = described_class.urls(url: start_url)
119 | expect(result).to eq(%w[http://www.example.com/ http://www.example.com/])
120 | end
121 | end
122 |
123 | context 'with url argument and returned sitemap' do
124 | it 'returns all URLs in sitemap' do
125 | stub_request(:get, 'http://www.example.com/sitemap.xml')
126 | .with(headers: headers)
127 | .to_return(status: 200, body: sitemap_xml, headers: {})
128 |
129 | result = described_class.urls(url: 'http://www.example.com/sitemap.xml')
130 | expect(result).to eq(%w[http://www.example.com/])
131 | end
132 | end
133 |
134 | it 'returns empty list on request error' do
135 | allow(WaybackArchiver::Request).to receive(:get).and_raise(WaybackArchiver::Request::Error)
136 |
137 | expect(described_class.urls(url: 'http://www.example.com')).to be_empty
138 | end
139 | end
140 | end
141 |
--------------------------------------------------------------------------------
/spec/wayback_archiver_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WaybackArchiver do
4 | describe '::archive' do
5 | it 'raises ArgumentError when passed unknown strategy' do
6 | expect do
7 | described_class.archive('http://example.com', strategy: :watman_strategy)
8 | end.to raise_error(ArgumentError)
9 | end
10 |
11 | it 'calls ::auto when no strategy is given' do
12 | allow(described_class).to receive(:auto).and_return([])
13 | described_class.archive('http://example.com')
14 | expect(described_class).to have_received(:auto).once
15 | end
16 |
17 | it 'calls ::auto when passed auto as strategy' do
18 | allow(described_class).to receive(:auto).and_return([])
19 | described_class.archive('http://example.com', strategy: :auto)
20 | expect(described_class).to have_received(:auto).once
21 | end
22 |
23 | it 'calls ::crawl when passed crawl as strategy' do
24 | allow(described_class).to receive(:crawl).and_return([])
25 | described_class.archive('http://example.com', strategy: :crawl)
26 | expect(described_class).to have_received(:crawl).once
27 | end
28 |
29 | it 'calls ::urls when passed urls as strategy' do
30 | allow(described_class).to receive(:urls).and_return([])
31 | described_class.archive('http://example.com', strategy: :urls)
32 | expect(described_class).to have_received(:urls).once
33 | end
34 |
35 | it 'calls ::urls when passed url as strategy' do
36 | allow(described_class).to receive(:urls).and_return([])
37 | described_class.archive('http://example.com', strategy: :url)
38 | expect(described_class).to have_received(:urls).once
39 | end
40 |
41 | it 'calls ::sitemap when passed sitemap as strategy' do
42 | allow(described_class).to receive(:sitemap).and_return([])
43 | described_class.archive('http://example.com', strategy: :sitemap)
44 | expect(described_class).to have_received(:sitemap).once
45 | end
46 |
47 | context 'legacy strategy param' do
48 | it 'raises ArgumentError when passed unknown strategy' do
49 | expect do
50 | described_class.archive('http://example.com', :watman_strategy)
51 | end.to raise_error(ArgumentError)
52 | end
53 |
54 | it 'calls ::auto when passed auto as strategy' do
55 | allow(described_class).to receive(:auto).and_return([])
56 | described_class.archive('http://example.com', :auto)
57 | expect(described_class).to have_received(:auto).once
58 | end
59 |
60 | it 'calls ::crawl when passed crawl as strategy' do
61 | allow(described_class).to receive(:crawl).and_return([])
62 | described_class.archive('http://example.com', :crawl)
63 | expect(described_class).to have_received(:crawl).once
64 | end
65 |
66 | it 'calls ::urls when passed urls as strategy' do
67 | allow(described_class).to receive(:urls).and_return([])
68 | described_class.archive('http://example.com', :urls)
69 | expect(described_class).to have_received(:urls).once
70 | end
71 |
72 | it 'calls ::urls when passed url as strategy' do
73 | allow(described_class).to receive(:urls).and_return([])
74 | described_class.archive('http://example.com', :url)
75 | expect(described_class).to have_received(:urls).once
76 | end
77 |
78 | it 'calls ::sitemap when passed sitemap as strategy' do
79 | allow(described_class).to receive(:sitemap).and_return([])
80 | described_class.archive('http://example.com', :sitemap)
81 | expect(described_class).to have_received(:sitemap).once
82 | end
83 | end
84 | end
85 |
86 | describe '::auto' do
87 | it 'calls Sitemapper::autodiscover and ::crawl if Sitemapper returned empty result' do
88 | allow(described_class::Sitemapper).to receive(:autodiscover).and_return([])
89 | allow(described_class).to receive(:crawl).and_return([])
90 |
91 | described_class.auto('http://example.com')
92 |
93 | expect(described_class::Sitemapper).to have_received(:autodiscover).once
94 | expect(described_class).to have_received(:crawl).once
95 | end
96 |
97 | it 'calls Sitemapper::autodiscover and ::urls if Sitemapper returned non-empty result' do
98 | allow(described_class::Sitemapper).to receive(:autodiscover).and_return(['url'])
99 | allow(described_class).to receive(:urls).and_return([])
100 |
101 | described_class.auto('http://example.com')
102 |
103 | expect(described_class::Sitemapper).to have_received(:autodiscover).once
104 | expect(described_class).to have_received(:urls).once
105 | end
106 | end
107 |
108 | describe '::crawl' do
109 | it 'calls Archive::crawl' do
110 | allow(described_class::Archive).to receive(:crawl).and_return([])
111 |
112 | described_class.crawl('http://example.com')
113 |
114 | expect(described_class::Archive).to have_received(:crawl).once
115 | end
116 | end
117 |
118 | describe '::urls' do
119 | it 'calls Archive::post' do
120 | allow(described_class::Archive).to receive(:post).and_return([])
121 |
122 | described_class.urls('http://example.com')
123 |
124 | expect(described_class::Archive).to have_received(:post).once
125 | end
126 | end
127 |
128 | describe '::sitemap' do
129 | it 'calls URLCollector::sitemap and Archive::post' do
130 | allow(described_class::URLCollector).to receive(:sitemap).and_return([])
131 | allow(described_class::Archive).to receive(:post).and_return([])
132 |
133 | described_class.sitemap('http://example.com')
134 |
135 | expect(described_class::URLCollector).to have_received(:sitemap).once
136 | expect(described_class::Archive).to have_received(:post).once
137 | end
138 | end
139 |
140 | describe '::default_logger!' do
141 | it 'has NullLogger as the default logger' do
142 | described_class.default_logger!
143 | expect(described_class.logger.class).to eq(described_class::NullLogger)
144 | end
145 | end
146 |
147 | describe '::logger=' do
148 | it 'can set logger' do
149 | MyLogger = Struct.new(:name).new('buren')
150 | described_class.logger = MyLogger
151 | expect(described_class.logger).to eq(MyLogger)
152 | end
153 | end
154 |
155 | describe '::user_agent=' do
156 | it 'can set user_agent' do
157 | described_class.user_agent = 'buren'
158 | expect(described_class.user_agent).to eq('buren')
159 | end
160 | end
161 |
162 | describe '::concurrency=' do
163 | it 'can set concurrency' do
164 | described_class.concurrency = 1
165 | expect(described_class.concurrency).to eq(1)
166 | end
167 | end
168 |
169 | describe '::max_limit=' do
170 | it 'can set max_limit' do
171 | described_class.max_limit = 1
172 | expect(described_class.max_limit).to eq(1)
173 | end
174 | end
175 |
176 | describe '::adapter=' do
177 | it 'can set adapter' do
178 | adapter = WaybackArchiver::WaybackMachine
179 | described_class.adapter = adapter
180 | expect(described_class.adapter).to match(adapter)
181 | end
182 |
183 | it 'raises error unless all adapter respond to #call' do
184 | expect { described_class.adapter = 1 }.to raise_error(ArgumentError)
185 | end
186 | end
187 | end
188 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # WaybackArchiver
2 |
3 | Post URLs to [Wayback Machine](https://archive.org/web/) (Internet Archive), using a crawler, from [Sitemap(s)](http://www.sitemaps.org), or a list of URLs.
4 |
5 | > The Wayback Machine is a digital archive of the World Wide Web [...]
6 | > The service enables users to see archived versions of web pages across time ...
7 | > \- [Wikipedia](https://en.wikipedia.org/wiki/Wayback_Machine)
8 |
9 | [](https://travis-ci.org/buren/wayback_archiver) [](https://codeclimate.com/github/buren/wayback_archiver) [](http://www.rubydoc.info/github/buren/wayback_archiver/master) [](http://badge.fury.io/rb/wayback_archiver)
10 |
11 | __Index__
12 |
13 | * [Installation](#installation)
14 | * [Usage](#usage)
15 | - [Ruby](#ruby)
16 | - [CLI](#cli)
17 | * [Configuration](#configuration)
18 | * [RubyDoc](#docs)
19 | * [Contributing](#contributing)
20 | * [MIT License](#license)
21 | * [References](#references)
22 |
23 | ## Installation
24 |
25 | Install the gem:
26 | ```
27 | $ gem install wayback_archiver
28 | ```
29 |
30 | Or add this line to your application's Gemfile:
31 |
32 | ```ruby
33 | gem 'wayback_archiver'
34 | ```
35 |
36 | And then execute:
37 |
38 | ```
39 | $ bundle
40 | ```
41 |
42 | ## Usage
43 |
44 | * [Ruby](#ruby)
45 | * [CLI](#cli)
46 |
47 | __Strategies__:
48 |
49 | * `auto` (the default) - Will try to
50 | 1. Find Sitemap(s) defined in `/robots.txt`
51 | 2. Then in common sitemap locations `/sitemap-index.xml`, `/sitemap.xml` etc.
52 | 3. Fallback to crawling (using the excellent [spidr](https://github.com/postmodern/spidr/) gem)
53 | * `sitemap` - Parse Sitemap(s), supports [index files](https://www.sitemaps.org/protocol.html#index) (and gzip)
54 | * `urls` - Post URL(s)
55 |
56 | ## Ruby
57 |
58 | First require the gem
59 |
60 | ```ruby
61 | require 'wayback_archiver'
62 | ```
63 |
64 | _Examples_:
65 |
66 | Auto
67 |
68 | ```ruby
69 | # auto is the default
70 | WaybackArchiver.archive('example.com')
71 |
72 | # or explicitly
73 | WaybackArchiver.archive('example.com', strategy: :auto)
74 | ```
75 |
76 | Crawl
77 |
78 | ```ruby
79 | WaybackArchiver.archive('example.com', strategy: :crawl)
80 | ```
81 |
82 | Only send one single URL
83 |
84 | ```ruby
85 | WaybackArchiver.archive('example.com', strategy: :url)
86 | ```
87 |
88 | Send multiple URLs
89 |
90 | ```ruby
91 | WaybackArchiver.archive(%w[example.com www.example.com], strategy: :urls)
92 | ```
93 |
94 | Send all URL(s) found in Sitemap
95 |
96 | ```ruby
97 | WaybackArchiver.archive('example.com/sitemap.xml', strategy: :sitemap)
98 |
99 | # works with Sitemap index files too
100 | WaybackArchiver.archive('example.com/sitemap-index.xml.gz', strategy: :sitemap)
101 | ```
102 |
103 | Specify concurrency
104 |
105 | ```ruby
106 | WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10)
107 | ```
108 |
109 | Specify max number of URLs to be archived
110 |
111 | ```ruby
112 | WaybackArchiver.archive('example.com', strategy: :auto, limit: 10)
113 | ```
114 |
115 | Each archive strategy can receive a block that will be called for each URL
116 |
117 | ```ruby
118 | WaybackArchiver.archive('example.com', strategy: :auto) do |result|
119 | if result.success?
120 | puts "Successfully archived: #{result.archived_url}"
121 | else
122 | puts "Error (HTTP #{result.code}) when archiving: #{result.archived_url}"
123 | end
124 | end
125 | ```
126 |
127 | Use your own adapter for posting found URLs
128 |
129 | ```ruby
130 | WaybackArchiver.adapter = ->(url) { puts url } # whatever that responds to #call
131 | ```
132 |
133 | ## CLI
134 |
135 | __Usage__:
136 |
137 | ```
138 | wayback_archiver [] [options]
139 | ```
140 |
141 | Print full usage instructions
142 |
143 | ```
144 | wayback_archiver --help
145 | ```
146 |
147 | _Examples_:
148 |
149 | Auto
150 |
151 | ```
152 | # auto is the default
153 | wayback_archiver example.com
154 |
155 | # or explicitly
156 | wayback_archiver example.com --auto
157 | ```
158 |
159 | Crawl
160 |
161 | ```bash
162 | wayback_archiver example.com --crawl
163 | ```
164 |
165 | Only send one single URL
166 |
167 | ```bash
168 | wayback_archiver example.com --url
169 | ```
170 |
171 | Send multiple URLs
172 |
173 | ```bash
174 | wayback_archiver example.com www.example.com --urls
175 | ```
176 |
177 | Crawl multiple URLs
178 |
179 | ```bash
180 | wayback_archiver example.com www.example.com --crawl
181 | ```
182 |
183 | Send all URL(s) found in Sitemap
184 |
185 | ```bash
186 | wayback_archiver example.com/sitemap.xml
187 |
188 | # works with Sitemap index files too
189 | wayback_archiver example.com/sitemap-index.xml.gz
190 | ```
191 |
192 | Most options
193 |
194 | ```bash
195 | wayback_archiver example.com www.example.com --auto --concurrency=10 --limit=100 --log=output.log --verbose
196 | ```
197 |
198 | View archive: [https://web.archive.org/web/*/http://example.com](https://web.archive.org/web/*/http://example.com) (replace `http://example.com` with to your desired domain).
199 |
200 | ## Configuration
201 |
202 | :information_source: By default `wayback_archiver` doesn't respect robots.txt files, see [this Internet Archive blog post](https://blog.archive.org/2017/04/17/robots-txt-meant-for-search-engines-dont-work-well-for-web-archives/) for more information.
203 |
204 | Configuration (the below values are the defaults)
205 |
206 | ```ruby
207 | WaybackArchiver.concurrency = 1
208 | WaybackArchiver.user_agent = WaybackArchiver::USER_AGENT
209 | WaybackArchiver.respect_robots_txt = WaybackArchiver::DEFAULT_RESPECT_ROBOTS_TXT
210 | WaybackArchiver.logger = Logger.new(STDOUT)
211 | WaybackArchiver.max_limit = -1 # unlimited
212 | WaybackArchiver.adapter = WaybackArchiver::WaybackMachine # must implement #call(url)
213 | ```
214 |
215 | For a more verbose log you can configure `WaybackArchiver` as such:
216 |
217 | ```ruby
218 | WaybackArchiver.logger = Logger.new(STDOUT).tap do |logger|
219 | logger.progname = 'WaybackArchiver'
220 | logger.level = Logger::DEBUG
221 | end
222 | ```
223 |
224 | _Pro tip_: If you're using the gem in a Rails app you can set `WaybackArchiver.logger = Rails.logger`.
225 |
226 | ## Docs
227 |
228 | You can find the docs online on [RubyDoc](http://www.rubydoc.info/github/buren/wayback_archiver/master).
229 |
230 | This gem is documented using `yard` (run from the root of this repository).
231 |
232 | ```bash
233 | yard # Generates documentation to doc/
234 | ```
235 |
236 | ## Contributing
237 |
238 | Contributions, feedback and suggestions are very welcome.
239 |
240 | 1. Fork it
241 | 2. Create your feature branch (`git checkout -b my-new-feature`)
242 | 3. Commit your changes (`git commit -am 'Add some feature'`)
243 | 4. Push to the branch (`git push origin my-new-feature`)
244 | 5. Create new Pull Request
245 |
246 | ## License
247 |
248 | [MIT License](LICENSE)
249 |
250 | ## References
251 |
252 | * Don't know what the Wayback Machine (Internet Archive) is? [Wayback Machine](https://archive.org/web/)
253 | * Don't know what a Sitemap is? [sitemaps.org](http://www.sitemaps.org)
254 | * Don't know what robot.txt is? [www.robotstxt.org](http://www.robotstxt.org/robotstxt.html)
255 |
--------------------------------------------------------------------------------
/spec/wayback_archiver/request_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe WaybackArchiver::Request do
4 | describe '::get' do
5 | let(:headers) do
6 | {
7 | 'Accept' => '*/*',
8 | 'Accept-Encoding' => 'gzip;q=1.0,deflate;q=0.6,identity;q=0.3',
9 | 'User-Agent' => WaybackArchiver.user_agent
10 | }
11 | end
12 |
13 | [
14 | [described_class::ServerError, Timeout::Error],
15 | [described_class::ServerError, OpenSSL::SSL::SSLError],
16 | [described_class::ServerError, Net::HTTPBadResponse],
17 | [described_class::ServerError, Zlib::Error],
18 | # For some reason the below line causes an ArgumentError exception to be raised instead
19 | # [described_class::ClientError, SystemCallError],
20 | [described_class::ClientError, SocketError],
21 | [described_class::ClientError, IOError]
22 | ].each do |test_data|
23 | error_klass, raised_error_klass = test_data
24 |
25 | it "raises #{error_klass} on #{raised_error_klass}" do
26 | allow_any_instance_of(Net::HTTP).to receive(:request).and_raise(raised_error_klass)
27 |
28 | expect { described_class.get('https://example.com') }.to raise_error(error_klass)
29 | end
30 | end
31 |
32 | it 'returns response when server response with HTTP 200' do
33 | stub_request(:get, 'https://example.com/')
34 | .with(headers: headers)
35 | .to_return(status: 200, body: 'buren', headers: {})
36 |
37 | result = described_class.get('https://example.com')
38 | expect(result.code).to eq('200')
39 | end
40 |
41 | it 'follows redirect when server response with HTTP 3XX' do
42 | response_headers = { 'location' => '/redirect-path' }
43 | stub_request(:get, 'https://example.com/')
44 | .with(headers: headers)
45 | .to_return(status: 301, body: 'buren', headers: response_headers)
46 |
47 | stub_request(:get, 'https://example.com/redirect-path')
48 | .with(headers: headers)
49 | .to_return(status: 200, body: 'buren', headers: {})
50 |
51 | result = described_class.get('https://example.com', max_redirects: 1)
52 | expect(result.code).to eq('200')
53 | expect(result.uri).to eq('https://example.com/redirect-path')
54 | end
55 |
56 | it 'raises MaxRedirectError if max redirects is reached' do
57 | response_headers = { 'location' => '/redirect-path' }
58 | stub_request(:get, 'https://example.com/')
59 | .with(headers: headers)
60 | .to_return(status: 301, body: 'buren', headers: response_headers)
61 |
62 | expect do
63 | described_class.get('https://example.com', max_redirects: 0)
64 | end.to raise_error(described_class::MaxRedirectError)
65 | end
66 |
67 | it 'raises UnknownResponseCodeError if server response with unknown HTTP code' do
68 | stub_request(:get, 'https://example.com/')
69 | .with(headers: headers)
70 | .to_return(status: 100, body: 'buren', headers: {})
71 |
72 | expect do
73 | described_class.get('https://example.com')
74 | end.to raise_error(described_class::UnknownResponseCodeError)
75 | end
76 |
77 | it 'raises ResponseError if server responded with an error and raise_on_http_error is true' do
78 | stub_request(:get, 'https://example.com/')
79 | .with(headers: headers)
80 | .to_return(status: 400, body: 'buren', headers: {})
81 |
82 | expect do
83 | described_class.get('https://example.com', raise_on_http_error: true)
84 | end.to raise_error(described_class::ResponseError)
85 | end
86 |
87 | it 'returns response if server responds with an error and raise_on_http_error is false' do
88 | stub_request(:get, 'https://example.com/')
89 | .with(headers: headers)
90 | .to_return(status: 400, body: 'buren', headers: {})
91 |
92 | result = described_class.get('https://example.com', raise_on_http_error: false)
93 |
94 | expect(result.code).to eq('400')
95 | end
96 | end
97 |
98 | describe '::build_response' do
99 | it 'builds a Response object' do
100 | expected = WaybackArchiver::Response.new('200', 'OK', 'buren', 'http://example.com')
101 | response = described_class.build_response(
102 | 'http://example.com',
103 | Struct.new(:code, :message, :body).new('200', 'OK', 'buren')
104 | )
105 |
106 | expect(response).to eq(expected)
107 | end
108 |
109 | it 'builds a response object that has a #success? method' do
110 | response = described_class.build_response(
111 | 'http://example.com',
112 | Struct.new(:code, :message, :body).new('200', 'OK', 'buren')
113 | )
114 |
115 | expect(response.success?).to eq(true)
116 | end
117 | end
118 |
119 | describe '::build_redirect_uri' do
120 | it 'raises InvalidRedirectError if no location header is found' do
121 | response = Struct.new(:header).new({ location: nil })
122 | redirect_error = WaybackArchiver::Request::InvalidRedirectError
123 |
124 | expect do
125 | described_class.build_redirect_uri('', response)
126 | end.to raise_error(redirect_error)
127 | end
128 |
129 | it 'adds base URI if location header is relative' do
130 | base_uri = 'http://example.com'
131 | response = Struct.new(:header).new({ 'location' => '/path' })
132 | result = described_class.build_redirect_uri(base_uri, response)
133 |
134 | expect(result).to eq(URI.parse('http://example.com/path'))
135 | end
136 |
137 | it 'returns location header' do
138 | base_uri = 'http://example.com'
139 | response = Struct.new(:header).new({ 'location' => 'https://example.com/path' })
140 | result = described_class.build_redirect_uri(base_uri, response)
141 |
142 | expect(result).to eq(URI.parse('https://example.com/path'))
143 | end
144 | end
145 |
146 | describe '::build_uri' do
147 | it 'returns URI untouched if passed an instance of URI' do
148 | uri = URI.parse('http://example.com')
149 | expect(described_class.build_uri(uri)).to eq(uri)
150 | end
151 |
152 | it 'returns URI instance if passed string with http protocol' do
153 | uri = URI.parse('http://example.com')
154 | expect(described_class.build_uri('http://example.com')).to eq(uri)
155 | end
156 |
157 | it 'returns URI instance if passed string with https protocol' do
158 | uri = URI.parse('https://example.com')
159 | expect(described_class.build_uri('https://example.com')).to eq(uri)
160 | end
161 |
162 | it 'returns URI instance with protocol if passed string without protocol' do
163 | uri = URI.parse('http://example.com')
164 | expect(described_class.build_uri('example.com')).to eq(uri)
165 | end
166 | end
167 |
168 | describe '::parse_body' do
169 | it 'returns empty string if passed nil' do
170 | expect(described_class.parse_body(nil)).to eq('')
171 | end
172 |
173 | it 'returns string untouched if passed a regular string' do
174 | expect(described_class.parse_body('buren')).to eq('buren')
175 | end
176 |
177 | it 'returns uncompressed string if passed a gzipped string' do
178 | gzipped_string = File.read('spec/data/test_gzip.gz')
179 | expect(described_class.parse_body(gzipped_string)).to eq("buren\n")
180 | end
181 | end
182 |
183 | describe '::blank?' do
184 | it 'returns true if passed nil' do
185 | expect(described_class.blank?(nil)).to eq(true)
186 | end
187 |
188 | it 'returns true if passed empty string' do
189 | expect(described_class.blank?('')).to eq(true)
190 | end
191 |
192 | it 'returns true if passed string with only spaces' do
193 | expect(described_class.blank?(' ')).to eq(true)
194 | end
195 |
196 | it 'returns false if passed non-string empty' do
197 | expect(described_class.blank?('buren')).to eq(false)
198 | end
199 | end
200 | end
201 |
--------------------------------------------------------------------------------
/lib/wayback_archiver/request.rb:
--------------------------------------------------------------------------------
1 | require 'net/http'
2 | require 'openssl'
3 | require 'timeout'
4 | require 'uri'
5 | require 'zlib'
6 |
7 | require 'wayback_archiver/http_code'
8 | require 'wayback_archiver/response'
9 |
10 | module WaybackArchiver
11 | # Make HTTP requests
12 | class Request
13 | # General error, something went wrong
14 | class Error < StandardError; end
15 | # Client error, something went wrong on the local machine
16 | class ClientError < Error; end
17 | # Server error, the remote server did something wrong
18 | class ServerError < Error; end
19 | # Remote server responded with a HTTP error
20 | class HTTPError < ServerError; end
21 | # Remote server error
22 | class ResponseError < ServerError; end
23 | # Max redirects reached error
24 | class MaxRedirectError < ServerError; end
25 | # Remote server responded with an invalid redirect
26 | class InvalidRedirectError < ServerError; end
27 | # Remote server responded with an unknown HTTP code
28 | class UnknownResponseCodeError < ServerError; end
29 |
30 | # GET response wrapper
31 | GETStruct = Struct.new(:response, :error)
32 |
33 | # Max number of redirects before an error is raised
34 | MAX_REDIRECTS = 10
35 |
36 | # Known request errors
37 | REQUEST_ERRORS = {
38 | # server
39 | Timeout::Error => ServerError,
40 | OpenSSL::SSL::SSLError => ServerError,
41 | Net::HTTPBadResponse => ServerError,
42 | Zlib::Error => ServerError,
43 | # client
44 | SystemCallError => ClientError,
45 | SocketError => ClientError,
46 | IOError => ClientError
47 | }.freeze
48 |
49 | # Get reponse.
50 | # @return [Response] the http response representation.
51 | # @param [String, URI] uri to retrieve.
52 | # @param max_redirects [Integer] max redirects (default: 10).
53 | # @param follow_redirects [Boolean] follow redirects (default: true).
54 | # @example Get example.com
55 | # Request.get('example.com')
56 | # @example Get http://example.com and follow max 3 redirects
57 | # Request.get('http://example.com', max_redirects: 3)
58 | # @example Get http://example.com and don't follow redirects
59 | # Request.get('http://example.com', follow_redirects: false)
60 | # @raise [Error] super class of all exceptions that this method can raise
61 | # @raise [ServerError] all server errors
62 | # @raise [ClientError] all client errors
63 | # @raise [HTTPError] all HTTP errors
64 | # @raise [MaxRedirectError] too many redirects, subclass of HTTPError (only raised if raise_on_http_error flag is true)
65 | # @raise [ResponseError] server responsed with a 4xx or 5xx HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true)
66 | # @raise [UnknownResponseCodeError] server responded with an unknown HTTP status code, subclass of HTTPError (only raised if raise_on_http_error flag is true)
67 | # @raise [InvalidRedirectError] server responded with an invalid redirect, subclass of HTTPError (only raised if raise_on_http_error flag is true)
68 | def self.get(
69 | uri,
70 | max_redirects: MAX_REDIRECTS,
71 | raise_on_http_error: false,
72 | follow_redirects: true
73 | )
74 | uri = build_uri(uri)
75 |
76 | redirect_count = 0
77 | until redirect_count > max_redirects
78 | WaybackArchiver.logger.debug "Requesting #{uri}"
79 |
80 | http = Net::HTTP.new(uri.host, uri.port)
81 | if uri.scheme == 'https'
82 | http.use_ssl = true
83 | http.verify_mode = OpenSSL::SSL::VERIFY_NONE
84 | end
85 |
86 | request = Net::HTTP::Get.new(uri.request_uri)
87 | request['User-Agent'] = WaybackArchiver.user_agent
88 |
89 | result = perform_request(uri, http, request)
90 | response = result.response
91 | error = result.error
92 |
93 | raise error if error
94 |
95 | code = response.code
96 | WaybackArchiver.logger.debug "[#{code}, #{response.message}] Requested #{uri}"
97 |
98 | case HTTPCode.type(code)
99 | when :success
100 | return build_response(uri, response)
101 | when :redirect
102 | return build_response(uri, response) unless follow_redirects
103 |
104 | uri = build_redirect_uri(uri, response)
105 | redirect_count += 1
106 | next
107 | when :error
108 | if raise_on_http_error
109 | raise ResponseError, "Failed with response code: #{code} when requesting #{uri}"
110 | end
111 |
112 | return build_response(uri, response)
113 | else
114 | raise UnknownResponseCodeError, "Unknown HTTP response code #{code} when requesting #{uri}"
115 | end
116 | end
117 |
118 | raise MaxRedirectError, "Redirected too many times when requesting #{uri}"
119 | end
120 |
121 | # Builds a Response object.
122 | # @return [Response]
123 | # @param [URI] uri that was requested.
124 | # @param [Net::HTTPResponse] response the server response.
125 | # @example Build Response object for example.com
126 | # Request.build_response(uri, net_http_response)
127 | def self.build_response(uri, response)
128 | Response.new(
129 | response.code,
130 | response.message,
131 | parse_body(response.body),
132 | uri.to_s
133 | )
134 | end
135 |
136 | # Builds an URI for a redirect response.
137 | # @return [URI] to redirect to.
138 | # @param [URI] uri that was requested.
139 | # @param [Net::HTTPResponse] response the server response.
140 | # @example Build redirect URI for example.com (lets pretend it will redirect..)
141 | # Request.build_redirect_uri('http://example.com', net_http_response)
142 | def self.build_redirect_uri(uri, response)
143 | location_header = response.header.fetch('location') do
144 | raise InvalidRedirectError, "No location header found on redirect when requesting #{uri}"
145 | end
146 |
147 | location = URI.parse(location_header)
148 | return build_uri(uri) + location_header if location.relative?
149 |
150 | location
151 | end
152 |
153 | # Build URI.
154 | # @return [URI] uri to redirect to.
155 | # @param [URI, String] uri to build.
156 | # @example Build URI for example.com
157 | # Request.build_uri('http://example.com')
158 | # @example Build URI for #
159 | # uri = URI.parse('http://example.com')
160 | # Request.build_uri(uri)
161 | def self.build_uri(uri)
162 | return uri if uri.is_a?(URI)
163 |
164 | uri = "http://#{uri}" unless uri =~ %r{^https?://}
165 | URI.parse(uri)
166 | end
167 |
168 | # Parse response body, handles reqular and gzipped response bodies.
169 | # @return [String] the response body.
170 | # @param [String] response_body the server response body.
171 | # @example Return response body for response.
172 | # Request.parse_body(uri, net_http_response)
173 | def self.parse_body(response_body)
174 | return '' unless response_body
175 |
176 | Zlib::GzipReader.new(StringIO.new(response_body)).read
177 | rescue Zlib::GzipFile::Error => _e
178 | response_body
179 | end
180 |
181 | # Return whether a value is blank or not.
182 | # @return [Boolean] whether the value is blank or not.
183 | # @param [Object] value the value to check if its blank or not.
184 | # @example Returns false for nil.
185 | # Request.blank?(nil)
186 | # @example Returns false for empty string.
187 | # Request.blank?('')
188 | # @example Returns false for string with only spaces.
189 | # Request.blank?(' ')
190 | def self.blank?(value)
191 | return true unless value
192 | return true if value.strip.empty?
193 |
194 | false
195 | end
196 |
197 | private
198 |
199 | def self.perform_request(uri, http, request)
200 | # TODO: Consider retrying on certain HTTP response codes, i.e 429, 503
201 | response = http.request(request)
202 | GETStruct.new(response)
203 | rescue *REQUEST_ERRORS.keys => e
204 | build_request_error(uri, e, REQUEST_ERRORS.fetch(e.class))
205 | end
206 |
207 | def self.build_request_error(uri, error, error_wrapper_klass)
208 | WaybackArchiver.logger.error "Request to #{uri} failed: #{error_wrapper_klass}, #{error.class}, #{error.message}"
209 |
210 | GETStruct.new(
211 | Response.new,
212 | error_wrapper_klass.new("#{error.class}, #{error.message}")
213 | )
214 | end
215 | end
216 | end
217 |
--------------------------------------------------------------------------------
/lib/wayback_archiver.rb:
--------------------------------------------------------------------------------
1 | require 'wayback_archiver/thread_pool'
2 | require 'wayback_archiver/null_logger'
3 | require 'wayback_archiver/version'
4 | require 'wayback_archiver/url_collector'
5 | require 'wayback_archiver/archive'
6 | require 'wayback_archiver/sitemapper'
7 |
8 | # WaybackArchiver, send URLs to Wayback Machine. By crawling, sitemap or by passing a list of URLs.
9 | module WaybackArchiver
10 | # Link to gem on rubygems.org, part of the sent User-Agent
11 | INFO_LINK = 'https://rubygems.org/gems/wayback_archiver'.freeze
12 | # WaybackArchiver User-Agent
13 | USER_AGENT = "WaybackArchiver/#{WaybackArchiver::VERSION} (+#{INFO_LINK})".freeze
14 | # Default for whether to respect robots txt files
15 | DEFAULT_RESPECT_ROBOTS_TXT = false
16 |
17 | # Default concurrency for archiving URLs
18 | DEFAULT_CONCURRENCY = 1
19 |
20 | # Maxmium number of links posted (-1 is no limit)
21 | DEFAULT_MAX_LIMIT = -1
22 |
23 | # Send URLs to Wayback Machine.
24 | # @return [Array] of URLs sent to the Wayback Machine.
25 | # @param [String/Array] source for URL(s).
26 | # @param [String/Symbol] strategy of source. Supported strategies: crawl, sitemap, url, urls, auto.
27 | # @param [Array] hosts to crawl.
28 | # @example Crawl example.com and send all URLs of the same domain
29 | # WaybackArchiver.archive('example.com') # Default strategy is :auto
30 | # WaybackArchiver.archive('example.com', strategy: :auto)
31 | # WaybackArchiver.archive('example.com', strategy: :auto, concurrency: 10)
32 | # WaybackArchiver.archive('example.com', strategy: :auto, limit: 100) # send max 100 URLs
33 | # WaybackArchiver.archive('example.com', :auto)
34 | # @example Crawl example.com and send all URLs of the same domain
35 | # WaybackArchiver.archive('example.com', strategy: :crawl)
36 | # WaybackArchiver.archive('example.com', strategy: :crawl, concurrency: 10)
37 | # WaybackArchiver.archive('example.com', strategy: :crawl, limit: 100) # send max 100 URLs
38 | # WaybackArchiver.archive('example.com', :crawl)
39 | # @example Send example.com Sitemap URLs
40 | # WaybackArchiver.archive('example.com', strategy: :sitemap)
41 | # WaybackArchiver.archive('example.com', strategy: :sitemap, concurrency: 10)
42 | # WaybackArchiver.archive('example.com', strategy: :sitemap, limit: 100) # send max 100 URLs
43 | # WaybackArchiver.archive('example.com', :sitemap)
44 | # @example Send only example.com
45 | # WaybackArchiver.archive('example.com', strategy: :url)
46 | # WaybackArchiver.archive('example.com', strategy: :url, concurrency: 10)
47 | # WaybackArchiver.archive('example.com', strategy: :url, limit: 100) # send max 100 URLs
48 | # WaybackArchiver.archive('example.com', :url)
49 | # @example Crawl multiple hosts
50 | # WaybackArchiver.archive(
51 | # 'http://example.com',
52 | # hosts: [
53 | # 'example.com',
54 | # /host[\d]+\.example\.com/
55 | # ]
56 | # )
57 | def self.archive(source, legacy_strategy = nil, strategy: :auto, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
58 | strategy = legacy_strategy || strategy
59 |
60 | case strategy.to_s
61 | when 'crawl' then crawl(source, concurrency: concurrency, limit: limit, hosts: hosts, &block)
62 | when 'auto' then auto(source, concurrency: concurrency, limit: limit, &block)
63 | when 'sitemap' then sitemap(source, concurrency: concurrency, limit: limit, &block)
64 | when 'urls' then urls(source, concurrency: concurrency, limit: limit, &block)
65 | when 'url' then urls(source, concurrency: concurrency, limit: limit, &block)
66 | else
67 | raise ArgumentError, "Unknown strategy: '#{strategy}'. Allowed strategies: sitemap, urls, url, crawl"
68 | end
69 | end
70 |
71 | # Look for Sitemap(s) and if nothing is found fallback to crawling.
72 | # Then send found URLs to the Wayback Machine.
73 | # @return [Array] of URLs sent to the Wayback Machine.
74 | # @param [String] source (must be a valid URL).
75 | # @param concurrency [Integer]
76 | # @example Auto archive example.com
77 | # WaybackArchiver.auto('example.com') # Default concurrency is 1
78 | # @example Auto archive example.com with low concurrency
79 | # WaybackArchiver.auto('example.com', concurrency: 1)
80 | # @example Auto archive example.com and archive max 100 URLs
81 | # WaybackArchiver.auto('example.com', limit: 100)
82 | # @see http://www.sitemaps.org
83 | def self.auto(source, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
84 | urls = Sitemapper.autodiscover(source)
85 | return urls(urls, concurrency: concurrency, &block) if urls.any?
86 |
87 | crawl(source, concurrency: concurrency, &block)
88 | end
89 |
90 | # Crawl site for URLs to send to the Wayback Machine.
91 | # @return [Array] of URLs sent to the Wayback Machine.
92 | # @param [String] url to start crawling from.
93 | # @param [Array] hosts to crawl
94 | # @param concurrency [Integer]
95 | # @example Crawl example.com and send all URLs of the same domain
96 | # WaybackArchiver.crawl('example.com') # Default concurrency is 1
97 | # @example Crawl example.com and send all URLs of the same domain with low concurrency
98 | # WaybackArchiver.crawl('example.com', concurrency: 1)
99 | # @example Crawl example.com and archive max 100 URLs
100 | # WaybackArchiver.crawl('example.com', limit: 100)
101 | # @example Crawl multiple hosts
102 | # URLCollector.crawl(
103 | # 'http://example.com',
104 | # hosts: [
105 | # 'example.com',
106 | # /host[\d]+\.example\.com/
107 | # ]
108 | # )
109 | def self.crawl(url, hosts: [], concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
110 | WaybackArchiver.logger.info "Crawling #{url}"
111 | Archive.crawl(url, hosts: hosts, concurrency: concurrency, limit: limit, &block)
112 | end
113 |
114 | # Get URLs from sitemap and send found URLs to the Wayback Machine.
115 | # @return [Array] of URLs sent to the Wayback Machine.
116 | # @param [String] url to the sitemap.
117 | # @param concurrency [Integer]
118 | # @example Get example.com sitemap and archive all found URLs
119 | # WaybackArchiver.sitemap('example.com/sitemap.xml') # Default concurrency is 1
120 | # @example Get example.com sitemap and archive all found URLs with low concurrency
121 | # WaybackArchiver.sitemap('example.com/sitemap.xml', concurrency: 1)
122 | # @example Get example.com sitemap archive max 100 URLs
123 | # WaybackArchiver.sitemap('example.com/sitemap.xml', limit: 100)
124 | # @see http://www.sitemaps.org
125 | def self.sitemap(url, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
126 | WaybackArchiver.logger.info "Fetching Sitemap"
127 | Archive.post(URLCollector.sitemap(url), concurrency: concurrency, limit: limit, &block)
128 | end
129 |
130 | # Send URL to the Wayback Machine.
131 | # @return [Array] of URLs sent to the Wayback Machine.
132 | # @param [Array/String] urls or url.
133 | # @param concurrency [Integer]
134 | # @example Archive example.com
135 | # WaybackArchiver.urls('example.com')
136 | # @example Archive example.com and google.com
137 | # WaybackArchiver.urls(%w(example.com google.com))
138 | # @example Archive example.com, max 100 URLs
139 | # WaybackArchiver.urls(%w(example.com www.example.com), limit: 100)
140 | def self.urls(urls, concurrency: WaybackArchiver.concurrency, limit: WaybackArchiver.max_limit, &block)
141 | Archive.post(Array(urls), concurrency: concurrency, &block)
142 | end
143 |
144 | # Set logger
145 | # @return [Object] the set logger
146 | # @param [Object] logger an object than response to quacks like a Logger
147 | # @example set a logger that prints to standard out (STDOUT)
148 | # WaybackArchiver.logger = Logger.new(STDOUT)
149 | def self.logger=(logger)
150 | @logger = logger
151 | end
152 |
153 | # Returns the current logger
154 | # @return [Object] the current logger instance
155 | def self.logger
156 | @logger ||= NullLogger.new
157 | end
158 |
159 | # Resets the logger to the default
160 | # @return [NullLogger] a new instance of NullLogger
161 | def self.default_logger!
162 | @logger = NullLogger.new
163 | end
164 |
165 | # Sets the user agent
166 | # @return [String] the configured user agent
167 | # @param [String] user_agent the desired user agent
168 | def self.user_agent=(user_agent)
169 | @user_agent = user_agent
170 | end
171 |
172 | # Returns the configured user agent
173 | # @return [String] the configured or the default user agent
174 | def self.user_agent
175 | @user_agent ||= USER_AGENT
176 | end
177 |
178 | # Sets the default respect_robots_txt
179 | # @return [Boolean] the desired default for respect_robots_txt
180 | # @param [Boolean] respect_robots_txt the desired default
181 | def self.respect_robots_txt=(respect_robots_txt)
182 | @respect_robots_txt = respect_robots_txt
183 | end
184 |
185 | # Returns the default respect_robots_txt
186 | # @return [Boolean] the configured or the default respect_robots_txt
187 | def self.respect_robots_txt
188 | @respect_robots_txt ||= DEFAULT_RESPECT_ROBOTS_TXT
189 | end
190 |
191 | # Sets the default concurrency
192 | # @return [Integer] the desired default concurrency
193 | # @param [Integer] concurrency the desired default concurrency
194 | def self.concurrency=(concurrency)
195 | @concurrency = concurrency
196 | end
197 |
198 | # Returns the default concurrency
199 | # @return [Integer] the configured or the default concurrency
200 | def self.concurrency
201 | @concurrency ||= DEFAULT_CONCURRENCY
202 | end
203 |
204 | # Sets the default max_limit
205 | # @return [Integer] the desired default max_limit
206 | # @param [Integer] max_limit the desired default max_limit
207 | def self.max_limit=(max_limit)
208 | @max_limit = max_limit
209 | end
210 |
211 | # Returns the default max_limit
212 | # @return [Integer] the configured or the default max_limit
213 | def self.max_limit
214 | @max_limit ||= DEFAULT_MAX_LIMIT
215 | end
216 |
217 | # Sets the adapter
218 | # @return [Object, #call>] the configured adapter
219 | # @param [Object, #call>] the adapter
220 | def self.adapter=(adapter)
221 | unless adapter.respond_to?(:call)
222 | raise(ArgumentError, 'adapter must implement #call')
223 | end
224 |
225 | @adapter = adapter
226 | end
227 |
228 | # Returns the configured adapter
229 | # @return [Integer] the configured or the default adapter
230 | def self.adapter
231 | @adapter ||= WaybackMachine
232 | end
233 | end
234 |
--------------------------------------------------------------------------------