├── Rakefile
├── .rspec
├── lib
    ├── grell
    │   ├── version.rb
    │   ├── grell_logger.rb
    │   ├── reader.rb
    │   ├── capybara_driver.rb
    │   ├── rawpage.rb
    │   ├── page_collection.rb
    │   ├── crawler.rb
    │   ├── crawler_manager.rb
    │   └── page.rb
    └── grell.rb
├── Gemfile
├── LICENSE.txt
├── spec
    ├── lib
    │   ├── reader_spec.rb
    │   ├── capybara_driver_spec.rb
    │   ├── page_collection_spec.rb
    │   ├── crawler_manager_spec.rb
    │   ├── crawler_spec.rb
    │   └── page_spec.rb
    └── spec_helper.rb
├── .travis.yml
├── grell.gemspec
├── CHANGELOG.md
└── README.md


/Rakefile:
--------------------------------------------------------------------------------
1 | require 'kender/tasks'
2 | 
3 | 


--------------------------------------------------------------------------------
/.rspec:
--------------------------------------------------------------------------------
1 | --color
2 | --require spec_helper
3 | 


--------------------------------------------------------------------------------
/lib/grell/version.rb:
--------------------------------------------------------------------------------
1 | module Grell
2 |   VERSION = "2.1.2".freeze
3 | end
4 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | # Avoid ruby 2.1 to use Rack > 2.0 which is not compatible
4 | platform :ruby_21 do
5 |   gem 'rack', '~> 1.0'
6 | end
7 | gemspec
8 | 


--------------------------------------------------------------------------------
/lib/grell/grell_logger.rb:
--------------------------------------------------------------------------------
 1 | require 'logger'
 2 | 
 3 | #Very simple global logger for our crawler.
 4 | module Grell
 5 |   class << self
 6 |     attr_accessor :logger
 7 |   end
 8 | end
 9 | 
10 | Grell.logger = Logger.new(STDOUT)


--------------------------------------------------------------------------------
/lib/grell.rb:
--------------------------------------------------------------------------------
 1 | require 'capybara/poltergeist'
 2 | require 'capybara/dsl'
 3 | 
 4 | require 'grell/grell_logger'
 5 | require 'grell/capybara_driver'
 6 | require 'grell/crawler_manager'
 7 | require 'grell/crawler'
 8 | require 'grell/rawpage'
 9 | require 'grell/page'
10 | require 'grell/page_collection'
11 | require 'grell/reader'
12 | 


--------------------------------------------------------------------------------
/lib/grell/reader.rb:
--------------------------------------------------------------------------------
 1 | module Grell
 2 |   # A tooling class, it waits a maximum of max_waiting for an action to finish. If the action is not
 3 |   # finished by then, it will continue anyway.
 4 |   # The wait may be long but we want to finish it as soon as the action has finished
 5 |   class Reader
 6 |     def self.wait_for(action, max_waiting, sleeping_time)
 7 |       time_start = Time.now
 8 |       action.call()
 9 |       return if yield
10 |       while (Time.now < time_start + max_waiting)
11 |         action.call()
12 |         break if yield
13 |         sleep(sleeping_time)
14 |       end
15 |     end
16 | 
17 |   end
18 | end
19 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Medidata Solutions Worldwide
 2 | 
 3 | MIT License
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | "Software"), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/spec/lib/reader_spec.rb:
--------------------------------------------------------------------------------
 1 | RSpec.describe Grell::Reader do
 2 | 
 3 |   context 'Waiting time expired' do
 4 |     let(:waiting_time) {0}
 5 |     let(:sleeping_time) {2}
 6 |     let(:condition) {false}
 7 |     it 'does not sleep' do
 8 |       before_time = Time.now
 9 |       Grell::Reader.wait_for(->{''}, waiting_time, sleeping_time) do
10 |         condition
11 |       end
12 |       expect(Time.now - before_time).to be < 1
13 |     end
14 |   end
15 | 
16 |   context 'The condition is true' do
17 |     let(:waiting_time) {3}
18 |     let(:sleeping_time) {2}
19 |     let(:condition) {true}
20 |     it 'does not sleep' do
21 |       before_time = Time.now
22 |       Grell::Reader.wait_for(->{''}, waiting_time, sleeping_time) do
23 |         condition
24 |       end
25 |       expect(Time.now - before_time).to be < 1
26 |     end
27 |   end
28 | 
29 |   context 'The condition is false' do
30 |     let(:waiting_time) {0.2}
31 |     let(:sleeping_time) {0.2}
32 |     let(:condition) {false}
33 | 
34 |     it 'waits the waiting time' do
35 |       before_time = Time.now
36 |       Grell::Reader.wait_for(->{''}, waiting_time, sleeping_time) do
37 |         condition
38 |       end
39 |       expect(Time.now - before_time).to be > waiting_time
40 |     end
41 | 
42 |   end
43 | end
44 | 


--------------------------------------------------------------------------------
/spec/lib/capybara_driver_spec.rb:
--------------------------------------------------------------------------------
 1 | 
 2 | RSpec.describe Grell::CapybaraDriver do
 3 |   let(:ts) { Time.now }
 4 |   before do
 5 |     Grell.logger = Logger.new(nil)
 6 |   end
 7 | 
 8 |   describe 'setup_capybara' do
 9 |     it 'properly registers the poltergeist driver' do
10 |       Timecop.freeze(ts)
11 |       driver = Grell::CapybaraDriver.new.setup_capybara
12 |       expect(driver).to be_instance_of(Capybara::Poltergeist::Driver)
13 |     end
14 | 
15 |     it 'raises an exception if the driver cannot be initialized' do
16 |       Timecop.freeze(ts + 60)
17 | 
18 |       # Attempt to register twice with the same driver name
19 |       Grell::CapybaraDriver.new.setup_capybara
20 |       expect { Grell::CapybaraDriver.new.setup_capybara }.
21 |         to raise_error "Poltergeist Driver could not be properly initialized"
22 |     end
23 | 
24 |     it 'can register the poltergeist driver multiple times in a row' do
25 |       Timecop.freeze(ts + 120)
26 |       driver = Grell::CapybaraDriver.new.setup_capybara
27 |       expect(driver).to be_instance_of(Capybara::Poltergeist::Driver)
28 |     end
29 |   end
30 | 
31 |   after do
32 |     Timecop.return
33 | 
34 |     # Reset Capybara so future tests can easily stub HTTP requests
35 |     Capybara.javascript_driver = :poltergeist_billy
36 |     Capybara.default_driver = :poltergeist_billy
37 |   end
38 | end
39 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: ruby
 2 | cache: bundler
 3 | 
 4 | rvm:
 5 |   - 2.2.4
 6 |   - 2.3.0
 7 |   - 2.4.2
 8 | 
 9 | before_install:
10 |   - mkdir travis-phantomjs
11 |   - wget https://github.com/JordiPolo/phantomjs/blob/master/phantomjs-2.1.1-linux-x86_64.tar.bz2?raw=true
12 |     -O $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2
13 |   - tar -xvf $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2 -C $PWD/travis-phantomjs
14 |   - export PATH=$PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64/bin:$PATH
15 | 
16 | install:
17 |   - bundle install --jobs=3 --retry=3
18 | 
19 | script:
20 |   - bundle exec rspec
21 | 
22 | deploy:
23 |   provider: rubygems
24 |   api_key:
25 |     secure: czStDI0W6MWL70sDwu53oNNCc8vKtT61pgvii+ZWIC9A41C2p7BzmbtosXsnLk2ApxmpWvFIgtQE0XIH7jkM5mY05cHinXDphtOTkNLFVjck3ZOMkx/cc+QRFW8K4FHkrzFsC+/Xx4t2/Psh35LpzhfJd0XzKKoCstXUVgJsfGcAK3DMpjXHSUbwLXGDZ4lzmsk52OLf0oL+in2447TJfVOvGXtYmfh1PjXRwDxKB0dan7w5mVgajS52b6wUhVPTaMe/JgCbMuV7BaQ1Goq8u7V4aaxU+liPAhzHWfMB6tF4TEW8yu2tvGLdOA0+1jmM8E9Q5saPWtwKiHvBxN8CzRpkiNDzyFAf8ljrWT5yKX3aRQCyPp3NNyhoumWap36b+O/zwZ3HxoAe22Yg0rjz8z8NxMR/ELPvjPYjCiF5zY7fO9PAzmIynMRUrxDnFj+/JGHdzx0ZMo3fEXgHHSaHPNxIzEffVVQk4XLVnFHDjBLY4mVp4sbHbja5qnui20RkdM/H9Yi/fQyl1ODhk+LUPoh45ZneDZq7GPrl+WKK06oEjXIXLU+1iEuqnSqybbmJMTUJlUV+7EJdtq2DgfDB4KXwLm2LLOR/IX63AzEav4NIxx3hIXifSKa9rp6D7nMTzdQwF0FFzIj/Y3qLrAe1WWt0gx3Vxq67pSwOJthk5Fc=
26 |   on:
27 |     tags: true
28 |     rvm: 2.4.2
29 | 


--------------------------------------------------------------------------------
/grell.gemspec:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | lib = File.expand_path('../lib', __FILE__)
 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 4 | require 'grell/version'
 5 | 
 6 | Gem::Specification.new do |spec|
 7 |   spec.name          = "grell"
 8 |   spec.version       = Grell::VERSION
 9 |   spec.platform      = Gem::Platform::RUBY
10 |   spec.authors       = ["Jordi Polo Carres"]
11 |   spec.email         = ["jcarres@mdsol.com"]
12 |   spec.summary       = %q{Ruby web crawler}
13 |   spec.description   = %q{Ruby web crawler using PhantomJS}
14 |   spec.homepage      = "https://github.com/mdsol/grell"
15 |   spec.license       = 'MIT'
16 | 
17 |   spec.files         = `git ls-files -z`.split("\x0")
18 |   spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19 |   spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
20 |   spec.require_paths = ["lib"]
21 | 
22 |   spec.required_ruby_version = '>= 2.1.8'
23 | 
24 |   spec.add_dependency 'capybara', '~> 2.10'
25 |   spec.add_dependency 'poltergeist', '~> 1.11'
26 | 
27 |   # spec.add_development_dependency 'bundler', '~> 1.6'
28 |   spec.add_development_dependency 'byebug', '~> 4.0'
29 |   spec.add_development_dependency 'kender', '~> 0.2'
30 |   spec.add_development_dependency 'rake', '~> 10.0'
31 |   spec.add_development_dependency 'webmock', '~> 1.18'
32 |   spec.add_development_dependency 'rspec', '~> 3.5'
33 |   spec.add_development_dependency 'puffing-billy', '~> 0.9'
34 |   spec.add_development_dependency 'timecop', '~> 0.8'
35 |   spec.add_development_dependency 'selenium-webdriver', '~> 2.53.4'
36 | end
37 | 


--------------------------------------------------------------------------------
/lib/grell/capybara_driver.rb:
--------------------------------------------------------------------------------
 1 | module Grell
 2 |   # This class setups the driver for capybara. Used internally by the CrawlerManager
 3 |   # It uses Portelgeist to control PhantomJS
 4 |   class CapybaraDriver
 5 |     USER_AGENT = "Mozilla/5.0 (Grell Crawler)".freeze
 6 | 
 7 |     # Returns a poltergeist driver
 8 |     def setup_capybara
 9 |       @poltergeist_driver = nil
10 | 
11 |       # Capybara will not re-run the block if the driver name already exists, so the driver name
12 |       # will have a time integer appended to ensure uniqueness.
13 |       driver_name = "poltergeist_crawler_#{Time.now.to_f}".to_sym
14 |       Grell.logger.info "GRELL Registering poltergeist driver with name '#{driver_name}'"
15 | 
16 |       Capybara.register_driver driver_name do |app|
17 |         @poltergeist_driver = Capybara::Poltergeist::Driver.new(app,
18 |           js_errors: false,
19 |           inspector: false,
20 |           phantomjs_logger: FakePoltergeistLogger,
21 |           phantomjs_options: ['--debug=no', '--load-images=no', '--ignore-ssl-errors=yes', '--ssl-protocol=TLSv1.2'])
22 |       end
23 | 
24 |       Capybara.default_max_wait_time = 3
25 |       Capybara.run_server = false
26 |       Capybara.default_driver = driver_name
27 |       Capybara.current_session.driver.headers = { # The driver gets initialized when modified here
28 |         "DNT" => 1,
29 |         "User-Agent" => USER_AGENT
30 |       }
31 | 
32 |       raise 'Poltergeist Driver could not be properly initialized' unless @poltergeist_driver
33 | 
34 |       @poltergeist_driver
35 |     end
36 | 
37 |     # Poltergeist driver needs a class with this signature. The javascript console.log is sent here.
38 |     # We just discard that information.
39 |     module FakePoltergeistLogger
40 |       def self.puts(*)
41 |       end
42 |     end
43 |   end
44 | end
45 | 


--------------------------------------------------------------------------------
/lib/grell/rawpage.rb:
--------------------------------------------------------------------------------
 1 | module Grell
 2 |   # This class depends heavily on Capybara but contains no logic.
 3 |   class RawPage
 4 |     include Capybara::DSL
 5 | 
 6 |     def navigate(url)
 7 |       visit(url)
 8 |       follow_redirects!
 9 |     end
10 | 
11 |     def headers
12 |       page.response_headers
13 |     end
14 | 
15 |     def status
16 |       page.status_code
17 |     end
18 | 
19 |     def body
20 |       page.body
21 |     end
22 | 
23 |     def all_anchors
24 |       # Some elements may not be "a" elements but still provide a link. This usually is done for Javascript
25 |       # to convert other elements which are not links to be able to be clicked naturally.
26 |       # Only return links which are visible.
27 |       all('[href]', visible: true).to_a + all('[data-href]', visible: true).to_a
28 |     end
29 | 
30 |     def host
31 |       page.current_host
32 |     end
33 | 
34 |     def has_selector?(selector)
35 |       page.has_selector?(selector)
36 |     end
37 | 
38 |     def wait_for_all_ajax_requests(timeout, interval)
39 |       Timeout::timeout(timeout) do
40 |         (timeout / interval).ceil.times do
41 |           jquery_active = page.evaluate_script("typeof jQuery !== 'undefined' && jQuery.active;")
42 |           break if (!jquery_active || jquery_active.zero?)
43 |           sleep(interval)
44 |         end
45 |       end
46 |       true
47 |     end
48 | 
49 |     private
50 | 
51 |     def follow_redirects!
52 |       # Phantom is very weird, it will follow a redirect to provide the correct body but will not fill the
53 |       # status and the headers, if we are in that situation, revisit the page with the correct url this time.
54 |       # Note that we will still fail if we have more than 5 redirects on a row
55 |       redirects = 0
56 |       while(page.status_code == nil && redirects < 5)
57 |         visit( CGI.unescape(page.current_url))
58 |         redirects = redirects + 1
59 |       end
60 |     end
61 |   end
62 | end
63 | 


--------------------------------------------------------------------------------
/lib/grell/page_collection.rb:
--------------------------------------------------------------------------------
 1 | module Grell
 2 |   # Keeps a record of all the pages crawled.
 3 |   # When a new url is found it is added to this collection, which makes sure it is unique.
 4 |   # This page is part of the discovered pages. Eventually that page will be navigated to, then
 5 |   # the page will be part of the visited pages.
 6 |   class PageCollection
 7 |     attr_reader :collection
 8 | 
 9 |     # A block containing the logic that determines if a new URL should be added
10 |     # to the collection or if it is already present will be passed to the initializer.
11 |     def initialize(add_match_block)
12 |       @collection = []
13 |       @add_match_block = add_match_block || default_add_match
14 |     end
15 | 
16 |     def create_page(url, parent_id)
17 |       page_id = next_id
18 |       page = Page.new(url, page_id, parent_id)
19 |       add(page)
20 |       page
21 |     end
22 | 
23 |     def visited_pages
24 |       @collection.select {|page| page.visited?}
25 |     end
26 | 
27 |     def discovered_pages
28 |       @collection - visited_pages
29 |     end
30 | 
31 |     def next_page
32 |       discovered_pages.sort_by{|page| page.parent_id}.first
33 |     end
34 | 
35 |     private
36 | 
37 |     def next_id
38 |       @collection.size
39 |     end
40 | 
41 |     def add(page)
42 |       # Although finding unique pages based on URL will add pages with different query parameters,
43 |       # in some cases we do link to different pages depending on the query parameters like when using proxies
44 |       new_url = @collection.none? do |collection_page|
45 |         @add_match_block.call(collection_page, page)
46 |       end
47 | 
48 |       if new_url
49 |         @collection.push page
50 |       end
51 |     end
52 | 
53 |     # If add_match_block is not provided, url matching to determine if a new page should be added
54 |     # to the page collection will default to this proc
55 |     def default_add_match
56 |       Proc.new do |collection_page, page|
57 |         collection_page.url.downcase == page.url.downcase
58 |       end
59 |     end
60 | 
61 |   end
62 | end
63 | 


--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
 1 | require 'grell'
 2 | require 'byebug'
 3 | require 'timecop'
 4 | require 'webmock/rspec'
 5 | require 'billy/capybara/rspec'
 6 | require 'rack'
 7 | require 'rack/server'
 8 | 
 9 | # This will trick Puffing-billy into using this logger instead of its own
10 | # Puffing billy is very noisy and we do not want to see that in our output
11 | class Rails
12 |   def self.logger
13 |     Logger.new(nil)
14 |   end
15 | end
16 | 
17 | WebMock.disable_net_connect!
18 | 
19 | 
20 | # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
21 | RSpec.configure do |config|
22 | 
23 |   # We do not need to wait for pages to return all the data
24 |   config.before do
25 |     stub_const("Grell::Page::WAIT_TIME", 0)
26 |     allow_any_instance_of(Capybara::Session).to receive(:evaluate_script).and_return(nil)
27 |   end
28 | 
29 |   config.expect_with :rspec do |expectations|
30 |     expectations.include_chain_clauses_in_custom_matcher_descriptions = true
31 |   end
32 | 
33 |   config.mock_with :rspec do |mocks|
34 |     # Prevents you from mocking or stubbing a method that does not exist on
35 |     # a real object.
36 |     mocks.verify_partial_doubles = true
37 |   end
38 | 
39 |   # Limits the available syntax to the non-monkey patched syntax that is recommended.
40 |   config.disable_monkey_patching!
41 | 
42 |   # This setting enables warnings. It's recommended, but in some cases may
43 |   # be too noisy due to issues in dependencies.
44 |   # TODO: Billy puffy has lots of warnings, test this with new versions
45 |   # config.warnings = true
46 | 
47 |   # Many RSpec users commonly either run the entire suite or an individual
48 |   # file, and it's useful to allow more verbose output when running an
49 |   # individual spec file.
50 |   if config.files_to_run.one?
51 |     # Use the documentation formatter for detailed output,
52 |     # unless a formatter has already been configured
53 |     # (e.g. via a command-line flag).
54 |     config.default_formatter = 'doc'
55 |   end
56 | 
57 |   config.order = :random
58 |   Kernel.srand config.seed
59 | 
60 |   Capybara.javascript_driver = :poltergeist_billy
61 |   Capybara.default_driver = :poltergeist_billy
62 | 
63 | #  config.profile_examples = 10
64 | end
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/lib/grell/crawler.rb:
--------------------------------------------------------------------------------
 1 | module Grell
 2 |   # This is the class that starts and controls the crawling
 3 |   class Crawler
 4 |     attr_reader :collection, :manager
 5 | 
 6 |     # Creates a crawler
 7 |     # evaluate_in_each_page: javascript block to evaluate in each page we crawl
 8 |     # add_match_block: block to evaluate to consider if a page is part of the collection
 9 |     # manager_options: options passed to the manager class
10 |     # allowlist: Sets an allowlist filter, allows a regexp, string or array of either to be matched.
11 |     # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched.
12 |     def initialize(evaluate_in_each_page: nil, add_match_block: nil, allowlist: /.*/, denylist: /a^/, **manager_options)
13 |       @collection = nil
14 |       @manager = CrawlerManager.new(manager_options)
15 |       @evaluate_in_each_page = evaluate_in_each_page
16 |       @add_match_block = add_match_block
17 |       @allowlist_regexp = Regexp.union(allowlist)
18 |       @denylist_regexp = Regexp.union(denylist)
19 |     end
20 | 
21 |     # Main method, it starts crawling on the given URL and calls a block for each of the pages found.
22 |     def start_crawling(url, &block)
23 |       Grell.logger.info "GRELL Started crawling"
24 |       @collection = PageCollection.new(@add_match_block)
25 |       @collection.create_page(url, nil)
26 | 
27 |       while !@collection.discovered_pages.empty?
28 |         crawl(@collection.next_page, block)
29 |         @manager.check_periodic_restart(@collection)
30 |       end
31 | 
32 |       Grell.logger.info "GRELL finished crawling"
33 |     end
34 | 
35 |     def crawl(site, block)
36 |       Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}"
37 |       crawl_site(site)
38 | 
39 |       if block # The user of this block can send us a :retry to retry accessing the page
40 |         while crawl_block(block, site) == :retry
41 |           Grell.logger.info "Retrying our visit to #{site.url}"
42 |           crawl_site(site)
43 |         end
44 |       end
45 | 
46 |       site.links.each do |url|
47 |         @collection.create_page(url, site.id)
48 |       end
49 |     end
50 | 
51 |     private
52 | 
53 |     def crawl_site(site)
54 |       site.navigate
55 |       site.rawpage.page.evaluate_script(@evaluate_in_each_page) if @evaluate_in_each_page
56 |       filter!(site.links)
57 |       add_redirect_url(site)
58 |     end
59 | 
60 |     # Treat any exceptions from the block as an unavailable page
61 |     def crawl_block(block, site)
62 |       block.call(site)
63 |     rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
64 |            Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
65 |            Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
66 |       site.unavailable_page(404, e)
67 |     end
68 | 
69 |     def filter!(links)
70 |       links.select! { |link| link =~ @allowlist_regexp } if @allowlist_regexp
71 |       links.delete_if { |link| link =~ @denylist_regexp } if @denylist_regexp
72 |     end
73 | 
74 |     # Store the resulting redirected URL along with the original URL
75 |     def add_redirect_url(site)
76 |       if site.url != site.current_url
77 |         @collection.create_page(site.current_url, site.id)
78 |       end
79 |     end
80 | 
81 |   end
82 | 
83 | end
84 | 


--------------------------------------------------------------------------------
/lib/grell/crawler_manager.rb:
--------------------------------------------------------------------------------
 1 | module Grell
 2 |   # Manages the state of the process crawling, does not care about individual pages but about logging,
 3 |   # restarting and quiting the crawler correctly.
 4 |   class CrawlerManager
 5 |     # logger: logger to use for Grell's messages
 6 |     # on_periodic_restart: if set, the driver will restart every :each visits (100 default) and execute the :do block
 7 |     # driver_options: Any extra options for the Capybara driver
 8 |     def initialize(logger: nil, on_periodic_restart: {}, driver: nil)
 9 |       Grell.logger = logger ? logger : Logger.new(STDOUT)
10 |       @periodic_restart_block = on_periodic_restart[:do]
11 |       @periodic_restart_period = on_periodic_restart[:each] || PAGES_TO_RESTART
12 |       @driver = driver || CapybaraDriver.new.setup_capybara
13 |       if @periodic_restart_period <= 0
14 |         Grell.logger.warn "GRELL. Restart option misconfigured with a negative period. Ignoring option."
15 |       end
16 |     end
17 | 
18 |     # Restarts the PhantomJS process without modifying the state of visited and discovered pages.
19 |     def restart
20 |       Grell.logger.info "GRELL. Driver restarting"
21 |       @driver.restart
22 |       Grell.logger.info "GRELL. Driver restarted"
23 |     end
24 | 
25 |     # Quits the poltergeist driver.
26 |     def quit
27 |       Grell.logger.info "GRELL. Driver quitting"
28 |       @driver.quit
29 |     end
30 | 
31 |     # PhantomJS seems to consume memory increasingly as it crawls, periodic restart allows to restart
32 |     # the driver, potentially calling a block.
33 |     def check_periodic_restart(collection)
34 |       return unless @periodic_restart_block
35 |       return unless @periodic_restart_period > 0
36 |       return unless (collection.visited_pages.size % @periodic_restart_period).zero?
37 |       restart
38 |       @periodic_restart_block.call
39 |     end
40 | 
41 |     def self.cleanup_all_processes
42 |       PhantomJSManager.new.cleanup_all_processes
43 |     end
44 | 
45 |     private
46 | 
47 |     PAGES_TO_RESTART = 100  # Default number of pages before we restart the driver.
48 |     KILL_TIMEOUT = 2        # Number of seconds we wait till we kill the process.
49 | 
50 |     # Manages the PhantomJS process
51 |     class PhantomJSManager
52 |       def cleanup_all_processes
53 |         pids = running_phantomjs_pids
54 |         return if pids.empty?
55 |         Grell.logger.warn "GRELL. Killing PhantomJS processes: #{pids.inspect}"
56 |         pids.each do |pid|
57 |           Grell.logger.warn "GRELL. Sending KILL to PhantomJS process #{pid}"
58 |           kill_process(pid.to_i)
59 |         end
60 |       end
61 | 
62 |       def running_phantomjs_pids
63 |         list_phantomjs_processes_cmd = "ps -ef | grep -E 'bin/phantomjs' | grep -v grep"
64 |         `#{list_phantomjs_processes_cmd} | awk '{print $2;}'`.split("\n")
65 |       end
66 | 
67 |       def kill_process(pid)
68 |         Process.kill('TERM', pid)
69 |         force_kill(pid)
70 |       rescue Errno::ESRCH, Errno::ECHILD
71 |         # successfully terminated
72 |       rescue => e
73 |         Grell.logger.error ["GRELL. PhantomJS process could not be killed", e.message, *e.backtrace].join($/)
74 |       end
75 | 
76 |       def force_kill(pid)
77 |         Timeout.timeout(KILL_TIMEOUT) { Process.wait(pid) }
78 |       rescue Timeout::Error
79 |         Process.kill('KILL', pid)
80 |         Process.wait(pid)
81 |       end
82 |     end
83 |   end
84 | end
85 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # 2.1.2
  2 |   * Change white/black lists to allow/deny lists
  3 | 
  4 | # 2.1.1
  5 |   * Update phantomjs_options to use 'TLSv1.2'
  6 | 
  7 | # 2.1.0
  8 |   * Delete `driver_options` configuration key as it was never used.
  9 |   * `cleanup_all_processes` is a self method as intended to.
 10 | 
 11 | # 2.0.0
 12 |   * New configuration key `on_periodic_restart`.
 13 |   * CrawlerManager.cleanup_all_processes method destroy all instances of phantomjs in this machine.
 14 | 
 15 |   * Breaking changes
 16 |     - Requires Ruby 2.1 or later.
 17 |     - Crawler.start_crawling does not accept options anymore, all options are passed to Crawler.new.
 18 |     - Crawler's methods `restart` and `quit` have been moved to CrawlerManager.
 19 |     - Crawler gets whitelist and blacklist as configuration options instead of being set in specific methods.
 20 | 
 21 | # 1.6.11
 22 |   * Ensure all links are loaded by waiting for Ajax requests to complete
 23 |   * Add '@evaluate_in_each_page' option to evaluate before extracting links (e.g. $('.dropdown').addClass('open');)
 24 | 
 25 | # 1.6.10
 26 |   * Avoid following JS href links, add missing dependencies to fix Travis build
 27 | 
 28 | # 1.6.9
 29 |   * Avoid following links when disabled by CSS (1.6.8 worked only for Javascript)
 30 | 
 31 | # 1.6.8
 32 |   * Avoid following disabled links
 33 | 
 34 | # 1.6.7
 35 |   * Increment '@times_visited' first to avoid infinite retries when rescuing errors
 36 | 
 37 | # 1.6.6
 38 |   * Updated phantomjs_logger not to open '/dev/null'
 39 | 
 40 | # 1.6.5
 41 |   * Added #quit to Crawler
 42 | 
 43 | # 1.6.4
 44 |   * Added #quit to Capybara driver
 45 | 
 46 | # 1.6.3
 47 |   * Only follow visible links
 48 | 
 49 | # 1.6.2
 50 |   * Reset Capybara driver to Puffing Billy (used to rewrite URL requests in specs)
 51 |   * Use float timestamp for Poltergeist driver name to support fast test executions
 52 | 
 53 | # 1.6.1
 54 |   * Use non-static name to support registering Poltergeist crawler multiple times
 55 |   * More exception handling, store redirected URLs in addition to original URL
 56 | 
 57 | # 1.6
 58 |   * Support custom URL comparison when adding new pages during crawling
 59 |   * Don't rescue Timeout error, so that Delayed Job can properly terminate hanging jobs
 60 |   * Fail early if Capybara doesn't initialize properly
 61 | 
 62 | # 1.5.1
 63 |   * Fixed deprecation warning (Thanks scott)
 64 |   * Updated Poltergeist dependency
 65 | 
 66 | # 1.5.0
 67 |   * Grell will follow redirects.
 68 |   * Added #followed_redirects? #error? #current_url methods to the Page class
 69 | 
 70 | # 1.4.0
 71 |   * Added crawler.restart to restart browser process
 72 |   * The block of code can make grell retry any given page.
 73 | 
 74 | # 1.3.2
 75 |   * Rescue Timeout error and return an empty page when that happens
 76 | 
 77 | # 1.3.1
 78 |   * Added whitelisting and blacklisting
 79 |   * Better info in gemspec
 80 | 
 81 | # 1.3
 82 |   * The Crawler object allows you to provide an external logger object.
 83 |   * Clearer semantics when an error happens, special headers are returned so the user can inspect the error
 84 |   * Caveats:
 85 |     - The 'debug' option in the crawler does not have any affect anymore. Provide an external logger with 'logger' instead
 86 |     - The errors provided in the headers by grell has changed from 'grell_status' to 'grellStatus'.
 87 |     - The 'visited' property in the page was never supposed to be accesible. Use 'visited?' instead.
 88 | 
 89 | # 1.2.1
 90 |   * Solve bug: URLs are case insensitive
 91 | 
 92 | # 1.2
 93 |   * Grell now will consider two links to point to the same page only when the whole URL is exactly the same.
 94 |     Versions previously would only consider two links to be the same when they shared the path.
 95 | 
 96 | # 1.1.2
 97 |   * Solve bug where we were adding links in heads as if there were normal links in the body
 98 | 
 99 | # 1.1.1
100 |   * Solve bug with the new data-href functionality
101 | 
102 | # 1.1
103 |   * Solve problem with randomly failing spec
104 |   * Search for elements with 'href' or 'data-href' to find links
105 | 
106 | # 1.0.1
107 |   * Rescueing Javascript errors
108 | 
109 | # 1.0
110 |   * Initial implementation
111 |   * Basic support to crawling pages.
112 | 


--------------------------------------------------------------------------------
/spec/lib/page_collection_spec.rb:
--------------------------------------------------------------------------------
  1 | 
  2 | RSpec.describe Grell::PageCollection do
  3 |   let(:add_match_block) do
  4 |     Proc.new do |collection_page, page|
  5 |       collection_page.url.downcase == page.url.downcase
  6 |     end
  7 |   end
  8 | 
  9 |   let(:collection) { Grell::PageCollection.new(add_match_block) }
 10 |   let(:url) { 'http://www.github.com/SomeUser/dragonlance?search=false' }
 11 |   let(:url2) { 'http://www.github.com/OtherUser/forgotten?search=false' }
 12 | 
 13 |   context 'empty collection' do
 14 | 
 15 |     it 'has no visited pages' do
 16 |       expect(collection.visited_pages).to be_empty
 17 |     end
 18 | 
 19 |     it 'has no discovered pages' do
 20 |       expect(collection.discovered_pages).to be_empty
 21 |     end
 22 | 
 23 |     it 'next page is nil' do
 24 |       expect(collection.next_page).to be_nil
 25 |     end
 26 |   end
 27 | 
 28 |   context 'one unvisited page' do
 29 |     let(:page) { collection.create_page(url, 0) }
 30 | 
 31 |     before do
 32 |       allow(page).to receive(:visited?).and_return(false)
 33 |     end
 34 | 
 35 |     it 'has no visited pages' do
 36 |       expect(collection.visited_pages).to be_empty
 37 |     end
 38 | 
 39 |     it 'has one discovered page' do
 40 |       expect(collection.discovered_pages).to eq([page])
 41 | 
 42 |     end
 43 | 
 44 |     it 'next page is the unvisited page' do
 45 |       expect(collection.next_page).to eq(page)
 46 |     end
 47 |   end
 48 | 
 49 |   context 'one visited page' do
 50 |     let(:page) { collection.create_page(url, 0) }
 51 | 
 52 |     before do
 53 |       allow(page).to receive(:visited?).and_return(true)
 54 |     end
 55 | 
 56 |     it 'has one visited page' do
 57 |       expect(collection.visited_pages).to eq([page])
 58 |     end
 59 | 
 60 |     it 'has no discovered pages' do
 61 |       expect(collection.discovered_pages).to be_empty
 62 |     end
 63 | 
 64 |     it 'next page is nil' do
 65 |       expect(collection.next_page).to be_nil
 66 |     end
 67 |   end
 68 | 
 69 |   context 'one visited and one unvisited page with the same url' do
 70 |     let(:page) { collection.create_page(url, 0) }
 71 |     let(:unvisited) { collection.create_page(url.upcase, 0) }
 72 | 
 73 |     before do
 74 |       allow(page).to receive(:visited?).and_return(true)
 75 |       allow(unvisited).to receive(:visited?).and_return(false)
 76 |     end
 77 | 
 78 |     it 'first page has id 0' do
 79 |       expect(page.id).to eq(0)
 80 |     end
 81 | 
 82 |     it 'second page has id 1' do
 83 |       expect(unvisited.id).to eq(1)
 84 |     end
 85 | 
 86 |     it 'has one visited page' do
 87 |       expect(collection.visited_pages).to eq([page])
 88 |     end
 89 | 
 90 |     it 'has no discovered pages' do
 91 |       expect(collection.discovered_pages).to be_empty
 92 |     end
 93 | 
 94 |     it 'next page is nil' do
 95 |       expect(collection.next_page).to be_nil
 96 |     end
 97 |   end
 98 | 
 99 |   context 'one visited and one unvisited page with different URLs' do
100 |     let(:page) { collection.create_page(url, 0) }
101 |     let(:unvisited) { collection.create_page(url2, 0) }
102 | 
103 |     before do
104 |       allow(page).to receive(:visited?).and_return(true)
105 |       allow(unvisited).to receive(:visited?).and_return(false)
106 |     end
107 | 
108 |     it 'has one visited page' do
109 |       expect(collection.visited_pages).to eq([page])
110 |     end
111 | 
112 |     it 'has one discovered page' do
113 |       expect(collection.discovered_pages).to eq([unvisited])
114 |     end
115 | 
116 |     it 'next page is the unvisited page' do
117 |       expect(collection.next_page).to eq(unvisited)
118 |     end
119 |   end
120 | 
121 |   context 'one visited and one unvisited page with different URLs only different by the query' do
122 |     let(:page) { collection.create_page(url, 0) }
123 |     let(:url3) { 'http://www.github.com/SomeUser/dragonlance?search=true' }
124 |     let(:unvisited) { collection.create_page(url3, 0) }
125 | 
126 |     before do
127 |       allow(page).to receive(:visited?).and_return(true)
128 |       allow(unvisited).to receive(:visited?).and_return(false)
129 |     end
130 | 
131 |     it 'has one visited page' do
132 |       expect(collection.visited_pages).to eq([page])
133 |     end
134 | 
135 |     it 'has one discovered page' do
136 |       expect(collection.discovered_pages).to eq([unvisited])
137 |     end
138 | 
139 |     it 'next page is the unvisited page' do
140 |       expect(collection.next_page).to eq(unvisited)
141 |     end
142 |   end
143 | 
144 |   context 'several unvisited pages' do
145 |     let(:page) { collection.create_page(url, 2) }
146 |     let(:page2) { collection.create_page(url2, 0) }
147 | 
148 |     before do
149 |       allow(page).to receive(:visited?).and_return(true)
150 |       allow(page2).to receive(:visited?).and_return(false)
151 |     end
152 | 
153 |     it 'returns the page which has an earlier parent' do
154 |       expect(collection.next_page).to eq(page2)
155 |     end
156 | 
157 |   end
158 | 
159 | end
160 | 


--------------------------------------------------------------------------------
/spec/lib/crawler_manager_spec.rb:
--------------------------------------------------------------------------------
  1 | RSpec.describe Grell::CrawlerManager do
  2 |   let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
  3 |   let(:host) { 'http://www.example.com' }
  4 |   let(:url) { 'http://www.example.com/test' }
  5 |   let(:driver) { double(Grell::CapybaraDriver) }
  6 |   let(:logger) { Logger.new(nil) }
  7 |   let(:crawler_manager) do
  8 |     described_class.new(logger: logger, driver: driver)
  9 |   end
 10 | 
 11 |   describe 'initialize' do
 12 |     context 'provides a logger' do
 13 |       let(:logger) { 33 }
 14 | 
 15 |       it 'sets custom logger' do
 16 |         crawler_manager
 17 |         expect(Grell.logger).to eq(33)
 18 |         Grell.logger = Logger.new(nil)
 19 |       end
 20 |     end
 21 | 
 22 |     context 'does not provides a logger' do
 23 |       let(:logger) { nil }
 24 | 
 25 |       it 'sets default logger' do
 26 |         crawler_manager
 27 |         expect(Grell.logger).to be_instance_of(Logger)
 28 |         Grell.logger = Logger.new(nil)
 29 |       end
 30 |     end
 31 | 
 32 |     context 'does not provide a driver' do
 33 |       let(:driver) { nil }
 34 | 
 35 |       it 'setups a new Capybara driver' do
 36 |         expect_any_instance_of(Grell::CapybaraDriver).to receive(:setup_capybara)
 37 |         crawler_manager
 38 |       end
 39 |     end
 40 |   end
 41 | 
 42 |   describe '#quit' do
 43 |     let(:driver) { double }
 44 | 
 45 |     it 'quits the poltergeist driver' do
 46 |       expect(logger).to receive(:info).with("GRELL. Driver quitting")
 47 |       expect(driver).to receive(:quit)
 48 |       crawler_manager.quit
 49 |     end
 50 |   end
 51 | 
 52 |   describe '#restart' do
 53 |     let(:driver) { double }
 54 | 
 55 |     it 'restarts the poltergeist driver' do
 56 |       expect(driver).to receive(:restart)
 57 |       expect(logger).to receive(:info).with("GRELL. Driver restarted")
 58 |       expect(logger).to receive(:info).with("GRELL. Driver restarting")
 59 |       crawler_manager.restart
 60 |     end
 61 |   end
 62 | 
 63 |   describe '#check_periodic_restart' do
 64 |     let(:collection) { double }
 65 | 
 66 |     context 'Periodic restart not setup' do
 67 |       it 'does not restart' do
 68 |         allow(collection).to receive_message_chain(:visited_pages, :size) { 100 }
 69 |         expect(crawler_manager).not_to receive(:restart)
 70 |         crawler_manager.check_periodic_restart(collection)
 71 |       end
 72 |     end
 73 | 
 74 |     context 'Periodic restart setup with default period' do
 75 |       let(:do_something) { proc {} }
 76 |       let(:crawler_manager) do
 77 |         Grell::CrawlerManager.new(
 78 |           logger: logger,
 79 |           driver: driver,
 80 |           on_periodic_restart: { do: do_something }
 81 |         )
 82 |       end
 83 | 
 84 |       it 'does not restart after visiting 99 pages' do
 85 |         allow(collection).to receive_message_chain(:visited_pages, :size) { 99 }
 86 |         expect(crawler_manager).not_to receive(:restart)
 87 |         crawler_manager.check_periodic_restart(collection)
 88 |       end
 89 | 
 90 |       it 'restarts after visiting 100 pages' do
 91 |         allow(collection).to receive_message_chain(:visited_pages, :size) { 100 }
 92 |         expect(crawler_manager).to receive(:restart)
 93 |         crawler_manager.check_periodic_restart(collection)
 94 |       end
 95 |     end
 96 | 
 97 |     context 'Periodic restart setup with custom period' do
 98 |       let(:do_something) { proc {} }
 99 |       let(:period) { 50 }
100 |       let(:crawler_manager) do
101 |         Grell::CrawlerManager.new(
102 |           logger: logger,
103 |           driver: driver,
104 |           on_periodic_restart: { do: do_something, each: period }
105 |         )
106 |       end
107 | 
108 |       context 'restart option is not positive' do
109 |         let(:period) { 0 }
110 | 
111 |         it 'logs a warning' do
112 |           message = 'GRELL. Restart option misconfigured with a negative period. Ignoring option.'
113 |           expect(logger).to receive(:warn).with(message)
114 |           crawler_manager
115 |         end
116 |       end
117 | 
118 |       it 'does not restart after visiting a number different from custom period pages' do
119 |         allow(collection).to receive_message_chain(:visited_pages, :size) { period * 1.2 }
120 |         expect(crawler_manager).not_to receive(:restart)
121 |         crawler_manager.check_periodic_restart(collection)
122 |       end
123 | 
124 |       it 'restarts after visiting custom period pages' do
125 |         allow(collection).to receive_message_chain(:visited_pages, :size) { period }
126 |         expect(crawler_manager).to receive(:restart)
127 |         crawler_manager.check_periodic_restart(collection)
128 |       end
129 |     end
130 |   end
131 | 
132 |   describe '.cleanup_all_processes' do
133 |     let(:driver) { double }
134 | 
135 |     context 'There are some phantomjs processes running' do
136 |       let(:pids) { [10, 11] }
137 |       before do
138 |         allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager)
139 |           .to receive(:running_phantomjs_pids).and_return(pids)
140 |       end
141 | 
142 |       it 'logs processes pids' do
143 |         expect(Grell.logger).to receive(:warn).with('GRELL. Killing PhantomJS processes: [10, 11]')
144 |         expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 10')
145 |         expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 11')
146 |         described_class.cleanup_all_processes
147 |       end
148 | 
149 |       it 'kills all phantomjs processes' do
150 |         expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(10)
151 |         expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(11)
152 |         described_class.cleanup_all_processes
153 |       end
154 |     end
155 | 
156 |     context 'There are no phantomjs processes running' do
157 |       let(:pids) { [] }
158 |       before do
159 |         allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager)
160 |           .to receive(:running_phantomjs_pids).and_return(pids)
161 |       end
162 | 
163 |       it 'no warning is logged' do
164 |         expect(Grell.logger).not_to receive(:warn)
165 |         described_class.cleanup_all_processes
166 |       end
167 | 
168 |       it 'No process is killed' do
169 |         expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).not_to receive(:kill_process)
170 |         described_class.cleanup_all_processes
171 |       end
172 |     end
173 |   end
174 | end
175 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Grell
  2 | 
  3 | [![Build Status](https://travis-ci.org/mdsol/grell.svg?branch=develop)](https://travis-ci.org/mdsol/grell)
  4 | 
  5 | Grell is a generic crawler for the web written in Ruby.
  6 | It can be used to gather data, test pages in a given domain, etc.
  7 | 
  8 | ## Installation
  9 | 
 10 | Add this line to your application's Gemfile:
 11 | 
 12 | ```ruby
 13 | gem 'grell'
 14 | ```
 15 | 
 16 | And then execute:
 17 | 
 18 |     $ bundle
 19 | 
 20 | Or install it yourself as:
 21 | 
 22 |     $ gem install grell
 23 | 
 24 | Grell uses PhantomJS as a browser, you will need to download and install it in your
 25 | system. Check for instructions in http://phantomjs.org/
 26 | Grell has been tested with PhantomJS v2.1.x
 27 | 
 28 | ## Usage
 29 | 
 30 | ### Crawling an entire site
 31 | 
 32 | The main entry point of the library is Grell::Crawler#start_crawling.
 33 | Grell will yield to your code with each page it finds:
 34 | 
 35 | ```ruby
 36 | require 'grell'
 37 | 
 38 | crawler = Grell::Crawler.new
 39 | crawler.start_crawling('http://www.google.com') do |page|
 40 |   #Grell will keep iterating this block which each unique page it finds
 41 |   puts "yes we crawled #{page.url}"
 42 |   puts "status: #{page.status}"
 43 |   puts "headers: #{page.headers}"
 44 |   puts "body: #{page.body}"
 45 |   puts "We crawled it at #{page.timestamp}"
 46 |   puts "We found #{page.links.size} links"
 47 |   puts "page id and parent_id #{page.id}, #{page.parent_id}"
 48 | end
 49 | 
 50 | ```
 51 | 
 52 | Grell keeps a list of pages previously crawled and do not visit the same page twice.
 53 | This list is indexed by the complete url, including query parameters.
 54 | 
 55 | ### Re-retrieving a page
 56 | If you want Grell to revisit a page and return the data to you again,
 57 | return the symbol :retry in your block for the start_crawling method.
 58 | For instance
 59 | ```ruby
 60 | require 'grell'
 61 | crawler = Grell::Crawler.new
 62 | crawler.start_crawling('http://www.google.com') do |current_page|
 63 |   if current_page.status == 500 && current_page.retries == 0
 64 |     crawler.manager.restart
 65 |     :retry
 66 |   end
 67 | end
 68 | ```
 69 | 
 70 | ### Pages' id
 71 | 
 72 | Each page has an unique id, accessed by the property `id`. Also each page stores the id of the page from which we found this page, accessed by the property `parent_id`.
 73 | The page object generated by accessing the first URL passed to the start_crawling(the root) has a `parent_id` equal to `nil` and an `id` equal to 0.
 74 | Using this information it is possible to construct a directed graph.
 75 | 
 76 | 
 77 | ### Restart and quit
 78 | 
 79 | Grell can be restarted. The current list of visited and yet-to-visit pages list are not modified when restarting
 80 | but the browser is destroyed and recreated, all cookies and local storage are lost. After restarting, crawling is resumed with a
 81 | new browser.
 82 | To destroy the crawler, call the `quit` method. This will free the memory taken in Ruby and destroys the PhantomJS process.
 83 | ```ruby
 84 | require 'grell'
 85 | crawler = Grell::Crawler.new
 86 | crawler.manager.restart # restarts the browser
 87 | crawler.manager.quit # quits and destroys the crawler
 88 | ```
 89 | 
 90 | ### Options
 91 | 
 92 | The `Grell:Crawler` class can be passed options to customize its behavior:
 93 | - `logger`: Sets the logger object, for instance `Rails.logger`. Default: `Logger.new(STDOUT)`
 94 | - `on_periodic_restart`: Sets periodic restarts of the crawler each certain number of visits. Default: 100 pages.
 95 | - `allowlist`: Sets a allowlist filter for URLs to be visited. Default: all URLs are allowlisted.
 96 | - `denylist`: Sets a denylist filter for URLs to be avoided. Default: no URL is denylisted.
 97 | - `add_match_block`: Block evaluated to consider if a given page should be part of the pages to be visited. Default: add unique URLs.
 98 | - `evaluate_in_each_page`: Javascript block to be evaluated on each page visited. Default: Nothing evaluated.
 99 | 
100 | Grell by default will follow all the links it finds in the site being crawled.
101 | It will never follow links linking outside your site.
102 | If you want to further limit the amount of links crawled, you can use
103 | allowlisting, denylisting or manual filtering.
104 | Below further details on these and other options.
105 | 
106 | 
107 | #### Automatically restarting PhantomJS
108 | If you are doing a long crawling it is possible that phantomJS gets into an inconsistent state or it starts leaking memory.
109 | The crawler can be restarted manually by calling `crawler.manager.restart` or automatically by using the
110 | `on_periodic_restart` configuration key as follows:
111 | 
112 |  ```ruby
113 |  require 'grell'
114 | 
115 |  crawler = Grell::Crawler.new(on_periodic_restart: { do: my_restart_procedure, each: 200 })
116 | 
117 |  crawler.start_crawling('http://www.google.com') do |current_page|
118 |  ...
119 |  endd
120 |  ```
121 | 
122 |  This code will setup the crawler to be restarted every 200 pages being crawled and to call `my_restart_procedure`
123 |  between restarts. A restart will destroy the cookies so for instance this custom block can be used to relogin.
124 | 
125 | 
126 |  #### Allowlisting
127 | 
128 |  ```ruby
129 |  require 'grell'
130 | 
131 |  crawler = Grell::Crawler.new(allowlist: [/games\/.*/, '/fun'])
132 |  crawler.start_crawling('http://www.google.com')
133 |  ```
134 | 
135 |  Grell here will only follow links to games and '/fun' and ignore all
136 |  other links. You can provide a regexp, strings (if any part of the
137 |  string match is allowlisted) or an array with regexps and/or strings.
138 | 
139 |  #### Denylisting
140 | 
141 |  ```ruby
142 |  require 'grell'
143 | 
144 |  crawler = Grell::Crawler.new(denylist: /games\/.*/)
145 |  crawler.start_crawling('http://www.google.com')
146 |  ```
147 | 
148 |  Similar to allowlisting. But now Grell will follow every other link in
149 |  this site which does not go to /games/...
150 | 
151 |  If you call both allowlist and denylist then both will apply, a link
152 |  has to fullfill both conditions to survive. If you do not call any, then
153 |  all links on this site will be crawled. Think of these methods as
154 |  filters.
155 | 
156 | #### Manual link filtering
157 | 
158 | If you have a more complex use-case, you can modify the list of links
159 | manually.
160 | Grell yields the page to you before it adds the links to the list of
161 | links to visit. So you can modify in your block of code "page.links" to
162 | add and delete links to instruct Grell to add them to the list of links
163 | to visit next.
164 | 
165 | #### Custom URL Comparison
166 | By default, Grell will detect new URLs to visit by comparing the full URL
167 | with the URLs of the discovered and visited links. This functionality can
168 | be changed by passing a block of code to Grells `start_crawling` method.
169 | In the below example, the path of the URLs (instead of the full URL) will
170 | be compared.
171 | 
172 | ```ruby
173 | require 'grell'
174 | 
175 | add_match_block = Proc.new do |collection_page, page|
176 |   collection_page.path == page.path
177 | end
178 | 
179 | crawler = Grell::Crawler.new(add_match_block: add_match_block)
180 | 
181 | crawler.start_crawling('http://www.google.com') do |current_page|
182 | ...
183 | end
184 | ```
185 | 
186 | #### Evaluate script
187 | 
188 | You can evalute a JavaScript snippet in each page before extracting links by passing the snippet to the 'evaluate_in_each_page' option:
189 | 
190 | ```ruby
191 | require 'grell'
192 | 
193 | crawler = Grell::Crawler.new(evaluate_in_each_page: "typeof jQuery !== 'undefined' && $('.dropdown').addClass('open');")
194 | 
195 | ```
196 | 
197 | ### Errors
198 | When there is an error in the page or an internal error in the crawler (Javascript crashed the browser, etc). Grell will return with status 404 and the headers will have the following keys:
199 | - grellStatus: 'Error'
200 | - errorClass: The class of the error which broke this page.
201 | - errorMessage: A descriptive message with the information Grell could gather about the error.
202 | 
203 | ## Tests
204 | 
205 | Run the tests with
206 | ```ruby
207 | bundle exec rake ci
208 | ```
209 | 
210 | ## Contributors
211 | Grell is (c) Medidata Solutions Worldwide and owned by its major contributors:
212 | * [Teruhide Hoshikawa](https://github.com/thoshikawa-mdsol)
213 | * [Jordi Polo Carres](https://github.com/jcarres-mdsol)
214 | 


--------------------------------------------------------------------------------
/lib/grell/page.rb:
--------------------------------------------------------------------------------
  1 | require 'forwardable'
  2 | 
  3 | module Grell
  4 |  # This class contains the logic related to work with each page we crawl. It is also the interface we use
  5 |  # To access the information of each page.
  6 |  # This information comes from result private classes below.
  7 |   class Page
  8 |     extend Forwardable
  9 | 
 10 |     WAIT_TIME = 10
 11 |     WAIT_INTERVAL = 0.5
 12 | 
 13 |     attr_reader :url, :timestamp, :id, :parent_id, :rawpage
 14 | 
 15 |     #Most of the interesting information accessed through this class is accessed by the methods below
 16 |     def_delegators :@result_page, :headers, :body, :status, :links, :has_selector?, :host, :visited?
 17 | 
 18 |     def initialize( url, id, parent_id)
 19 |       @rawpage = RawPage.new
 20 |       @url = url
 21 |       @id = id
 22 |       @parent_id = parent_id
 23 |       @timestamp = nil
 24 |       @times_visited = 0
 25 |       @result_page = UnvisitedPage.new
 26 |     end
 27 | 
 28 |     def navigate
 29 |       # We wait a maximum of WAIT_TIME seconds to get an HTML page. We try our best to workaround inconsistencies on poltergeist
 30 |       Reader.wait_for(->{@rawpage.navigate(url)}, WAIT_TIME, WAIT_INTERVAL ) do
 31 |         @rawpage.status && !@rawpage.headers.empty? &&
 32 |           @rawpage.headers["Content-Type"] && @rawpage.headers["Content-Type"].include?('text/html').equal?(true)
 33 |       end
 34 |       @rawpage.wait_for_all_ajax_requests(WAIT_TIME, WAIT_INTERVAL)
 35 |       @result_page = VisitedPage.new(@rawpage)
 36 |       @timestamp = Time.now
 37 |     rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient,
 38 |            Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError,
 39 |            Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e
 40 |       unavailable_page(404, e)
 41 |     ensure
 42 |       @times_visited += 1
 43 |     end
 44 | 
 45 |     # Number of times we have retried the current page
 46 |     def retries
 47 |       [@times_visited - 1, 0].max
 48 |     end
 49 | 
 50 |     # The current URL, this may be different from the URL we asked for if there was some redirect
 51 |     def current_url
 52 |       @rawpage.current_url
 53 |     end
 54 | 
 55 |     # True if we followed a redirect to get the current contents
 56 |     def followed_redirects?
 57 |       current_url != @url
 58 |     end
 59 | 
 60 |     # True if there page responded with an error
 61 |     def error?
 62 |       !!(status.to_s =~ /[4|5]\d\d/)
 63 |     end
 64 | 
 65 |     # Extracts the path (e.g. /actions/test_action) from the URL
 66 |     def path
 67 |       URI.parse(@url).path
 68 |     rescue URI::InvalidURIError # Invalid URLs will be added and caught when we try to navigate to them
 69 |       @url
 70 |     end
 71 | 
 72 |     def unavailable_page(status, exception)
 73 |       Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}"
 74 |       @result_page = ErroredPage.new(status, exception)
 75 |       @timestamp = Time.now
 76 |     end
 77 | 
 78 |     private
 79 | 
 80 |     # Private class.
 81 |     # This is a result page when it has not been visited yet. Essentially empty of information
 82 |     #
 83 |     class UnvisitedPage
 84 |       def status
 85 |         nil
 86 |       end
 87 | 
 88 |       def body
 89 |         ''
 90 |       end
 91 | 
 92 |       def headers
 93 |         {grellStatus: 'NotVisited' }
 94 |       end
 95 | 
 96 |       def links
 97 |         []
 98 |       end
 99 | 
100 |       def host
101 |         ''
102 |       end
103 | 
104 |       def visited?
105 |         false
106 |       end
107 | 
108 |       def has_selector?(selector)
109 |         false
110 |       end
111 | 
112 |     end
113 | 
114 |     # Private class.
115 |     # This is a result page when some error happened. It provides some information about the error.
116 |     #
117 |     class ErroredPage
118 |       def initialize(error_code, exception)
119 |         @error_code = error_code
120 |         @exception = exception
121 |       end
122 | 
123 |       def status
124 |         @error_code
125 |       end
126 | 
127 |       def body
128 |         ''
129 |       end
130 | 
131 |       def headers
132 |         message = begin
133 |           @exception.message
134 |         rescue StandardError
135 |           "Error message can not be accessed" #Poltergeist may try to access a nil object when accessing message
136 |         end
137 | 
138 |         {
139 |           grellStatus: 'Error',
140 |           errorClass: @exception.class.to_s,
141 |           errorMessage: message
142 |         }
143 |       end
144 | 
145 |       def links
146 |         []
147 |       end
148 | 
149 |       def host
150 |         ''
151 |       end
152 | 
153 |       def visited?
154 |         true
155 |       end
156 | 
157 |       def has_selector?(selector)
158 |         false
159 |       end
160 | 
161 |     end
162 | 
163 | 
164 |     # Private class.
165 |     # This is a result page when we successfully got some information back after visiting the page.
166 |     # It delegates most of the information to the @rawpage capybara page. But any transformation or logic is here
167 |     #
168 |     class VisitedPage
169 |       def initialize(rawpage)
170 |         @rawpage = rawpage
171 |       end
172 | 
173 |       def status
174 |         @rawpage.status
175 |       end
176 | 
177 |       def body
178 |         @rawpage.body
179 |       end
180 | 
181 |       def headers
182 |         @rawpage.headers
183 |       rescue Capybara::Poltergeist::BrowserError => e #This may happen internally on Poltergeist, they claim is a bug.
184 |         {
185 |           grellStatus: 'Error',
186 |           errorClass: e.class.to_s,
187 |           errorMessage: e.message
188 |         }
189 |       end
190 | 
191 |       def links
192 |         @links ||= all_links
193 |       end
194 | 
195 |       def host
196 |         @rawpage.host
197 |       end
198 | 
199 |       def visited?
200 |         true
201 |       end
202 | 
203 |       def has_selector?(selector)
204 |         @rawpage.has_selector?(selector)
205 |       end
206 | 
207 |       private
208 |       def all_links
209 |         links =  @rawpage.all_anchors.map { |anchor| Link.new(anchor) }
210 |         body_enabled_links = links.reject { |link| link.inside_header? || link.disabled? || link.js_href? }
211 |         body_enabled_links.map { |link| link.to_url(host) }.uniq.compact
212 | 
213 |       rescue Capybara::Poltergeist::ObsoleteNode
214 |         Grell.logger.warn "We found an obsolete node in #{@url}. Ignoring all links"
215 |         # Sometimes Javascript and timing may screw this, we lose these links.
216 |         # TODO: Can we do something more intelligent here?
217 |         []
218 |       end
219 | 
220 |       # Private class to group all the methods related to links.
221 |       class Link
222 |         def initialize(anchor)
223 |           @anchor = anchor
224 |         end
225 | 
226 |         # <link> can only be used in the <head> as of: https://developer.mozilla.org/en/docs/Web/HTML/Element/link
227 |         def inside_header?
228 |           @anchor.tag_name == 'link'
229 |         end
230 | 
231 |         # Is the link disabled by either Javascript or CSS?
232 |         def disabled?
233 |           @anchor.disabled? || !!@anchor.native.attributes['disabled']
234 |         end
235 | 
236 |         # Does the href use javascript?
237 |         def js_href?
238 |           href.start_with?('javascript:')
239 |         end
240 | 
241 |         # Some links may use data-href + javascript to do interesting things
242 |         def href
243 |           @anchor['href'] || @anchor['data-href']
244 |         end
245 | 
246 |         # We only accept links in this same host that start with a path
247 |         def to_url(host)
248 |           uri = URI.parse(href)
249 |           if uri.absolute?
250 |             if uri.host != URI.parse(host).host
251 |               Grell.logger.debug "GRELL does not follow links to external hosts: #{href}"
252 |               nil
253 |             else
254 |               href # Absolute link to our own host
255 |             end
256 |           else
257 |             if uri.path.nil?
258 |               Grell.logger.debug "GRELL does not follow links without a path: #{uri}"
259 |               nil
260 |             end
261 |             if uri.path.start_with?('/')
262 |               host + href  # convert to full URL
263 |             else # links like href="google.com" the browser would go to http://google.com like "http://#{link}"
264 |               Grell.logger.debug "GRELL Bad formatted link: #{href}, assuming external"
265 |               nil
266 |             end
267 |           end
268 |         rescue URI::InvalidURIError # Invalid links propagating till we navigate to them
269 |           href
270 |         end
271 |       end
272 | 
273 |     end
274 |   end
275 | end
276 | 


--------------------------------------------------------------------------------
/spec/lib/crawler_spec.rb:
--------------------------------------------------------------------------------
  1 | 
  2 | RSpec.describe Grell::Crawler do
  3 |   let(:page_id) { rand(10).floor + 10 }
  4 |   let(:parent_page_id) { rand(10).floor }
  5 |   let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
  6 |   let(:host) { 'http://www.example.com' }
  7 |   let(:url) { 'http://www.example.com/test' }
  8 |   let(:add_match_block) { nil }
  9 |   let(:denylist) { /a^/ }
 10 |   let(:allowlist) { /.*/ }
 11 |   let(:crawler) do
 12 |     Grell::Crawler.new(
 13 |       logger: Logger.new(nil),
 14 |       driver: double(nil),
 15 |       evaluate_in_each_page: script,
 16 |       add_match_block: add_match_block,
 17 |       denylist: denylist,
 18 |       allowlist: allowlist)
 19 |   end
 20 |   let(:script) { nil }
 21 |   let(:body) { 'body' }
 22 |   let(:custom_add_match) do
 23 |     Proc.new do |collection_page, page|
 24 |       collection_page.path == page.path
 25 |     end
 26 |   end
 27 | 
 28 |   before do
 29 |     proxy.stub(url).and_return(body: body, code: 200)
 30 |   end
 31 | 
 32 |   describe '#crawl' do
 33 |     before do
 34 |       crawler.instance_variable_set('@collection', Grell::PageCollection.new(custom_add_match))
 35 |     end
 36 | 
 37 |     it 'yields the result if a block is given' do
 38 |       result = []
 39 |       block = Proc.new { |n| result.push(n) }
 40 |       crawler.crawl(page, block)
 41 |       expect(result.size).to eq(1)
 42 |       expect(result.first.url).to eq(url)
 43 |       expect(result.first.visited?).to eq(true)
 44 |     end
 45 | 
 46 |     it 'rescues any specified exceptions raised during the block execution' do
 47 |       block = Proc.new { |n| raise Capybara::Poltergeist::BrowserError, 'Exception' }
 48 |       expect{ crawler.crawl(page, block) }.to_not raise_error
 49 |       expect(page.status).to eq(404)
 50 |     end
 51 | 
 52 |     it 'logs interesting information' do
 53 |       crawler
 54 |       expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/)
 55 |       crawler.crawl(page, nil)
 56 |     end
 57 | 
 58 |     it 'retries when the block returns :retry' do
 59 |       counter = 0
 60 |       times_retrying = 2
 61 |       block = Proc.new do |n|
 62 |         if counter < times_retrying
 63 |           counter += 1
 64 |           :retry
 65 |         end
 66 |       end
 67 |       crawler.crawl(page, block)
 68 |       expect(counter).to eq(times_retrying)
 69 |     end
 70 | 
 71 |     it 'handles redirects by adding the current_url to the page collection' do
 72 |       redirect_url = 'http://www.example.com/test/landing_page'
 73 |       allow(page).to receive(:current_url).and_return(redirect_url)
 74 |       expect_any_instance_of(Grell::PageCollection).to receive(:create_page).with(redirect_url, page_id)
 75 |       crawler.crawl(page, nil)
 76 |     end
 77 | 
 78 |     context 'without script' do
 79 |       it 'does not evaluate a script' do
 80 |         expect_any_instance_of(Capybara::Session).not_to receive(:evaluate_script)
 81 |         crawler.crawl(page, nil)
 82 |       end
 83 |     end
 84 | 
 85 |     context 'with script' do
 86 |       let(:script) { "(typeof(jQuery)!='undefined') && $('.dropdown').addClass('open');" }
 87 |       it 'evaluates a script' do
 88 |         expect_any_instance_of(Capybara::Session).to receive(:evaluate_script).with(script)
 89 |         crawler.crawl(page, nil)
 90 |       end
 91 |     end
 92 |   end
 93 | 
 94 |   context '#start_crawling' do
 95 |     let(:body) do
 96 |       <<-EOS
 97 |       <html><head></head><body>
 98 |       <a href="/musmis.html">trusmis</a>
 99 |       Hello world!
100 |       </body></html>
101 |       EOS
102 |     end
103 |     let(:url_visited) { "http://www.example.com/musmis.html" }
104 | 
105 |     before do
106 |       proxy.stub(url_visited).and_return(body: 'body', code: 200)
107 |     end
108 | 
109 |     it 'calls the block we used to start_crawling' do
110 |       result = []
111 |       block = Proc.new { |n| result.push(n) }
112 |       crawler.start_crawling(url, &block)
113 |       expect(result.size).to eq(2)
114 |       expect(result[0].url).to eq(url)
115 |       expect(result[1].url).to eq(url_visited)
116 |     end
117 | 
118 |   end
119 | 
120 |   shared_examples_for 'visits all available pages' do
121 |     it 'visits all the pages' do
122 |       crawler.start_crawling(url)
123 |       expect(crawler.collection.visited_pages.size).to eq(visited_pages_count)
124 |     end
125 | 
126 |     it 'has no more pages to discover' do
127 |       crawler.start_crawling(url)
128 |       expect(crawler.collection.discovered_pages.size).to eq(0)
129 |     end
130 | 
131 |     it 'contains the allowlisted page and the base page only' do
132 |       crawler.start_crawling(url)
133 |       expect(crawler.collection.visited_pages.map(&:url)).
134 |         to eq(visited_pages)
135 |     end
136 |   end
137 | 
138 |   context 'the url has no links' do
139 |     let(:body) do
140 |       "<html><head></head><body>
141 |       Hello world!
142 |       </body></html>"
143 |     end
144 |     let(:visited_pages_count) { 1 }
145 |     let(:visited_pages) { ['http://www.example.com/test'] }
146 | 
147 |     it_behaves_like 'visits all available pages'
148 |   end
149 | 
150 |   context 'the url has several links' do
151 |     let(:visited_pages_count) { 3 }
152 |     let(:visited_pages) do
153 |       ['http://www.example.com/test', 'http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
154 |     end
155 |     let(:body) do
156 |       "<html><head></head><body>
157 |       <a href=\"/trusmis.html\">trusmis</a>
158 |       <a href=\"/help.html\">help</a>
159 |       Hello world!
160 |       </body></html>"
161 |     end
162 | 
163 |     before do
164 |       proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
165 |       proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
166 |     end
167 | 
168 |     it_behaves_like 'visits all available pages'
169 |   end
170 | 
171 |   describe '#allowlist' do
172 |     let(:body) do
173 |       "<html><head></head><body>
174 |       <a href=\"/trusmis.html\">trusmis</a>
175 |       <a href=\"/help.html\">help</a>
176 |       Hello world!
177 |       </body></html>"
178 |     end
179 | 
180 |     before do
181 |       proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
182 |       proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
183 |     end
184 | 
185 |     context 'using a single string' do
186 |       let(:allowlist) { '/trusmis.html' }
187 |       let(:visited_pages_count) { 2 } # my own page + trusmis
188 |       let(:visited_pages) do
189 |         ['http://www.example.com/test', 'http://www.example.com/trusmis.html']
190 |       end
191 | 
192 |       it_behaves_like 'visits all available pages'
193 |     end
194 | 
195 |     context 'using an array of strings' do
196 |       let(:allowlist) { ['/trusmis.html', '/nothere', 'another.html'] }
197 |       let(:visited_pages_count) { 2 }
198 |       let(:visited_pages) do
199 |         ['http://www.example.com/test', 'http://www.example.com/trusmis.html']
200 |       end
201 | 
202 |       it_behaves_like 'visits all available pages'
203 |     end
204 | 
205 |     context 'using a regexp' do
206 |       let(:allowlist) { /\/trusmis\.html/ }
207 |       let(:visited_pages_count) { 2 }
208 |       let(:visited_pages) do
209 |         ['http://www.example.com/test', 'http://www.example.com/trusmis.html']
210 |       end
211 | 
212 |       it_behaves_like 'visits all available pages'
213 |     end
214 | 
215 |     context 'using an array of regexps' do
216 |       let(:allowlist) { [/\/trusmis\.html/] }
217 |       let(:visited_pages_count) { 2 }
218 |       let(:visited_pages) do
219 |         ['http://www.example.com/test', 'http://www.example.com/trusmis.html']
220 |       end
221 | 
222 |       it_behaves_like 'visits all available pages'
223 |     end
224 | 
225 |     context 'using an empty array' do
226 |       let(:allowlist) { [] }
227 |       let(:visited_pages_count) { 1 } # my own page only
228 |       let(:visited_pages) do
229 |         ['http://www.example.com/test']
230 |       end
231 | 
232 |       it_behaves_like 'visits all available pages'
233 |     end
234 | 
235 |     context 'adding all links to the allowlist' do
236 |       let(:allowlist) { ['/trusmis', '/help'] }
237 |       let(:visited_pages_count) { 3 } # all links
238 |       let(:visited_pages) do
239 |         ['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
240 |       end
241 | 
242 |       it_behaves_like 'visits all available pages'
243 |     end
244 |   end
245 | 
246 | 
247 |   describe '#denylist' do
248 |     let(:body) do
249 |       "<html><head></head><body>
250 |       <a href=\"/trusmis.html\">trusmis</a>
251 |       <a href=\"/help.html\">help</a>
252 |       Hello world!
253 |       </body></html>"
254 |     end
255 | 
256 |     before do
257 |       proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
258 |       proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
259 |     end
260 | 
261 |     context 'using a single string' do
262 |       let(:denylist) { '/trusmis.html' }
263 |       let(:visited_pages_count) {2}
264 |       let(:visited_pages) do
265 |         ['http://www.example.com/test','http://www.example.com/help.html']
266 |       end
267 | 
268 |       it_behaves_like 'visits all available pages'
269 |     end
270 | 
271 |     context 'using an array of strings' do
272 |       let(:denylist) { ['/trusmis.html', '/nothere', 'another.html'] }
273 |       let(:visited_pages_count) {2}
274 |       let(:visited_pages) do
275 |         ['http://www.example.com/test','http://www.example.com/help.html']
276 |       end
277 | 
278 |       it_behaves_like 'visits all available pages'
279 |     end
280 | 
281 |     context 'using a regexp' do
282 |       let(:denylist) { /\/trusmis\.html/ }
283 |       let(:visited_pages_count) {2}
284 |       let(:visited_pages) do
285 |         ['http://www.example.com/test','http://www.example.com/help.html']
286 |       end
287 | 
288 |       it_behaves_like 'visits all available pages'
289 |     end
290 | 
291 |     context 'using an array of regexps' do
292 |       let(:denylist) { [/\/trusmis\.html/] }
293 |       let(:visited_pages_count) {2}
294 |       let(:visited_pages) do
295 |         ['http://www.example.com/test','http://www.example.com/help.html']
296 |       end
297 | 
298 |       it_behaves_like 'visits all available pages'
299 |     end
300 | 
301 |     context 'using an empty array' do
302 |       let(:denylist) { [] }
303 |       let(:visited_pages_count) { 3 } # all links
304 |       let(:visited_pages) do
305 |         ['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html']
306 |       end
307 | 
308 |       it_behaves_like 'visits all available pages'
309 |     end
310 | 
311 |     context 'adding all links to the denylist' do
312 |       let(:denylist) { ['/trusmis', '/help'] }
313 |       let(:visited_pages_count) { 1 }
314 |       let(:visited_pages) do
315 |         ['http://www.example.com/test']
316 |       end
317 | 
318 |       it_behaves_like 'visits all available pages'
319 |     end
320 |   end
321 | 
322 | 
323 |   describe 'allowlisting and denylisting' do
324 |     let(:body) do
325 |       "<html><head></head><body>
326 |       <a href=\"/trusmis.html\">trusmis</a>
327 |       <a href=\"/help.html\">help</a>
328 |       Hello world!
329 |       </body></html>"
330 |     end
331 | 
332 |     before do
333 |       proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200)
334 |       proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200)
335 |     end
336 | 
337 |     context 'we denylist the only allowlisted page' do
338 |       let(:allowlist) { '/trusmis.html' }
339 |       let(:denylist) { '/trusmis.html' }
340 |       let(:visited_pages_count) { 1 }
341 |       let(:visited_pages) do
342 |         ['http://www.example.com/test']
343 |       end
344 | 
345 |       it_behaves_like 'visits all available pages'
346 |     end
347 | 
348 |     context 'we denylist none of the allowlisted pages' do
349 |       let(:allowlist) { '/trusmis.html' }
350 |       let(:denylist) { '/raistlin.html' }
351 |       let(:visited_pages_count) { 2 }
352 |       let(:visited_pages) do
353 |         ['http://www.example.com/test', 'http://www.example.com/trusmis.html']
354 |       end
355 | 
356 |       it_behaves_like 'visits all available pages'
357 |     end
358 |   end
359 | 
360 | 
361 | end
362 | 


--------------------------------------------------------------------------------
/spec/lib/page_spec.rb:
--------------------------------------------------------------------------------
  1 | RSpec.describe Grell::Page do
  2 | 
  3 |   let(:page_id) { rand(10).floor + 10 }
  4 |   let(:parent_page_id) { rand(10).floor }
  5 |   let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
  6 |   let(:host) { 'http://www.example.com' }
  7 |   let(:url) { 'http://www.example.com/test' }
  8 |   let(:returned_headers)  { { 'Other-Header' => 'yes', 'Content-Type' => 'text/html' }}
  9 |   let(:now) { Time.now }
 10 | 
 11 |   before do
 12 |     allow(Time).to receive(:now).and_return(now)
 13 |     Grell.logger = Logger.new(nil) # avoids noise in rspec output
 14 |   end
 15 | 
 16 |   it 'gives access to the url' do
 17 |     expect(page.url).to eq(url)
 18 |   end
 19 | 
 20 |   it 'gives access to the path' do
 21 |     expect(page.path).to eq('/test')
 22 |   end
 23 | 
 24 |   it 'gives access to the page id' do
 25 |     expect(page.id).to eq(page_id)
 26 |   end
 27 | 
 28 |   it 'gives access to the parent page id' do
 29 |     expect(page.parent_id).to eq(parent_page_id)
 30 |   end
 31 | 
 32 |   it 'newly created page does not have status yet' do
 33 |     expect(page.status).to eq(nil)
 34 |   end
 35 | 
 36 |   shared_examples_for 'a grell page' do
 37 | 
 38 |     it 'returns the correct status' do
 39 |       expect(page.status).to eq(status)
 40 |     end
 41 | 
 42 |     it 'has the correct body' do
 43 |       expect(page.body).to eq(body)
 44 |     end
 45 | 
 46 |     it 'has correct headers' do
 47 |       expect(page.headers).to include(expected_headers)
 48 |     end
 49 | 
 50 |     it 'has the correct links' do
 51 |       expect(page.links.sort).to eq(links.sort)
 52 |     end
 53 | 
 54 |     it '#visited? returns the correct value' do
 55 |       expect(page.visited?).to eq(visited)
 56 |     end
 57 | 
 58 |     it 'has correct timestamp' do
 59 |       expect(page.timestamp).to eq(now)
 60 |     end
 61 | 
 62 |   end
 63 | 
 64 |   describe '#retries' do
 65 |     context 'page has not been navigated' do
 66 |       it '#retries return 0' do
 67 |         expect(page.retries).to eq(0)
 68 |       end
 69 |     end
 70 | 
 71 |     context 'page has been navigated once' do
 72 |       before do
 73 |         proxy.stub(url).and_return(body: '', code: 200, headers: {})
 74 |         page.navigate
 75 |       end
 76 | 
 77 |       it '#retries return 0' do
 78 |         expect(page.retries).to eq(0)
 79 |       end
 80 |     end
 81 | 
 82 |     context 'page has been navigated twice' do
 83 |       before do
 84 |         proxy.stub(url).and_return(body: '', code: 200, headers: {})
 85 |         page.navigate
 86 |         page.navigate
 87 |       end
 88 | 
 89 |       it '#retries return 1' do
 90 |         expect(page.retries).to eq(1)
 91 |       end
 92 |     end
 93 |   end
 94 | 
 95 |   describe '#navigate' do
 96 |     before do
 97 |       proxy.stub(url).and_return(body: '', code: 200, headers: {})
 98 |     end
 99 | 
100 |     it 'waits for all ajax requests' do
101 |       expect_any_instance_of(Grell::RawPage).to receive(:wait_for_all_ajax_requests).with(0, 0.5)
102 |       page.navigate
103 |     end
104 |   end
105 | 
106 |   shared_examples_for 'an errored grell page' do
107 |     it 'returns empty status 404 page after navigating' do
108 |       expect(page.status).to eq(404)
109 |       expect(page.links).to eq([])
110 |       expect(page.headers).to eq(headers)
111 |       expect(page.body).to eq('')
112 |       expect(page.has_selector?('html')).to eq(false)
113 |       expect(page).to be_visited
114 |       expect(page.timestamp).to eq(now)
115 |       expect(page.error?).to eq(true)
116 |       expect(page.instance_variable_get(:@times_visited)).to eq(1)
117 |     end
118 |   end
119 | 
120 |   [ Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::BrowserError, URI::InvalidURIError,
121 |     Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError,
122 |     Capybara::Poltergeist::DeadClient, Errno::ECONNRESET ].each do |error_type|
123 | 
124 |     context "#{error_type}" do
125 |       let(:headers) do
126 |         {
127 |           grellStatus: 'Error',
128 |           errorClass: "#{error_type}",
129 |           errorMessage: error_message
130 |         }
131 |       end
132 |       let(:error_message) { 'Trusmis broke it again' }
133 |       let(:now) { Time.now }
134 | 
135 |       before do
136 |         allow_any_instance_of(Grell::RawPage).to receive(:navigate).and_raise(error_type, 'error')
137 |         allow_any_instance_of(error_type).to receive(:message).and_return(error_message)
138 |         page.navigate
139 |       end
140 | 
141 |       it_behaves_like 'an errored grell page'
142 |     end
143 |   end
144 | 
145 | 
146 |   context 'we have not yet navigated to the page' do
147 |     let(:visited) { false }
148 |     let(:status) { nil }
149 |     let(:body) { '' }
150 |     let(:links) { [] }
151 |     let(:expected_headers) { {} }
152 |     let(:now) { nil }
153 | 
154 |     before do
155 |       proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
156 |     end
157 | 
158 |     it_behaves_like 'a grell page'
159 | 
160 |   end
161 | 
162 |   context 'navigating to the URL we get a 404' do
163 |     let(:visited) { true }
164 |     let(:status) { 404 }
165 |     let(:body) { '<html><head></head><body>nothing cool</body></html>' }
166 |     let(:links) { [] }
167 |     let(:expected_headers) { returned_headers }
168 | 
169 |     before do
170 |       proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
171 |       page.navigate
172 |     end
173 | 
174 |     it_behaves_like 'a grell page'
175 | 
176 |   end
177 | 
178 |   context 'navigating to an URL with redirects, follows them transparently' do
179 |     let(:visited) { true }
180 |     let(:status) { 200 }
181 |     let(:body) { '<html><head></head><body>nothing cool</body></html>' }
182 |     let(:links) { [] }
183 |     let(:expected_headers) { returned_headers }
184 |     let(:real_url) { 'http://example.com/other' }
185 | 
186 |     before do
187 |       proxy.stub(url).and_return(:redirect_to => real_url)
188 |       proxy.stub(real_url).and_return(body: body, code: status, headers: returned_headers.dup)
189 |       page.navigate
190 |     end
191 | 
192 |     it_behaves_like 'a grell page'
193 | 
194 |     it 'followed_redirects? is true' do
195 |       expect(page.followed_redirects?).to eq(true)
196 |     end
197 | 
198 |     it 'current_url match the url we were redirected to' do
199 |       expect(page.current_url).to eq(real_url)
200 |     end
201 |   end
202 | 
203 |   #Here also add examples that may happen for almost all pages (no errors, no redirects)
204 |   context 'navigating to the URL we get page with no links' do
205 |     let(:visited) { true }
206 |     let(:status) { 200 }
207 |     let(:body) { '<html><head></head><body>nothing cool</body></html>' }
208 |     let(:links) { [] }
209 |     let(:expected_headers) { returned_headers }
210 | 
211 |     before do
212 |       proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
213 |       page.navigate
214 |     end
215 | 
216 |     it_behaves_like 'a grell page'
217 | 
218 |     it 'followed_redirects is false' do
219 |       expect(page.followed_redirects?).to eq(false)
220 |     end
221 | 
222 |     it 'current_url is url' do
223 |       expect(page.current_url).to eq(url)
224 |     end
225 | 
226 |     it 'does not have errors' do
227 |       expect(page.error?).to eq(false)
228 |     end
229 |   end
230 | 
231 |   context 'navigating to the URL we get page with links using a elements' do
232 |     let(:visited) { true }
233 |     let(:status) { 200 }
234 |     let(:body) do
235 |       "<html><head></head><body>
236 |       Hello world!
237 |       <a href=\"/trusmis.html\">trusmis</a>
238 |       <a href=\"/help.html\">help</a>
239 |       <a href=\"http://www.outsidewebsite.com/help.html\">help</a>
240 |       </body></html>"
241 |     end
242 |     let(:links) { ['http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] }
243 |     let(:expected_headers) { returned_headers }
244 | 
245 |     before do
246 |       proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
247 |       page.navigate
248 |     end
249 | 
250 |     it_behaves_like 'a grell page'
251 | 
252 |     it 'do not return links to external websites' do
253 |       expect(page.links).to_not include('http://www.outsidewebsite.com/help.html')
254 |     end
255 |   end
256 | 
257 | 
258 |   context 'navigating to the URL we get page with disabled links' do
259 |     let(:visited) { true }
260 |     let(:status) { 200 }
261 |     let(:body) do
262 |       "<html><head></head><body>
263 |       Hello world!
264 |       <a href=\"/trusmis.html\">trusmis</a>
265 |       <a href=\"/help.html\">help</a>
266 |       <a href=\"javascript: void(0)\">help</a>
267 |       <a href=\"/helpdisabled.html\" disabled=\"\">helpdisabled</a>
268 |       </body></html>"
269 |     end
270 |     let(:links) { ['http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] }
271 |     let(:expected_headers) { returned_headers }
272 | 
273 |     before do
274 |       proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
275 |       page.navigate
276 |     end
277 | 
278 |     it_behaves_like 'a grell page'
279 |   end
280 | 
281 |   context 'navigating to the URL we get page with links with absolute links' do
282 |     let(:visited) { true }
283 |     let(:status) { 200 }
284 |     let(:body) do
285 |       "<html><head></head><body>
286 |       Hello world!
287 |       <a href=\"/trusmis.html\">trusmis</a>
288 |       <a href=\"http://www.example.com/help.html\">help</a>
289 |       <a href=\"http://www.outsidewebsite.com/help.html\">help</a>
290 |       </body></html>"
291 |     end
292 |     let(:links) { ['http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] }
293 |     let(:expected_headers) { returned_headers }
294 | 
295 |     before do
296 |       proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
297 |       page.navigate
298 |     end
299 | 
300 |     it_behaves_like 'a grell page'
301 | 
302 |     it 'do not return links to external websites' do
303 |       expect(page.links).to_not include('http://www.outsidewebsite.com/help.html')
304 |     end
305 |   end
306 | 
307 |   context 'navigating to the URL we get page with links using a mix of elements' do
308 |     let(:visited) { true }
309 |     let(:status) { 200 }
310 |     let(:body) do
311 |       "<html><head></head><body>
312 |       Hello world!
313 |       <a href=\"/trusmis.html\">trusmis</a>
314 |       <table>
315 |       <tbody>
316 |       <tr href=\"/help_me.html\"><td>help</td></tr>
317 |       <tr data-href=\"/help.html\"><td>help</td></tr>
318 |       </tbody>
319 |       </table>
320 |       <div data-href=\"http://www.example.com/more_help.html\">help</div>
321 |       <div data-href=\"http://www.outsidewebsite.com/help.html\">help</div>
322 |       </body></html>"
323 |     end
324 |     let(:links) do
325 |       [ 'http://www.example.com/trusmis.html', 'http://www.example.com/help.html',
326 |         'http://www.example.com/more_help.html', 'http://www.example.com/help_me.html' ]
327 |     end
328 |     let(:expected_headers) { returned_headers }
329 | 
330 |     before do
331 |       proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
332 |       page.navigate
333 |     end
334 | 
335 |     it_behaves_like 'a grell page'
336 | 
337 |     describe '#path' do
338 |       context 'proper url' do
339 |         let(:url) { 'http://www.anyurl.com/path' }
340 |         let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
341 | 
342 |         it 'returns the path' do
343 |           expect(page.path).to eq('/path')
344 |         end
345 |       end
346 | 
347 |       context 'broken url' do
348 |         let(:url) { 'www.an.asda.fasfasf.yurl.com/path' }
349 |         let(:page) { Grell::Page.new(url, page_id, parent_page_id) }
350 | 
351 |         it 'returns the path' do
352 |           expect(page.path).to eq(url)
353 |         end
354 |       end
355 |     end
356 | 
357 |     it 'do not return links to external websites' do
358 |       expect(page.links).to_not include('http://www.outsidewebsite.com/help.html')
359 |     end
360 |   end
361 | 
362 |  context 'navigating to the URL we get page with links inside the header section of the code' do
363 |     let(:visited) { true }
364 |     let(:status) { 200 }
365 |     let(:css) { '/application.css' }
366 |     let(:favicon) { '/favicon.ico' }
367 |     let(:body) do
368 |       "<html><head>
369 |       <title>mimi</title>
370 |       <link href=\"#{css}\" rel=\"stylesheet\">
371 |       <link href=\"#{favicon}\" rel=\"shortcut icon\" type=\"image/vnd.microsoft.icon\">
372 |       </head>
373 |       <body>
374 |       Hello world!
375 |       <a href=\"/trusmis.html\">trusmis</a>
376 |       </body></html>"
377 |     end
378 |     let(:links) do
379 |       ['http://www.example.com/trusmis.html']
380 |     end
381 |     let(:expected_headers) { returned_headers }
382 | 
383 |     before do
384 |       proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup)
385 |       #We need to stub this or Phantomjs will get stuck trying to retrieve the resources
386 |       proxy.stub(host + css).and_return(body: '', code: status)
387 |       proxy.stub(host + favicon).and_return(body: '', code: status)
388 |       page.navigate
389 |     end
390 | 
391 |     it_behaves_like 'a grell page'
392 | 
393 |     it 'do not return links to resources in the header' do
394 |       expect(page.links).to_not include('http://www.example.com/application.css')
395 |     end
396 | 
397 |   end
398 | 
399 |   context 'status is never set' do #this may happen when there is nothing comming from the site
400 |     before do
401 |       stub_const('Grell::Page::WAIT_TIME', 0)
402 |       allow_any_instance_of(Grell::RawPage).to receive(:status).and_return(nil)
403 |       allow_any_instance_of(Grell::RawPage).to receive(:headers).and_return({})
404 |       allow_any_instance_of(Grell::RawPage).to receive(:body).and_return('')
405 |       proxy.stub(url).and_return(body: body, code: nil, headers: {})
406 |       page.navigate
407 |     end
408 | 
409 |     let(:visited) { true }
410 |     let(:status) { nil }
411 |     let(:body) { '' }
412 |     let(:links) { [] }
413 |     let(:expected_headers) { {} }
414 | 
415 |     it_behaves_like 'a grell page'
416 |   end
417 | 
418 | end
419 | 


--------------------------------------------------------------------------------