├── Rakefile ├── .rspec ├── lib ├── grell │ ├── version.rb │ ├── grell_logger.rb │ ├── reader.rb │ ├── capybara_driver.rb │ ├── rawpage.rb │ ├── page_collection.rb │ ├── crawler.rb │ ├── crawler_manager.rb │ └── page.rb └── grell.rb ├── Gemfile ├── LICENSE.txt ├── spec ├── lib │ ├── reader_spec.rb │ ├── capybara_driver_spec.rb │ ├── page_collection_spec.rb │ ├── crawler_manager_spec.rb │ ├── crawler_spec.rb │ └── page_spec.rb └── spec_helper.rb ├── .travis.yml ├── grell.gemspec ├── CHANGELOG.md └── README.md /Rakefile: -------------------------------------------------------------------------------- 1 | require 'kender/tasks' 2 | 3 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --color 2 | --require spec_helper 3 | -------------------------------------------------------------------------------- /lib/grell/version.rb: -------------------------------------------------------------------------------- 1 | module Grell 2 | VERSION = "2.1.2".freeze 3 | end 4 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Avoid ruby 2.1 to use Rack > 2.0 which is not compatible 4 | platform :ruby_21 do 5 | gem 'rack', '~> 1.0' 6 | end 7 | gemspec 8 | -------------------------------------------------------------------------------- /lib/grell/grell_logger.rb: -------------------------------------------------------------------------------- 1 | require 'logger' 2 | 3 | #Very simple global logger for our crawler. 4 | module Grell 5 | class << self 6 | attr_accessor :logger 7 | end 8 | end 9 | 10 | Grell.logger = Logger.new(STDOUT) -------------------------------------------------------------------------------- /lib/grell.rb: -------------------------------------------------------------------------------- 1 | require 'capybara/poltergeist' 2 | require 'capybara/dsl' 3 | 4 | require 'grell/grell_logger' 5 | require 'grell/capybara_driver' 6 | require 'grell/crawler_manager' 7 | require 'grell/crawler' 8 | require 'grell/rawpage' 9 | require 'grell/page' 10 | require 'grell/page_collection' 11 | require 'grell/reader' 12 | -------------------------------------------------------------------------------- /lib/grell/reader.rb: -------------------------------------------------------------------------------- 1 | module Grell 2 | # A tooling class, it waits a maximum of max_waiting for an action to finish. If the action is not 3 | # finished by then, it will continue anyway. 4 | # The wait may be long but we want to finish it as soon as the action has finished 5 | class Reader 6 | def self.wait_for(action, max_waiting, sleeping_time) 7 | time_start = Time.now 8 | action.call() 9 | return if yield 10 | while (Time.now < time_start + max_waiting) 11 | action.call() 12 | break if yield 13 | sleep(sleeping_time) 14 | end 15 | end 16 | 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Medidata Solutions Worldwide 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /spec/lib/reader_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe Grell::Reader do 2 | 3 | context 'Waiting time expired' do 4 | let(:waiting_time) {0} 5 | let(:sleeping_time) {2} 6 | let(:condition) {false} 7 | it 'does not sleep' do 8 | before_time = Time.now 9 | Grell::Reader.wait_for(->{''}, waiting_time, sleeping_time) do 10 | condition 11 | end 12 | expect(Time.now - before_time).to be < 1 13 | end 14 | end 15 | 16 | context 'The condition is true' do 17 | let(:waiting_time) {3} 18 | let(:sleeping_time) {2} 19 | let(:condition) {true} 20 | it 'does not sleep' do 21 | before_time = Time.now 22 | Grell::Reader.wait_for(->{''}, waiting_time, sleeping_time) do 23 | condition 24 | end 25 | expect(Time.now - before_time).to be < 1 26 | end 27 | end 28 | 29 | context 'The condition is false' do 30 | let(:waiting_time) {0.2} 31 | let(:sleeping_time) {0.2} 32 | let(:condition) {false} 33 | 34 | it 'waits the waiting time' do 35 | before_time = Time.now 36 | Grell::Reader.wait_for(->{''}, waiting_time, sleeping_time) do 37 | condition 38 | end 39 | expect(Time.now - before_time).to be > waiting_time 40 | end 41 | 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /spec/lib/capybara_driver_spec.rb: -------------------------------------------------------------------------------- 1 | 2 | RSpec.describe Grell::CapybaraDriver do 3 | let(:ts) { Time.now } 4 | before do 5 | Grell.logger = Logger.new(nil) 6 | end 7 | 8 | describe 'setup_capybara' do 9 | it 'properly registers the poltergeist driver' do 10 | Timecop.freeze(ts) 11 | driver = Grell::CapybaraDriver.new.setup_capybara 12 | expect(driver).to be_instance_of(Capybara::Poltergeist::Driver) 13 | end 14 | 15 | it 'raises an exception if the driver cannot be initialized' do 16 | Timecop.freeze(ts + 60) 17 | 18 | # Attempt to register twice with the same driver name 19 | Grell::CapybaraDriver.new.setup_capybara 20 | expect { Grell::CapybaraDriver.new.setup_capybara }. 21 | to raise_error "Poltergeist Driver could not be properly initialized" 22 | end 23 | 24 | it 'can register the poltergeist driver multiple times in a row' do 25 | Timecop.freeze(ts + 120) 26 | driver = Grell::CapybaraDriver.new.setup_capybara 27 | expect(driver).to be_instance_of(Capybara::Poltergeist::Driver) 28 | end 29 | end 30 | 31 | after do 32 | Timecop.return 33 | 34 | # Reset Capybara so future tests can easily stub HTTP requests 35 | Capybara.javascript_driver = :poltergeist_billy 36 | Capybara.default_driver = :poltergeist_billy 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | cache: bundler 3 | 4 | rvm: 5 | - 2.2.4 6 | - 2.3.0 7 | - 2.4.2 8 | 9 | before_install: 10 | - mkdir travis-phantomjs 11 | - wget https://github.com/JordiPolo/phantomjs/blob/master/phantomjs-2.1.1-linux-x86_64.tar.bz2?raw=true 12 | -O $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2 13 | - tar -xvf $PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64.tar.bz2 -C $PWD/travis-phantomjs 14 | - export PATH=$PWD/travis-phantomjs/phantomjs-2.1.1-linux-x86_64/bin:$PATH 15 | 16 | install: 17 | - bundle install --jobs=3 --retry=3 18 | 19 | script: 20 | - bundle exec rspec 21 | 22 | deploy: 23 | provider: rubygems 24 | api_key: 25 | secure: czStDI0W6MWL70sDwu53oNNCc8vKtT61pgvii+ZWIC9A41C2p7BzmbtosXsnLk2ApxmpWvFIgtQE0XIH7jkM5mY05cHinXDphtOTkNLFVjck3ZOMkx/cc+QRFW8K4FHkrzFsC+/Xx4t2/Psh35LpzhfJd0XzKKoCstXUVgJsfGcAK3DMpjXHSUbwLXGDZ4lzmsk52OLf0oL+in2447TJfVOvGXtYmfh1PjXRwDxKB0dan7w5mVgajS52b6wUhVPTaMe/JgCbMuV7BaQ1Goq8u7V4aaxU+liPAhzHWfMB6tF4TEW8yu2tvGLdOA0+1jmM8E9Q5saPWtwKiHvBxN8CzRpkiNDzyFAf8ljrWT5yKX3aRQCyPp3NNyhoumWap36b+O/zwZ3HxoAe22Yg0rjz8z8NxMR/ELPvjPYjCiF5zY7fO9PAzmIynMRUrxDnFj+/JGHdzx0ZMo3fEXgHHSaHPNxIzEffVVQk4XLVnFHDjBLY4mVp4sbHbja5qnui20RkdM/H9Yi/fQyl1ODhk+LUPoh45ZneDZq7GPrl+WKK06oEjXIXLU+1iEuqnSqybbmJMTUJlUV+7EJdtq2DgfDB4KXwLm2LLOR/IX63AzEav4NIxx3hIXifSKa9rp6D7nMTzdQwF0FFzIj/Y3qLrAe1WWt0gx3Vxq67pSwOJthk5Fc= 26 | on: 27 | tags: true 28 | rvm: 2.4.2 29 | -------------------------------------------------------------------------------- /grell.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'grell/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "grell" 8 | spec.version = Grell::VERSION 9 | spec.platform = Gem::Platform::RUBY 10 | spec.authors = ["Jordi Polo Carres"] 11 | spec.email = ["jcarres@mdsol.com"] 12 | spec.summary = %q{Ruby web crawler} 13 | spec.description = %q{Ruby web crawler using PhantomJS} 14 | spec.homepage = "https://github.com/mdsol/grell" 15 | spec.license = 'MIT' 16 | 17 | spec.files = `git ls-files -z`.split("\x0") 18 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 19 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 20 | spec.require_paths = ["lib"] 21 | 22 | spec.required_ruby_version = '>= 2.1.8' 23 | 24 | spec.add_dependency 'capybara', '~> 2.10' 25 | spec.add_dependency 'poltergeist', '~> 1.11' 26 | 27 | # spec.add_development_dependency 'bundler', '~> 1.6' 28 | spec.add_development_dependency 'byebug', '~> 4.0' 29 | spec.add_development_dependency 'kender', '~> 0.2' 30 | spec.add_development_dependency 'rake', '~> 10.0' 31 | spec.add_development_dependency 'webmock', '~> 1.18' 32 | spec.add_development_dependency 'rspec', '~> 3.5' 33 | spec.add_development_dependency 'puffing-billy', '~> 0.9' 34 | spec.add_development_dependency 'timecop', '~> 0.8' 35 | spec.add_development_dependency 'selenium-webdriver', '~> 2.53.4' 36 | end 37 | -------------------------------------------------------------------------------- /lib/grell/capybara_driver.rb: -------------------------------------------------------------------------------- 1 | module Grell 2 | # This class setups the driver for capybara. Used internally by the CrawlerManager 3 | # It uses Portelgeist to control PhantomJS 4 | class CapybaraDriver 5 | USER_AGENT = "Mozilla/5.0 (Grell Crawler)".freeze 6 | 7 | # Returns a poltergeist driver 8 | def setup_capybara 9 | @poltergeist_driver = nil 10 | 11 | # Capybara will not re-run the block if the driver name already exists, so the driver name 12 | # will have a time integer appended to ensure uniqueness. 13 | driver_name = "poltergeist_crawler_#{Time.now.to_f}".to_sym 14 | Grell.logger.info "GRELL Registering poltergeist driver with name '#{driver_name}'" 15 | 16 | Capybara.register_driver driver_name do |app| 17 | @poltergeist_driver = Capybara::Poltergeist::Driver.new(app, 18 | js_errors: false, 19 | inspector: false, 20 | phantomjs_logger: FakePoltergeistLogger, 21 | phantomjs_options: ['--debug=no', '--load-images=no', '--ignore-ssl-errors=yes', '--ssl-protocol=TLSv1.2']) 22 | end 23 | 24 | Capybara.default_max_wait_time = 3 25 | Capybara.run_server = false 26 | Capybara.default_driver = driver_name 27 | Capybara.current_session.driver.headers = { # The driver gets initialized when modified here 28 | "DNT" => 1, 29 | "User-Agent" => USER_AGENT 30 | } 31 | 32 | raise 'Poltergeist Driver could not be properly initialized' unless @poltergeist_driver 33 | 34 | @poltergeist_driver 35 | end 36 | 37 | # Poltergeist driver needs a class with this signature. The javascript console.log is sent here. 38 | # We just discard that information. 39 | module FakePoltergeistLogger 40 | def self.puts(*) 41 | end 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /lib/grell/rawpage.rb: -------------------------------------------------------------------------------- 1 | module Grell 2 | # This class depends heavily on Capybara but contains no logic. 3 | class RawPage 4 | include Capybara::DSL 5 | 6 | def navigate(url) 7 | visit(url) 8 | follow_redirects! 9 | end 10 | 11 | def headers 12 | page.response_headers 13 | end 14 | 15 | def status 16 | page.status_code 17 | end 18 | 19 | def body 20 | page.body 21 | end 22 | 23 | def all_anchors 24 | # Some elements may not be "a" elements but still provide a link. This usually is done for Javascript 25 | # to convert other elements which are not links to be able to be clicked naturally. 26 | # Only return links which are visible. 27 | all('[href]', visible: true).to_a + all('[data-href]', visible: true).to_a 28 | end 29 | 30 | def host 31 | page.current_host 32 | end 33 | 34 | def has_selector?(selector) 35 | page.has_selector?(selector) 36 | end 37 | 38 | def wait_for_all_ajax_requests(timeout, interval) 39 | Timeout::timeout(timeout) do 40 | (timeout / interval).ceil.times do 41 | jquery_active = page.evaluate_script("typeof jQuery !== 'undefined' && jQuery.active;") 42 | break if (!jquery_active || jquery_active.zero?) 43 | sleep(interval) 44 | end 45 | end 46 | true 47 | end 48 | 49 | private 50 | 51 | def follow_redirects! 52 | # Phantom is very weird, it will follow a redirect to provide the correct body but will not fill the 53 | # status and the headers, if we are in that situation, revisit the page with the correct url this time. 54 | # Note that we will still fail if we have more than 5 redirects on a row 55 | redirects = 0 56 | while(page.status_code == nil && redirects < 5) 57 | visit( CGI.unescape(page.current_url)) 58 | redirects = redirects + 1 59 | end 60 | end 61 | end 62 | end 63 | -------------------------------------------------------------------------------- /lib/grell/page_collection.rb: -------------------------------------------------------------------------------- 1 | module Grell 2 | # Keeps a record of all the pages crawled. 3 | # When a new url is found it is added to this collection, which makes sure it is unique. 4 | # This page is part of the discovered pages. Eventually that page will be navigated to, then 5 | # the page will be part of the visited pages. 6 | class PageCollection 7 | attr_reader :collection 8 | 9 | # A block containing the logic that determines if a new URL should be added 10 | # to the collection or if it is already present will be passed to the initializer. 11 | def initialize(add_match_block) 12 | @collection = [] 13 | @add_match_block = add_match_block || default_add_match 14 | end 15 | 16 | def create_page(url, parent_id) 17 | page_id = next_id 18 | page = Page.new(url, page_id, parent_id) 19 | add(page) 20 | page 21 | end 22 | 23 | def visited_pages 24 | @collection.select {|page| page.visited?} 25 | end 26 | 27 | def discovered_pages 28 | @collection - visited_pages 29 | end 30 | 31 | def next_page 32 | discovered_pages.sort_by{|page| page.parent_id}.first 33 | end 34 | 35 | private 36 | 37 | def next_id 38 | @collection.size 39 | end 40 | 41 | def add(page) 42 | # Although finding unique pages based on URL will add pages with different query parameters, 43 | # in some cases we do link to different pages depending on the query parameters like when using proxies 44 | new_url = @collection.none? do |collection_page| 45 | @add_match_block.call(collection_page, page) 46 | end 47 | 48 | if new_url 49 | @collection.push page 50 | end 51 | end 52 | 53 | # If add_match_block is not provided, url matching to determine if a new page should be added 54 | # to the page collection will default to this proc 55 | def default_add_match 56 | Proc.new do |collection_page, page| 57 | collection_page.url.downcase == page.url.downcase 58 | end 59 | end 60 | 61 | end 62 | end 63 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require 'grell' 2 | require 'byebug' 3 | require 'timecop' 4 | require 'webmock/rspec' 5 | require 'billy/capybara/rspec' 6 | require 'rack' 7 | require 'rack/server' 8 | 9 | # This will trick Puffing-billy into using this logger instead of its own 10 | # Puffing billy is very noisy and we do not want to see that in our output 11 | class Rails 12 | def self.logger 13 | Logger.new(nil) 14 | end 15 | end 16 | 17 | WebMock.disable_net_connect! 18 | 19 | 20 | # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration 21 | RSpec.configure do |config| 22 | 23 | # We do not need to wait for pages to return all the data 24 | config.before do 25 | stub_const("Grell::Page::WAIT_TIME", 0) 26 | allow_any_instance_of(Capybara::Session).to receive(:evaluate_script).and_return(nil) 27 | end 28 | 29 | config.expect_with :rspec do |expectations| 30 | expectations.include_chain_clauses_in_custom_matcher_descriptions = true 31 | end 32 | 33 | config.mock_with :rspec do |mocks| 34 | # Prevents you from mocking or stubbing a method that does not exist on 35 | # a real object. 36 | mocks.verify_partial_doubles = true 37 | end 38 | 39 | # Limits the available syntax to the non-monkey patched syntax that is recommended. 40 | config.disable_monkey_patching! 41 | 42 | # This setting enables warnings. It's recommended, but in some cases may 43 | # be too noisy due to issues in dependencies. 44 | # TODO: Billy puffy has lots of warnings, test this with new versions 45 | # config.warnings = true 46 | 47 | # Many RSpec users commonly either run the entire suite or an individual 48 | # file, and it's useful to allow more verbose output when running an 49 | # individual spec file. 50 | if config.files_to_run.one? 51 | # Use the documentation formatter for detailed output, 52 | # unless a formatter has already been configured 53 | # (e.g. via a command-line flag). 54 | config.default_formatter = 'doc' 55 | end 56 | 57 | config.order = :random 58 | Kernel.srand config.seed 59 | 60 | Capybara.javascript_driver = :poltergeist_billy 61 | Capybara.default_driver = :poltergeist_billy 62 | 63 | # config.profile_examples = 10 64 | end 65 | 66 | 67 | -------------------------------------------------------------------------------- /lib/grell/crawler.rb: -------------------------------------------------------------------------------- 1 | module Grell 2 | # This is the class that starts and controls the crawling 3 | class Crawler 4 | attr_reader :collection, :manager 5 | 6 | # Creates a crawler 7 | # evaluate_in_each_page: javascript block to evaluate in each page we crawl 8 | # add_match_block: block to evaluate to consider if a page is part of the collection 9 | # manager_options: options passed to the manager class 10 | # allowlist: Sets an allowlist filter, allows a regexp, string or array of either to be matched. 11 | # denylist: Sets a denylist filter, allows a regexp, string or array of either to be matched. 12 | def initialize(evaluate_in_each_page: nil, add_match_block: nil, allowlist: /.*/, denylist: /a^/, **manager_options) 13 | @collection = nil 14 | @manager = CrawlerManager.new(manager_options) 15 | @evaluate_in_each_page = evaluate_in_each_page 16 | @add_match_block = add_match_block 17 | @allowlist_regexp = Regexp.union(allowlist) 18 | @denylist_regexp = Regexp.union(denylist) 19 | end 20 | 21 | # Main method, it starts crawling on the given URL and calls a block for each of the pages found. 22 | def start_crawling(url, &block) 23 | Grell.logger.info "GRELL Started crawling" 24 | @collection = PageCollection.new(@add_match_block) 25 | @collection.create_page(url, nil) 26 | 27 | while !@collection.discovered_pages.empty? 28 | crawl(@collection.next_page, block) 29 | @manager.check_periodic_restart(@collection) 30 | end 31 | 32 | Grell.logger.info "GRELL finished crawling" 33 | end 34 | 35 | def crawl(site, block) 36 | Grell.logger.info "Visiting #{site.url}, visited_links: #{@collection.visited_pages.size}, discovered #{@collection.discovered_pages.size}" 37 | crawl_site(site) 38 | 39 | if block # The user of this block can send us a :retry to retry accessing the page 40 | while crawl_block(block, site) == :retry 41 | Grell.logger.info "Retrying our visit to #{site.url}" 42 | crawl_site(site) 43 | end 44 | end 45 | 46 | site.links.each do |url| 47 | @collection.create_page(url, site.id) 48 | end 49 | end 50 | 51 | private 52 | 53 | def crawl_site(site) 54 | site.navigate 55 | site.rawpage.page.evaluate_script(@evaluate_in_each_page) if @evaluate_in_each_page 56 | filter!(site.links) 57 | add_redirect_url(site) 58 | end 59 | 60 | # Treat any exceptions from the block as an unavailable page 61 | def crawl_block(block, site) 62 | block.call(site) 63 | rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient, 64 | Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError, 65 | Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e 66 | site.unavailable_page(404, e) 67 | end 68 | 69 | def filter!(links) 70 | links.select! { |link| link =~ @allowlist_regexp } if @allowlist_regexp 71 | links.delete_if { |link| link =~ @denylist_regexp } if @denylist_regexp 72 | end 73 | 74 | # Store the resulting redirected URL along with the original URL 75 | def add_redirect_url(site) 76 | if site.url != site.current_url 77 | @collection.create_page(site.current_url, site.id) 78 | end 79 | end 80 | 81 | end 82 | 83 | end 84 | -------------------------------------------------------------------------------- /lib/grell/crawler_manager.rb: -------------------------------------------------------------------------------- 1 | module Grell 2 | # Manages the state of the process crawling, does not care about individual pages but about logging, 3 | # restarting and quiting the crawler correctly. 4 | class CrawlerManager 5 | # logger: logger to use for Grell's messages 6 | # on_periodic_restart: if set, the driver will restart every :each visits (100 default) and execute the :do block 7 | # driver_options: Any extra options for the Capybara driver 8 | def initialize(logger: nil, on_periodic_restart: {}, driver: nil) 9 | Grell.logger = logger ? logger : Logger.new(STDOUT) 10 | @periodic_restart_block = on_periodic_restart[:do] 11 | @periodic_restart_period = on_periodic_restart[:each] || PAGES_TO_RESTART 12 | @driver = driver || CapybaraDriver.new.setup_capybara 13 | if @periodic_restart_period <= 0 14 | Grell.logger.warn "GRELL. Restart option misconfigured with a negative period. Ignoring option." 15 | end 16 | end 17 | 18 | # Restarts the PhantomJS process without modifying the state of visited and discovered pages. 19 | def restart 20 | Grell.logger.info "GRELL. Driver restarting" 21 | @driver.restart 22 | Grell.logger.info "GRELL. Driver restarted" 23 | end 24 | 25 | # Quits the poltergeist driver. 26 | def quit 27 | Grell.logger.info "GRELL. Driver quitting" 28 | @driver.quit 29 | end 30 | 31 | # PhantomJS seems to consume memory increasingly as it crawls, periodic restart allows to restart 32 | # the driver, potentially calling a block. 33 | def check_periodic_restart(collection) 34 | return unless @periodic_restart_block 35 | return unless @periodic_restart_period > 0 36 | return unless (collection.visited_pages.size % @periodic_restart_period).zero? 37 | restart 38 | @periodic_restart_block.call 39 | end 40 | 41 | def self.cleanup_all_processes 42 | PhantomJSManager.new.cleanup_all_processes 43 | end 44 | 45 | private 46 | 47 | PAGES_TO_RESTART = 100 # Default number of pages before we restart the driver. 48 | KILL_TIMEOUT = 2 # Number of seconds we wait till we kill the process. 49 | 50 | # Manages the PhantomJS process 51 | class PhantomJSManager 52 | def cleanup_all_processes 53 | pids = running_phantomjs_pids 54 | return if pids.empty? 55 | Grell.logger.warn "GRELL. Killing PhantomJS processes: #{pids.inspect}" 56 | pids.each do |pid| 57 | Grell.logger.warn "GRELL. Sending KILL to PhantomJS process #{pid}" 58 | kill_process(pid.to_i) 59 | end 60 | end 61 | 62 | def running_phantomjs_pids 63 | list_phantomjs_processes_cmd = "ps -ef | grep -E 'bin/phantomjs' | grep -v grep" 64 | `#{list_phantomjs_processes_cmd} | awk '{print $2;}'`.split("\n") 65 | end 66 | 67 | def kill_process(pid) 68 | Process.kill('TERM', pid) 69 | force_kill(pid) 70 | rescue Errno::ESRCH, Errno::ECHILD 71 | # successfully terminated 72 | rescue => e 73 | Grell.logger.error ["GRELL. PhantomJS process could not be killed", e.message, *e.backtrace].join($/) 74 | end 75 | 76 | def force_kill(pid) 77 | Timeout.timeout(KILL_TIMEOUT) { Process.wait(pid) } 78 | rescue Timeout::Error 79 | Process.kill('KILL', pid) 80 | Process.wait(pid) 81 | end 82 | end 83 | end 84 | end 85 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 2.1.2 2 | * Change white/black lists to allow/deny lists 3 | 4 | # 2.1.1 5 | * Update phantomjs_options to use 'TLSv1.2' 6 | 7 | # 2.1.0 8 | * Delete `driver_options` configuration key as it was never used. 9 | * `cleanup_all_processes` is a self method as intended to. 10 | 11 | # 2.0.0 12 | * New configuration key `on_periodic_restart`. 13 | * CrawlerManager.cleanup_all_processes method destroy all instances of phantomjs in this machine. 14 | 15 | * Breaking changes 16 | - Requires Ruby 2.1 or later. 17 | - Crawler.start_crawling does not accept options anymore, all options are passed to Crawler.new. 18 | - Crawler's methods `restart` and `quit` have been moved to CrawlerManager. 19 | - Crawler gets whitelist and blacklist as configuration options instead of being set in specific methods. 20 | 21 | # 1.6.11 22 | * Ensure all links are loaded by waiting for Ajax requests to complete 23 | * Add '@evaluate_in_each_page' option to evaluate before extracting links (e.g. $('.dropdown').addClass('open');) 24 | 25 | # 1.6.10 26 | * Avoid following JS href links, add missing dependencies to fix Travis build 27 | 28 | # 1.6.9 29 | * Avoid following links when disabled by CSS (1.6.8 worked only for Javascript) 30 | 31 | # 1.6.8 32 | * Avoid following disabled links 33 | 34 | # 1.6.7 35 | * Increment '@times_visited' first to avoid infinite retries when rescuing errors 36 | 37 | # 1.6.6 38 | * Updated phantomjs_logger not to open '/dev/null' 39 | 40 | # 1.6.5 41 | * Added #quit to Crawler 42 | 43 | # 1.6.4 44 | * Added #quit to Capybara driver 45 | 46 | # 1.6.3 47 | * Only follow visible links 48 | 49 | # 1.6.2 50 | * Reset Capybara driver to Puffing Billy (used to rewrite URL requests in specs) 51 | * Use float timestamp for Poltergeist driver name to support fast test executions 52 | 53 | # 1.6.1 54 | * Use non-static name to support registering Poltergeist crawler multiple times 55 | * More exception handling, store redirected URLs in addition to original URL 56 | 57 | # 1.6 58 | * Support custom URL comparison when adding new pages during crawling 59 | * Don't rescue Timeout error, so that Delayed Job can properly terminate hanging jobs 60 | * Fail early if Capybara doesn't initialize properly 61 | 62 | # 1.5.1 63 | * Fixed deprecation warning (Thanks scott) 64 | * Updated Poltergeist dependency 65 | 66 | # 1.5.0 67 | * Grell will follow redirects. 68 | * Added #followed_redirects? #error? #current_url methods to the Page class 69 | 70 | # 1.4.0 71 | * Added crawler.restart to restart browser process 72 | * The block of code can make grell retry any given page. 73 | 74 | # 1.3.2 75 | * Rescue Timeout error and return an empty page when that happens 76 | 77 | # 1.3.1 78 | * Added whitelisting and blacklisting 79 | * Better info in gemspec 80 | 81 | # 1.3 82 | * The Crawler object allows you to provide an external logger object. 83 | * Clearer semantics when an error happens, special headers are returned so the user can inspect the error 84 | * Caveats: 85 | - The 'debug' option in the crawler does not have any affect anymore. Provide an external logger with 'logger' instead 86 | - The errors provided in the headers by grell has changed from 'grell_status' to 'grellStatus'. 87 | - The 'visited' property in the page was never supposed to be accesible. Use 'visited?' instead. 88 | 89 | # 1.2.1 90 | * Solve bug: URLs are case insensitive 91 | 92 | # 1.2 93 | * Grell now will consider two links to point to the same page only when the whole URL is exactly the same. 94 | Versions previously would only consider two links to be the same when they shared the path. 95 | 96 | # 1.1.2 97 | * Solve bug where we were adding links in heads as if there were normal links in the body 98 | 99 | # 1.1.1 100 | * Solve bug with the new data-href functionality 101 | 102 | # 1.1 103 | * Solve problem with randomly failing spec 104 | * Search for elements with 'href' or 'data-href' to find links 105 | 106 | # 1.0.1 107 | * Rescueing Javascript errors 108 | 109 | # 1.0 110 | * Initial implementation 111 | * Basic support to crawling pages. 112 | -------------------------------------------------------------------------------- /spec/lib/page_collection_spec.rb: -------------------------------------------------------------------------------- 1 | 2 | RSpec.describe Grell::PageCollection do 3 | let(:add_match_block) do 4 | Proc.new do |collection_page, page| 5 | collection_page.url.downcase == page.url.downcase 6 | end 7 | end 8 | 9 | let(:collection) { Grell::PageCollection.new(add_match_block) } 10 | let(:url) { 'http://www.github.com/SomeUser/dragonlance?search=false' } 11 | let(:url2) { 'http://www.github.com/OtherUser/forgotten?search=false' } 12 | 13 | context 'empty collection' do 14 | 15 | it 'has no visited pages' do 16 | expect(collection.visited_pages).to be_empty 17 | end 18 | 19 | it 'has no discovered pages' do 20 | expect(collection.discovered_pages).to be_empty 21 | end 22 | 23 | it 'next page is nil' do 24 | expect(collection.next_page).to be_nil 25 | end 26 | end 27 | 28 | context 'one unvisited page' do 29 | let(:page) { collection.create_page(url, 0) } 30 | 31 | before do 32 | allow(page).to receive(:visited?).and_return(false) 33 | end 34 | 35 | it 'has no visited pages' do 36 | expect(collection.visited_pages).to be_empty 37 | end 38 | 39 | it 'has one discovered page' do 40 | expect(collection.discovered_pages).to eq([page]) 41 | 42 | end 43 | 44 | it 'next page is the unvisited page' do 45 | expect(collection.next_page).to eq(page) 46 | end 47 | end 48 | 49 | context 'one visited page' do 50 | let(:page) { collection.create_page(url, 0) } 51 | 52 | before do 53 | allow(page).to receive(:visited?).and_return(true) 54 | end 55 | 56 | it 'has one visited page' do 57 | expect(collection.visited_pages).to eq([page]) 58 | end 59 | 60 | it 'has no discovered pages' do 61 | expect(collection.discovered_pages).to be_empty 62 | end 63 | 64 | it 'next page is nil' do 65 | expect(collection.next_page).to be_nil 66 | end 67 | end 68 | 69 | context 'one visited and one unvisited page with the same url' do 70 | let(:page) { collection.create_page(url, 0) } 71 | let(:unvisited) { collection.create_page(url.upcase, 0) } 72 | 73 | before do 74 | allow(page).to receive(:visited?).and_return(true) 75 | allow(unvisited).to receive(:visited?).and_return(false) 76 | end 77 | 78 | it 'first page has id 0' do 79 | expect(page.id).to eq(0) 80 | end 81 | 82 | it 'second page has id 1' do 83 | expect(unvisited.id).to eq(1) 84 | end 85 | 86 | it 'has one visited page' do 87 | expect(collection.visited_pages).to eq([page]) 88 | end 89 | 90 | it 'has no discovered pages' do 91 | expect(collection.discovered_pages).to be_empty 92 | end 93 | 94 | it 'next page is nil' do 95 | expect(collection.next_page).to be_nil 96 | end 97 | end 98 | 99 | context 'one visited and one unvisited page with different URLs' do 100 | let(:page) { collection.create_page(url, 0) } 101 | let(:unvisited) { collection.create_page(url2, 0) } 102 | 103 | before do 104 | allow(page).to receive(:visited?).and_return(true) 105 | allow(unvisited).to receive(:visited?).and_return(false) 106 | end 107 | 108 | it 'has one visited page' do 109 | expect(collection.visited_pages).to eq([page]) 110 | end 111 | 112 | it 'has one discovered page' do 113 | expect(collection.discovered_pages).to eq([unvisited]) 114 | end 115 | 116 | it 'next page is the unvisited page' do 117 | expect(collection.next_page).to eq(unvisited) 118 | end 119 | end 120 | 121 | context 'one visited and one unvisited page with different URLs only different by the query' do 122 | let(:page) { collection.create_page(url, 0) } 123 | let(:url3) { 'http://www.github.com/SomeUser/dragonlance?search=true' } 124 | let(:unvisited) { collection.create_page(url3, 0) } 125 | 126 | before do 127 | allow(page).to receive(:visited?).and_return(true) 128 | allow(unvisited).to receive(:visited?).and_return(false) 129 | end 130 | 131 | it 'has one visited page' do 132 | expect(collection.visited_pages).to eq([page]) 133 | end 134 | 135 | it 'has one discovered page' do 136 | expect(collection.discovered_pages).to eq([unvisited]) 137 | end 138 | 139 | it 'next page is the unvisited page' do 140 | expect(collection.next_page).to eq(unvisited) 141 | end 142 | end 143 | 144 | context 'several unvisited pages' do 145 | let(:page) { collection.create_page(url, 2) } 146 | let(:page2) { collection.create_page(url2, 0) } 147 | 148 | before do 149 | allow(page).to receive(:visited?).and_return(true) 150 | allow(page2).to receive(:visited?).and_return(false) 151 | end 152 | 153 | it 'returns the page which has an earlier parent' do 154 | expect(collection.next_page).to eq(page2) 155 | end 156 | 157 | end 158 | 159 | end 160 | -------------------------------------------------------------------------------- /spec/lib/crawler_manager_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe Grell::CrawlerManager do 2 | let(:page) { Grell::Page.new(url, page_id, parent_page_id) } 3 | let(:host) { 'http://www.example.com' } 4 | let(:url) { 'http://www.example.com/test' } 5 | let(:driver) { double(Grell::CapybaraDriver) } 6 | let(:logger) { Logger.new(nil) } 7 | let(:crawler_manager) do 8 | described_class.new(logger: logger, driver: driver) 9 | end 10 | 11 | describe 'initialize' do 12 | context 'provides a logger' do 13 | let(:logger) { 33 } 14 | 15 | it 'sets custom logger' do 16 | crawler_manager 17 | expect(Grell.logger).to eq(33) 18 | Grell.logger = Logger.new(nil) 19 | end 20 | end 21 | 22 | context 'does not provides a logger' do 23 | let(:logger) { nil } 24 | 25 | it 'sets default logger' do 26 | crawler_manager 27 | expect(Grell.logger).to be_instance_of(Logger) 28 | Grell.logger = Logger.new(nil) 29 | end 30 | end 31 | 32 | context 'does not provide a driver' do 33 | let(:driver) { nil } 34 | 35 | it 'setups a new Capybara driver' do 36 | expect_any_instance_of(Grell::CapybaraDriver).to receive(:setup_capybara) 37 | crawler_manager 38 | end 39 | end 40 | end 41 | 42 | describe '#quit' do 43 | let(:driver) { double } 44 | 45 | it 'quits the poltergeist driver' do 46 | expect(logger).to receive(:info).with("GRELL. Driver quitting") 47 | expect(driver).to receive(:quit) 48 | crawler_manager.quit 49 | end 50 | end 51 | 52 | describe '#restart' do 53 | let(:driver) { double } 54 | 55 | it 'restarts the poltergeist driver' do 56 | expect(driver).to receive(:restart) 57 | expect(logger).to receive(:info).with("GRELL. Driver restarted") 58 | expect(logger).to receive(:info).with("GRELL. Driver restarting") 59 | crawler_manager.restart 60 | end 61 | end 62 | 63 | describe '#check_periodic_restart' do 64 | let(:collection) { double } 65 | 66 | context 'Periodic restart not setup' do 67 | it 'does not restart' do 68 | allow(collection).to receive_message_chain(:visited_pages, :size) { 100 } 69 | expect(crawler_manager).not_to receive(:restart) 70 | crawler_manager.check_periodic_restart(collection) 71 | end 72 | end 73 | 74 | context 'Periodic restart setup with default period' do 75 | let(:do_something) { proc {} } 76 | let(:crawler_manager) do 77 | Grell::CrawlerManager.new( 78 | logger: logger, 79 | driver: driver, 80 | on_periodic_restart: { do: do_something } 81 | ) 82 | end 83 | 84 | it 'does not restart after visiting 99 pages' do 85 | allow(collection).to receive_message_chain(:visited_pages, :size) { 99 } 86 | expect(crawler_manager).not_to receive(:restart) 87 | crawler_manager.check_periodic_restart(collection) 88 | end 89 | 90 | it 'restarts after visiting 100 pages' do 91 | allow(collection).to receive_message_chain(:visited_pages, :size) { 100 } 92 | expect(crawler_manager).to receive(:restart) 93 | crawler_manager.check_periodic_restart(collection) 94 | end 95 | end 96 | 97 | context 'Periodic restart setup with custom period' do 98 | let(:do_something) { proc {} } 99 | let(:period) { 50 } 100 | let(:crawler_manager) do 101 | Grell::CrawlerManager.new( 102 | logger: logger, 103 | driver: driver, 104 | on_periodic_restart: { do: do_something, each: period } 105 | ) 106 | end 107 | 108 | context 'restart option is not positive' do 109 | let(:period) { 0 } 110 | 111 | it 'logs a warning' do 112 | message = 'GRELL. Restart option misconfigured with a negative period. Ignoring option.' 113 | expect(logger).to receive(:warn).with(message) 114 | crawler_manager 115 | end 116 | end 117 | 118 | it 'does not restart after visiting a number different from custom period pages' do 119 | allow(collection).to receive_message_chain(:visited_pages, :size) { period * 1.2 } 120 | expect(crawler_manager).not_to receive(:restart) 121 | crawler_manager.check_periodic_restart(collection) 122 | end 123 | 124 | it 'restarts after visiting custom period pages' do 125 | allow(collection).to receive_message_chain(:visited_pages, :size) { period } 126 | expect(crawler_manager).to receive(:restart) 127 | crawler_manager.check_periodic_restart(collection) 128 | end 129 | end 130 | end 131 | 132 | describe '.cleanup_all_processes' do 133 | let(:driver) { double } 134 | 135 | context 'There are some phantomjs processes running' do 136 | let(:pids) { [10, 11] } 137 | before do 138 | allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager) 139 | .to receive(:running_phantomjs_pids).and_return(pids) 140 | end 141 | 142 | it 'logs processes pids' do 143 | expect(Grell.logger).to receive(:warn).with('GRELL. Killing PhantomJS processes: [10, 11]') 144 | expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 10') 145 | expect(Grell.logger).to receive(:warn).with('GRELL. Sending KILL to PhantomJS process 11') 146 | described_class.cleanup_all_processes 147 | end 148 | 149 | it 'kills all phantomjs processes' do 150 | expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(10) 151 | expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).to receive(:kill_process).with(11) 152 | described_class.cleanup_all_processes 153 | end 154 | end 155 | 156 | context 'There are no phantomjs processes running' do 157 | let(:pids) { [] } 158 | before do 159 | allow_any_instance_of(Grell::CrawlerManager::PhantomJSManager) 160 | .to receive(:running_phantomjs_pids).and_return(pids) 161 | end 162 | 163 | it 'no warning is logged' do 164 | expect(Grell.logger).not_to receive(:warn) 165 | described_class.cleanup_all_processes 166 | end 167 | 168 | it 'No process is killed' do 169 | expect_any_instance_of(Grell::CrawlerManager::PhantomJSManager).not_to receive(:kill_process) 170 | described_class.cleanup_all_processes 171 | end 172 | end 173 | end 174 | end 175 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Grell 2 | 3 | [![Build Status](https://travis-ci.org/mdsol/grell.svg?branch=develop)](https://travis-ci.org/mdsol/grell) 4 | 5 | Grell is a generic crawler for the web written in Ruby. 6 | It can be used to gather data, test pages in a given domain, etc. 7 | 8 | ## Installation 9 | 10 | Add this line to your application's Gemfile: 11 | 12 | ```ruby 13 | gem 'grell' 14 | ``` 15 | 16 | And then execute: 17 | 18 | $ bundle 19 | 20 | Or install it yourself as: 21 | 22 | $ gem install grell 23 | 24 | Grell uses PhantomJS as a browser, you will need to download and install it in your 25 | system. Check for instructions in http://phantomjs.org/ 26 | Grell has been tested with PhantomJS v2.1.x 27 | 28 | ## Usage 29 | 30 | ### Crawling an entire site 31 | 32 | The main entry point of the library is Grell::Crawler#start_crawling. 33 | Grell will yield to your code with each page it finds: 34 | 35 | ```ruby 36 | require 'grell' 37 | 38 | crawler = Grell::Crawler.new 39 | crawler.start_crawling('http://www.google.com') do |page| 40 | #Grell will keep iterating this block which each unique page it finds 41 | puts "yes we crawled #{page.url}" 42 | puts "status: #{page.status}" 43 | puts "headers: #{page.headers}" 44 | puts "body: #{page.body}" 45 | puts "We crawled it at #{page.timestamp}" 46 | puts "We found #{page.links.size} links" 47 | puts "page id and parent_id #{page.id}, #{page.parent_id}" 48 | end 49 | 50 | ``` 51 | 52 | Grell keeps a list of pages previously crawled and do not visit the same page twice. 53 | This list is indexed by the complete url, including query parameters. 54 | 55 | ### Re-retrieving a page 56 | If you want Grell to revisit a page and return the data to you again, 57 | return the symbol :retry in your block for the start_crawling method. 58 | For instance 59 | ```ruby 60 | require 'grell' 61 | crawler = Grell::Crawler.new 62 | crawler.start_crawling('http://www.google.com') do |current_page| 63 | if current_page.status == 500 && current_page.retries == 0 64 | crawler.manager.restart 65 | :retry 66 | end 67 | end 68 | ``` 69 | 70 | ### Pages' id 71 | 72 | Each page has an unique id, accessed by the property `id`. Also each page stores the id of the page from which we found this page, accessed by the property `parent_id`. 73 | The page object generated by accessing the first URL passed to the start_crawling(the root) has a `parent_id` equal to `nil` and an `id` equal to 0. 74 | Using this information it is possible to construct a directed graph. 75 | 76 | 77 | ### Restart and quit 78 | 79 | Grell can be restarted. The current list of visited and yet-to-visit pages list are not modified when restarting 80 | but the browser is destroyed and recreated, all cookies and local storage are lost. After restarting, crawling is resumed with a 81 | new browser. 82 | To destroy the crawler, call the `quit` method. This will free the memory taken in Ruby and destroys the PhantomJS process. 83 | ```ruby 84 | require 'grell' 85 | crawler = Grell::Crawler.new 86 | crawler.manager.restart # restarts the browser 87 | crawler.manager.quit # quits and destroys the crawler 88 | ``` 89 | 90 | ### Options 91 | 92 | The `Grell:Crawler` class can be passed options to customize its behavior: 93 | - `logger`: Sets the logger object, for instance `Rails.logger`. Default: `Logger.new(STDOUT)` 94 | - `on_periodic_restart`: Sets periodic restarts of the crawler each certain number of visits. Default: 100 pages. 95 | - `allowlist`: Sets a allowlist filter for URLs to be visited. Default: all URLs are allowlisted. 96 | - `denylist`: Sets a denylist filter for URLs to be avoided. Default: no URL is denylisted. 97 | - `add_match_block`: Block evaluated to consider if a given page should be part of the pages to be visited. Default: add unique URLs. 98 | - `evaluate_in_each_page`: Javascript block to be evaluated on each page visited. Default: Nothing evaluated. 99 | 100 | Grell by default will follow all the links it finds in the site being crawled. 101 | It will never follow links linking outside your site. 102 | If you want to further limit the amount of links crawled, you can use 103 | allowlisting, denylisting or manual filtering. 104 | Below further details on these and other options. 105 | 106 | 107 | #### Automatically restarting PhantomJS 108 | If you are doing a long crawling it is possible that phantomJS gets into an inconsistent state or it starts leaking memory. 109 | The crawler can be restarted manually by calling `crawler.manager.restart` or automatically by using the 110 | `on_periodic_restart` configuration key as follows: 111 | 112 | ```ruby 113 | require 'grell' 114 | 115 | crawler = Grell::Crawler.new(on_periodic_restart: { do: my_restart_procedure, each: 200 }) 116 | 117 | crawler.start_crawling('http://www.google.com') do |current_page| 118 | ... 119 | endd 120 | ``` 121 | 122 | This code will setup the crawler to be restarted every 200 pages being crawled and to call `my_restart_procedure` 123 | between restarts. A restart will destroy the cookies so for instance this custom block can be used to relogin. 124 | 125 | 126 | #### Allowlisting 127 | 128 | ```ruby 129 | require 'grell' 130 | 131 | crawler = Grell::Crawler.new(allowlist: [/games\/.*/, '/fun']) 132 | crawler.start_crawling('http://www.google.com') 133 | ``` 134 | 135 | Grell here will only follow links to games and '/fun' and ignore all 136 | other links. You can provide a regexp, strings (if any part of the 137 | string match is allowlisted) or an array with regexps and/or strings. 138 | 139 | #### Denylisting 140 | 141 | ```ruby 142 | require 'grell' 143 | 144 | crawler = Grell::Crawler.new(denylist: /games\/.*/) 145 | crawler.start_crawling('http://www.google.com') 146 | ``` 147 | 148 | Similar to allowlisting. But now Grell will follow every other link in 149 | this site which does not go to /games/... 150 | 151 | If you call both allowlist and denylist then both will apply, a link 152 | has to fullfill both conditions to survive. If you do not call any, then 153 | all links on this site will be crawled. Think of these methods as 154 | filters. 155 | 156 | #### Manual link filtering 157 | 158 | If you have a more complex use-case, you can modify the list of links 159 | manually. 160 | Grell yields the page to you before it adds the links to the list of 161 | links to visit. So you can modify in your block of code "page.links" to 162 | add and delete links to instruct Grell to add them to the list of links 163 | to visit next. 164 | 165 | #### Custom URL Comparison 166 | By default, Grell will detect new URLs to visit by comparing the full URL 167 | with the URLs of the discovered and visited links. This functionality can 168 | be changed by passing a block of code to Grells `start_crawling` method. 169 | In the below example, the path of the URLs (instead of the full URL) will 170 | be compared. 171 | 172 | ```ruby 173 | require 'grell' 174 | 175 | add_match_block = Proc.new do |collection_page, page| 176 | collection_page.path == page.path 177 | end 178 | 179 | crawler = Grell::Crawler.new(add_match_block: add_match_block) 180 | 181 | crawler.start_crawling('http://www.google.com') do |current_page| 182 | ... 183 | end 184 | ``` 185 | 186 | #### Evaluate script 187 | 188 | You can evalute a JavaScript snippet in each page before extracting links by passing the snippet to the 'evaluate_in_each_page' option: 189 | 190 | ```ruby 191 | require 'grell' 192 | 193 | crawler = Grell::Crawler.new(evaluate_in_each_page: "typeof jQuery !== 'undefined' && $('.dropdown').addClass('open');") 194 | 195 | ``` 196 | 197 | ### Errors 198 | When there is an error in the page or an internal error in the crawler (Javascript crashed the browser, etc). Grell will return with status 404 and the headers will have the following keys: 199 | - grellStatus: 'Error' 200 | - errorClass: The class of the error which broke this page. 201 | - errorMessage: A descriptive message with the information Grell could gather about the error. 202 | 203 | ## Tests 204 | 205 | Run the tests with 206 | ```ruby 207 | bundle exec rake ci 208 | ``` 209 | 210 | ## Contributors 211 | Grell is (c) Medidata Solutions Worldwide and owned by its major contributors: 212 | * [Teruhide Hoshikawa](https://github.com/thoshikawa-mdsol) 213 | * [Jordi Polo Carres](https://github.com/jcarres-mdsol) 214 | -------------------------------------------------------------------------------- /lib/grell/page.rb: -------------------------------------------------------------------------------- 1 | require 'forwardable' 2 | 3 | module Grell 4 | # This class contains the logic related to work with each page we crawl. It is also the interface we use 5 | # To access the information of each page. 6 | # This information comes from result private classes below. 7 | class Page 8 | extend Forwardable 9 | 10 | WAIT_TIME = 10 11 | WAIT_INTERVAL = 0.5 12 | 13 | attr_reader :url, :timestamp, :id, :parent_id, :rawpage 14 | 15 | #Most of the interesting information accessed through this class is accessed by the methods below 16 | def_delegators :@result_page, :headers, :body, :status, :links, :has_selector?, :host, :visited? 17 | 18 | def initialize( url, id, parent_id) 19 | @rawpage = RawPage.new 20 | @url = url 21 | @id = id 22 | @parent_id = parent_id 23 | @timestamp = nil 24 | @times_visited = 0 25 | @result_page = UnvisitedPage.new 26 | end 27 | 28 | def navigate 29 | # We wait a maximum of WAIT_TIME seconds to get an HTML page. We try our best to workaround inconsistencies on poltergeist 30 | Reader.wait_for(->{@rawpage.navigate(url)}, WAIT_TIME, WAIT_INTERVAL ) do 31 | @rawpage.status && !@rawpage.headers.empty? && 32 | @rawpage.headers["Content-Type"] && @rawpage.headers["Content-Type"].include?('text/html').equal?(true) 33 | end 34 | @rawpage.wait_for_all_ajax_requests(WAIT_TIME, WAIT_INTERVAL) 35 | @result_page = VisitedPage.new(@rawpage) 36 | @timestamp = Time.now 37 | rescue Capybara::Poltergeist::BrowserError, Capybara::Poltergeist::DeadClient, 38 | Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::StatusFailError, 39 | Capybara::Poltergeist::TimeoutError, Errno::ECONNRESET, URI::InvalidURIError => e 40 | unavailable_page(404, e) 41 | ensure 42 | @times_visited += 1 43 | end 44 | 45 | # Number of times we have retried the current page 46 | def retries 47 | [@times_visited - 1, 0].max 48 | end 49 | 50 | # The current URL, this may be different from the URL we asked for if there was some redirect 51 | def current_url 52 | @rawpage.current_url 53 | end 54 | 55 | # True if we followed a redirect to get the current contents 56 | def followed_redirects? 57 | current_url != @url 58 | end 59 | 60 | # True if there page responded with an error 61 | def error? 62 | !!(status.to_s =~ /[4|5]\d\d/) 63 | end 64 | 65 | # Extracts the path (e.g. /actions/test_action) from the URL 66 | def path 67 | URI.parse(@url).path 68 | rescue URI::InvalidURIError # Invalid URLs will be added and caught when we try to navigate to them 69 | @url 70 | end 71 | 72 | def unavailable_page(status, exception) 73 | Grell.logger.warn "The page with the URL #{@url} was not available. Exception #{exception}" 74 | @result_page = ErroredPage.new(status, exception) 75 | @timestamp = Time.now 76 | end 77 | 78 | private 79 | 80 | # Private class. 81 | # This is a result page when it has not been visited yet. Essentially empty of information 82 | # 83 | class UnvisitedPage 84 | def status 85 | nil 86 | end 87 | 88 | def body 89 | '' 90 | end 91 | 92 | def headers 93 | {grellStatus: 'NotVisited' } 94 | end 95 | 96 | def links 97 | [] 98 | end 99 | 100 | def host 101 | '' 102 | end 103 | 104 | def visited? 105 | false 106 | end 107 | 108 | def has_selector?(selector) 109 | false 110 | end 111 | 112 | end 113 | 114 | # Private class. 115 | # This is a result page when some error happened. It provides some information about the error. 116 | # 117 | class ErroredPage 118 | def initialize(error_code, exception) 119 | @error_code = error_code 120 | @exception = exception 121 | end 122 | 123 | def status 124 | @error_code 125 | end 126 | 127 | def body 128 | '' 129 | end 130 | 131 | def headers 132 | message = begin 133 | @exception.message 134 | rescue StandardError 135 | "Error message can not be accessed" #Poltergeist may try to access a nil object when accessing message 136 | end 137 | 138 | { 139 | grellStatus: 'Error', 140 | errorClass: @exception.class.to_s, 141 | errorMessage: message 142 | } 143 | end 144 | 145 | def links 146 | [] 147 | end 148 | 149 | def host 150 | '' 151 | end 152 | 153 | def visited? 154 | true 155 | end 156 | 157 | def has_selector?(selector) 158 | false 159 | end 160 | 161 | end 162 | 163 | 164 | # Private class. 165 | # This is a result page when we successfully got some information back after visiting the page. 166 | # It delegates most of the information to the @rawpage capybara page. But any transformation or logic is here 167 | # 168 | class VisitedPage 169 | def initialize(rawpage) 170 | @rawpage = rawpage 171 | end 172 | 173 | def status 174 | @rawpage.status 175 | end 176 | 177 | def body 178 | @rawpage.body 179 | end 180 | 181 | def headers 182 | @rawpage.headers 183 | rescue Capybara::Poltergeist::BrowserError => e #This may happen internally on Poltergeist, they claim is a bug. 184 | { 185 | grellStatus: 'Error', 186 | errorClass: e.class.to_s, 187 | errorMessage: e.message 188 | } 189 | end 190 | 191 | def links 192 | @links ||= all_links 193 | end 194 | 195 | def host 196 | @rawpage.host 197 | end 198 | 199 | def visited? 200 | true 201 | end 202 | 203 | def has_selector?(selector) 204 | @rawpage.has_selector?(selector) 205 | end 206 | 207 | private 208 | def all_links 209 | links = @rawpage.all_anchors.map { |anchor| Link.new(anchor) } 210 | body_enabled_links = links.reject { |link| link.inside_header? || link.disabled? || link.js_href? } 211 | body_enabled_links.map { |link| link.to_url(host) }.uniq.compact 212 | 213 | rescue Capybara::Poltergeist::ObsoleteNode 214 | Grell.logger.warn "We found an obsolete node in #{@url}. Ignoring all links" 215 | # Sometimes Javascript and timing may screw this, we lose these links. 216 | # TODO: Can we do something more intelligent here? 217 | [] 218 | end 219 | 220 | # Private class to group all the methods related to links. 221 | class Link 222 | def initialize(anchor) 223 | @anchor = anchor 224 | end 225 | 226 | # can only be used in the as of: https://developer.mozilla.org/en/docs/Web/HTML/Element/link 227 | def inside_header? 228 | @anchor.tag_name == 'link' 229 | end 230 | 231 | # Is the link disabled by either Javascript or CSS? 232 | def disabled? 233 | @anchor.disabled? || !!@anchor.native.attributes['disabled'] 234 | end 235 | 236 | # Does the href use javascript? 237 | def js_href? 238 | href.start_with?('javascript:') 239 | end 240 | 241 | # Some links may use data-href + javascript to do interesting things 242 | def href 243 | @anchor['href'] || @anchor['data-href'] 244 | end 245 | 246 | # We only accept links in this same host that start with a path 247 | def to_url(host) 248 | uri = URI.parse(href) 249 | if uri.absolute? 250 | if uri.host != URI.parse(host).host 251 | Grell.logger.debug "GRELL does not follow links to external hosts: #{href}" 252 | nil 253 | else 254 | href # Absolute link to our own host 255 | end 256 | else 257 | if uri.path.nil? 258 | Grell.logger.debug "GRELL does not follow links without a path: #{uri}" 259 | nil 260 | end 261 | if uri.path.start_with?('/') 262 | host + href # convert to full URL 263 | else # links like href="google.com" the browser would go to http://google.com like "http://#{link}" 264 | Grell.logger.debug "GRELL Bad formatted link: #{href}, assuming external" 265 | nil 266 | end 267 | end 268 | rescue URI::InvalidURIError # Invalid links propagating till we navigate to them 269 | href 270 | end 271 | end 272 | 273 | end 274 | end 275 | end 276 | -------------------------------------------------------------------------------- /spec/lib/crawler_spec.rb: -------------------------------------------------------------------------------- 1 | 2 | RSpec.describe Grell::Crawler do 3 | let(:page_id) { rand(10).floor + 10 } 4 | let(:parent_page_id) { rand(10).floor } 5 | let(:page) { Grell::Page.new(url, page_id, parent_page_id) } 6 | let(:host) { 'http://www.example.com' } 7 | let(:url) { 'http://www.example.com/test' } 8 | let(:add_match_block) { nil } 9 | let(:denylist) { /a^/ } 10 | let(:allowlist) { /.*/ } 11 | let(:crawler) do 12 | Grell::Crawler.new( 13 | logger: Logger.new(nil), 14 | driver: double(nil), 15 | evaluate_in_each_page: script, 16 | add_match_block: add_match_block, 17 | denylist: denylist, 18 | allowlist: allowlist) 19 | end 20 | let(:script) { nil } 21 | let(:body) { 'body' } 22 | let(:custom_add_match) do 23 | Proc.new do |collection_page, page| 24 | collection_page.path == page.path 25 | end 26 | end 27 | 28 | before do 29 | proxy.stub(url).and_return(body: body, code: 200) 30 | end 31 | 32 | describe '#crawl' do 33 | before do 34 | crawler.instance_variable_set('@collection', Grell::PageCollection.new(custom_add_match)) 35 | end 36 | 37 | it 'yields the result if a block is given' do 38 | result = [] 39 | block = Proc.new { |n| result.push(n) } 40 | crawler.crawl(page, block) 41 | expect(result.size).to eq(1) 42 | expect(result.first.url).to eq(url) 43 | expect(result.first.visited?).to eq(true) 44 | end 45 | 46 | it 'rescues any specified exceptions raised during the block execution' do 47 | block = Proc.new { |n| raise Capybara::Poltergeist::BrowserError, 'Exception' } 48 | expect{ crawler.crawl(page, block) }.to_not raise_error 49 | expect(page.status).to eq(404) 50 | end 51 | 52 | it 'logs interesting information' do 53 | crawler 54 | expect(Grell.logger).to receive(:info).with(/Visiting #{url}, visited_links: 0, discovered 0/) 55 | crawler.crawl(page, nil) 56 | end 57 | 58 | it 'retries when the block returns :retry' do 59 | counter = 0 60 | times_retrying = 2 61 | block = Proc.new do |n| 62 | if counter < times_retrying 63 | counter += 1 64 | :retry 65 | end 66 | end 67 | crawler.crawl(page, block) 68 | expect(counter).to eq(times_retrying) 69 | end 70 | 71 | it 'handles redirects by adding the current_url to the page collection' do 72 | redirect_url = 'http://www.example.com/test/landing_page' 73 | allow(page).to receive(:current_url).and_return(redirect_url) 74 | expect_any_instance_of(Grell::PageCollection).to receive(:create_page).with(redirect_url, page_id) 75 | crawler.crawl(page, nil) 76 | end 77 | 78 | context 'without script' do 79 | it 'does not evaluate a script' do 80 | expect_any_instance_of(Capybara::Session).not_to receive(:evaluate_script) 81 | crawler.crawl(page, nil) 82 | end 83 | end 84 | 85 | context 'with script' do 86 | let(:script) { "(typeof(jQuery)!='undefined') && $('.dropdown').addClass('open');" } 87 | it 'evaluates a script' do 88 | expect_any_instance_of(Capybara::Session).to receive(:evaluate_script).with(script) 89 | crawler.crawl(page, nil) 90 | end 91 | end 92 | end 93 | 94 | context '#start_crawling' do 95 | let(:body) do 96 | <<-EOS 97 | 98 | trusmis 99 | Hello world! 100 | 101 | EOS 102 | end 103 | let(:url_visited) { "http://www.example.com/musmis.html" } 104 | 105 | before do 106 | proxy.stub(url_visited).and_return(body: 'body', code: 200) 107 | end 108 | 109 | it 'calls the block we used to start_crawling' do 110 | result = [] 111 | block = Proc.new { |n| result.push(n) } 112 | crawler.start_crawling(url, &block) 113 | expect(result.size).to eq(2) 114 | expect(result[0].url).to eq(url) 115 | expect(result[1].url).to eq(url_visited) 116 | end 117 | 118 | end 119 | 120 | shared_examples_for 'visits all available pages' do 121 | it 'visits all the pages' do 122 | crawler.start_crawling(url) 123 | expect(crawler.collection.visited_pages.size).to eq(visited_pages_count) 124 | end 125 | 126 | it 'has no more pages to discover' do 127 | crawler.start_crawling(url) 128 | expect(crawler.collection.discovered_pages.size).to eq(0) 129 | end 130 | 131 | it 'contains the allowlisted page and the base page only' do 132 | crawler.start_crawling(url) 133 | expect(crawler.collection.visited_pages.map(&:url)). 134 | to eq(visited_pages) 135 | end 136 | end 137 | 138 | context 'the url has no links' do 139 | let(:body) do 140 | " 141 | Hello world! 142 | " 143 | end 144 | let(:visited_pages_count) { 1 } 145 | let(:visited_pages) { ['http://www.example.com/test'] } 146 | 147 | it_behaves_like 'visits all available pages' 148 | end 149 | 150 | context 'the url has several links' do 151 | let(:visited_pages_count) { 3 } 152 | let(:visited_pages) do 153 | ['http://www.example.com/test', 'http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] 154 | end 155 | let(:body) do 156 | " 157 | trusmis 158 | help 159 | Hello world! 160 | " 161 | end 162 | 163 | before do 164 | proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200) 165 | proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200) 166 | end 167 | 168 | it_behaves_like 'visits all available pages' 169 | end 170 | 171 | describe '#allowlist' do 172 | let(:body) do 173 | " 174 | trusmis 175 | help 176 | Hello world! 177 | " 178 | end 179 | 180 | before do 181 | proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200) 182 | proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200) 183 | end 184 | 185 | context 'using a single string' do 186 | let(:allowlist) { '/trusmis.html' } 187 | let(:visited_pages_count) { 2 } # my own page + trusmis 188 | let(:visited_pages) do 189 | ['http://www.example.com/test', 'http://www.example.com/trusmis.html'] 190 | end 191 | 192 | it_behaves_like 'visits all available pages' 193 | end 194 | 195 | context 'using an array of strings' do 196 | let(:allowlist) { ['/trusmis.html', '/nothere', 'another.html'] } 197 | let(:visited_pages_count) { 2 } 198 | let(:visited_pages) do 199 | ['http://www.example.com/test', 'http://www.example.com/trusmis.html'] 200 | end 201 | 202 | it_behaves_like 'visits all available pages' 203 | end 204 | 205 | context 'using a regexp' do 206 | let(:allowlist) { /\/trusmis\.html/ } 207 | let(:visited_pages_count) { 2 } 208 | let(:visited_pages) do 209 | ['http://www.example.com/test', 'http://www.example.com/trusmis.html'] 210 | end 211 | 212 | it_behaves_like 'visits all available pages' 213 | end 214 | 215 | context 'using an array of regexps' do 216 | let(:allowlist) { [/\/trusmis\.html/] } 217 | let(:visited_pages_count) { 2 } 218 | let(:visited_pages) do 219 | ['http://www.example.com/test', 'http://www.example.com/trusmis.html'] 220 | end 221 | 222 | it_behaves_like 'visits all available pages' 223 | end 224 | 225 | context 'using an empty array' do 226 | let(:allowlist) { [] } 227 | let(:visited_pages_count) { 1 } # my own page only 228 | let(:visited_pages) do 229 | ['http://www.example.com/test'] 230 | end 231 | 232 | it_behaves_like 'visits all available pages' 233 | end 234 | 235 | context 'adding all links to the allowlist' do 236 | let(:allowlist) { ['/trusmis', '/help'] } 237 | let(:visited_pages_count) { 3 } # all links 238 | let(:visited_pages) do 239 | ['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] 240 | end 241 | 242 | it_behaves_like 'visits all available pages' 243 | end 244 | end 245 | 246 | 247 | describe '#denylist' do 248 | let(:body) do 249 | " 250 | trusmis 251 | help 252 | Hello world! 253 | " 254 | end 255 | 256 | before do 257 | proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200) 258 | proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200) 259 | end 260 | 261 | context 'using a single string' do 262 | let(:denylist) { '/trusmis.html' } 263 | let(:visited_pages_count) {2} 264 | let(:visited_pages) do 265 | ['http://www.example.com/test','http://www.example.com/help.html'] 266 | end 267 | 268 | it_behaves_like 'visits all available pages' 269 | end 270 | 271 | context 'using an array of strings' do 272 | let(:denylist) { ['/trusmis.html', '/nothere', 'another.html'] } 273 | let(:visited_pages_count) {2} 274 | let(:visited_pages) do 275 | ['http://www.example.com/test','http://www.example.com/help.html'] 276 | end 277 | 278 | it_behaves_like 'visits all available pages' 279 | end 280 | 281 | context 'using a regexp' do 282 | let(:denylist) { /\/trusmis\.html/ } 283 | let(:visited_pages_count) {2} 284 | let(:visited_pages) do 285 | ['http://www.example.com/test','http://www.example.com/help.html'] 286 | end 287 | 288 | it_behaves_like 'visits all available pages' 289 | end 290 | 291 | context 'using an array of regexps' do 292 | let(:denylist) { [/\/trusmis\.html/] } 293 | let(:visited_pages_count) {2} 294 | let(:visited_pages) do 295 | ['http://www.example.com/test','http://www.example.com/help.html'] 296 | end 297 | 298 | it_behaves_like 'visits all available pages' 299 | end 300 | 301 | context 'using an empty array' do 302 | let(:denylist) { [] } 303 | let(:visited_pages_count) { 3 } # all links 304 | let(:visited_pages) do 305 | ['http://www.example.com/test','http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] 306 | end 307 | 308 | it_behaves_like 'visits all available pages' 309 | end 310 | 311 | context 'adding all links to the denylist' do 312 | let(:denylist) { ['/trusmis', '/help'] } 313 | let(:visited_pages_count) { 1 } 314 | let(:visited_pages) do 315 | ['http://www.example.com/test'] 316 | end 317 | 318 | it_behaves_like 'visits all available pages' 319 | end 320 | end 321 | 322 | 323 | describe 'allowlisting and denylisting' do 324 | let(:body) do 325 | " 326 | trusmis 327 | help 328 | Hello world! 329 | " 330 | end 331 | 332 | before do 333 | proxy.stub('http://www.example.com/trusmis.html').and_return(body: 'body', code: 200) 334 | proxy.stub('http://www.example.com/help.html').and_return(body: 'body', code: 200) 335 | end 336 | 337 | context 'we denylist the only allowlisted page' do 338 | let(:allowlist) { '/trusmis.html' } 339 | let(:denylist) { '/trusmis.html' } 340 | let(:visited_pages_count) { 1 } 341 | let(:visited_pages) do 342 | ['http://www.example.com/test'] 343 | end 344 | 345 | it_behaves_like 'visits all available pages' 346 | end 347 | 348 | context 'we denylist none of the allowlisted pages' do 349 | let(:allowlist) { '/trusmis.html' } 350 | let(:denylist) { '/raistlin.html' } 351 | let(:visited_pages_count) { 2 } 352 | let(:visited_pages) do 353 | ['http://www.example.com/test', 'http://www.example.com/trusmis.html'] 354 | end 355 | 356 | it_behaves_like 'visits all available pages' 357 | end 358 | end 359 | 360 | 361 | end 362 | -------------------------------------------------------------------------------- /spec/lib/page_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe Grell::Page do 2 | 3 | let(:page_id) { rand(10).floor + 10 } 4 | let(:parent_page_id) { rand(10).floor } 5 | let(:page) { Grell::Page.new(url, page_id, parent_page_id) } 6 | let(:host) { 'http://www.example.com' } 7 | let(:url) { 'http://www.example.com/test' } 8 | let(:returned_headers) { { 'Other-Header' => 'yes', 'Content-Type' => 'text/html' }} 9 | let(:now) { Time.now } 10 | 11 | before do 12 | allow(Time).to receive(:now).and_return(now) 13 | Grell.logger = Logger.new(nil) # avoids noise in rspec output 14 | end 15 | 16 | it 'gives access to the url' do 17 | expect(page.url).to eq(url) 18 | end 19 | 20 | it 'gives access to the path' do 21 | expect(page.path).to eq('/test') 22 | end 23 | 24 | it 'gives access to the page id' do 25 | expect(page.id).to eq(page_id) 26 | end 27 | 28 | it 'gives access to the parent page id' do 29 | expect(page.parent_id).to eq(parent_page_id) 30 | end 31 | 32 | it 'newly created page does not have status yet' do 33 | expect(page.status).to eq(nil) 34 | end 35 | 36 | shared_examples_for 'a grell page' do 37 | 38 | it 'returns the correct status' do 39 | expect(page.status).to eq(status) 40 | end 41 | 42 | it 'has the correct body' do 43 | expect(page.body).to eq(body) 44 | end 45 | 46 | it 'has correct headers' do 47 | expect(page.headers).to include(expected_headers) 48 | end 49 | 50 | it 'has the correct links' do 51 | expect(page.links.sort).to eq(links.sort) 52 | end 53 | 54 | it '#visited? returns the correct value' do 55 | expect(page.visited?).to eq(visited) 56 | end 57 | 58 | it 'has correct timestamp' do 59 | expect(page.timestamp).to eq(now) 60 | end 61 | 62 | end 63 | 64 | describe '#retries' do 65 | context 'page has not been navigated' do 66 | it '#retries return 0' do 67 | expect(page.retries).to eq(0) 68 | end 69 | end 70 | 71 | context 'page has been navigated once' do 72 | before do 73 | proxy.stub(url).and_return(body: '', code: 200, headers: {}) 74 | page.navigate 75 | end 76 | 77 | it '#retries return 0' do 78 | expect(page.retries).to eq(0) 79 | end 80 | end 81 | 82 | context 'page has been navigated twice' do 83 | before do 84 | proxy.stub(url).and_return(body: '', code: 200, headers: {}) 85 | page.navigate 86 | page.navigate 87 | end 88 | 89 | it '#retries return 1' do 90 | expect(page.retries).to eq(1) 91 | end 92 | end 93 | end 94 | 95 | describe '#navigate' do 96 | before do 97 | proxy.stub(url).and_return(body: '', code: 200, headers: {}) 98 | end 99 | 100 | it 'waits for all ajax requests' do 101 | expect_any_instance_of(Grell::RawPage).to receive(:wait_for_all_ajax_requests).with(0, 0.5) 102 | page.navigate 103 | end 104 | end 105 | 106 | shared_examples_for 'an errored grell page' do 107 | it 'returns empty status 404 page after navigating' do 108 | expect(page.status).to eq(404) 109 | expect(page.links).to eq([]) 110 | expect(page.headers).to eq(headers) 111 | expect(page.body).to eq('') 112 | expect(page.has_selector?('html')).to eq(false) 113 | expect(page).to be_visited 114 | expect(page.timestamp).to eq(now) 115 | expect(page.error?).to eq(true) 116 | expect(page.instance_variable_get(:@times_visited)).to eq(1) 117 | end 118 | end 119 | 120 | [ Capybara::Poltergeist::JavascriptError, Capybara::Poltergeist::BrowserError, URI::InvalidURIError, 121 | Capybara::Poltergeist::TimeoutError, Capybara::Poltergeist::StatusFailError, 122 | Capybara::Poltergeist::DeadClient, Errno::ECONNRESET ].each do |error_type| 123 | 124 | context "#{error_type}" do 125 | let(:headers) do 126 | { 127 | grellStatus: 'Error', 128 | errorClass: "#{error_type}", 129 | errorMessage: error_message 130 | } 131 | end 132 | let(:error_message) { 'Trusmis broke it again' } 133 | let(:now) { Time.now } 134 | 135 | before do 136 | allow_any_instance_of(Grell::RawPage).to receive(:navigate).and_raise(error_type, 'error') 137 | allow_any_instance_of(error_type).to receive(:message).and_return(error_message) 138 | page.navigate 139 | end 140 | 141 | it_behaves_like 'an errored grell page' 142 | end 143 | end 144 | 145 | 146 | context 'we have not yet navigated to the page' do 147 | let(:visited) { false } 148 | let(:status) { nil } 149 | let(:body) { '' } 150 | let(:links) { [] } 151 | let(:expected_headers) { {} } 152 | let(:now) { nil } 153 | 154 | before do 155 | proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup) 156 | end 157 | 158 | it_behaves_like 'a grell page' 159 | 160 | end 161 | 162 | context 'navigating to the URL we get a 404' do 163 | let(:visited) { true } 164 | let(:status) { 404 } 165 | let(:body) { 'nothing cool' } 166 | let(:links) { [] } 167 | let(:expected_headers) { returned_headers } 168 | 169 | before do 170 | proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup) 171 | page.navigate 172 | end 173 | 174 | it_behaves_like 'a grell page' 175 | 176 | end 177 | 178 | context 'navigating to an URL with redirects, follows them transparently' do 179 | let(:visited) { true } 180 | let(:status) { 200 } 181 | let(:body) { 'nothing cool' } 182 | let(:links) { [] } 183 | let(:expected_headers) { returned_headers } 184 | let(:real_url) { 'http://example.com/other' } 185 | 186 | before do 187 | proxy.stub(url).and_return(:redirect_to => real_url) 188 | proxy.stub(real_url).and_return(body: body, code: status, headers: returned_headers.dup) 189 | page.navigate 190 | end 191 | 192 | it_behaves_like 'a grell page' 193 | 194 | it 'followed_redirects? is true' do 195 | expect(page.followed_redirects?).to eq(true) 196 | end 197 | 198 | it 'current_url match the url we were redirected to' do 199 | expect(page.current_url).to eq(real_url) 200 | end 201 | end 202 | 203 | #Here also add examples that may happen for almost all pages (no errors, no redirects) 204 | context 'navigating to the URL we get page with no links' do 205 | let(:visited) { true } 206 | let(:status) { 200 } 207 | let(:body) { 'nothing cool' } 208 | let(:links) { [] } 209 | let(:expected_headers) { returned_headers } 210 | 211 | before do 212 | proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup) 213 | page.navigate 214 | end 215 | 216 | it_behaves_like 'a grell page' 217 | 218 | it 'followed_redirects is false' do 219 | expect(page.followed_redirects?).to eq(false) 220 | end 221 | 222 | it 'current_url is url' do 223 | expect(page.current_url).to eq(url) 224 | end 225 | 226 | it 'does not have errors' do 227 | expect(page.error?).to eq(false) 228 | end 229 | end 230 | 231 | context 'navigating to the URL we get page with links using a elements' do 232 | let(:visited) { true } 233 | let(:status) { 200 } 234 | let(:body) do 235 | " 236 | Hello world! 237 | trusmis 238 | help 239 | help 240 | " 241 | end 242 | let(:links) { ['http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] } 243 | let(:expected_headers) { returned_headers } 244 | 245 | before do 246 | proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup) 247 | page.navigate 248 | end 249 | 250 | it_behaves_like 'a grell page' 251 | 252 | it 'do not return links to external websites' do 253 | expect(page.links).to_not include('http://www.outsidewebsite.com/help.html') 254 | end 255 | end 256 | 257 | 258 | context 'navigating to the URL we get page with disabled links' do 259 | let(:visited) { true } 260 | let(:status) { 200 } 261 | let(:body) do 262 | " 263 | Hello world! 264 | trusmis 265 | help 266 | help 267 | helpdisabled 268 | " 269 | end 270 | let(:links) { ['http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] } 271 | let(:expected_headers) { returned_headers } 272 | 273 | before do 274 | proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup) 275 | page.navigate 276 | end 277 | 278 | it_behaves_like 'a grell page' 279 | end 280 | 281 | context 'navigating to the URL we get page with links with absolute links' do 282 | let(:visited) { true } 283 | let(:status) { 200 } 284 | let(:body) do 285 | " 286 | Hello world! 287 | trusmis 288 | help 289 | help 290 | " 291 | end 292 | let(:links) { ['http://www.example.com/trusmis.html', 'http://www.example.com/help.html'] } 293 | let(:expected_headers) { returned_headers } 294 | 295 | before do 296 | proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup) 297 | page.navigate 298 | end 299 | 300 | it_behaves_like 'a grell page' 301 | 302 | it 'do not return links to external websites' do 303 | expect(page.links).to_not include('http://www.outsidewebsite.com/help.html') 304 | end 305 | end 306 | 307 | context 'navigating to the URL we get page with links using a mix of elements' do 308 | let(:visited) { true } 309 | let(:status) { 200 } 310 | let(:body) do 311 | " 312 | Hello world! 313 | trusmis 314 | 315 | 316 | 317 | 318 | 319 |
help
help
320 |
help
321 |
help
322 | " 323 | end 324 | let(:links) do 325 | [ 'http://www.example.com/trusmis.html', 'http://www.example.com/help.html', 326 | 'http://www.example.com/more_help.html', 'http://www.example.com/help_me.html' ] 327 | end 328 | let(:expected_headers) { returned_headers } 329 | 330 | before do 331 | proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup) 332 | page.navigate 333 | end 334 | 335 | it_behaves_like 'a grell page' 336 | 337 | describe '#path' do 338 | context 'proper url' do 339 | let(:url) { 'http://www.anyurl.com/path' } 340 | let(:page) { Grell::Page.new(url, page_id, parent_page_id) } 341 | 342 | it 'returns the path' do 343 | expect(page.path).to eq('/path') 344 | end 345 | end 346 | 347 | context 'broken url' do 348 | let(:url) { 'www.an.asda.fasfasf.yurl.com/path' } 349 | let(:page) { Grell::Page.new(url, page_id, parent_page_id) } 350 | 351 | it 'returns the path' do 352 | expect(page.path).to eq(url) 353 | end 354 | end 355 | end 356 | 357 | it 'do not return links to external websites' do 358 | expect(page.links).to_not include('http://www.outsidewebsite.com/help.html') 359 | end 360 | end 361 | 362 | context 'navigating to the URL we get page with links inside the header section of the code' do 363 | let(:visited) { true } 364 | let(:status) { 200 } 365 | let(:css) { '/application.css' } 366 | let(:favicon) { '/favicon.ico' } 367 | let(:body) do 368 | " 369 | mimi 370 | 371 | 372 | 373 | 374 | Hello world! 375 | trusmis 376 | " 377 | end 378 | let(:links) do 379 | ['http://www.example.com/trusmis.html'] 380 | end 381 | let(:expected_headers) { returned_headers } 382 | 383 | before do 384 | proxy.stub(url).and_return(body: body, code: status, headers: returned_headers.dup) 385 | #We need to stub this or Phantomjs will get stuck trying to retrieve the resources 386 | proxy.stub(host + css).and_return(body: '', code: status) 387 | proxy.stub(host + favicon).and_return(body: '', code: status) 388 | page.navigate 389 | end 390 | 391 | it_behaves_like 'a grell page' 392 | 393 | it 'do not return links to resources in the header' do 394 | expect(page.links).to_not include('http://www.example.com/application.css') 395 | end 396 | 397 | end 398 | 399 | context 'status is never set' do #this may happen when there is nothing comming from the site 400 | before do 401 | stub_const('Grell::Page::WAIT_TIME', 0) 402 | allow_any_instance_of(Grell::RawPage).to receive(:status).and_return(nil) 403 | allow_any_instance_of(Grell::RawPage).to receive(:headers).and_return({}) 404 | allow_any_instance_of(Grell::RawPage).to receive(:body).and_return('') 405 | proxy.stub(url).and_return(body: body, code: nil, headers: {}) 406 | page.navigate 407 | end 408 | 409 | let(:visited) { true } 410 | let(:status) { nil } 411 | let(:body) { '' } 412 | let(:links) { [] } 413 | let(:expected_headers) { {} } 414 | 415 | it_behaves_like 'a grell page' 416 | end 417 | 418 | end 419 | --------------------------------------------------------------------------------