├── .github └── workflows │ └── ruby.yml ├── .gitignore ├── .rspec ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── Gemfile ├── Gemfile.lock ├── LICENSE.txt ├── README.md ├── Rakefile ├── arachnid2.gemspec ├── bin ├── console └── setup ├── lib ├── arachnid2.rb └── arachnid2 │ ├── cached_responses.rb │ ├── exoskeleton.rb │ ├── typhoeus.rb │ ├── version.rb │ └── watir.rb └── spec ├── arachnid2 ├── exoskeleton_spec.rb ├── typhoeus_spec.rb └── watir_spec.rb ├── arachnid2_spec.rb └── spec_helper.rb /.github/workflows/ruby.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake 6 | # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby 7 | 8 | name: Ruby 9 | 10 | on: 11 | push: 12 | branches: [ "master" ] 13 | pull_request: 14 | branches: [ "master" ] 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | test: 21 | 22 | runs-on: ubuntu-latest 23 | strategy: 24 | matrix: 25 | ruby-version: ['2.6', '2.7', '3.0', 3.2] 26 | 27 | steps: 28 | - uses: actions/checkout@v3 29 | - name: Set up Ruby 30 | uses: ruby/setup-ruby@v1 31 | # uses: ruby/setup-ruby@55283cc23133118229fd3f97f9336ee23a179fcf # v1.146.0 32 | with: 33 | ruby-version: ${{ matrix.ruby-version }} 34 | bundler-cache: true # runs 'bundle install' and caches installed gems automatically 35 | - name: Run tests 36 | run: bundle exec rspec spec/ 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /doc/ 6 | /pkg/ 7 | /spec/reports/ 8 | /tmp/ 9 | 10 | # rspec failure tracking 11 | .rspec_status 12 | 13 | arachnid2-*.gem 14 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | --require spec_helper 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: ruby 3 | rvm: 4 | - 2.4.1 5 | before_install: gem install bundler -v 1.16.1 6 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at samuel.nissen@rakuten.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } 4 | 5 | # Specify your gem's dependencies in arachnid2.gemspec 6 | gemspec 7 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | PATH 2 | remote: . 3 | specs: 4 | arachnid2 (0.3.9) 5 | addressable 6 | adomain 7 | bloomfilter-rb 8 | nokogiri (>= 1.10.4) 9 | typhoeus 10 | watir 11 | webdriver-user-agent (>= 7.6) 12 | webdrivers 13 | 14 | GEM 15 | remote: https://rubygems.org/ 16 | specs: 17 | addressable (2.7.0) 18 | public_suffix (>= 2.0.2, < 5.0) 19 | adomain (0.2.3) 20 | addressable (~> 2.5) 21 | logger 22 | bloomfilter-rb (2.1.1) 23 | redis 24 | childprocess (3.0.0) 25 | diff-lcs (1.3) 26 | ethon (0.12.0) 27 | ffi (>= 1.3.0) 28 | facets (3.1.0) 29 | ffi (1.12.2) 30 | json (2.3.0) 31 | logger (1.4.2) 32 | mini_portile2 (2.4.0) 33 | nokogiri (1.10.9) 34 | mini_portile2 (~> 2.4.0) 35 | os (1.0.1) 36 | psych (3.1.0) 37 | public_suffix (4.0.3) 38 | rake (13.0.1) 39 | redis (4.1.3) 40 | regexp_parser (1.7.0) 41 | rspec (3.8.0) 42 | rspec-core (~> 3.8.0) 43 | rspec-expectations (~> 3.8.0) 44 | rspec-mocks (~> 3.8.0) 45 | rspec-core (3.8.0) 46 | rspec-support (~> 3.8.0) 47 | rspec-expectations (3.8.2) 48 | diff-lcs (>= 1.2.0, < 2.0) 49 | rspec-support (~> 3.8.0) 50 | rspec-mocks (3.8.0) 51 | diff-lcs (>= 1.2.0, < 2.0) 52 | rspec-support (~> 3.8.0) 53 | rspec-support (3.8.0) 54 | rubyzip (2.2.0) 55 | selenium-webdriver (3.142.7) 56 | childprocess (>= 0.5, < 4.0) 57 | rubyzip (>= 1.2.2) 58 | typhoeus (1.3.1) 59 | ethon (>= 0.9.0) 60 | watir (6.16.5) 61 | regexp_parser (~> 1.2) 62 | selenium-webdriver (~> 3.6) 63 | webdriver-user-agent (7.6) 64 | facets 65 | json 66 | os 67 | psych 68 | selenium-webdriver (>= 3.4.0) 69 | webdrivers (4.2.0) 70 | nokogiri (~> 1.6) 71 | rubyzip (>= 1.3.0) 72 | selenium-webdriver (>= 3.0, < 4.0) 73 | 74 | PLATFORMS 75 | ruby 76 | 77 | DEPENDENCIES 78 | arachnid2! 79 | bundler (~> 1.16) 80 | rake (>= 12.3.3) 81 | rspec (~> 3.0) 82 | 83 | BUNDLED WITH 84 | 1.17.3 85 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Sam Nissen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Arachnid2 2 | 3 | ## About 4 | 5 | Arachnid2 is a simple, fast web-crawler written in Ruby. 6 | You can use [typhoeus](https://github.com/typhoeus/typhoeus) 7 | to make HTTP requests, or [Watir](https://github.com/watir/watir) 8 | to render pages. [bloomfilter-rb](https://github.com/igrigorik/bloomfilter-rb) 9 | stores the URLs, 10 | and [nokogiri](https://github.com/sparklemotion/nokogiri) 11 | finds the URLs on each webpage. 12 | 13 | Arachnid2 is a successor to [Arachnid](https://github.com/dchuk/Arachnid), 14 | and was abstracted out of the [Tellurion Bot](https://github.com/samnissen/tellurion_bot). 15 | 16 | ## Usage 17 | 18 | ### Typheous (cURL) 19 | 20 | The default use case for Arachnid2 is surfacing responses from 21 | a domains' URLs by visiting a URL, collecting any links to the 22 | same domain on that page, and visiting those to do the same. 23 | 24 | The simplest way to use the gem is collecting all of the 25 | responses while spidering from some URL. 26 | 27 | ```ruby 28 | require "arachnid2" 29 | 30 | url = "http://www.maximumfun.org" 31 | spider = Arachnid2.new(url) 32 | responses = [] 33 | 34 | spider.crawl { |response| 35 | responses << response 36 | } 37 | ``` 38 | 39 | Obviously this could become unwieldy, 40 | so you can execute logic within the spidering to collect a narrow subset 41 | of the responses, transform or dissect the response, 42 | or both (or whatever you want). 43 | 44 | ```ruby 45 | require "arachnid2" 46 | require "nokogiri" 47 | 48 | url = "https://daringfireball.net" 49 | spider = Arachnid2.new(url) 50 | responses = [] 51 | 52 | spider.crawl { |response| 53 | responses << Nokogiri::HTML(response.body) if response.effective_url =~ /.*amazon.*/ 54 | print '*' 55 | } 56 | ``` 57 | 58 | `Arachnid2#crawl` will return always `nil`. 59 | 60 | #### Options 61 | 62 | ```ruby 63 | require "arachnid2" 64 | 65 | url = "http://sixcolours.com" 66 | spider = Arachnid2.new(url) 67 | opts = { 68 | followlocation: true, 69 | timeout: 300, 70 | time_box: 60, 71 | max_urls: 50, 72 | :headers => { 73 | 'Accept-Language' => "en-UK", 74 | 'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0", 75 | }, 76 | memory_limit: 89.99, 77 | proxy: { 78 | ip: "1.2.3.4", 79 | port: "1234", 80 | username: "sam", 81 | password: "coolcoolcool", 82 | } 83 | :non_html_extensions => { 84 | 3 => [".abc", ".xyz"], 85 | 4 => [".abcd"], 86 | 6 => [".abcdef"], 87 | 11 => [".abcdefghijk"] 88 | } 89 | } 90 | responses = [] 91 | 92 | spider.crawl(opts) { |response| 93 | responses << response 94 | } 95 | ``` 96 | 97 | ##### `followlocation` 98 | 99 | Tell Typhoeus to follow redirections. 100 | 101 | ##### `timeout` 102 | 103 | Tell Typheous or Watir how long to wait for page load. 104 | 105 | ##### `time_box` 106 | 107 | The crawler will time-bound your spidering. 108 | If no valid integer is provided, 109 | it will crawl for 15 seconds before exiting. 110 | 10000 seconds is the current maximum, 111 | and any value above it will be reduced to 10000. 112 | 113 | ##### `max_urls` 114 | 115 | The crawler will crawl a limited number of URLs before stopping. 116 | If no valid integer is provided, 117 | it will crawl for 50 URLs before exiting. 118 | 10000 seconds is the current maximum, 119 | and any value above it will be reduced to 10000. 120 | 121 | ##### `headers` 122 | 123 | This is a hash that represents any HTTP header key/value pairs you desire, 124 | and is passed directly to Typheous. Before it is sent, a default 125 | language and user agent are created: 126 | 127 | ###### Defaults 128 | 129 | The HTTP header `Accept-Language` default is 130 | `en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, \*;0.4` 131 | 132 | The HTTP header `User-Agent` default is 133 | `Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15` 134 | 135 | ##### `proxy` 136 | 137 | Provide your IP, port for a proxy. If required, provide credentials for 138 | authenticating to that proxy. Proxy options and handling are done 139 | by Typhoeus. 140 | 141 | ##### `non_html_extensions` 142 | 143 | This is the list of TLDs to ignore when collecting URLs from the page. 144 | The extensions are formatted as a hash of key/value pairs, where the value 145 | is an array of TLDs, and the keys represent the length of those TLDs. 146 | 147 | ##### `memory_limit` and Docker 148 | 149 | In case you are operating the crawler within a container, Arachnid2 150 | can attempt to prevent the container from running out of memory. 151 | By default, it will end the crawl when the container uses >= 80% 152 | of its available memory. You can override this with the 153 | option. 154 | 155 | ##### Non-HTML links 156 | 157 | The crawler attempts to stop itself from returning data from 158 | links that are not indicative of HTML, as detailed in 159 | `Arachnid2::NON_HTML_EXTENSIONS`. 160 | 161 | #### Caching (optional) 162 | 163 | If you have setup a cache to deduplicate crawls, 164 | set a cached service url 165 | `export ARACHNID_CACHED_SERVICE_ADDRESS=http://localhost:9000` 166 | 167 | This expects a push and get JSON API to respond 168 | to `/typhoeus_responses`, with a URL and the options pushed 169 | exactly as received as parameters. It will push any crawls 170 | to the service, and re-use any crawled pages 171 | if they are found to match. 172 | 173 | ### With Watir 174 | 175 | Arachnid2 can crawl links with Watir, gathering up links 176 | like crawling with Typhoeus, but with pages that are 177 | actually rendered. You can access this option in one 178 | of two ways: 179 | 180 | ```ruby 181 | # ... 182 | Arachnid2.new(url).crawl_watir(opts) 183 | # -or- 184 | with_watir = true # the default is `false` 185 | Arachnid2.new(url).crawl(opts, with_watir) 186 | ``` 187 | 188 | Arachnid2 has base defaults which you might want to address when 189 | employing Watir. 190 | 191 | * First, the default crawl time is 15 seconds. 192 | As browser page loads can take this long, you will probably want to 193 | set a higher crawl time. 194 | * Also, simply storing the browser is not a great idea, since 195 | it will be inaccessible after it is closed. 196 | Instead, consider nabbing the HTML, cookies, 197 | or whatever content is required during the crawl. 198 | * Finally, note that Firefox is the default browser. 199 | 200 | 201 | ```ruby 202 | require 'arachnid2' 203 | 204 | with_watir = true 205 | responses = [] 206 | url = "http://maximumfun.org" 207 | max = 60 208 | browser = :chrome 209 | opts = {time_box: max, browser_type: browser} 210 | 211 | spider = Arachnid2.new(url) 212 | spider.crawl(opts, with_watir) do |response| 213 | response.body.wait_until(&:present?) 214 | responses << response.body.html if response.body.present? 215 | end 216 | ``` 217 | 218 | #### Options 219 | 220 | See the Typhoeus options above — most apply to Watir as well, with 221 | some exceptions: 222 | 223 | ##### `proxy` 224 | 225 | Watir proxy options are formatted differently: 226 | 227 | ```ruby 228 | proxy: { 229 | http: "troy.show:8080", 230 | ssl: "abed.show:8080" 231 | }, 232 | ``` 233 | 234 | Proxy options handling is done by Watir. 235 | 236 | ##### `headless` 237 | 238 | And it accepts an argument to make browse headlessly 239 | 240 | ```ruby 241 | opts = { headless: true } 242 | ``` 243 | 244 | ##### `agent` 245 | 246 | It accepts an argument mapped to Webdriver::UserAgent::Driver's `agent` option 247 | 248 | ```ruby 249 | opts = { agent: :desktop } 250 | ``` 251 | ##### `orientation` 252 | 253 | And it accepts an argument mapped to Webdriver::UserAgent::Driver's `orientation` option 254 | 255 | ```ruby 256 | opts = { orientation: :landscape } 257 | ``` 258 | 259 | ##### `followlocation` and `max_concurrency` 260 | 261 | These options do not apply to Watir, and will be ignored. 262 | 263 | ## Development 264 | 265 | Fork the repo and run the tests 266 | 267 | ```ruby 268 | bundle exec rspec spec/ 269 | ``` 270 | 271 | ## Contributing 272 | 273 | Bug reports and pull requests are welcome on GitHub at 274 | https://github.com/samnissen/arachnid2. 275 | This project is intended to be a safe, 276 | welcoming space for collaboration, 277 | and contributors are expected to adhere to the 278 | [Contributor Covenant](http://contributor-covenant.org) code of conduct. 279 | 280 | ## License 281 | 282 | The gem is available as open source under the terms of the 283 | [MIT License](https://opensource.org/licenses/MIT). 284 | 285 | ## Code of Conduct 286 | 287 | Everyone interacting in the Arachnid2 project’s codebases, 288 | issue trackers, chat rooms and mailing lists is expected 289 | to follow the 290 | [code of conduct](https://github.com/samnissen/arachnid2/blob/master/CODE_OF_CONDUCT.md). 291 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rspec/core/rake_task" 3 | 4 | RSpec::Core::RakeTask.new(:spec) 5 | 6 | task :default => :spec 7 | -------------------------------------------------------------------------------- /arachnid2.gemspec: -------------------------------------------------------------------------------- 1 | 2 | lib = File.expand_path("../lib", __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require "arachnid2/version" 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "arachnid2" 8 | spec.version = Arachnid2::VERSION 9 | spec.authors = ["Sam Nissen"] 10 | spec.email = ["scnissen@gmail.com"] 11 | 12 | spec.summary = %q{A simple, fast web crawler} 13 | # spec.description = %q{TODO: Write a longer description or delete this line.} 14 | spec.homepage = "https://github.com/samnissen/arachnid2" 15 | spec.license = "MIT" 16 | 17 | spec.files = `git ls-files -z`.split("\x0").reject do |f| 18 | f.match(%r{^(test|spec|features)/}) 19 | end 20 | spec.bindir = "exe" 21 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } 22 | spec.require_paths = ["lib"] 23 | 24 | spec.add_development_dependency "bundler", "~> 1.16" 25 | spec.add_development_dependency "rake", ">= 12.3.3" 26 | spec.add_development_dependency "rspec", "~> 3.0" 27 | 28 | spec.add_dependency "webdriver-user-agent", ">= 7.6" 29 | spec.add_dependency "watir" 30 | spec.add_dependency "webdrivers" 31 | spec.add_dependency "typhoeus" 32 | spec.add_dependency "bloomfilter-rb" 33 | spec.add_dependency "adomain" 34 | spec.add_dependency "addressable" 35 | spec.add_dependency "nokogiri", ">= 1.10.4" 36 | end 37 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require "arachnid2" 5 | 6 | # You can add fixtures and/or initialization code here to make experimenting 7 | # with your gem easier. You can also use a different console, if you like. 8 | 9 | # (If you use this, don't forget to add pry to your Gemfile!) 10 | # require "pry" 11 | # Pry.start 12 | 13 | require "irb" 14 | IRB.start(__FILE__) 15 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /lib/arachnid2.rb: -------------------------------------------------------------------------------- 1 | require "arachnid2/version" 2 | require "arachnid2/cached_responses" 3 | require "arachnid2/exoskeleton" 4 | require "arachnid2/typhoeus" 5 | require "arachnid2/watir" 6 | 7 | require 'tempfile' 8 | require "typhoeus" 9 | require "bloomfilter-rb" 10 | require "adomain" 11 | require "addressable/uri" 12 | require "nokogiri" 13 | require "base64" 14 | require "webdrivers" 15 | require "webdriver-user-agent" 16 | require "watir" 17 | 18 | 19 | class Arachnid2 20 | # META: 21 | # About the origins of this crawling approach 22 | # The Crawler is heavily borrowed from by Arachnid. 23 | # Original: https://github.com/dchuk/Arachnid 24 | # Other iterations I've borrowed liberally from: 25 | # - https://github.com/matstc/Arachnid 26 | # - https://github.com/intrigueio/Arachnid 27 | # - https://github.com/jhulme/Arachnid 28 | # And this was originally written as a part of Tellurion's bot 29 | # https://github.com/samnissen/tellurion_bot 30 | 31 | MAX_CRAWL_TIME = 10000 32 | BASE_CRAWL_TIME = 15 33 | MAX_URLS = 10000 34 | BASE_URLS = 50 35 | DEFAULT_LANGUAGE = "en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, *;0.4" 36 | DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15" 37 | 38 | DEFAULT_NON_HTML_EXTENSIONS = { 39 | 3 => ['.gz'], 40 | 4 => ['.jpg', '.png', '.m4a', '.mp3', '.mp4', '.pdf', '.zip', 41 | '.wmv', '.gif', '.doc', '.xls', '.pps', '.ppt', '.tar', 42 | '.iso', '.dmg', '.bin', '.ics', '.exe', '.wav', '.mid'], 43 | 5 => ['.xlsx', '.docx', '.pptx', '.tiff', '.zipx'], 44 | 8 => ['.torrent'] 45 | } 46 | MEMORY_USE_FILE = "/sys/fs/cgroup/memory/memory.usage_in_bytes" 47 | MEMORY_LIMIT_FILE = "/sys/fs/cgroup/memory/memory.limit_in_bytes" 48 | DEFAULT_MAXIMUM_LOAD_RATE = 79.9 49 | 50 | DEFAULT_TIMEOUT = 10_000 51 | MINIMUM_TIMEOUT = 1 52 | MAXIMUM_TIMEOUT = 999_999 53 | 54 | # 55 | # Creates the object to execute the crawl 56 | # 57 | # @example 58 | # url = "https://daringfireball.net" 59 | # spider = Arachnid2.new(url) 60 | # 61 | # @param [String] url 62 | # 63 | # @return [Arachnid2] self 64 | # 65 | def initialize(url) 66 | @url = url 67 | end 68 | 69 | # 70 | # Visits a URL, gathering links and visiting them, 71 | # until running out of time, memory or attempts. 72 | # 73 | # @example 74 | # url = "https://daringfireball.net" 75 | # spider = Arachnid2.new(url) 76 | # 77 | # opts = { 78 | # :followlocation => true, 79 | # :timeout => 25000, 80 | # :time_box => 30, 81 | # :headers => { 82 | # 'Accept-Language' => "en-UK", 83 | # 'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0", 84 | # }, 85 | # :memory_limit => 89.99, 86 | # :proxy => { 87 | # :ip => "1.2.3.4", 88 | # :port => "1234", 89 | # :username => "sam", 90 | # :password => "coolcoolcool", 91 | # } 92 | # :non_html_extensions => { 93 | # 3 => [".abc", ".xyz"], 94 | # 4 => [".abcd"], 95 | # 6 => [".abcdef"], 96 | # 11 => [".abcdefghijk"] 97 | # } 98 | # } 99 | # responses = [] 100 | # spider.crawl(opts) { |response| 101 | # responses << response 102 | # } 103 | # 104 | # @param [Hash] opts 105 | # 106 | # @return nil 107 | # 108 | def crawl(opts = {}, with_watir = false) 109 | if with_watir 110 | crawl_watir(opts, &Proc.new) 111 | else 112 | Arachnid2::Typhoeus.new(@url).crawl(opts, &Proc.new) 113 | end 114 | end 115 | 116 | def crawl_watir(opts) 117 | Arachnid2::Watir.new(@url).crawl(opts, &Proc.new) 118 | end 119 | # https://mudge.name/2011/01/26/passing-blocks-in-ruby-without-block.html 120 | 121 | end 122 | -------------------------------------------------------------------------------- /lib/arachnid2/cached_responses.rb: -------------------------------------------------------------------------------- 1 | require 'net/http' 2 | require 'json' 3 | module CachedResponses 4 | CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze 5 | 6 | def load_data(_url, _options) 7 | return if check_config 8 | 9 | uri = URI("#{CACHE_SERVICE_URL}/typhoeus_responses?url=#{@url}&options=#{@options}") 10 | req = Net::HTTP::Get.new(uri) 11 | req['Accept'] = 'json' 12 | Net::HTTP.start(uri.hostname, uri.port) do |http| 13 | response = http.request(req) 14 | return nil if response.code != '200' 15 | 16 | body = ::JSON.parse(response.body) 17 | responses_list = Base64.decode64(body['encrypted_response']) 18 | return Marshal.load responses_list # here we get an Array of `Typhoeus::Response`s 19 | end 20 | rescue StandardError 21 | nil 22 | end 23 | 24 | def put_cached_data(url, options, data) 25 | return if check_config 26 | 27 | uri = URI("#{CACHE_SERVICE_URL}/typhoeus_responses") 28 | 29 | header = { 'Content-Type': 'application/json' } 30 | req = Net::HTTP::Post.new(uri, header) 31 | processed_data = Base64.encode64(Marshal.dump(data)) 32 | req.body = { url: url, options: options, encrypted_response: processed_data }.to_json 33 | Net::HTTP.start(uri.hostname, uri.port) do |http| 34 | http.request(req) 35 | end 36 | end 37 | 38 | def check_config 39 | CACHE_SERVICE_URL.nil? 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /lib/arachnid2/exoskeleton.rb: -------------------------------------------------------------------------------- 1 | class Arachnid2 2 | module Exoskeleton 3 | def browser_type 4 | unless @browser_type 5 | @browser_type = "#{@options[:browser_type]}".to_sym if @options[:browser_type] 6 | @browser_type ||= :firefox 7 | end 8 | 9 | @browser_type 10 | end 11 | 12 | def process(url, html) 13 | return false unless Adomain["#{url}"]&.include? @domain 14 | 15 | extract_hrefs(html) 16 | end 17 | 18 | def extract_hrefs(body) 19 | elements = Nokogiri::HTML.parse(body).css('a') 20 | return elements.map {|link| link.attribute('href').to_s}.uniq.sort.delete_if {|href| href.empty? } 21 | end 22 | 23 | def vacuum(links, url) 24 | links.each do |link| 25 | next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/) 26 | 27 | begin 28 | absolute_link = make_absolute(link, url) 29 | 30 | next if skip_link?(absolute_link) 31 | 32 | @global_queue << absolute_link 33 | rescue Addressable::URI::InvalidURIError 34 | end 35 | end 36 | end 37 | 38 | def skip_link?(absolute_link) 39 | !internal_link?(absolute_link) || \ 40 | @global_visited.include?(absolute_link) || \ 41 | extension_ignored?(absolute_link) || \ 42 | @global_queue.include?(absolute_link) 43 | end 44 | 45 | def preflight(opts) 46 | @options = opts 47 | @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true) 48 | @global_queue = [@url] 49 | end 50 | 51 | def proxy 52 | @options[:proxy] 53 | end 54 | 55 | def non_html_extensions 56 | return @non_html_extensions if @non_html_extensions 57 | 58 | @non_html_extensions = @options[:non_html_extensions] 59 | @non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS 60 | end 61 | 62 | def bound_time 63 | boundary = "#{@options[:time_box]}".to_i 64 | boundary = BASE_CRAWL_TIME if boundary <= 0 65 | boundary = MAX_CRAWL_TIME if boundary > MAX_CRAWL_TIME 66 | 67 | return Time.now + boundary 68 | end 69 | 70 | def bound_urls 71 | amount = "#{@options[:max_urls]}".to_i 72 | amount = BASE_URLS if amount <= 0 73 | amount = MAX_URLS if amount > MAX_URLS 74 | 75 | amount 76 | end 77 | 78 | def timeout 79 | unless @timeout 80 | @timeout = @options[:timeout] 81 | @timeout = DEFAULT_TIMEOUT unless @timeout.is_a?(Integer) 82 | @timeout = DEFAULT_TIMEOUT if @timeout > MAXIMUM_TIMEOUT 83 | @timeout = DEFAULT_TIMEOUT if @timeout < MINIMUM_TIMEOUT 84 | end 85 | @timeout 86 | end 87 | 88 | def crawl_options 89 | @crawl_options ||= { max_urls: max_urls, time_limit: time_limit } 90 | end 91 | 92 | alias_method :max_urls, :bound_urls 93 | 94 | alias_method :time_limit, :bound_time 95 | 96 | def make_absolute(href, root) 97 | Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s 98 | end 99 | 100 | def internal_link?(absolute_url) 101 | "#{Adomain[absolute_url]}".include? @domain 102 | end 103 | 104 | def extension_ignored?(url) 105 | return false if url.empty? 106 | 107 | !non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil? 108 | end 109 | 110 | def memory_danger? 111 | return false unless in_docker? 112 | 113 | use = "#{File.open(MEMORY_USE_FILE, "rb").read}".to_f 114 | @limit ||= "#{File.open(MEMORY_LIMIT_FILE, "rb").read}".to_f 115 | 116 | return false unless ( (use > 0.0) && (@limit > 0.0) ) 117 | 118 | return ( ( (use / @limit) * 100.0 ) >= maximum_load_rate ) 119 | end 120 | 121 | def in_docker? 122 | File.file?(MEMORY_USE_FILE) 123 | end 124 | 125 | def maximum_load_rate 126 | return @maximum_load_rate if @maximum_load_rate 127 | 128 | @maximum_load_rate = "#{@options[:memory_limit]}".to_f 129 | @maximum_load_rate = DEFAULT_MAXIMUM_LOAD_RATE unless ((@maximum_load_rate > 0.0) && (@maximum_load_rate < 100.0)) 130 | @maximum_load_rate 131 | end 132 | end 133 | end 134 | -------------------------------------------------------------------------------- /lib/arachnid2/typhoeus.rb: -------------------------------------------------------------------------------- 1 | class Arachnid2 2 | class Typhoeus 3 | include CachedResponses 4 | include Arachnid2::Exoskeleton 5 | 6 | def initialize(url) 7 | @url = url 8 | @domain = Adomain[@url] 9 | @cached_data = [] 10 | end 11 | 12 | def crawl(opts = {}) 13 | preflight(opts) 14 | typhoeus_preflight 15 | 16 | until @global_queue.empty? 17 | max_concurrency.times do 18 | q = @global_queue.shift 19 | 20 | break if time_to_stop? 21 | @global_visited.insert(q) 22 | 23 | found_in_cache = use_cache(q, opts, &Proc.new) 24 | return if found_in_cache 25 | 26 | request = ::Typhoeus::Request.new(q, request_options) 27 | requestable = after_request(request, &Proc.new) 28 | @hydra.queue(request) if requestable 29 | end # max_concurrency.times do 30 | 31 | @hydra.run 32 | end # until @global_queue.empty? 33 | ensure 34 | @cookie_file.close! if @cookie_file 35 | end # def crawl(opts = {}) 36 | 37 | private 38 | def after_request(request) 39 | request.on_complete do |response| 40 | cacheable = use_response(response, &Proc.new) 41 | return unless cacheable 42 | 43 | put_cached_data(response.effective_url, @options, response) 44 | end 45 | 46 | true 47 | end 48 | 49 | def use_response(response) 50 | links = process(response.effective_url, response.body) 51 | return unless links 52 | 53 | yield response 54 | 55 | vacuum(links, response.effective_url) 56 | true 57 | end 58 | 59 | def use_cache(url, options) 60 | data = load_data(url, options) 61 | use_response(data, &Proc.new) if data 62 | 63 | data 64 | end 65 | 66 | def time_to_stop? 67 | @global_visited.size >= crawl_options[:max_urls] || \ 68 | Time.now > crawl_options[:time_limit] || \ 69 | memory_danger? 70 | end 71 | 72 | def typhoeus_preflight 73 | @hydra = ::Typhoeus::Hydra.new(:max_concurrency => max_concurrency) 74 | typhoeus_proxy_options 75 | end 76 | 77 | def max_concurrency 78 | return @max_concurrency if @max_concurrency 79 | 80 | @max_concurrency = "#{@options[:max_concurrency]}".to_i 81 | @max_concurrency = 1 unless (@max_concurrency > 0) 82 | @max_concurrency 83 | end 84 | 85 | def followlocation 86 | return @followlocation unless @followlocation.nil? 87 | 88 | @followlocation = @options[:followlocation] 89 | @followlocation = true unless @followlocation.is_a?(FalseClass) 90 | end 91 | 92 | def request_options 93 | @cookie_file ||= Tempfile.new('cookies') 94 | 95 | @request_options = { 96 | timeout: timeout, 97 | followlocation: followlocation, 98 | cookiefile: @cookie_file.path, 99 | cookiejar: @cookie_file.path, 100 | headers: @options[:headers] 101 | }.merge(crawl_options[:proxy]) 102 | 103 | @request_options[:headers] ||= {} 104 | @request_options[:headers]['Accept-Language'] ||= DEFAULT_LANGUAGE 105 | @request_options[:headers]['User-Agent'] ||= DEFAULT_USER_AGENT 106 | 107 | @request_options 108 | end 109 | 110 | def typhoeus_proxy_options 111 | crawl_options[:proxy] = {} 112 | 113 | crawl_options[:proxy][:proxy] = "#{@options[:proxy][:ip]}:#{@options[:proxy][:port]}" if @options.dig(:proxy, :ip) 114 | crawl_options[:proxy][:proxyuserpwd] = "#{@options[:proxy][:username]}:#{@options[:proxy][:password]}" if @options.dig(:proxy, :username) 115 | end 116 | 117 | end 118 | end 119 | -------------------------------------------------------------------------------- /lib/arachnid2/version.rb: -------------------------------------------------------------------------------- 1 | class Arachnid2 2 | VERSION = "0.4.0" 3 | end 4 | -------------------------------------------------------------------------------- /lib/arachnid2/watir.rb: -------------------------------------------------------------------------------- 1 | class Arachnid2 2 | class Watir 3 | DEFAULT_AGENT = :desktop 4 | DEFAULT_ORIENTATION = :landscape 5 | 6 | include Arachnid2::Exoskeleton 7 | 8 | def initialize(url) 9 | @url = url 10 | @domain = Adomain[@url] 11 | end 12 | 13 | def crawl(opts) 14 | preflight(opts) 15 | watir_preflight 16 | @already_retried = false 17 | 18 | until @global_queue.empty? 19 | q = @global_queue.shift 20 | links = nil 21 | 22 | break if time_to_stop? 23 | 24 | @global_visited.insert(q) 25 | 26 | make_request(q, &Proc.new) 27 | end # until @global_queue.empty? 28 | ensure 29 | @browser.close if @browser rescue nil 30 | @headless.destroy if @headless rescue nil 31 | end 32 | 33 | private 34 | def make_request(q) 35 | begin 36 | links = browse_links(q, &Proc.new) 37 | return unless links 38 | 39 | vacuum(links, browser.url) 40 | rescue Selenium::WebDriver::Error::NoSuchWindowError, Net::ReadTimeout => e 41 | msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \ 42 | "is ignoring an error: " \ 43 | "#{e.class} - #{e.message}" 44 | puts msg 45 | rescue => e 46 | raise e if raise_before_retry?(e.class) 47 | msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \ 48 | "is retrying once after an error: " \ 49 | "#{e.class} - #{e.message}" 50 | puts msg 51 | e.backtrace[0..4].each{|l| puts "\t#{l}"}; puts "..." 52 | reset_for_retry 53 | end 54 | end 55 | 56 | def browse_links(url) 57 | return unless navigate(url) 58 | 59 | yield browser 60 | 61 | process(browser.url, browser.body.html) if browser.body.exists? 62 | end 63 | 64 | def navigate(url) 65 | begin 66 | browser.goto url 67 | rescue Selenium::WebDriver::Error::UnknownError => e 68 | # Firefox and Selenium, in their infinite wisdom 69 | # raise an error when a page cannot be loaded. 70 | # At the time of writing this, the page at 71 | # thewirecutter.com/cars/accessories-auto 72 | # causes such an issue (too many redirects). 73 | # This error handling moves us on from those pages. 74 | raise e unless e.message =~ /.*Reached error page.*/i 75 | return 76 | end 77 | 78 | true 79 | end 80 | 81 | def time_to_stop? 82 | @global_visited.size >= crawl_options[:max_urls] || \ 83 | Time.now > crawl_options[:time_limit] || \ 84 | memory_danger? 85 | end 86 | 87 | def raise_before_retry?(klass) 88 | @already_retried || \ 89 | "#{klass}".include?("Selenium") || \ 90 | "#{klass}".include?("Watir") 91 | end 92 | 93 | def reset_for_retry 94 | @browser.close if @browser rescue nil 95 | @headless.destroy if @headless rescue nil 96 | @driver.quit if @headless rescue nil 97 | @driver = nil 98 | @browser = nil 99 | @already_retried = true 100 | end 101 | 102 | def browser 103 | unless @browser 104 | behead if @make_headless 105 | 106 | @browser = create_browser 107 | 108 | set_timeout 109 | end 110 | 111 | return @browser 112 | end 113 | 114 | def create_browser 115 | return ::Watir::Browser.new(driver, proxy: @proxy) if @proxy 116 | 117 | ::Watir::Browser.new driver 118 | end 119 | 120 | def set_timeout 121 | @browser.driver.manage.timeouts.page_load = timeout 122 | end 123 | 124 | def behead 125 | @headless = Headless.new 126 | @headless.start 127 | end 128 | 129 | def driver 130 | unless @driver 131 | language = @options.dig(:headers, "Accept-Language") || DEFAULT_LANGUAGE 132 | user_agent = @options.dig(:headers, "User-Agent") || DEFAULT_USER_AGENT 133 | agent = @options.dig(:agent) || DEFAULT_AGENT 134 | orientation = @options.dig(:orientation) || DEFAULT_ORIENTATION 135 | 136 | @driver = Webdriver::UserAgent.driver( 137 | browser: browser_type, 138 | agent: agent, 139 | orientation: orientation, 140 | accept_language_string: language, 141 | user_agent_string: user_agent 142 | ) 143 | end 144 | 145 | @driver 146 | end 147 | 148 | def watir_preflight 149 | watir_proxy_options 150 | @make_headless = @options[:headless] 151 | end 152 | 153 | def watir_proxy_options 154 | crawl_options[:proxy] = {} 155 | 156 | crawl_options[:proxy][:http] = @options[:proxy][:http] if @options.dig(:proxy, :http) 157 | crawl_options[:proxy][:ssl] = @options[:proxy][:ssl] if @options.dig(:proxy, :ssl) 158 | end 159 | end 160 | 161 | end 162 | -------------------------------------------------------------------------------- /spec/arachnid2/exoskeleton_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper.rb' 2 | 3 | RSpec.describe Arachnid2::Exoskeleton do 4 | describe "#memory_danger?" do 5 | let(:dummy) { (Class.new { include Arachnid2::Exoskeleton }).new } 6 | before(:each) do 7 | dummy.instance_variable_set(:@url, "http://dummy.com") 8 | dummy.instance_variable_set(:@domain, "dummy.com") 9 | 10 | allow(dummy).to receive(:in_docker?).and_return(true) 11 | dummy.instance_variable_set(:@maximum_load_rate, 50.00) 12 | end 13 | 14 | it "stops execution when memory limit is reached" do 15 | use_file = OpenStruct.new({read: 99.9999}) 16 | limit_file = OpenStruct.new({read: 100.0000}) 17 | 18 | allow(File).to receive(:open).with(Arachnid2::MEMORY_USE_FILE, 'rb').and_return(use_file) 19 | allow(File).to receive(:open).with(Arachnid2::MEMORY_LIMIT_FILE, 'rb').and_return(limit_file) 20 | 21 | expect(dummy.memory_danger?).to be_truthy 22 | end 23 | 24 | it "does not stop execution when memory limit is not yet reached" do 25 | use_file = OpenStruct.new({read: 1.0}) 26 | limit_file = OpenStruct.new({read: 100.0000}) 27 | 28 | allow(File).to receive(:open).with(Arachnid2::MEMORY_USE_FILE, 'rb').and_return(use_file) 29 | allow(File).to receive(:open).with(Arachnid2::MEMORY_LIMIT_FILE, 'rb').and_return(limit_file) 30 | 31 | expect(dummy.memory_danger?).to be_falsey 32 | end 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /spec/arachnid2/typhoeus_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper.rb' 2 | 3 | RSpec.describe Arachnid2::Typhoeus do 4 | describe "#crawl" do 5 | it "accepts the options" do 6 | url = "https://daringfireball.net" 7 | spider = Arachnid2::Typhoeus.new(url) 8 | opts = { 9 | followlocation: true, 10 | timeout: 12000, 11 | time_box: 10, 12 | max_urls: 1, 13 | headers: { 14 | 'Accept-Language' => "en-UK", 15 | 'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0", 16 | }, 17 | max_concurrency: 5, 18 | memory_limit: 39.99, 19 | proxy: { 20 | ip: "1.2.3.4", 21 | port: "1234", 22 | username: "sam", 23 | password: "coolcoolcool", 24 | }, 25 | non_html_extensions: { 26 | 2 => [".oh"], 27 | 3 => [".omg"], 28 | 5 => [".ohhai"], 29 | } 30 | } 31 | 32 | spider.crawl(opts){} 33 | 34 | crawl_options = spider.instance_variable_get(:@crawl_options) 35 | request_options = spider.instance_variable_get(:@request_options) 36 | maximum_load_rate = spider.send(:maximum_load_rate) 37 | max_concurrency = spider.send(:max_concurrency) 38 | hydra = spider.instance_variable_get(:@hydra) 39 | followlocation = spider.send(:followlocation) 40 | non_html_extensions = spider.send(:non_html_extensions) 41 | timeout = spider.instance_variable_get(:@timeout) 42 | 43 | expect(crawl_options[:time_limit]).to be_a(Time) 44 | expect(crawl_options[:max_urls]).to be_an(Integer) 45 | expect(crawl_options[:proxy][:proxy]).to eq("1.2.3.4:1234") 46 | expect(crawl_options[:proxy][:proxyuserpwd]).to eq("sam:coolcoolcool") 47 | expect(request_options).not_to be_nil 48 | expect(request_options[:headers]).to eq(opts[:headers]) 49 | expect(maximum_load_rate).to eq(39.99) 50 | expect(max_concurrency).to eq(5) 51 | expect(hydra).to be_a(Typhoeus::Hydra) 52 | expect(followlocation).to eq(true) 53 | expect(timeout).to eq(12000) 54 | expect(non_html_extensions.values.flatten).to eq([".oh", ".omg", ".ohhai"]) 55 | end 56 | 57 | it "visits the URL" do 58 | url = "https://daringfireball.net" 59 | spider = Arachnid2::Typhoeus.new(url) 60 | opts = { 61 | time_box: 10, 62 | max_urls: 2 63 | } 64 | responses = [] 65 | 66 | spider.crawl(opts){|r| responses << r} 67 | global_visited = spider.instance_variable_get(:@global_visited) 68 | global_queue = spider.instance_variable_get(:@global_queue) 69 | 70 | expect(global_visited.size).to be > 0 71 | expect(responses.size).to be > 0 72 | end 73 | 74 | context "data is available in the cache" do 75 | let!(:url) { "https://daringfireball.net" } 76 | let!(:spider) { Arachnid2::Typhoeus.new(url) } 77 | let!(:opts) { { time_box: 10, max_urls: 1 } } 78 | let!(:payload) { 79 | OpenStruct.new({effective_url: "http://daringfireball.net", body: ""}) 80 | } # note that the url and effective_url domains must match 81 | 82 | before(:each) do 83 | allow(spider).to receive(:load_data).with(url, opts).and_return(payload) 84 | end 85 | 86 | it "loads data from the cache" do 87 | responses = [] 88 | expect(spider).to receive(:load_data).with(url, opts).and_return(payload) 89 | 90 | spider.crawl(opts){|r| responses << r} 91 | expect(responses).to include(payload) 92 | end 93 | end 94 | end 95 | end 96 | -------------------------------------------------------------------------------- /spec/arachnid2/watir_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper.rb' 2 | 3 | RSpec.describe Arachnid2::Watir do 4 | describe "#crawl" do 5 | it "accepts the options" do 6 | url = "https://daringfireball.net" 7 | spider = Arachnid2::Watir.new(url) 8 | opts = { 9 | browser_type: :chrome, 10 | timeout: 12000, 11 | time_box: 10, 12 | max_urls: 1, 13 | headers: { 14 | 'Accept-Language' => "en-UK", 15 | 'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0", 16 | }, 17 | memory_limit: 39.99, 18 | proxy: { 19 | http: "troy.show:8080", 20 | ssl: "abed.show:8080" 21 | }, 22 | non_html_extensions: { 23 | 2 => [".oh"], 24 | 3 => [".omg"], 25 | 5 => [".ohhai"], 26 | }, 27 | headers: { 28 | 'Accept-Language' => "es-ES", 29 | 'User-Agent' => "Sam's Custom Browser" 30 | }, 31 | headless: false, 32 | agent: :iphone, 33 | orientation: :portrait 34 | } 35 | 36 | spider.crawl(opts) { |browser| 37 | @header_language = browser.execute_script("return navigator.language") unless @header_language 38 | @header_user_agent = browser.execute_script("return navigator.userAgent") unless @header_user_agent 39 | @portrait = browser.execute_script("return (window.innerHeight > window.innerWidth)") unless @portrait 40 | @window_width = browser.execute_script("return window.innerWidth") unless @window_width 41 | } 42 | 43 | crawl_options = spider.crawl_options 44 | maximum_load_rate = spider.maximum_load_rate 45 | non_html_extensions = spider.non_html_extensions 46 | timeout = spider.timeout 47 | make_headless = spider.instance_variable_get(:@make_headless) 48 | 49 | expect(crawl_options[:time_limit]).to be_a(Time) 50 | expect(crawl_options[:max_urls]).to be_an(Integer) 51 | expect(crawl_options[:proxy][:http]).to eq("troy.show:8080") 52 | expect(crawl_options[:proxy][:ssl]).to eq("abed.show:8080") 53 | expect(@header_language).to include(opts[:headers]['Accept-Language']) 54 | expect(@header_user_agent).to eq(opts[:headers]['User-Agent']) 55 | expect(@portrait).to be_truthy 56 | expect(@window_width).to be < 525 57 | expect(maximum_load_rate).to eq(39.99) 58 | expect(timeout).to eq(12000) 59 | expect(non_html_extensions.values.flatten).to eq([".oh", ".omg", ".ohhai"]) 60 | end 61 | 62 | it "visits the URL" do 63 | url = "https://daringfireball.net" 64 | spider = Arachnid2::Watir.new(url) 65 | opts = { 66 | time_box: 10, 67 | max_urls: 2 68 | } 69 | responses = [] 70 | 71 | spider.crawl(opts){|r| responses << r} 72 | global_visited = spider.instance_variable_get(:@global_visited) 73 | global_queue = spider.instance_variable_get(:@global_queue) 74 | 75 | expect(global_visited.size).to be > 0 76 | expect(responses.size).to be > 0 77 | end 78 | 79 | it "uses Watir when requested" do 80 | spider = Arachnid2.new("http://test.com") 81 | allow_any_instance_of(Arachnid2::Watir).to receive(:crawl).with(anything).and_return(true) 82 | expect{ spider.crawl(opts = {max_urls: 1, time_box: 1}, with_watir = true) {} }.not_to raise_error 83 | end 84 | 85 | it "only uses one crawling technology type" do 86 | spider = Arachnid2.new("http://daringfireball.net") 87 | # allow_any_instance_of(Arachnid2::Watir).to receive(:crawl).with(anything).and_return(true) 88 | expect_any_instance_of(Arachnid2::Typhoeus).not_to receive(:crawl) 89 | spider.crawl(opts = {max_urls: 2, time_box: 5}, with_watir = true) {} 90 | end 91 | 92 | it "crawls past any Net::ReadTimeout issues" do 93 | spider = Arachnid2.new("https://www.themcelroy.family") 94 | opts = {max_urls: 3, time_box: 10} 95 | with_watir = true 96 | 97 | allow_any_instance_of(::Watir::Browser).to receive(:goto).with(anything).and_raise(Net::ReadTimeout) 98 | 99 | expect{ 100 | spider.crawl(opts, with_watir) {} 101 | }.not_to raise_error 102 | end 103 | 104 | it "crawls past any Selenium::WebDriver::Error::NoSuchWindowError issues" do 105 | spider = Arachnid2.new("https://www.themcelroy.family") 106 | opts = {max_urls: 3, time_box: 10} 107 | with_watir = true 108 | 109 | allow_any_instance_of(::Watir::Browser).to receive(:goto).with(anything).and_raise(Selenium::WebDriver::Error::NoSuchWindowError) 110 | 111 | expect{ 112 | spider.crawl(opts, with_watir) {} 113 | }.not_to raise_error 114 | end 115 | 116 | it "does not fail when the browser cannot locate the " do 117 | spider = Arachnid2.new("https://www.themcelroy.family") 118 | opts = {max_urls: 3, time_box: 10} 119 | with_watir = true 120 | 121 | allow_any_instance_of(::Watir::Body).to receive(:html).and_raise(Watir::Exception::UnknownObjectException) 122 | allow_any_instance_of(::Watir::Body).to receive(:exists?).and_return(false) 123 | 124 | expect{ 125 | spider.crawl(opts, with_watir) {} 126 | }.not_to raise_error 127 | end 128 | 129 | it "rescues one error when the browser connection is lost" do 130 | spider = Arachnid2::Watir.new("https://stratechery.com") 131 | opts = {max_urls: 3, time_box: 60} 132 | 133 | Object.const_set("MyCustomTestError", Class.new(StandardError)) 134 | 135 | allow_any_instance_of(Arachnid2::Watir).to receive(:preflight).with(opts).and_return(true) 136 | allow_any_instance_of(Arachnid2::Watir).to receive(:watir_preflight).and_return(true) 137 | 138 | queue = [ 139 | "https://stratechery.com", 140 | "http://stratechery.com/about/", 141 | "https://stratechery.com/concepts/" 142 | ] 143 | spider.instance_variable_set(:@options, opts) 144 | spider.instance_variable_set(:@global_queue, queue) 145 | bf = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true) 146 | spider.instance_variable_set(:@global_visited, bf) 147 | spider.instance_variable_set(:@make_headless, !OS.mac?) 148 | 149 | browser = spider.send(:create_browser) 150 | spider.instance_variable_set(:@browser, browser) 151 | 152 | allow(browser).to receive(:url).and_raise(MyCustomTestError) 153 | 154 | expect{ 155 | spider.crawl(opts) {} 156 | }.not_to raise_error 157 | end 158 | 159 | it "stops after more than one error" do 160 | spider = Arachnid2::Watir.new("https://stratechery.com") 161 | opts = {max_urls: 3, time_box: 60} 162 | 163 | Object.const_set("MyCustomTestError", Class.new(StandardError)) unless Object.const_defined?("MyCustomTestError") 164 | 165 | allow_any_instance_of(::Watir::Browser).to receive(:url).and_raise(MyCustomTestError) 166 | allow_any_instance_of(Arachnid2::Watir).to receive(:preflight).with(opts).and_return(true) 167 | allow_any_instance_of(Arachnid2::Watir).to receive(:watir_preflight).and_return(true) 168 | 169 | queue = [ 170 | "https://stratechery.com", 171 | "http://stratechery.com/about/", 172 | "https://stratechery.com/concepts/" 173 | ] 174 | spider.instance_variable_set(:@options, opts) 175 | spider.instance_variable_set(:@global_queue, queue) 176 | bf = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true) 177 | spider.instance_variable_set(:@global_visited, bf) 178 | spider.instance_variable_set(:@make_headless, !OS.mac?) 179 | 180 | expect{ 181 | spider.crawl(opts) {} 182 | }.to raise_error(MyCustomTestError) 183 | end 184 | end 185 | end 186 | -------------------------------------------------------------------------------- /spec/arachnid2_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper.rb' 2 | 3 | RSpec.describe Arachnid2 do 4 | it "has a version number" do 5 | expect(Arachnid2::VERSION).not_to be nil 6 | end 7 | 8 | describe "#initialize" do 9 | it "sets the URL" do 10 | url = "http://test.com" 11 | spider = Arachnid2.new url 12 | expect(spider.instance_variable_get(:@url)).to eq(url) 13 | end 14 | end 15 | end 16 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require "bundler/setup" 2 | require "arachnid2" 3 | require "ostruct" 4 | 5 | RSpec.configure do |config| 6 | # Enable flags like --only-failures and --next-failure 7 | config.example_status_persistence_file_path = ".rspec_status" 8 | 9 | # Disable RSpec exposing methods globally on `Module` and `main` 10 | config.disable_monkey_patching! 11 | 12 | config.expect_with :rspec do |c| 13 | c.syntax = :expect 14 | end 15 | end 16 | --------------------------------------------------------------------------------