├── .github
    └── workflows
    │   └── ruby.yml
├── .gitignore
├── .rspec
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── Gemfile
├── Gemfile.lock
├── LICENSE.txt
├── README.md
├── Rakefile
├── arachnid2.gemspec
├── bin
    ├── console
    └── setup
├── lib
    ├── arachnid2.rb
    └── arachnid2
    │   ├── cached_responses.rb
    │   ├── exoskeleton.rb
    │   ├── typhoeus.rb
    │   ├── version.rb
    │   └── watir.rb
└── spec
    ├── arachnid2
        ├── exoskeleton_spec.rb
        ├── typhoeus_spec.rb
        └── watir_spec.rb
    ├── arachnid2_spec.rb
    └── spec_helper.rb


/.github/workflows/ruby.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake
 6 | # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby
 7 | 
 8 | name: Ruby
 9 | 
10 | on:
11 |   push:
12 |     branches: [ "master" ]
13 |   pull_request:
14 |     branches: [ "master" ]
15 | 
16 | permissions:
17 |   contents: read
18 | 
19 | jobs:
20 |   test:
21 | 
22 |     runs-on: ubuntu-latest
23 |     strategy:
24 |       matrix:
25 |         ruby-version: ['2.6', '2.7', '3.0', 3.2]
26 | 
27 |     steps:
28 |     - uses: actions/checkout@v3
29 |     - name: Set up Ruby
30 |       uses: ruby/setup-ruby@v1
31 |     # uses: ruby/setup-ruby@55283cc23133118229fd3f97f9336ee23a179fcf # v1.146.0
32 |       with:
33 |         ruby-version: ${{ matrix.ruby-version }}
34 |         bundler-cache: true # runs 'bundle install' and caches installed gems automatically
35 |     - name: Run tests
36 |       run: bundle exec rspec spec/
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /.bundle/
 2 | /.yardoc
 3 | /_yardoc/
 4 | /coverage/
 5 | /doc/
 6 | /pkg/
 7 | /spec/reports/
 8 | /tmp/
 9 | 
10 | # rspec failure tracking
11 | .rspec_status
12 | 
13 | arachnid2-*.gem
14 | 


--------------------------------------------------------------------------------
/.rspec:
--------------------------------------------------------------------------------
1 | --format documentation
2 | --color
3 | --require spec_helper
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: false
2 | language: ruby
3 | rvm:
4 |   - 2.4.1
5 | before_install: gem install bundler -v 1.16.1
6 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, gender identity and expression, level of experience,
 9 | nationality, personal appearance, race, religion, or sexual identity and
10 | orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at samuel.nissen@rakuten.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at [http://contributor-covenant.org/version/1/4][version]
72 | 
73 | [homepage]: http://contributor-covenant.org
74 | [version]: http://contributor-covenant.org/version/1/4/
75 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 | 
3 | git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4 | 
5 | # Specify your gem's dependencies in arachnid2.gemspec
6 | gemspec
7 | 


--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | PATH
 2 |   remote: .
 3 |   specs:
 4 |     arachnid2 (0.3.9)
 5 |       addressable
 6 |       adomain
 7 |       bloomfilter-rb
 8 |       nokogiri (>= 1.10.4)
 9 |       typhoeus
10 |       watir
11 |       webdriver-user-agent (>= 7.6)
12 |       webdrivers
13 | 
14 | GEM
15 |   remote: https://rubygems.org/
16 |   specs:
17 |     addressable (2.7.0)
18 |       public_suffix (>= 2.0.2, < 5.0)
19 |     adomain (0.2.3)
20 |       addressable (~> 2.5)
21 |       logger
22 |     bloomfilter-rb (2.1.1)
23 |       redis
24 |     childprocess (3.0.0)
25 |     diff-lcs (1.3)
26 |     ethon (0.12.0)
27 |       ffi (>= 1.3.0)
28 |     facets (3.1.0)
29 |     ffi (1.12.2)
30 |     json (2.3.0)
31 |     logger (1.4.2)
32 |     mini_portile2 (2.4.0)
33 |     nokogiri (1.10.9)
34 |       mini_portile2 (~> 2.4.0)
35 |     os (1.0.1)
36 |     psych (3.1.0)
37 |     public_suffix (4.0.3)
38 |     rake (13.0.1)
39 |     redis (4.1.3)
40 |     regexp_parser (1.7.0)
41 |     rspec (3.8.0)
42 |       rspec-core (~> 3.8.0)
43 |       rspec-expectations (~> 3.8.0)
44 |       rspec-mocks (~> 3.8.0)
45 |     rspec-core (3.8.0)
46 |       rspec-support (~> 3.8.0)
47 |     rspec-expectations (3.8.2)
48 |       diff-lcs (>= 1.2.0, < 2.0)
49 |       rspec-support (~> 3.8.0)
50 |     rspec-mocks (3.8.0)
51 |       diff-lcs (>= 1.2.0, < 2.0)
52 |       rspec-support (~> 3.8.0)
53 |     rspec-support (3.8.0)
54 |     rubyzip (2.2.0)
55 |     selenium-webdriver (3.142.7)
56 |       childprocess (>= 0.5, < 4.0)
57 |       rubyzip (>= 1.2.2)
58 |     typhoeus (1.3.1)
59 |       ethon (>= 0.9.0)
60 |     watir (6.16.5)
61 |       regexp_parser (~> 1.2)
62 |       selenium-webdriver (~> 3.6)
63 |     webdriver-user-agent (7.6)
64 |       facets
65 |       json
66 |       os
67 |       psych
68 |       selenium-webdriver (>= 3.4.0)
69 |     webdrivers (4.2.0)
70 |       nokogiri (~> 1.6)
71 |       rubyzip (>= 1.3.0)
72 |       selenium-webdriver (>= 3.0, < 4.0)
73 | 
74 | PLATFORMS
75 |   ruby
76 | 
77 | DEPENDENCIES
78 |   arachnid2!
79 |   bundler (~> 1.16)
80 |   rake (>= 12.3.3)
81 |   rspec (~> 3.0)
82 | 
83 | BUNDLED WITH
84 |    1.17.3
85 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018 Sam Nissen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Arachnid2
  2 | 
  3 | ## About
  4 | 
  5 | Arachnid2 is a simple, fast web-crawler written in Ruby.
  6 | You can use [typhoeus](https://github.com/typhoeus/typhoeus)
  7 | to make HTTP requests, or [Watir](https://github.com/watir/watir)
  8 | to render pages. [bloomfilter-rb](https://github.com/igrigorik/bloomfilter-rb)
  9 | stores the URLs,
 10 | and [nokogiri](https://github.com/sparklemotion/nokogiri)
 11 | finds the URLs on each webpage.
 12 | 
 13 | Arachnid2 is a successor to [Arachnid](https://github.com/dchuk/Arachnid),
 14 | and was abstracted out of the [Tellurion Bot](https://github.com/samnissen/tellurion_bot).
 15 | 
 16 | ## Usage
 17 | 
 18 | ### Typheous (cURL)
 19 | 
 20 | The default use case for Arachnid2 is surfacing responses from
 21 | a domains' URLs by visiting a URL, collecting any links to the
 22 | same domain on that page, and visiting those to do the same.
 23 | 
 24 | The simplest way to use the gem is collecting all of the
 25 | responses while spidering from some URL.
 26 | 
 27 | ```ruby
 28 | require "arachnid2"
 29 | 
 30 | url = "http://www.maximumfun.org"
 31 | spider = Arachnid2.new(url)
 32 | responses = []
 33 | 
 34 | spider.crawl { |response|
 35 |   responses << response
 36 | }
 37 | ```
 38 | 
 39 | Obviously this could become unwieldy,
 40 | so you can execute logic within the spidering to collect a narrow subset
 41 | of the responses, transform or dissect the response,
 42 | or both (or whatever you want).
 43 | 
 44 | ```ruby
 45 | require "arachnid2"
 46 | require "nokogiri"
 47 | 
 48 | url = "https://daringfireball.net"
 49 | spider = Arachnid2.new(url)
 50 | responses = []
 51 | 
 52 | spider.crawl { |response|
 53 |   responses << Nokogiri::HTML(response.body) if response.effective_url =~ /.*amazon.*/
 54 |   print '*'
 55 | }
 56 | ```
 57 | 
 58 | `Arachnid2#crawl` will return always `nil`.
 59 | 
 60 | #### Options
 61 | 
 62 | ```ruby
 63 | require "arachnid2"
 64 | 
 65 | url = "http://sixcolours.com"
 66 | spider = Arachnid2.new(url)
 67 | opts = {
 68 |   followlocation: true,
 69 |   timeout: 300,
 70 |   time_box: 60,
 71 |   max_urls: 50,
 72 |   :headers => {
 73 |     'Accept-Language' => "en-UK",
 74 |     'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
 75 |   },
 76 |   memory_limit: 89.99,
 77 |   proxy: {
 78 |     ip: "1.2.3.4",
 79 |     port: "1234",
 80 |     username: "sam",
 81 |     password: "coolcoolcool",
 82 |   }
 83 |   :non_html_extensions => {
 84 |     3 => [".abc", ".xyz"],
 85 |     4 => [".abcd"],
 86 |     6 => [".abcdef"],
 87 |     11 => [".abcdefghijk"]
 88 |   }
 89 | }
 90 | responses = []
 91 | 
 92 | spider.crawl(opts) { |response|
 93 |   responses << response
 94 | }
 95 | ```
 96 | 
 97 | ##### `followlocation`
 98 | 
 99 | Tell Typhoeus to follow redirections.
100 | 
101 | ##### `timeout`
102 | 
103 | Tell Typheous or Watir how long to wait for page load.
104 | 
105 | ##### `time_box`
106 | 
107 | The crawler will time-bound your spidering.
108 | If no valid integer is provided,
109 | it will crawl for 15 seconds before exiting.
110 | 10000 seconds is the current maximum,
111 | and any value above it will be reduced to 10000.
112 | 
113 | ##### `max_urls`
114 | 
115 | The crawler will crawl a limited number of URLs before stopping.
116 | If no valid integer is provided,
117 | it will crawl for 50 URLs before exiting.
118 | 10000 seconds is the current maximum,
119 | and any value above it will be reduced to 10000.
120 | 
121 | ##### `headers`
122 | 
123 | This is a hash that represents any HTTP header key/value pairs you desire,
124 | and is passed directly to Typheous. Before it is sent, a default
125 | language and user agent are created:
126 | 
127 | ###### Defaults
128 | 
129 | The HTTP header `Accept-Language` default is
130 | `en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, \*;0.4`
131 | 
132 | The HTTP header `User-Agent` default is
133 | `Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15`
134 | 
135 | ##### `proxy`
136 | 
137 | Provide your IP, port for a proxy. If required, provide credentials for
138 | authenticating to that proxy. Proxy options and handling are done
139 | by Typhoeus.
140 | 
141 | ##### `non_html_extensions`
142 | 
143 | This is the list of TLDs to ignore when collecting URLs from the page.
144 | The extensions are formatted as a hash of key/value pairs, where the value
145 | is an array of TLDs, and the keys represent the length of those TLDs.
146 | 
147 | ##### `memory_limit` and Docker
148 | 
149 | In case you are operating the crawler within a container, Arachnid2
150 | can attempt to prevent the container from running out of memory.
151 | By default, it will end the crawl when the container uses >= 80%
152 | of its available memory. You can override this with the
153 | option.
154 | 
155 | ##### Non-HTML links
156 | 
157 | The crawler attempts to stop itself from returning data from
158 | links that are not indicative of HTML, as detailed in
159 | `Arachnid2::NON_HTML_EXTENSIONS`.
160 | 
161 | #### Caching (optional)
162 | 
163 | If you have setup a cache to deduplicate crawls,
164 | set a cached service url
165 | `export ARACHNID_CACHED_SERVICE_ADDRESS=http://localhost:9000`
166 | 
167 | This expects a push and get JSON API to respond
168 | to `/typhoeus_responses`, with a URL and the options pushed
169 | exactly as received as parameters. It will push any crawls
170 | to the service, and re-use any crawled pages
171 | if they are found to match.
172 | 
173 | ### With Watir
174 | 
175 | Arachnid2 can crawl links with Watir, gathering up links
176 | like crawling with Typhoeus, but with pages that are
177 | actually rendered. You can access this option in one
178 | of two ways:
179 | 
180 | ```ruby
181 | # ...
182 | Arachnid2.new(url).crawl_watir(opts)
183 | # -or-
184 | with_watir = true  # the default is `false`
185 | Arachnid2.new(url).crawl(opts, with_watir)
186 | ```
187 | 
188 | Arachnid2 has base defaults which you might want to address when
189 | employing Watir.
190 | 
191 | * First, the default crawl time is 15 seconds.
192 | As browser page loads can take this long, you will probably want to
193 | set a higher crawl time.
194 | * Also, simply storing the browser is not a great idea, since
195 | it will be inaccessible after it is closed.
196 | Instead, consider nabbing the HTML, cookies,
197 | or whatever content is required during the crawl.
198 | * Finally, note that Firefox is the default browser.
199 | 
200 | 
201 | ```ruby
202 | require 'arachnid2'
203 | 
204 | with_watir = true
205 | responses = []
206 | url = "http://maximumfun.org"
207 | max = 60
208 | browser = :chrome
209 | opts = {time_box: max, browser_type: browser}
210 | 
211 | spider = Arachnid2.new(url)
212 | spider.crawl(opts, with_watir) do |response|
213 |   response.body.wait_until(&:present?)
214 |   responses << response.body.html if response.body.present?
215 | end
216 | ```
217 | 
218 | #### Options
219 | 
220 | See the Typhoeus options above &mdash; most apply to Watir as well, with
221 | some exceptions:
222 | 
223 | ##### `proxy`
224 | 
225 | Watir proxy options are formatted differently:
226 | 
227 | ```ruby
228 | proxy: {
229 |   http: "troy.show:8080",
230 |   ssl: "abed.show:8080"
231 | },
232 | ```
233 | 
234 | Proxy options handling is done by Watir.
235 | 
236 | ##### `headless`
237 | 
238 | And it accepts an argument to make browse headlessly
239 | 
240 | ```ruby
241 | opts = { headless: true }
242 | ```
243 | 
244 | ##### `agent`
245 | 
246 | It accepts an argument mapped to Webdriver::UserAgent::Driver's `agent` option
247 | 
248 | ```ruby
249 | opts = { agent: :desktop }
250 | ```
251 | ##### `orientation`
252 | 
253 | And it accepts an argument mapped to Webdriver::UserAgent::Driver's `orientation` option
254 | 
255 | ```ruby
256 | opts = { orientation: :landscape }
257 | ```
258 | 
259 | ##### `followlocation` and `max_concurrency`
260 | 
261 | These options do not apply to Watir, and will be ignored.
262 | 
263 | ## Development
264 | 
265 | Fork the repo and run the tests
266 | 
267 | ```ruby
268 | bundle exec rspec spec/
269 | ```
270 | 
271 | ## Contributing
272 | 
273 | Bug reports and pull requests are welcome on GitHub at
274 | https://github.com/samnissen/arachnid2.
275 | This project is intended to be a safe,
276 | welcoming space for collaboration,
277 | and contributors are expected to adhere to the
278 | [Contributor Covenant](http://contributor-covenant.org) code of conduct.
279 | 
280 | ## License
281 | 
282 | The gem is available as open source under the terms of the
283 | [MIT License](https://opensource.org/licenses/MIT).
284 | 
285 | ## Code of Conduct
286 | 
287 | Everyone interacting in the Arachnid2 project’s codebases,
288 | issue trackers, chat rooms and mailing lists is expected
289 | to follow the
290 | [code of conduct](https://github.com/samnissen/arachnid2/blob/master/CODE_OF_CONDUCT.md).
291 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require "bundler/gem_tasks"
2 | require "rspec/core/rake_task"
3 | 
4 | RSpec::Core::RakeTask.new(:spec)
5 | 
6 | task :default => :spec
7 | 


--------------------------------------------------------------------------------
/arachnid2.gemspec:
--------------------------------------------------------------------------------
 1 | 
 2 | lib = File.expand_path("../lib", __FILE__)
 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 4 | require "arachnid2/version"
 5 | 
 6 | Gem::Specification.new do |spec|
 7 |   spec.name          = "arachnid2"
 8 |   spec.version       = Arachnid2::VERSION
 9 |   spec.authors       = ["Sam Nissen"]
10 |   spec.email         = ["scnissen@gmail.com"]
11 | 
12 |   spec.summary       = %q{A simple, fast web crawler}
13 |   # spec.description   = %q{TODO: Write a longer description or delete this line.}
14 |   spec.homepage      = "https://github.com/samnissen/arachnid2"
15 |   spec.license       = "MIT"
16 | 
17 |   spec.files         = `git ls-files -z`.split("\x0").reject do |f|
18 |     f.match(%r{^(test|spec|features)/})
19 |   end
20 |   spec.bindir        = "exe"
21 |   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
22 |   spec.require_paths = ["lib"]
23 | 
24 |   spec.add_development_dependency "bundler", "~> 1.16"
25 |   spec.add_development_dependency "rake", ">= 12.3.3"
26 |   spec.add_development_dependency "rspec", "~> 3.0"
27 | 
28 |   spec.add_dependency "webdriver-user-agent", ">= 7.6"
29 |   spec.add_dependency "watir"
30 |   spec.add_dependency "webdrivers"
31 |   spec.add_dependency "typhoeus"
32 |   spec.add_dependency "bloomfilter-rb"
33 |   spec.add_dependency "adomain"
34 |   spec.add_dependency "addressable"
35 |   spec.add_dependency "nokogiri", ">= 1.10.4"
36 | end
37 | 


--------------------------------------------------------------------------------
/bin/console:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require "bundler/setup"
 4 | require "arachnid2"
 5 | 
 6 | # You can add fixtures and/or initialization code here to make experimenting
 7 | # with your gem easier. You can also use a different console, if you like.
 8 | 
 9 | # (If you use this, don't forget to add pry to your Gemfile!)
10 | # require "pry"
11 | # Pry.start
12 | 
13 | require "irb"
14 | IRB.start(__FILE__)
15 | 


--------------------------------------------------------------------------------
/bin/setup:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euo pipefail
3 | IFS=$'\n\t'
4 | set -vx
5 | 
6 | bundle install
7 | 
8 | # Do any other automated setup that you need to do here
9 | 


--------------------------------------------------------------------------------
/lib/arachnid2.rb:
--------------------------------------------------------------------------------
  1 | require "arachnid2/version"
  2 | require "arachnid2/cached_responses"
  3 | require "arachnid2/exoskeleton"
  4 | require "arachnid2/typhoeus"
  5 | require "arachnid2/watir"
  6 | 
  7 | require 'tempfile'
  8 | require "typhoeus"
  9 | require "bloomfilter-rb"
 10 | require "adomain"
 11 | require "addressable/uri"
 12 | require "nokogiri"
 13 | require "base64"
 14 | require "webdrivers"
 15 | require "webdriver-user-agent"
 16 | require "watir"
 17 | 
 18 | 
 19 | class Arachnid2
 20 |   # META:
 21 |   #   About the origins of this crawling approach
 22 |   # The Crawler is heavily borrowed from by Arachnid.
 23 |   # Original: https://github.com/dchuk/Arachnid
 24 |   # Other iterations I've borrowed liberally from:
 25 |   #   - https://github.com/matstc/Arachnid
 26 |   #   - https://github.com/intrigueio/Arachnid
 27 |   #   - https://github.com/jhulme/Arachnid
 28 |   # And this was originally written as a part of Tellurion's bot
 29 |   # https://github.com/samnissen/tellurion_bot
 30 | 
 31 |   MAX_CRAWL_TIME = 10000
 32 |   BASE_CRAWL_TIME = 15
 33 |   MAX_URLS = 10000
 34 |   BASE_URLS = 50
 35 |   DEFAULT_LANGUAGE = "en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, *;0.4"
 36 |   DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15"
 37 | 
 38 |   DEFAULT_NON_HTML_EXTENSIONS = {
 39 |     3 => ['.gz'],
 40 |     4 => ['.jpg', '.png', '.m4a', '.mp3', '.mp4', '.pdf', '.zip',
 41 |           '.wmv', '.gif', '.doc', '.xls', '.pps', '.ppt', '.tar',
 42 |           '.iso', '.dmg', '.bin', '.ics', '.exe', '.wav', '.mid'],
 43 |     5 => ['.xlsx', '.docx', '.pptx', '.tiff', '.zipx'],
 44 |     8 => ['.torrent']
 45 |   }
 46 |   MEMORY_USE_FILE = "/sys/fs/cgroup/memory/memory.usage_in_bytes"
 47 |   MEMORY_LIMIT_FILE = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
 48 |   DEFAULT_MAXIMUM_LOAD_RATE = 79.9
 49 | 
 50 |   DEFAULT_TIMEOUT = 10_000
 51 |   MINIMUM_TIMEOUT = 1
 52 |   MAXIMUM_TIMEOUT = 999_999
 53 | 
 54 |   #
 55 |   # Creates the object to execute the crawl
 56 |   #
 57 |   # @example
 58 |   #   url = "https://daringfireball.net"
 59 |   #   spider = Arachnid2.new(url)
 60 |   #
 61 |   # @param [String] url
 62 |   #
 63 |   # @return [Arachnid2] self
 64 |   #
 65 |   def initialize(url)
 66 |     @url = url
 67 |   end
 68 | 
 69 |   #
 70 |   # Visits a URL, gathering links and visiting them,
 71 |   # until running out of time, memory or attempts.
 72 |   #
 73 |   # @example
 74 |   #   url = "https://daringfireball.net"
 75 |   #   spider = Arachnid2.new(url)
 76 |   #
 77 |   #   opts = {
 78 |   #     :followlocation => true,
 79 |   #     :timeout => 25000,
 80 |   #     :time_box => 30,
 81 |   #     :headers => {
 82 |   #       'Accept-Language' => "en-UK",
 83 |   #       'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
 84 |   #     },
 85 |   #     :memory_limit => 89.99,
 86 |   #     :proxy => {
 87 |   #       :ip => "1.2.3.4",
 88 |   #       :port => "1234",
 89 |   #       :username => "sam",
 90 |   #       :password => "coolcoolcool",
 91 |   #     }
 92 |   #     :non_html_extensions => {
 93 |   #       3 => [".abc", ".xyz"],
 94 |   #       4 => [".abcd"],
 95 |   #       6 => [".abcdef"],
 96 |   #       11 => [".abcdefghijk"]
 97 |   #     }
 98 |   #   }
 99 |   #   responses = []
100 |   #   spider.crawl(opts) { |response|
101 |   #     responses << response
102 |   #   }
103 |   #
104 |   # @param [Hash] opts
105 |   #
106 |   # @return nil
107 |   #
108 |   def crawl(opts = {}, with_watir = false)
109 |     if with_watir
110 |       crawl_watir(opts, &Proc.new)
111 |     else
112 |       Arachnid2::Typhoeus.new(@url).crawl(opts, &Proc.new)
113 |     end
114 |   end
115 | 
116 |   def crawl_watir(opts)
117 |     Arachnid2::Watir.new(@url).crawl(opts, &Proc.new)
118 |   end
119 |   # https://mudge.name/2011/01/26/passing-blocks-in-ruby-without-block.html
120 | 
121 | end
122 | 


--------------------------------------------------------------------------------
/lib/arachnid2/cached_responses.rb:
--------------------------------------------------------------------------------
 1 | require 'net/http'
 2 | require 'json'
 3 | module CachedResponses
 4 |   CACHE_SERVICE_URL = ENV['ARACHNID_CACHED_SERVICE_ADDRESS'].freeze
 5 | 
 6 |   def load_data(_url, _options)
 7 |     return if check_config
 8 | 
 9 |     uri = URI("#{CACHE_SERVICE_URL}/typhoeus_responses?url=#{@url}&options=#{@options}")
10 |     req = Net::HTTP::Get.new(uri)
11 |     req['Accept'] = 'json'
12 |     Net::HTTP.start(uri.hostname, uri.port) do |http|
13 |       response = http.request(req)
14 |       return nil if response.code != '200'
15 | 
16 |       body = ::JSON.parse(response.body)
17 |       responses_list = Base64.decode64(body['encrypted_response'])
18 |       return Marshal.load responses_list # here we get an Array of `Typhoeus::Response`s
19 |     end
20 |   rescue StandardError
21 |     nil
22 |   end
23 | 
24 |   def put_cached_data(url, options, data)
25 |     return if check_config
26 | 
27 |     uri = URI("#{CACHE_SERVICE_URL}/typhoeus_responses")
28 | 
29 |     header = { 'Content-Type': 'application/json' }
30 |     req = Net::HTTP::Post.new(uri, header)
31 |     processed_data = Base64.encode64(Marshal.dump(data))
32 |     req.body = { url: url, options: options, encrypted_response: processed_data }.to_json
33 |     Net::HTTP.start(uri.hostname, uri.port) do |http|
34 |       http.request(req)
35 |     end
36 |   end
37 | 
38 |   def check_config
39 |     CACHE_SERVICE_URL.nil?
40 |   end
41 | end
42 | 


--------------------------------------------------------------------------------
/lib/arachnid2/exoskeleton.rb:
--------------------------------------------------------------------------------
  1 | class Arachnid2
  2 |   module Exoskeleton
  3 |     def browser_type
  4 |       unless @browser_type
  5 |         @browser_type   = "#{@options[:browser_type]}".to_sym if @options[:browser_type]
  6 |         @browser_type ||= :firefox
  7 |       end
  8 | 
  9 |       @browser_type
 10 |     end
 11 | 
 12 |     def process(url, html)
 13 |       return false unless Adomain["#{url}"]&.include? @domain
 14 | 
 15 |       extract_hrefs(html)
 16 |     end
 17 | 
 18 |     def extract_hrefs(body)
 19 |       elements = Nokogiri::HTML.parse(body).css('a')
 20 |       return elements.map {|link| link.attribute('href').to_s}.uniq.sort.delete_if {|href| href.empty? }
 21 |     end
 22 | 
 23 |     def vacuum(links, url)
 24 |       links.each do |link|
 25 |         next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/)
 26 | 
 27 |         begin
 28 |           absolute_link = make_absolute(link, url)
 29 | 
 30 |           next if skip_link?(absolute_link)
 31 | 
 32 |           @global_queue << absolute_link
 33 |         rescue Addressable::URI::InvalidURIError
 34 |         end
 35 |       end
 36 |     end
 37 | 
 38 |     def skip_link?(absolute_link)
 39 |       !internal_link?(absolute_link) || \
 40 |       @global_visited.include?(absolute_link) || \
 41 |       extension_ignored?(absolute_link) || \
 42 |       @global_queue.include?(absolute_link)
 43 |     end
 44 | 
 45 |     def preflight(opts)
 46 |       @options = opts
 47 |       @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
 48 |       @global_queue = [@url]
 49 |     end
 50 | 
 51 |     def proxy
 52 |       @options[:proxy]
 53 |     end
 54 | 
 55 |     def non_html_extensions
 56 |       return @non_html_extensions if @non_html_extensions
 57 | 
 58 |       @non_html_extensions   = @options[:non_html_extensions]
 59 |       @non_html_extensions ||= DEFAULT_NON_HTML_EXTENSIONS
 60 |     end
 61 | 
 62 |     def bound_time
 63 |       boundary = "#{@options[:time_box]}".to_i
 64 |       boundary = BASE_CRAWL_TIME if boundary <= 0
 65 |       boundary = MAX_CRAWL_TIME  if boundary >  MAX_CRAWL_TIME
 66 | 
 67 |       return Time.now + boundary
 68 |     end
 69 | 
 70 |     def bound_urls
 71 |       amount = "#{@options[:max_urls]}".to_i
 72 |       amount = BASE_URLS if amount <= 0
 73 |       amount = MAX_URLS  if amount >  MAX_URLS
 74 | 
 75 |       amount
 76 |     end
 77 | 
 78 |     def timeout
 79 |       unless @timeout
 80 |         @timeout = @options[:timeout]
 81 |         @timeout = DEFAULT_TIMEOUT unless @timeout.is_a?(Integer)
 82 |         @timeout = DEFAULT_TIMEOUT if @timeout > MAXIMUM_TIMEOUT
 83 |         @timeout = DEFAULT_TIMEOUT if @timeout < MINIMUM_TIMEOUT
 84 |       end
 85 |       @timeout
 86 |     end
 87 | 
 88 |     def crawl_options
 89 |       @crawl_options ||= { max_urls: max_urls, time_limit: time_limit }
 90 |     end
 91 | 
 92 |     alias_method :max_urls, :bound_urls
 93 | 
 94 |     alias_method :time_limit, :bound_time
 95 | 
 96 |     def make_absolute(href, root)
 97 |       Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s
 98 |     end
 99 | 
100 |     def internal_link?(absolute_url)
101 |       "#{Adomain[absolute_url]}".include? @domain
102 |     end
103 | 
104 |     def extension_ignored?(url)
105 |       return false if url.empty?
106 | 
107 |       !non_html_extensions.values.flatten.find { |e| url.downcase.end_with? e.downcase }.nil?
108 |     end
109 | 
110 |     def memory_danger?
111 |       return false unless in_docker?
112 | 
113 |       use      = "#{File.open(MEMORY_USE_FILE, "rb").read}".to_f
114 |       @limit ||= "#{File.open(MEMORY_LIMIT_FILE, "rb").read}".to_f
115 | 
116 |       return false unless ( (use > 0.0) && (@limit > 0.0) )
117 | 
118 |       return ( ( (use / @limit) * 100.0 ) >= maximum_load_rate )
119 |     end
120 | 
121 |     def in_docker?
122 |       File.file?(MEMORY_USE_FILE)
123 |     end
124 | 
125 |     def maximum_load_rate
126 |       return @maximum_load_rate if @maximum_load_rate
127 | 
128 |       @maximum_load_rate = "#{@options[:memory_limit]}".to_f
129 |       @maximum_load_rate = DEFAULT_MAXIMUM_LOAD_RATE unless ((@maximum_load_rate > 0.0) && (@maximum_load_rate < 100.0))
130 |       @maximum_load_rate
131 |     end
132 |   end
133 | end
134 | 


--------------------------------------------------------------------------------
/lib/arachnid2/typhoeus.rb:
--------------------------------------------------------------------------------
  1 | class Arachnid2
  2 |   class Typhoeus
  3 |     include CachedResponses
  4 |     include Arachnid2::Exoskeleton
  5 | 
  6 |     def initialize(url)
  7 |       @url = url
  8 |       @domain = Adomain[@url]
  9 |       @cached_data = []
 10 |     end
 11 | 
 12 |     def crawl(opts = {})
 13 |       preflight(opts)
 14 |       typhoeus_preflight
 15 | 
 16 |       until @global_queue.empty?
 17 |         max_concurrency.times do
 18 |           q = @global_queue.shift
 19 | 
 20 |           break if time_to_stop?
 21 |           @global_visited.insert(q)
 22 | 
 23 |           found_in_cache = use_cache(q, opts, &Proc.new)
 24 |           return if found_in_cache
 25 | 
 26 |           request = ::Typhoeus::Request.new(q, request_options)
 27 |           requestable = after_request(request, &Proc.new)
 28 |           @hydra.queue(request) if requestable
 29 |         end # max_concurrency.times do
 30 | 
 31 |         @hydra.run
 32 |       end # until @global_queue.empty?
 33 |     ensure
 34 |       @cookie_file.close! if @cookie_file
 35 |     end # def crawl(opts = {})
 36 | 
 37 |     private
 38 |       def after_request(request)
 39 |         request.on_complete do |response|
 40 |           cacheable = use_response(response, &Proc.new)
 41 |           return unless cacheable
 42 | 
 43 |           put_cached_data(response.effective_url, @options, response)
 44 |         end
 45 | 
 46 |         true
 47 |       end
 48 | 
 49 |       def use_response(response)
 50 |         links = process(response.effective_url, response.body)
 51 |         return unless links
 52 | 
 53 |         yield response
 54 | 
 55 |         vacuum(links, response.effective_url)
 56 |         true
 57 |       end
 58 | 
 59 |       def use_cache(url, options)
 60 |         data = load_data(url, options)
 61 |         use_response(data, &Proc.new) if data
 62 | 
 63 |         data
 64 |       end
 65 | 
 66 |       def time_to_stop?
 67 |         @global_visited.size >= crawl_options[:max_urls] || \
 68 |                  Time.now > crawl_options[:time_limit] || \
 69 |                  memory_danger?
 70 |       end
 71 | 
 72 |       def typhoeus_preflight
 73 |         @hydra = ::Typhoeus::Hydra.new(:max_concurrency => max_concurrency)
 74 |         typhoeus_proxy_options
 75 |       end
 76 | 
 77 |       def max_concurrency
 78 |         return @max_concurrency if @max_concurrency
 79 | 
 80 |         @max_concurrency = "#{@options[:max_concurrency]}".to_i
 81 |         @max_concurrency = 1 unless (@max_concurrency > 0)
 82 |         @max_concurrency
 83 |       end
 84 | 
 85 |       def followlocation
 86 |         return @followlocation unless @followlocation.nil?
 87 | 
 88 |         @followlocation = @options[:followlocation]
 89 |         @followlocation = true unless @followlocation.is_a?(FalseClass)
 90 |       end
 91 | 
 92 |       def request_options
 93 |         @cookie_file ||= Tempfile.new('cookies')
 94 | 
 95 |         @request_options = {
 96 |           timeout: timeout,
 97 |           followlocation: followlocation,
 98 |           cookiefile: @cookie_file.path,
 99 |           cookiejar: @cookie_file.path,
100 |           headers: @options[:headers]
101 |         }.merge(crawl_options[:proxy])
102 | 
103 |         @request_options[:headers] ||= {}
104 |         @request_options[:headers]['Accept-Language'] ||= DEFAULT_LANGUAGE
105 |         @request_options[:headers]['User-Agent']      ||= DEFAULT_USER_AGENT
106 | 
107 |         @request_options
108 |       end
109 | 
110 |       def typhoeus_proxy_options
111 |         crawl_options[:proxy] = {}
112 | 
113 |         crawl_options[:proxy][:proxy] = "#{@options[:proxy][:ip]}:#{@options[:proxy][:port]}" if @options.dig(:proxy, :ip)
114 |         crawl_options[:proxy][:proxyuserpwd] = "#{@options[:proxy][:username]}:#{@options[:proxy][:password]}" if @options.dig(:proxy, :username)
115 |       end
116 | 
117 |   end
118 | end
119 | 


--------------------------------------------------------------------------------
/lib/arachnid2/version.rb:
--------------------------------------------------------------------------------
1 | class Arachnid2
2 |   VERSION = "0.4.0"
3 | end
4 | 


--------------------------------------------------------------------------------
/lib/arachnid2/watir.rb:
--------------------------------------------------------------------------------
  1 | class Arachnid2
  2 |   class Watir
  3 |     DEFAULT_AGENT = :desktop
  4 |     DEFAULT_ORIENTATION = :landscape
  5 | 
  6 |     include Arachnid2::Exoskeleton
  7 | 
  8 |     def initialize(url)
  9 |       @url = url
 10 |       @domain = Adomain[@url]
 11 |     end
 12 | 
 13 |     def crawl(opts)
 14 |       preflight(opts)
 15 |       watir_preflight
 16 |       @already_retried = false
 17 | 
 18 |       until @global_queue.empty?
 19 |         q = @global_queue.shift
 20 |         links = nil
 21 | 
 22 |         break if time_to_stop?
 23 | 
 24 |         @global_visited.insert(q)
 25 | 
 26 |         make_request(q, &Proc.new)
 27 |       end # until @global_queue.empty?
 28 |     ensure
 29 |       @browser.close if @browser rescue nil
 30 |       @headless.destroy if @headless rescue nil
 31 |     end
 32 | 
 33 |     private
 34 |       def make_request(q)
 35 |         begin
 36 |           links = browse_links(q, &Proc.new)
 37 |           return unless links
 38 | 
 39 |           vacuum(links, browser.url)
 40 |         rescue Selenium::WebDriver::Error::NoSuchWindowError, Net::ReadTimeout => e
 41 |           msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
 42 |                 "is ignoring an error: " \
 43 |                 "#{e.class} - #{e.message}"
 44 |           puts msg
 45 |         rescue => e
 46 |           raise e if raise_before_retry?(e.class)
 47 |           msg = "WARNING [arachnid2] Arachnid2::Watir#make_request " \
 48 |                 "is retrying once after an error: " \
 49 |                 "#{e.class} - #{e.message}"
 50 |           puts msg
 51 |           e.backtrace[0..4].each{|l| puts "\t#{l}"}; puts "..."
 52 |           reset_for_retry
 53 |         end
 54 |       end
 55 | 
 56 |       def browse_links(url)
 57 |         return unless navigate(url)
 58 | 
 59 |         yield browser
 60 | 
 61 |         process(browser.url, browser.body.html) if browser.body.exists?
 62 |       end
 63 | 
 64 |       def navigate(url)
 65 |         begin
 66 |           browser.goto url
 67 |         rescue Selenium::WebDriver::Error::UnknownError => e
 68 |           # Firefox and Selenium, in their infinite wisdom
 69 |           # raise an error when a page cannot be loaded.
 70 |           # At the time of writing this, the page at
 71 |           # thewirecutter.com/cars/accessories-auto
 72 |           # causes such an issue (too many redirects).
 73 |           # This error handling moves us on from those pages.
 74 |           raise e unless e.message =~ /.*Reached error page.*/i
 75 |           return
 76 |         end
 77 | 
 78 |         true
 79 |       end
 80 | 
 81 |       def time_to_stop?
 82 |         @global_visited.size >= crawl_options[:max_urls] || \
 83 |                  Time.now > crawl_options[:time_limit] || \
 84 |                  memory_danger?
 85 |       end
 86 | 
 87 |       def raise_before_retry?(klass)
 88 |         @already_retried || \
 89 |           "#{klass}".include?("Selenium") || \
 90 |           "#{klass}".include?("Watir")
 91 |       end
 92 | 
 93 |       def reset_for_retry
 94 |         @browser.close if @browser rescue nil
 95 |         @headless.destroy if @headless rescue nil
 96 |         @driver.quit if @headless rescue nil
 97 |         @driver = nil
 98 |         @browser = nil
 99 |         @already_retried = true
100 |       end
101 | 
102 |       def browser
103 |         unless @browser
104 |           behead if @make_headless
105 | 
106 |           @browser = create_browser
107 | 
108 |           set_timeout
109 |         end
110 | 
111 |         return @browser
112 |       end
113 | 
114 |       def create_browser
115 |         return ::Watir::Browser.new(driver, proxy: @proxy) if @proxy
116 | 
117 |         ::Watir::Browser.new driver
118 |       end
119 | 
120 |       def set_timeout
121 |         @browser.driver.manage.timeouts.page_load = timeout
122 |       end
123 | 
124 |       def behead
125 |         @headless = Headless.new
126 |         @headless.start
127 |       end
128 | 
129 |       def driver
130 |         unless @driver
131 |           language    = @options.dig(:headers, "Accept-Language") || DEFAULT_LANGUAGE
132 |           user_agent  = @options.dig(:headers, "User-Agent")      || DEFAULT_USER_AGENT
133 |           agent       = @options.dig(:agent)                      || DEFAULT_AGENT
134 |           orientation = @options.dig(:orientation)                || DEFAULT_ORIENTATION
135 | 
136 |           @driver = Webdriver::UserAgent.driver(
137 |             browser: browser_type,
138 |             agent: agent,
139 |             orientation: orientation,
140 |             accept_language_string: language,
141 |             user_agent_string: user_agent
142 |           )
143 |         end
144 | 
145 |         @driver
146 |       end
147 | 
148 |       def watir_preflight
149 |         watir_proxy_options
150 |         @make_headless = @options[:headless]
151 |       end
152 | 
153 |       def watir_proxy_options
154 |         crawl_options[:proxy] = {}
155 | 
156 |         crawl_options[:proxy][:http] = @options[:proxy][:http] if @options.dig(:proxy, :http)
157 |         crawl_options[:proxy][:ssl] = @options[:proxy][:ssl] if @options.dig(:proxy, :ssl)
158 |       end
159 |     end
160 | 
161 | end
162 | 


--------------------------------------------------------------------------------
/spec/arachnid2/exoskeleton_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper.rb'
 2 | 
 3 | RSpec.describe Arachnid2::Exoskeleton do
 4 |   describe "#memory_danger?" do
 5 |     let(:dummy) { (Class.new { include Arachnid2::Exoskeleton }).new }
 6 |     before(:each) do
 7 |       dummy.instance_variable_set(:@url, "http://dummy.com")
 8 |       dummy.instance_variable_set(:@domain, "dummy.com")
 9 | 
10 |       allow(dummy).to receive(:in_docker?).and_return(true)
11 |       dummy.instance_variable_set(:@maximum_load_rate, 50.00)
12 |     end
13 | 
14 |     it "stops execution when memory limit is reached" do
15 |       use_file    = OpenStruct.new({read: 99.9999})
16 |       limit_file  = OpenStruct.new({read: 100.0000})
17 | 
18 |       allow(File).to receive(:open).with(Arachnid2::MEMORY_USE_FILE, 'rb').and_return(use_file)
19 |       allow(File).to receive(:open).with(Arachnid2::MEMORY_LIMIT_FILE, 'rb').and_return(limit_file)
20 | 
21 |       expect(dummy.memory_danger?).to be_truthy
22 |     end
23 | 
24 |     it "does not stop execution when memory limit is not yet reached" do
25 |       use_file    = OpenStruct.new({read: 1.0})
26 |       limit_file  = OpenStruct.new({read: 100.0000})
27 | 
28 |       allow(File).to receive(:open).with(Arachnid2::MEMORY_USE_FILE, 'rb').and_return(use_file)
29 |       allow(File).to receive(:open).with(Arachnid2::MEMORY_LIMIT_FILE, 'rb').and_return(limit_file)
30 | 
31 |       expect(dummy.memory_danger?).to be_falsey
32 |     end
33 |   end
34 | end
35 | 


--------------------------------------------------------------------------------
/spec/arachnid2/typhoeus_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper.rb'
 2 | 
 3 | RSpec.describe Arachnid2::Typhoeus do
 4 |   describe "#crawl" do
 5 |     it "accepts the options" do
 6 |       url = "https://daringfireball.net"
 7 |       spider = Arachnid2::Typhoeus.new(url)
 8 |       opts = {
 9 |         followlocation: true,
10 |         timeout: 12000,
11 |         time_box: 10,
12 |         max_urls: 1,
13 |         headers: {
14 |           'Accept-Language' => "en-UK",
15 |           'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
16 |         },
17 |         max_concurrency: 5,
18 |         memory_limit: 39.99,
19 |         proxy: {
20 |           ip: "1.2.3.4",
21 |           port: "1234",
22 |           username: "sam",
23 |           password: "coolcoolcool",
24 |         },
25 |         non_html_extensions: {
26 |           2 => [".oh"],
27 |           3 => [".omg"],
28 |           5 => [".ohhai"],
29 |         }
30 |       }
31 | 
32 |       spider.crawl(opts){}
33 | 
34 |       crawl_options = spider.instance_variable_get(:@crawl_options)
35 |       request_options = spider.instance_variable_get(:@request_options)
36 |       maximum_load_rate = spider.send(:maximum_load_rate)
37 |       max_concurrency = spider.send(:max_concurrency)
38 |       hydra = spider.instance_variable_get(:@hydra)
39 |       followlocation = spider.send(:followlocation)
40 |       non_html_extensions = spider.send(:non_html_extensions)
41 |       timeout = spider.instance_variable_get(:@timeout)
42 | 
43 |       expect(crawl_options[:time_limit]).to be_a(Time)
44 |       expect(crawl_options[:max_urls]).to be_an(Integer)
45 |       expect(crawl_options[:proxy][:proxy]).to eq("1.2.3.4:1234")
46 |       expect(crawl_options[:proxy][:proxyuserpwd]).to eq("sam:coolcoolcool")
47 |       expect(request_options).not_to be_nil
48 |       expect(request_options[:headers]).to eq(opts[:headers])
49 |       expect(maximum_load_rate).to eq(39.99)
50 |       expect(max_concurrency).to eq(5)
51 |       expect(hydra).to be_a(Typhoeus::Hydra)
52 |       expect(followlocation).to eq(true)
53 |       expect(timeout).to eq(12000)
54 |       expect(non_html_extensions.values.flatten).to eq([".oh", ".omg", ".ohhai"])
55 |     end
56 | 
57 |     it "visits the URL" do
58 |       url = "https://daringfireball.net"
59 |       spider = Arachnid2::Typhoeus.new(url)
60 |       opts = {
61 |         time_box: 10,
62 |         max_urls: 2
63 |       }
64 |       responses = []
65 | 
66 |       spider.crawl(opts){|r| responses << r}
67 |       global_visited = spider.instance_variable_get(:@global_visited)
68 |       global_queue = spider.instance_variable_get(:@global_queue)
69 | 
70 |       expect(global_visited.size).to be > 0
71 |       expect(responses.size).to be > 0
72 |     end
73 | 
74 |     context "data is available in the cache" do
75 |       let!(:url) { "https://daringfireball.net" }
76 |       let!(:spider) { Arachnid2::Typhoeus.new(url) }
77 |       let!(:opts) { { time_box: 10, max_urls: 1 } }
78 |       let!(:payload) {
79 |         OpenStruct.new({effective_url: "http://daringfireball.net", body: "<html></html>"})
80 |       } # note that the url and effective_url domains must match
81 | 
82 |       before(:each) do
83 |         allow(spider).to receive(:load_data).with(url, opts).and_return(payload)
84 |       end
85 | 
86 |       it "loads data from the cache" do
87 |         responses = []
88 |         expect(spider).to receive(:load_data).with(url, opts).and_return(payload)
89 | 
90 |         spider.crawl(opts){|r| responses << r}
91 |         expect(responses).to include(payload)
92 |       end
93 |     end
94 |   end
95 | end
96 | 


--------------------------------------------------------------------------------
/spec/arachnid2/watir_spec.rb:
--------------------------------------------------------------------------------
  1 | require 'spec_helper.rb'
  2 | 
  3 | RSpec.describe Arachnid2::Watir do
  4 |   describe "#crawl" do
  5 |     it "accepts the options" do
  6 |       url = "https://daringfireball.net"
  7 |       spider = Arachnid2::Watir.new(url)
  8 |       opts = {
  9 |         browser_type: :chrome,
 10 |         timeout: 12000,
 11 |         time_box: 10,
 12 |         max_urls: 1,
 13 |         headers: {
 14 |           'Accept-Language' => "en-UK",
 15 |           'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
 16 |         },
 17 |         memory_limit: 39.99,
 18 |         proxy: {
 19 |           http: "troy.show:8080",
 20 |           ssl: "abed.show:8080"
 21 |         },
 22 |         non_html_extensions: {
 23 |           2 => [".oh"],
 24 |           3 => [".omg"],
 25 |           5 => [".ohhai"],
 26 |         },
 27 |         headers: {
 28 |           'Accept-Language' => "es-ES",
 29 |           'User-Agent' => "Sam's Custom Browser"
 30 |         },
 31 |         headless: false,
 32 |         agent: :iphone,
 33 |         orientation: :portrait
 34 |       }
 35 | 
 36 |       spider.crawl(opts) { |browser|
 37 |         @header_language = browser.execute_script("return navigator.language") unless @header_language
 38 |         @header_user_agent = browser.execute_script("return navigator.userAgent") unless @header_user_agent
 39 |         @portrait = browser.execute_script("return (window.innerHeight > window.innerWidth)") unless @portrait
 40 |         @window_width = browser.execute_script("return window.innerWidth") unless @window_width
 41 |       }
 42 | 
 43 |       crawl_options = spider.crawl_options
 44 |       maximum_load_rate = spider.maximum_load_rate
 45 |       non_html_extensions = spider.non_html_extensions
 46 |       timeout = spider.timeout
 47 |       make_headless = spider.instance_variable_get(:@make_headless)
 48 | 
 49 |       expect(crawl_options[:time_limit]).to be_a(Time)
 50 |       expect(crawl_options[:max_urls]).to be_an(Integer)
 51 |       expect(crawl_options[:proxy][:http]).to eq("troy.show:8080")
 52 |       expect(crawl_options[:proxy][:ssl]).to eq("abed.show:8080")
 53 |       expect(@header_language).to include(opts[:headers]['Accept-Language'])
 54 |       expect(@header_user_agent).to eq(opts[:headers]['User-Agent'])
 55 |       expect(@portrait).to be_truthy
 56 |       expect(@window_width).to be < 525
 57 |       expect(maximum_load_rate).to eq(39.99)
 58 |       expect(timeout).to eq(12000)
 59 |       expect(non_html_extensions.values.flatten).to eq([".oh", ".omg", ".ohhai"])
 60 |     end
 61 | 
 62 |     it "visits the URL" do
 63 |       url = "https://daringfireball.net"
 64 |       spider = Arachnid2::Watir.new(url)
 65 |       opts = {
 66 |         time_box: 10,
 67 |         max_urls: 2
 68 |       }
 69 |       responses = []
 70 | 
 71 |       spider.crawl(opts){|r| responses << r}
 72 |       global_visited = spider.instance_variable_get(:@global_visited)
 73 |       global_queue = spider.instance_variable_get(:@global_queue)
 74 | 
 75 |       expect(global_visited.size).to be > 0
 76 |       expect(responses.size).to be > 0
 77 |     end
 78 | 
 79 |     it "uses Watir when requested" do
 80 |       spider = Arachnid2.new("http://test.com")
 81 |       allow_any_instance_of(Arachnid2::Watir).to receive(:crawl).with(anything).and_return(true)
 82 |       expect{ spider.crawl(opts = {max_urls: 1, time_box: 1}, with_watir = true) {} }.not_to raise_error
 83 |     end
 84 | 
 85 |     it "only uses one crawling technology type" do
 86 |       spider = Arachnid2.new("http://daringfireball.net")
 87 |       # allow_any_instance_of(Arachnid2::Watir).to receive(:crawl).with(anything).and_return(true)
 88 |       expect_any_instance_of(Arachnid2::Typhoeus).not_to receive(:crawl)
 89 |       spider.crawl(opts = {max_urls: 2, time_box: 5}, with_watir = true) {}
 90 |     end
 91 | 
 92 |     it "crawls past any Net::ReadTimeout issues" do
 93 |       spider = Arachnid2.new("https://www.themcelroy.family")
 94 |       opts = {max_urls: 3, time_box: 10}
 95 |       with_watir = true
 96 | 
 97 |       allow_any_instance_of(::Watir::Browser).to receive(:goto).with(anything).and_raise(Net::ReadTimeout)
 98 | 
 99 |       expect{
100 |         spider.crawl(opts, with_watir) {}
101 |       }.not_to raise_error
102 |     end
103 | 
104 |     it "crawls past any Selenium::WebDriver::Error::NoSuchWindowError issues" do
105 |       spider = Arachnid2.new("https://www.themcelroy.family")
106 |       opts = {max_urls: 3, time_box: 10}
107 |       with_watir = true
108 | 
109 |       allow_any_instance_of(::Watir::Browser).to receive(:goto).with(anything).and_raise(Selenium::WebDriver::Error::NoSuchWindowError)
110 | 
111 |       expect{
112 |         spider.crawl(opts, with_watir) {}
113 |       }.not_to raise_error
114 |     end
115 | 
116 |     it "does not fail when the browser cannot locate the <body>" do
117 |       spider = Arachnid2.new("https://www.themcelroy.family")
118 |       opts = {max_urls: 3, time_box: 10}
119 |       with_watir = true
120 | 
121 |       allow_any_instance_of(::Watir::Body).to receive(:html).and_raise(Watir::Exception::UnknownObjectException)
122 |       allow_any_instance_of(::Watir::Body).to receive(:exists?).and_return(false)
123 | 
124 |       expect{
125 |         spider.crawl(opts, with_watir) {}
126 |       }.not_to raise_error
127 |     end
128 | 
129 |     it "rescues one error when the browser connection is lost" do
130 |       spider = Arachnid2::Watir.new("https://stratechery.com")
131 |       opts = {max_urls: 3, time_box: 60}
132 | 
133 |       Object.const_set("MyCustomTestError", Class.new(StandardError))
134 | 
135 |       allow_any_instance_of(Arachnid2::Watir).to receive(:preflight).with(opts).and_return(true)
136 |       allow_any_instance_of(Arachnid2::Watir).to receive(:watir_preflight).and_return(true)
137 | 
138 |       queue = [
139 |         "https://stratechery.com",
140 |         "http://stratechery.com/about/",
141 |         "https://stratechery.com/concepts/"
142 |       ]
143 |       spider.instance_variable_set(:@options, opts)
144 |       spider.instance_variable_set(:@global_queue, queue)
145 |       bf = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
146 |       spider.instance_variable_set(:@global_visited, bf)
147 |       spider.instance_variable_set(:@make_headless, !OS.mac?)
148 | 
149 |       browser = spider.send(:create_browser)
150 |       spider.instance_variable_set(:@browser, browser)
151 | 
152 |       allow(browser).to receive(:url).and_raise(MyCustomTestError)
153 | 
154 |       expect{
155 |         spider.crawl(opts) {}
156 |       }.not_to raise_error
157 |     end
158 | 
159 |     it "stops after more than one error" do
160 |       spider = Arachnid2::Watir.new("https://stratechery.com")
161 |       opts = {max_urls: 3, time_box: 60}
162 | 
163 |       Object.const_set("MyCustomTestError", Class.new(StandardError)) unless Object.const_defined?("MyCustomTestError")
164 | 
165 |       allow_any_instance_of(::Watir::Browser).to receive(:url).and_raise(MyCustomTestError)
166 |       allow_any_instance_of(Arachnid2::Watir).to receive(:preflight).with(opts).and_return(true)
167 |       allow_any_instance_of(Arachnid2::Watir).to receive(:watir_preflight).and_return(true)
168 | 
169 |       queue = [
170 |         "https://stratechery.com",
171 |         "http://stratechery.com/about/",
172 |         "https://stratechery.com/concepts/"
173 |       ]
174 |       spider.instance_variable_set(:@options, opts)
175 |       spider.instance_variable_set(:@global_queue, queue)
176 |       bf = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
177 |       spider.instance_variable_set(:@global_visited, bf)
178 |       spider.instance_variable_set(:@make_headless, !OS.mac?)
179 | 
180 |       expect{
181 |         spider.crawl(opts) {}
182 |       }.to raise_error(MyCustomTestError)
183 |     end
184 |   end
185 | end
186 | 


--------------------------------------------------------------------------------
/spec/arachnid2_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper.rb'
 2 | 
 3 | RSpec.describe Arachnid2 do
 4 |   it "has a version number" do
 5 |     expect(Arachnid2::VERSION).not_to be nil
 6 |   end
 7 | 
 8 |   describe "#initialize" do
 9 |     it "sets the URL" do
10 |       url = "http://test.com"
11 |       spider = Arachnid2.new url
12 |       expect(spider.instance_variable_get(:@url)).to eq(url)
13 |     end
14 |   end
15 | end
16 | 


--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
 1 | require "bundler/setup"
 2 | require "arachnid2"
 3 | require "ostruct"
 4 | 
 5 | RSpec.configure do |config|
 6 |   # Enable flags like --only-failures and --next-failure
 7 |   config.example_status_persistence_file_path = ".rspec_status"
 8 | 
 9 |   # Disable RSpec exposing methods globally on `Module` and `main`
10 |   config.disable_monkey_patching!
11 | 
12 |   config.expect_with :rspec do |c|
13 |     c.syntax = :expect
14 |   end
15 | end
16 | 


--------------------------------------------------------------------------------