├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── CHANGELOG.md
├── Gemfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── grubby.gemspec
├── lib
    ├── grubby.rb
    └── grubby
    │   ├── core_ext
    │       ├── string.rb
    │       └── uri.rb
    │   ├── json_parser.rb
    │   ├── json_scraper.rb
    │   ├── log.rb
    │   ├── mechanize
    │       ├── download.rb
    │       ├── fetch_with_retry.rb
    │       ├── file.rb
    │       ├── link.rb
    │       ├── page.rb
    │       └── parser.rb
    │   ├── page_scraper.rb
    │   ├── scraper.rb
    │   └── version.rb
└── test
    ├── core_ext
        ├── string_test.rb
        └── uri_test.rb
    ├── grubby_test.rb
    ├── json_parser_test.rb
    ├── json_scraper_test.rb
    ├── log_test.rb
    ├── mechanize
        ├── download_test.rb
        ├── fetch_with_retry_test.rb
        ├── file_test.rb
        ├── link_test.rb
        ├── page_test.rb
        └── parser_test.rb
    ├── page_scraper_test.rb
    ├── scraper_test.rb
    └── test_helper.rb


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   tests:
 5 |     strategy:
 6 |       matrix:
 7 |         ruby: ["2.7", "3.0", "3.1", "3.2"]
 8 |       fail-fast: false
 9 | 
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - uses: actions/checkout@v2
14 | 
15 |       - uses: ruby/setup-ruby@v1
16 |         with:
17 |           ruby-version: ${{ matrix.ruby }}
18 |           bundler-cache: true
19 | 
20 |       - run: bundle exec rake test
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /.bundle/
 2 | /.yardoc
 3 | /Gemfile.lock
 4 | /_yardoc/
 5 | /coverage/
 6 | /doc/
 7 | /gemfiles/*.lock
 8 | /pkg/
 9 | /spec/reports/
10 | /tmp/
11 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## 2.0.0
 2 | 
 3 | * [BREAKING] Drop support for Active Support < 6.0
 4 | * [BREAKING] Require casual_support ~> 4.0
 5 | * [BREAKING] Require mini_sanity ~> 2.0
 6 | * [BREAKING] Require pleasant_path ~> 2.0
 7 | * [BREAKING] Remove `JsonParser.json_parse_options`
 8 |   * Use `::JSON.load_default_options` instead
 9 | * [BREAKING] Rename `Grubby#singleton` to `Grubby#fulfill`
10 | * [BREAKING] Change `Grubby#fulfill` to return block's result
11 | 
12 | 
13 | ## 1.2.1
14 | 
15 | * Add `JsonParser#mech` attribute for parity with `Mechanize::Page#mech`
16 | * Ensure time spent fetching a response does not count toward the time
17 |   to sleep between requests
18 | * Prevent sleep between requests when following a redirect
19 | * Prevent duplicates in `Scraper.fields`
20 | * Fix `URI#query_param` when query is nil
21 | * Fix `PageScraper.scrape_file` and `JsonScraper.scrape_file` when path
22 |   contains characters that need to be URI-encoded
23 | 
24 | 
25 | ## 1.2.0
26 | 
27 | * Add `Grubby#journal=`
28 | * Add `$grubby` global default `Grubby` instance
29 | * Add `Scraper.scrape`
30 | * Add `Scraper.each`
31 | * Support `:if` and `:unless` options for `Scraper.scrapes`
32 | * Fix fail-fast behavior of inherited scraper fields
33 | * Fix `JsonParser` on empty response body
34 | * Loosen Active Support version constraint
35 | 
36 | 
37 | ## 1.1.0
38 | 
39 | * Add `Grubby#ok?`
40 | * Add `PageScraper.scrape_file` and `JsonScraper.scrape_file`
41 | * Add `Mechanize::Parser#save_to` and `Mechanize::Parser#save_to!`,
42 |   which are inherited by `Mechanize::Download` and `Mechanize::File`
43 | * Add `URI#basename`
44 | * Add `URI#query_param`
45 | * Add utility methods from [ryoba](https://rubygems.org/gems/ryoba)
46 | * Add `Scraper::Error#scraper` and `Scraper#errors` for interactive
47 |   debugging with e.g. `byebug`
48 | * Improve log messages and error formatting
49 | * Fix compatibility with net-http-persistent gem v3.0
50 | 
51 | 
52 | ## 1.0.0
53 | 
54 | * Initial release
55 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 | 
3 | # Specify your gem's dependencies in grubby.gemspec
4 | gemspec
5 | 
6 | gem "rake", "~> 12.0"
7 | gem "minitest", "~> 5.0"
8 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2017 Jonathan Hefner
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # grubby
  2 | 
  3 | [Fail-fast] web scraping.  *grubby* adds a layer of utility and
  4 | error-checking atop the marvelous [Mechanize gem].  See API listing
  5 | below, or browse the [full documentation].
  6 | 
  7 | [Fail-fast]: https://en.wikipedia.org/wiki/Fail-fast
  8 | [Mechanize gem]: https://rubygems.org/gems/mechanize
  9 | [full documentation]: https://www.rubydoc.info/gems/grubby/
 10 | 
 11 | 
 12 | ## Examples
 13 | 
 14 | The following code scrapes stories from the [Hacker News](
 15 | https://news.ycombinator.com/news) front page:
 16 | 
 17 | ```ruby
 18 | require "grubby"
 19 | 
 20 | class HackerNews < Grubby::PageScraper
 21 |   scrapes(:items) do
 22 |     page.search!(".athing").map{|element| Item.new(element) }
 23 |   end
 24 | 
 25 |   class Item < Grubby::Scraper
 26 |     scrapes(:story_link){ source.at!("a.storylink") }
 27 | 
 28 |     scrapes(:story_url){ expand_url(story_link["href"]) }
 29 | 
 30 |     scrapes(:title){ story_link.text }
 31 | 
 32 |     scrapes(:comments_link, optional: true) do
 33 |       source.next_sibling.search!(".subtext a").find do |link|
 34 |         link.text.match?(/comment|discuss/)
 35 |       end
 36 |     end
 37 | 
 38 |     scrapes(:comments_url, if: :comments_link) do
 39 |       expand_url(comments_link["href"])
 40 |     end
 41 | 
 42 |     scrapes(:comment_count, if: :comments_link) do
 43 |       comments_link.text.to_i
 44 |     end
 45 | 
 46 |     def expand_url(url)
 47 |       url.include?("://") ? url : source.document.uri.merge(url).to_s
 48 |     end
 49 |   end
 50 | end
 51 | 
 52 | # The following line will raise an exception if anything goes wrong
 53 | # during the scraping process.  For example, if the structure of the
 54 | # HTML does not match expectations due to a site change, the script will
 55 | # terminate immediately with a helpful error message.  This prevents bad
 56 | # data from propagating and causing hard-to-trace errors.
 57 | hn = HackerNews.scrape("https://news.ycombinator.com/news")
 58 | 
 59 | # Your processing logic goes here:
 60 | hn.items.take(10).each do |item|
 61 |   puts "* #{item.title}"
 62 |   puts "  #{item.story_url}"
 63 |   puts "  #{item.comment_count} comments: #{item.comments_url}" if item.comments_url
 64 |   puts
 65 | end
 66 | ```
 67 | 
 68 | Hacker News also offers a [JSON API](https://github.com/HackerNews/API),
 69 | which may be more robust for scraping purposes.  *grubby* can scrape
 70 | JSON just as well:
 71 | 
 72 | ```ruby
 73 | require "grubby"
 74 | 
 75 | class HackerNews < Grubby::JsonScraper
 76 |   scrapes(:items) do
 77 |     # API returns array of top 500 item IDs, so limit as necessary
 78 |     json.take(10).map do |item_id|
 79 |       Item.scrape("https://hacker-news.firebaseio.com/v0/item/#{item_id}.json")
 80 |     end
 81 |   end
 82 | 
 83 |   class Item < Grubby::JsonScraper
 84 |     scrapes(:story_url){ json["url"] || hn_url }
 85 | 
 86 |     scrapes(:title){ json["title"] }
 87 | 
 88 |     scrapes(:comments_url, optional: true) do
 89 |       hn_url if json["descendants"]
 90 |     end
 91 | 
 92 |     scrapes(:comment_count, optional: true) do
 93 |       json["descendants"]&.to_i
 94 |     end
 95 | 
 96 |     def hn_url
 97 |       "https://news.ycombinator.com/item?id=#{json["id"]}"
 98 |     end
 99 |   end
100 | end
101 | 
102 | hn = HackerNews.scrape("https://hacker-news.firebaseio.com/v0/topstories.json")
103 | 
104 | # Your processing logic goes here:
105 | hn.items.each do |item|
106 |   puts "* #{item.title}"
107 |   puts "  #{item.story_url}"
108 |   puts "  #{item.comment_count} comments: #{item.comments_url}" if item.comments_url
109 |   puts
110 | end
111 | ```
112 | 
113 | 
114 | ## Core API
115 | 
116 | - [Grubby](https://www.rubydoc.info/gems/grubby/Grubby)
117 |   - [#fulfill](https://www.rubydoc.info/gems/grubby/Grubby:fulfill)
118 |   - [#get_mirrored](https://www.rubydoc.info/gems/grubby/Grubby:get_mirrored)
119 |   - [#ok?](https://www.rubydoc.info/gems/grubby/Grubby:ok%3F)
120 |   - [#time_between_requests](https://www.rubydoc.info/gems/grubby/Grubby:time_between_requests)
121 | - [Scraper](https://www.rubydoc.info/gems/grubby/Grubby/Scraper)
122 |   - [.each](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.each)
123 |   - [.scrape](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrape)
124 |   - [.scrapes](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrapes)
125 |   - [#[]](https://www.rubydoc.info/gems/grubby/Grubby/Scraper:[])
126 |   - [#to_h](https://www.rubydoc.info/gems/grubby/Grubby/Scraper:to_h)
127 | - [PageScraper](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper)
128 |   - [.scrape_file](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper.scrape_file)
129 |   - [#page](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper:page)
130 | - [JsonScraper](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper)
131 |   - [.scrape_file](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper.scrape_file)
132 |   - [#json](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json)
133 | - Mechanize::File
134 |   - [#save_to](https://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to)
135 |   - [#save_to!](https://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21)
136 | - Mechanize::Page
137 |   - [#at!](https://www.rubydoc.info/gems/grubby/Mechanize/Page:at%21)
138 |   - [#search!](https://www.rubydoc.info/gems/grubby/Mechanize/Page:search%21)
139 | - Mechanize::Page::Link
140 |   - [#to_absolute_uri](https://www.rubydoc.info/gems/grubby/Mechanize/Page/Link#to_absolute_uri)
141 | - URI
142 |   - [#basename](https://www.rubydoc.info/gems/grubby/URI:basename)
143 |   - [#query_param](https://www.rubydoc.info/gems/grubby/URI:query_param)
144 | 
145 | 
146 | ## Auxiliary API
147 | 
148 | *grubby* loads several gems that extend Ruby objects with utility
149 | methods.  Some of those methods are listed below.  See each gem's
150 | documentation for a complete API listing.
151 | 
152 | - [Active Support](https://rubygems.org/gems/activesupport)
153 |   ([docs](https://www.rubydoc.info/gems/activesupport/))
154 |   - [Enumerable#index_by](https://www.rubydoc.info/gems/activesupport/Enumerable:index_by)
155 |   - [File.atomic_write](https://www.rubydoc.info/gems/activesupport/File:atomic_write)
156 |   - [Object#presence](https://www.rubydoc.info/gems/activesupport/Object:presence)
157 |   - [String#blank?](https://www.rubydoc.info/gems/activesupport/String:blank%3F)
158 |   - [String#squish](https://www.rubydoc.info/gems/activesupport/String:squish)
159 | - [casual_support](https://rubygems.org/gems/casual_support)
160 |   ([docs](https://www.rubydoc.info/gems/casual_support/))
161 |   - [Enumerable#index_to](https://www.rubydoc.info/gems/casual_support/Enumerable:index_to)
162 |   - [String#after](https://www.rubydoc.info/gems/casual_support/String:after)
163 |   - [String#after_last](https://www.rubydoc.info/gems/casual_support/String:after_last)
164 |   - [String#before](https://www.rubydoc.info/gems/casual_support/String:before)
165 |   - [String#before_last](https://www.rubydoc.info/gems/casual_support/String:before_last)
166 |   - [String#between](https://www.rubydoc.info/gems/casual_support/String:between)
167 |   - [Time#to_hms](https://www.rubydoc.info/gems/casual_support/Time:to_hms)
168 |   - [Time#to_ymd](https://www.rubydoc.info/gems/casual_support/Time:to_ymd)
169 | - [gorge](https://rubygems.org/gems/gorge)
170 |   ([docs](https://www.rubydoc.info/gems/gorge/))
171 |   - [Pathname#file_crc32](https://www.rubydoc.info/gems/gorge/Pathname:file_crc32)
172 |   - [Pathname#file_md5](https://www.rubydoc.info/gems/gorge/Pathname:file_md5)
173 |   - [Pathname#file_sha1](https://www.rubydoc.info/gems/gorge/Pathname:file_sha1)
174 | - [mini_sanity](https://rubygems.org/gems/mini_sanity)
175 |   ([docs](https://www.rubydoc.info/gems/mini_sanity/))
176 |   - [Enumerator#result!](https://www.rubydoc.info/gems/mini_sanity/Enumerator:result%21)
177 |   - [Enumerator#results!](https://www.rubydoc.info/gems/mini_sanity/Enumerator:results%21)
178 |   - [Object#assert!](https://www.rubydoc.info/gems/mini_sanity/Object:assert%21)
179 |   - [Object#refute!](https://www.rubydoc.info/gems/mini_sanity/Object:refute%21)
180 |   - [String#match!](https://www.rubydoc.info/gems/mini_sanity/String:match%21)
181 | - [pleasant_path](https://rubygems.org/gems/pleasant_path)
182 |   ([docs](https://www.rubydoc.info/gems/pleasant_path/))
183 |   - [Pathname#available_name](https://www.rubydoc.info/gems/pleasant_path/Pathname:available_name)
184 |   - [Pathname#existence](https://www.rubydoc.info/gems/pleasant_path/Pathname:existence)
185 |   - [Pathname#make_dirname](https://www.rubydoc.info/gems/pleasant_path/Pathname:make_dirname)
186 |   - [Pathname#move_as](https://www.rubydoc.info/gems/pleasant_path/Pathname:move_as)
187 |   - [Pathname#rename_basename](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename)
188 |   - [Pathname#rename_extname](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname)
189 | - [ryoba](https://rubygems.org/gems/ryoba)
190 |   ([docs](https://www.rubydoc.info/gems/ryoba/))
191 |   - [Nokogiri::XML::Node#matches!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:matches%21)
192 |   - [Nokogiri::XML::Node#text!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:text%21)
193 |   - [Nokogiri::XML::Node#uri](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:uri)
194 |   - [Nokogiri::XML::Searchable#ancestor!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestor%21)
195 |   - [Nokogiri::XML::Searchable#ancestors!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestors%21)
196 |   - [Nokogiri::XML::Searchable#at!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21)
197 |   - [Nokogiri::XML::Searchable#search!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21)
198 | 
199 | 
200 | ## Installation
201 | 
202 | Install the [`grubby` gem](https://rubygems.org/gems/grubby).
203 | 
204 | 
205 | ## Contributing
206 | 
207 | Run `rake test` to run the tests.
208 | 
209 | 
210 | ## License
211 | 
212 | [MIT License](LICENSE.txt)
213 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require "bundler/gem_tasks"
 2 | require "rake/testtask"
 3 | 
 4 | Rake::TestTask.new(:test) do |t|
 5 |   t.libs << "test"
 6 |   t.libs << "lib"
 7 |   t.test_files = FileList["test/**/*_test.rb"]
 8 | end
 9 | 
10 | task :default => :test
11 | 


--------------------------------------------------------------------------------
/grubby.gemspec:
--------------------------------------------------------------------------------
 1 | require_relative "lib/grubby/version"
 2 | 
 3 | Gem::Specification.new do |spec|
 4 |   spec.name          = "grubby"
 5 |   spec.version       = GRUBBY_VERSION
 6 |   spec.authors       = ["Jonathan Hefner"]
 7 |   spec.email         = ["jonathan@hefner.pro"]
 8 | 
 9 |   spec.summary       = %q{Fail-fast web scraping}
10 |   spec.homepage      = "https://github.com/jonathanhefner/grubby"
11 |   spec.license       = "MIT"
12 |   spec.required_ruby_version = ">= 2.7"
13 | 
14 |   spec.metadata["homepage_uri"] = spec.homepage
15 |   spec.metadata["source_code_uri"] = spec.homepage
16 |   spec.metadata["changelog_uri"] = spec.metadata["source_code_uri"] + "/blob/master/CHANGELOG.md"
17 | 
18 |   # Specify which files should be added to the gem when it is released.
19 |   # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
20 |   spec.files         = Dir.chdir(__dir__) do
21 |     `git ls-files -z`.split("\x0").reject { |f| f.start_with?("test/", ".git") }
22 |   end
23 |   spec.bindir        = "exe"
24 |   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25 |   spec.require_paths = ["lib"]
26 | 
27 |   spec.add_dependency "activesupport", ">= 6.0"
28 |   spec.add_dependency "casual_support", "~> 4.0"
29 |   spec.add_dependency "gorge", "~> 1.0"
30 |   spec.add_dependency "mechanize", "~> 2.7"
31 |   spec.add_dependency "mini_sanity", "~> 2.0"
32 |   spec.add_dependency "pleasant_path", "~> 2.0"
33 |   spec.add_dependency "ryoba", "~> 1.0"
34 | end
35 | 


--------------------------------------------------------------------------------
/lib/grubby.rb:
--------------------------------------------------------------------------------
  1 | require "active_support/all"
  2 | require "casual_support"
  3 | require "gorge"
  4 | require "mechanize"
  5 | require "mini_sanity"
  6 | require "pleasant_path"
  7 | require "ryoba"
  8 | 
  9 | require_relative "grubby/version"
 10 | require_relative "grubby/log"
 11 | 
 12 | require_relative "grubby/core_ext/string"
 13 | require_relative "grubby/core_ext/uri"
 14 | require_relative "grubby/mechanize/fetch_with_retry"
 15 | require_relative "grubby/mechanize/download"
 16 | require_relative "grubby/mechanize/file"
 17 | require_relative "grubby/mechanize/link"
 18 | require_relative "grubby/mechanize/page"
 19 | require_relative "grubby/mechanize/parser"
 20 | 
 21 | 
 22 | class Grubby < Mechanize
 23 | 
 24 |   VERSION = GRUBBY_VERSION
 25 | 
 26 |   # The minimum amount of time enforced between requests, in seconds.
 27 |   # If the value is a Range, a random number within the Range is chosen
 28 |   # for each request.
 29 |   #
 30 |   # @return [Integer, Float, Range<Integer>, Range<Float>]
 31 |   attr_accessor :time_between_requests
 32 | 
 33 |   # Journal file used to ensure only-once processing of resources by
 34 |   # {fulfill} across multiple program runs.
 35 |   #
 36 |   # @return [Pathname, nil]
 37 |   attr_reader :journal
 38 | 
 39 |   # @param journal [Pathname, String]
 40 |   #   Optional journal file used to ensure only-once processing of
 41 |   #   resources by {fulfill} across multiple program runs
 42 |   def initialize(journal = nil)
 43 |     super()
 44 | 
 45 |     # Prevent "memory leaks", and prevent mistakenly blank urls from
 46 |     # resolving.  (Blank urls resolve as a path relative to the last
 47 |     # history entry.  Without this setting, an erroneous `agent.get("")`
 48 |     # could sometimes successfully fetch a page.)
 49 |     self.max_history = 0
 50 | 
 51 |     # Prevent files of unforeseen content type from being buffered into
 52 |     # memory by default, in case they are very large.  However, increase
 53 |     # the threshold for what is considered "large", to prevent
 54 |     # unnecessary writes to disk.
 55 |     #
 56 |     # References:
 57 |     #   - http://docs.seattlerb.org/mechanize/Mechanize/PluggableParser.html
 58 |     #   - http://docs.seattlerb.org/mechanize/Mechanize/Download.html
 59 |     #   - http://docs.seattlerb.org/mechanize/Mechanize/File.html
 60 |     self.max_file_buffer = 1_000_000 # only applies to Mechanize::Download
 61 |     self.pluggable_parser.default = Mechanize::Download
 62 |     self.pluggable_parser["text/plain"] = Mechanize::File
 63 |     self.pluggable_parser["application/json"] = Grubby::JsonParser
 64 | 
 65 |     # Set up configurable rate limiting, and choose a reasonable default
 66 |     # rate limit.
 67 |     self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) }
 68 |     self.post_connect_hooks << Proc.new do |agent, uri, response, body|
 69 |       self.send(:mark_last_request_time, (Time.now unless response.code.to_s.start_with?("3")))
 70 |     end
 71 |     self.time_between_requests = 1.0
 72 | 
 73 |     self.journal = journal
 74 |   end
 75 | 
 76 |   # Sets the journal file used to ensure only-once processing of
 77 |   # resources by {fulfill} across multiple program runs.  Setting the
 78 |   # journal file will clear the in-memory list of previously-processed
 79 |   # resources, and, if the journal file exists, load the list from file.
 80 |   #
 81 |   # @param path [Pathname, String, nil]
 82 |   # @return [Pathname]
 83 |   def journal=(path)
 84 |     @journal = path&.to_pathname&.make_file
 85 |     @fulfilled = if @journal
 86 |         require "csv"
 87 |         CSV.read(@journal).map{|row| FulfilledEntry.new(*row) }.to_set
 88 |       else
 89 |         Set.new
 90 |       end
 91 |     @journal
 92 |   end
 93 | 
 94 |   # Calls +#head+ and returns true if a response code "200" is received,
 95 |   # false otherwise.  Unlike +#head+, error response codes (e.g. "404",
 96 |   # "500") do not result in a +Mechanize::ResponseCodeError+ being
 97 |   # raised.
 98 |   #
 99 |   # @param uri [URI, String]
100 |   # @return [Boolean]
101 |   def ok?(uri, query_params = {}, headers = {})
102 |     begin
103 |       head(uri, query_params, headers).code == "200"
104 |     rescue Mechanize::ResponseCodeError
105 |       false
106 |     end
107 |   end
108 | 
109 |   # Calls +#get+ with each of +mirror_uris+ until a successful
110 |   # ("200 OK") response is received, and returns that +#get+ result.
111 |   # Rescues and logs +Mechanize::ResponseCodeError+ failures for all but
112 |   # the last mirror.
113 |   #
114 |   # @example
115 |   #   grubby = Grubby.new
116 |   #
117 |   #   urls = [
118 |   #     "https://httpstat.us/404",
119 |   #     "https://httpstat.us/500",
120 |   #     "https://httpstat.us/200?foo",
121 |   #     "https://httpstat.us/200?bar",
122 |   #   ]
123 |   #
124 |   #   grubby.get_mirrored(urls).uri  # == URI("https://httpstat.us/200?foo")
125 |   #
126 |   #   grubby.get_mirrored(urls.take(2))  # raise Mechanize::ResponseCodeError
127 |   #
128 |   # @param mirror_uris [Array<URI>, Array<String>]
129 |   # @return [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
130 |   # @raise [Mechanize::ResponseCodeError]
131 |   #   if all +mirror_uris+ fail
132 |   def get_mirrored(mirror_uris, parameters = [], referer = nil, headers = {})
133 |     i = 0
134 |     begin
135 |       get(mirror_uris[i], parameters, referer, headers)
136 |     rescue Mechanize::ResponseCodeError => e
137 |       i += 1
138 |       if i >= mirror_uris.length
139 |         raise
140 |       else
141 |         $log.debug("Mirror failed (code #{e.response_code}): #{mirror_uris[i - 1]}")
142 |         $log.debug("Try mirror: #{mirror_uris[i]}")
143 |         retry
144 |       end
145 |     end
146 |   end
147 | 
148 |   # Ensures only-once processing of the resource indicated by +uri+ for
149 |   # the specified +purpose+.  The given block is executed and the result
150 |   # is returned if and only if the Grubby instance has not recorded a
151 |   # previous call to +fulfill+ for the same resource and purpose.
152 |   #
153 |   # Note that the resource is identified by both its URI and its content
154 |   # hash.  The latter prevents superfluous and rearranged URI query
155 |   # string parameters from interfering with only-once processing.
156 |   #
157 |   # If {journal} is set, and if the block does not raise an exception,
158 |   # the resource and purpose are logged to the journal file.  This
159 |   # enables only-once processing across multiple program runs.  It also
160 |   # provides a means to resume batch processing after an unexpected
161 |   # termination.
162 |   #
163 |   # @example
164 |   #   grubby = Grubby.new
165 |   #
166 |   #   grubby.fulfill("https://example.com/posts") do |page|
167 |   #     "first time"
168 |   #   end
169 |   #   # == "first time"
170 |   #
171 |   #   grubby.fulfill("https://example.com/posts") do |page|
172 |   #     "already seen" # not evaluated
173 |   #   end
174 |   #   # == nil
175 |   #
176 |   #   grubby.fulfill("https://example.com/posts?page=1") do |page|
177 |   #     "already seen content hash" # not evaluated
178 |   #   end
179 |   #   # == nil
180 |   #
181 |   #   grubby.fulfill("https://example.com/posts", "again!") do |page|
182 |   #     "already seen, but new purpose"
183 |   #   end
184 |   #   # == "already seen, but new purpose"
185 |   #
186 |   # @param uri [URI, String]
187 |   # @param purpose [String]
188 |   # @yieldparam resource [Mechanize::Page, Mechanize::File, Mechanize::Download, ...]
189 |   # @yieldreturn [Object]
190 |   # @return [Object, nil]
191 |   # @raise [Mechanize::ResponseCodeError]
192 |   #   if fetching the resource results in error (see +Mechanize#get+)
193 |   def fulfill(uri, purpose = "")
194 |     series = []
195 | 
196 |     uri = uri.to_absolute_uri
197 |     return unless add_fulfilled(uri, purpose, series)
198 | 
199 |     normalized_uri = normalize_uri(uri)
200 |     return unless add_fulfilled(normalized_uri, purpose, series)
201 | 
202 |     $log.info("Fetch #{normalized_uri}")
203 |     resource = get(normalized_uri)
204 |     unprocessed = add_fulfilled(resource.uri, purpose, series) &
205 |       add_fulfilled("content hash: #{resource.content_hash}", purpose, series)
206 | 
207 |     result = yield resource if unprocessed
208 | 
209 |     CSV.open(journal, "a") do |csv|
210 |       series.each{|entry| csv << entry }
211 |     end if journal
212 | 
213 |     result
214 |   end
215 | 
216 | 
217 |   private
218 | 
219 |   # @!visibility private
220 |   FulfilledEntry = Struct.new(:purpose, :target)
221 | 
222 |   def add_fulfilled(target, purpose, series)
223 |     series << FulfilledEntry.new(purpose, target.to_s)
224 |     if (series.uniq!) || @fulfilled.add?(series.last)
225 |       true
226 |     else
227 |       $log.info("Skip #{series.first.target}" \
228 |         " (seen#{" #{series.last.target}" unless series.length == 1})")
229 |       false
230 |     end
231 |   end
232 | 
233 |   def normalize_uri(uri)
234 |     uri = uri.dup
235 |     $log.warn("Ignore ##{uri.fragment} in #{uri}") if uri.fragment
236 |     uri.fragment = nil
237 |     uri.path = uri.path.chomp("/")
238 |     uri
239 |   end
240 | 
241 |   def sleep_between_requests
242 |     @last_request_at ||= 0.0
243 |     delay_duration = time_between_requests.is_a?(Range) ?
244 |       rand(time_between_requests) : time_between_requests
245 |     sleep_duration = @last_request_at + delay_duration - Time.now.to_f
246 |     sleep(sleep_duration) if sleep_duration > 0
247 |   end
248 | 
249 |   def mark_last_request_time(time)
250 |     @last_request_at = time.to_f
251 |   end
252 | 
253 | end
254 | 
255 | 
256 | require_relative "grubby/json_parser"
257 | require_relative "grubby/scraper"
258 | require_relative "grubby/page_scraper"
259 | require_relative "grubby/json_scraper"
260 | 
261 | 
262 | $grubby = Grubby.new
263 | 


--------------------------------------------------------------------------------
/lib/grubby/core_ext/string.rb:
--------------------------------------------------------------------------------
 1 | class String
 2 | 
 3 |   # Constructs a URI from the String.  Raises an exception if the String
 4 |   # does not denote an absolute URI.
 5 |   #
 6 |   # @return [URI]
 7 |   # @raise [RuntimeError]
 8 |   #   if the String does not denote an absolute URI
 9 |   def to_absolute_uri
10 |     URI(self).to_absolute_uri
11 |   end
12 | 
13 | end
14 | 


--------------------------------------------------------------------------------
/lib/grubby/core_ext/uri.rb:
--------------------------------------------------------------------------------
 1 | module URI
 2 | 
 3 |   # Returns the basename of the URI's +path+, a la +File.basename+.
 4 |   #
 5 |   # @example
 6 |   #   URI("https://example.com/foo/bar").basename  # == "bar"
 7 |   #   URI("https://example.com/foo").basename      # == "foo"
 8 |   #   URI("https://example.com/").basename         # == ""
 9 |   #
10 |   # @return [String]
11 |   def basename
12 |     self.path == "/" ? "" : ::File.basename(self.path)
13 |   end
14 | 
15 |   # Returns the value of the specified query param in the URI's query
16 |   # string.  The specified +name+ must be *exactly* as it appears in the
17 |   # query string, and support for complex nested values is limited.
18 |   # (See +CGI.parse+ for parsing behavior.)  If +name+ contains +"[]"+,
19 |   # all occurrences of the query param are returned as an Array.
20 |   # Otherwise, only the last occurrence is returned.
21 |   #
22 |   # @example
23 |   #   URI("https://example.com/?foo=a").query_param("foo")  # == "a"
24 |   #
25 |   #   URI("https://example.com/?foo=a&foo=b").query_param("foo")    # == "b"
26 |   #   URI("https://example.com/?foo=a&foo=b").query_param("foo[]")  # == nil
27 |   #
28 |   #   URI("https://example.com/?foo[]=a&foo[]=b").query_param("foo")    # == nil
29 |   #   URI("https://example.com/?foo[]=a&foo[]=b").query_param("foo[]")  # == ["a", "b"]
30 |   #
31 |   #   URI("https://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]")     # == nil
32 |   #   URI("https://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]")  # == ["a"]
33 |   #
34 |   # @param name [String]
35 |   # @return [String, Array<String>, nil]
36 |   def query_param(name)
37 |     values = CGI.parse(self.query)[name] if self.query
38 |     (values.nil? || name.include?("[]")) ? values : values.last
39 |   end
40 | 
41 |   # Raises an exception if the URI is not +absolute?+.  Otherwise,
42 |   # returns the URI.
43 |   #
44 |   # @return [self]
45 |   # @raise [RuntimeError]
46 |   #   if the URI is not +absolute?+
47 |   def to_absolute_uri
48 |     raise "URI is not absolute: #{self}" unless self.absolute?
49 |     self
50 |   end
51 | 
52 | end
53 | 


--------------------------------------------------------------------------------
/lib/grubby/json_parser.rb:
--------------------------------------------------------------------------------
 1 | class Grubby::JsonParser < Mechanize::File
 2 | 
 3 |   # The parsed JSON data.
 4 |   #
 5 |   # @return [Hash, Array]
 6 |   attr_reader :json
 7 | 
 8 |   # The Mechanize agent used to make the request.
 9 |   #
10 |   # @return [Mechanize, nil]
11 |   attr_accessor :mech
12 | 
13 |   def initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil)
14 |     @json = JSON.load(body, nil, create_additions: false)
15 |     @mech = mech
16 |     super(uri, response, body, code)
17 |   end
18 | 
19 | end
20 | 


--------------------------------------------------------------------------------
/lib/grubby/json_scraper.rb:
--------------------------------------------------------------------------------
 1 | class Grubby::JsonScraper < Grubby::Scraper
 2 | 
 3 |   # The parsed JSON data being scraped.
 4 |   #
 5 |   # @return [Hash, Array]
 6 |   attr_reader :json
 7 | 
 8 |   # @param source [Grubby::JsonParser]
 9 |   # @raise [Grubby::Scraper::Error]
10 |   #   if any {Scraper.scrapes} blocks fail
11 |   def initialize(source)
12 |     @json = source.assert!(Grubby::JsonParser).json
13 |     super
14 |   end
15 | 
16 |   # Scrapes a locally-stored file.  This method is intended for use with
17 |   # subclasses of +Grubby::JsonScraper+.
18 |   #
19 |   # @example
20 |   #   class MyScraper < Grubby::JsonScraper
21 |   #     # ...
22 |   #   end
23 |   #
24 |   #   MyScraper.scrape_file("path/to/local_file.json")  # === MyScraper
25 |   #
26 |   # @param path [String]
27 |   # @param agent [Mechanize]
28 |   # @return [Grubby::JsonScraper]
29 |   # @raise [Grubby::Scraper::Error]
30 |   #   if any {Scraper.scrapes} blocks fail
31 |   def self.scrape_file(path, agent = $grubby)
32 |     self.new(Grubby::JsonParser.read_local(path).tap{|parser| parser.mech = agent })
33 |   end
34 | 
35 | end
36 | 


--------------------------------------------------------------------------------
/lib/grubby/log.rb:
--------------------------------------------------------------------------------
1 | $log ||= Logger.new($stderr).tap do |logger|
2 |   logger.formatter = ->(severity, datetime, progname, msg) do
3 |     "[#{datetime.to_ymd} #{datetime.to_hms}] #{severity} #{msg}\n"
4 |   end
5 | end
6 | 


--------------------------------------------------------------------------------
/lib/grubby/mechanize/download.rb:
--------------------------------------------------------------------------------
1 | # @!visibility private
2 | class Mechanize::Download
3 | 
4 |   def content_hash
5 |     @content_hash ||= Digest::SHA1.new.io(self.body_io).hexdigest
6 |   end
7 | 
8 | end
9 | 


--------------------------------------------------------------------------------
/lib/grubby/mechanize/fetch_with_retry.rb:
--------------------------------------------------------------------------------
 1 | # This monkey patch attempts to fix the insidious "too many connection
 2 | # resets" bug described here: https://github.com/sparklemotion/mechanize/issues/123
 3 | #
 4 | # The code is taken and modified from this helpful blog article:
 5 | # http://scottwb.com/blog/2013/11/09/defeating-the-infamous-mechanize-too-many-connection-resets-bug/
 6 | class Mechanize::HTTP::Agent
 7 | 
 8 |   MAX_CONNECTION_RESET_RETRIES = 9
 9 |   IDEMPOTENT_HTTP_METHODS = [:get, :head, :options, :delete]
10 | 
11 |   # Replacement for +Mechanize::HTTP::Agent#fetch+.  When a "too many
12 |   # connection resets" error is encountered, this method retries the
13 |   # request (upto {MAX_CONNECTION_RESET_RETRIES} times).
14 |   def fetch_with_retry(uri, http_method = :get, headers = {}, params = [], referer = current_page, redirects = 0)
15 |     retry_count = 0
16 |     begin
17 |       fetch_without_retry(uri, http_method, headers, params, referer, redirects)
18 |     rescue Net::HTTP::Persistent::Error => e
19 |       # raise if different type of error
20 |       raise unless e.message.include?("too many connection resets")
21 |       # raise if non-idempotent http method
22 |       raise unless IDEMPOTENT_HTTP_METHODS.include?(http_method)
23 |       # raise if we've tried too many times
24 |       raise if retry_count >= MAX_CONNECTION_RESET_RETRIES
25 | 
26 |       # otherwise, shutdown the persistent HTTP connection and try again
27 |       retry_count += 1
28 |       $log.warn("#{e.message} (#{e.class}).  Retry in #{retry_count} seconds.")
29 |       sleep(retry_count) # incremental backoff to allow server to self-correct
30 |       $log.warn("Retry #{http_method.to_s.upcase} #{uri}")
31 |       retry
32 |     end
33 |   end
34 | 
35 |   alias_method :fetch_without_retry, :fetch
36 |   alias_method :fetch, :fetch_with_retry
37 | 
38 | end
39 | 


--------------------------------------------------------------------------------
/lib/grubby/mechanize/file.rb:
--------------------------------------------------------------------------------
 1 | # @!visibility private
 2 | class Mechanize::File
 3 | 
 4 |   def self.read_local(path)
 5 |     uri_path = File.expand_path(path).gsub(%r"[^/\\]+"){|component| CGI.escape(component) }
 6 |     self.new(URI::File.build(path: uri_path), nil, File.read(path), "200")
 7 |   end
 8 | 
 9 |   def content_hash
10 |     @content_hash ||= self.body.to_s.sha1
11 |   end
12 | 
13 | end
14 | 


--------------------------------------------------------------------------------
/lib/grubby/mechanize/link.rb:
--------------------------------------------------------------------------------
 1 | class Mechanize::Page::Link
 2 | 
 3 |   # Returns the URI represented by the Link, in absolute form.  If the
 4 |   # href attribute of the Link is expressed in relative form, the URI is
 5 |   # converted to absolute form using the Link's +page.uri+.  Raises an
 6 |   # exception if the URI cannot be converted to absolute form.
 7 |   #
 8 |   # @return [URI]
 9 |   # @raise [RuntimeError]
10 |   #   if the URI cannot be converted to absolute form
11 |   def to_absolute_uri
12 |     # Via the W3 spec[1]: "If the a element has no href attribute, then
13 |     # the element represents a placeholder for where a link might
14 |     # otherwise have been placed, if it had been relevant, consisting of
15 |     # just the element's contents."  So, we assume a link with no href
16 |     # attribute (i.e. `uri == nil`) should be treated the same as an
17 |     # intra-page link.
18 |     #
19 |     # [1]: https://www.w3.org/TR/2016/REC-html51-20161101/textlevel-semantics.html#the-a-element
20 |     URI.join(self.page.uri, self.uri || "#").to_absolute_uri
21 |   end
22 | 
23 | end
24 | 


--------------------------------------------------------------------------------
/lib/grubby/mechanize/page.rb:
--------------------------------------------------------------------------------
 1 | class Mechanize::Page
 2 | 
 3 |   # @!method search!(*queries)
 4 |   # See ryoba's {https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21
 5 |   # +Nokogiri::XML::Searchable#search!+}.
 6 |   #
 7 |   # @param queries [Array<String>]
 8 |   # @return [Nokogiri::XML::NodeSet]
 9 |   # @raise [Ryoba::Error]
10 |   #   if all queries yield no results
11 |   def_delegators :parser, :search!
12 | 
13 |   # @!method at!(*queries)
14 |   # See ryoba's {https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21
15 |   # +Nokogiri::XML::Searchable#at!+}.
16 |   #
17 |   # @param queries [Array<String>]
18 |   # @return [Nokogiri::XML::Element]
19 |   # @raise [Ryoba::Error]
20 |   #   if all queries yield no results
21 |   def_delegators :parser, :at!
22 | 
23 | end
24 | 


--------------------------------------------------------------------------------
/lib/grubby/mechanize/parser.rb:
--------------------------------------------------------------------------------
 1 | require "fileutils"
 2 | 
 3 | module Mechanize::Parser
 4 | 
 5 |   # Saves the payload to a specified directory, using the default
 6 |   # filename suggested by the server.  If a file with that name already
 7 |   # exists, this method will try to find a free filename by appending
 8 |   # numbers to the default filename.  Returns the full path of the saved
 9 |   # file.
10 |   #
11 |   # @note This method expects a +#save!+ method to be defined by the
12 |   #   class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
13 |   #   and +Mechanize::Download#save!+.
14 |   #
15 |   # @param directory [String]
16 |   # @return [String]
17 |   def save_to(directory)
18 |     raise "#{self.class}#save! is not defined" unless self.respond_to?(:save!)
19 | 
20 |     FileUtils.mkdir_p(directory)
21 |     path = find_free_name(File.join(directory, @filename))
22 |     save!(path)
23 |     path
24 |   end
25 | 
26 |   # Saves the payload to a specified directory, using the default
27 |   # filename suggested by the server.  If a file with that name already
28 |   # exists, that file will be overwritten.  Returns the full path of the
29 |   # saved file.
30 |   #
31 |   # @note This method expects a +#save!+ method to be defined by the
32 |   #   class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+
33 |   #   and +Mechanize::Download#save!+.
34 |   #
35 |   # @param directory [String]
36 |   # @return [String]
37 |   def save_to!(directory)
38 |     raise "#{self.class}#save! is not defined" unless self.respond_to?(:save!)
39 | 
40 |     FileUtils.mkdir_p(directory)
41 |     path = File.join(directory, @filename)
42 |     save!(path)
43 |     path
44 |   end
45 | 
46 | end
47 | 


--------------------------------------------------------------------------------
/lib/grubby/page_scraper.rb:
--------------------------------------------------------------------------------
 1 | class Grubby::PageScraper < Grubby::Scraper
 2 | 
 3 |   # The Page being scraped.
 4 |   #
 5 |   # @return [Mechanize::Page]
 6 |   attr_reader :page
 7 | 
 8 |   # @param source [Mechanize::Page]
 9 |   # @raise [Grubby::Scraper::Error]
10 |   #   if any {Scraper.scrapes} blocks fail
11 |   def initialize(source)
12 |     @page = source.assert!(Mechanize::Page)
13 |     super
14 |   end
15 | 
16 |   # Scrapes a locally-stored file.  This method is intended for use with
17 |   # subclasses of +Grubby::PageScraper+.
18 |   #
19 |   # @example
20 |   #   class MyScraper < Grubby::PageScraper
21 |   #     # ...
22 |   #   end
23 |   #
24 |   #   MyScraper.scrape_file("path/to/local_file.html")  # === MyScraper
25 |   #
26 |   # @param path [String]
27 |   # @param agent [Mechanize]
28 |   # @return [Grubby::PageScraper]
29 |   # @raise [Grubby::Scraper::Error]
30 |   #   if any {Scraper.scrapes} blocks fail
31 |   def self.scrape_file(path, agent = $grubby)
32 |     self.new(Mechanize::Page.read_local(path).tap{|page| page.mech = agent })
33 |   end
34 | 
35 | end
36 | 


--------------------------------------------------------------------------------
/lib/grubby/scraper.rb:
--------------------------------------------------------------------------------
  1 | class Grubby::Scraper
  2 | 
  3 |   # Defines an attribute reader method named by +field+.  During
  4 |   # {initialize}, the given block is called, and the attribute is set to
  5 |   # the block's return value.
  6 |   #
  7 |   # By default, raises an exception if the block's return value is nil.
  8 |   # To prevent this behavior, set the +:optional+ option to true.
  9 |   # Alternatively, the block can be conditionally evaluated, based on
 10 |   # another method's return value, using the +:if+ or +:unless+ options.
 11 |   #
 12 |   # @example Default behavior
 13 |   #   class GreetingScraper < Grubby::Scraper
 14 |   #     scrapes(:name) do
 15 |   #       source[/Hello (\w+)/, 1]
 16 |   #     end
 17 |   #   end
 18 |   #
 19 |   #   scraper = GreetingScraper.new("Hello World!")
 20 |   #   scraper.name  # == "World"
 21 |   #
 22 |   #   scraper = GreetingScraper.new("Hello!")  # raises Grubby::Scraper::Error
 23 |   #
 24 |   # @example Optional scraped value
 25 |   #   class GreetingScraper < Grubby::Scraper
 26 |   #     scrapes(:name, optional: true) do
 27 |   #       source[/Hello (\w+)/, 1]
 28 |   #     end
 29 |   #   end
 30 |   #
 31 |   #   scraper = GreetingScraper.new("Hello World!")
 32 |   #   scraper.name  # == "World"
 33 |   #
 34 |   #   scraper = GreetingScraper.new("Hello!")
 35 |   #   scraper.name  # == nil
 36 |   #
 37 |   # @example Conditional scraped value
 38 |   #   class GreetingScraper < Grubby::Scraper
 39 |   #     def hello?
 40 |   #       source.start_with?("Hello ")
 41 |   #     end
 42 |   #
 43 |   #     scrapes(:name, if: :hello?) do
 44 |   #       source[/Hello (\w+)/, 1]
 45 |   #     end
 46 |   #   end
 47 |   #
 48 |   #   scraper = GreetingScraper.new("Hello World!")
 49 |   #   scraper.name  # == "World"
 50 |   #
 51 |   #   scraper = GreetingScraper.new("Hello!")  # raises Grubby::Scraper::Error
 52 |   #
 53 |   #   scraper = GreetingScraper.new("How are you?")
 54 |   #   scraper.name  # == nil
 55 |   #
 56 |   # @param field [Symbol, String]
 57 |   # @param options [Hash]
 58 |   # @option options :optional [Boolean] (false)
 59 |   #   Whether the block should be allowed to return a nil value
 60 |   # @option options :if [Symbol] (nil)
 61 |   #   Name of predicate method that determines if the block should be
 62 |   #   evaluated
 63 |   # @option options :unless [Symbol] (nil)
 64 |   #   Name of predicate method that determines if the block should not
 65 |   #   be evaluated
 66 |   # @yieldreturn [Object]
 67 |   # @return [void]
 68 |   def self.scrapes(field, **options, &block)
 69 |     field = field.to_sym
 70 |     (self.fields << field).uniq!
 71 | 
 72 |     define_method(field) do
 73 |       raise "#{self.class}#initialize does not invoke `super`" unless defined?(@scraped)
 74 | 
 75 |       if !@scraped.key?(field) && !@errors.key?(field)
 76 |         begin
 77 |           skip = (options[:if] && !self.send(options[:if])) ||
 78 |             (options[:unless] && self.send(options[:unless]))
 79 | 
 80 |           if skip
 81 |             @scraped[field] = nil
 82 |           else
 83 |             @scraped[field] = instance_eval(&block)
 84 |             if @scraped[field].nil?
 85 |               raise FieldValueRequiredError.new(field) unless options[:optional]
 86 |               $log.debug("#{self.class}##{field} is nil")
 87 |             end
 88 |           end
 89 |         rescue RuntimeError, IndexError => e
 90 |           @errors[field] = e
 91 |         end
 92 |       end
 93 | 
 94 |       if @errors.key?(field)
 95 |         raise FieldScrapeFailedError.new(field, @errors[field])
 96 |       else
 97 |         @scraped[field]
 98 |       end
 99 |     end
100 |   end
101 | 
102 |   # Fields defined via {scrapes}.
103 |   #
104 |   # @return [Array<Symbol>]
105 |   def self.fields
106 |     @fields ||= self == Grubby::Scraper ? [] : self.superclass.fields.dup
107 |   end
108 | 
109 |   # Instantiates the Scraper class with the resource indicated by +url+.
110 |   # This method acts as a default factory method, and provides a
111 |   # standard interface for overrides.
112 |   #
113 |   # @example Default factory method
114 |   #   class PostPageScraper < Grubby::PageScraper
115 |   #     # ...
116 |   #   end
117 |   #
118 |   #   PostPageScraper.scrape("https://example.com/posts/42")
119 |   #     # == PostPageScraper.new($grubby.get("https://example.com/posts/42"))
120 |   #
121 |   # @example Override factory method
122 |   #   class PostApiScraper < Grubby::JsonScraper
123 |   #     # ...
124 |   #
125 |   #     def self.scrape(url, agent = $grubby)
126 |   #       api_url = url.to_s.sub(%r"//example.com/(.+)", '//api.example.com/\1.json')
127 |   #       super(api_url, agent)
128 |   #     end
129 |   #   end
130 |   #
131 |   #   PostApiScraper.scrape("https://example.com/posts/42")
132 |   #     # == PostApiScraper.new($grubby.get("https://api.example.com/posts/42.json"))
133 |   #
134 |   # @param url [String, URI]
135 |   # @param agent [Mechanize]
136 |   # @return [Grubby::Scraper]
137 |   # @raise [Grubby::Scraper::Error]
138 |   #   if any {Scraper.scrapes} blocks fail
139 |   def self.scrape(url, agent = $grubby)
140 |     self.new(agent.get(url))
141 |   end
142 | 
143 |   # Iterates a series of pages, starting at +start+.  The Scraper class
144 |   # is instantiated with each page, and each Scraper instance is passed
145 |   # to the given block.  Subsequent pages in the series are determined
146 |   # by invoking the +next_method+ method on each Scraper instance.
147 |   #
148 |   # Iteration stops when the +next_method+ method returns falsy.  If the
149 |   # +next_method+ method returns a String or URI, that value will be
150 |   # treated as the URL of the next page.  Otherwise that value will be
151 |   # treated as the page itself.
152 |   #
153 |   # @example Iterate from page object
154 |   #   class PostsIndexScraper < Grubby::PageScraper
155 |   #     def next
156 |   #       page.link_with(text: "Next >")&.click
157 |   #     end
158 |   #   end
159 |   #
160 |   #   PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper|
161 |   #     scraper.page.uri.query  # == "page=1", "page=2", "page=3", ...
162 |   #   end
163 |   #
164 |   # @example Iterate from URI
165 |   #   class PostsIndexScraper < Grubby::PageScraper
166 |   #     def next
167 |   #       page.link_with(text: "Next >")&.to_absolute_uri
168 |   #     end
169 |   #   end
170 |   #
171 |   #   PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper|
172 |   #     scraper.page.uri.query  # == "page=1", "page=2", "page=3", ...
173 |   #   end
174 |   #
175 |   # @example Specifying the iteration method
176 |   #   class PostsIndexScraper < Grubby::PageScraper
177 |   #     scrapes(:next_uri, optional: true) do
178 |   #       page.link_with(text: "Next >")&.to_absolute_uri
179 |   #     end
180 |   #   end
181 |   #
182 |   #   PostsIndexScraper.each("https://example.com/posts?page=1", next_method: :next_uri) do |scraper|
183 |   #     scraper.page.uri.query  # == "page=1", "page=2", "page=3", ...
184 |   #   end
185 |   #
186 |   # @param start [String, URI, Mechanize::Page, Mechanize::File]
187 |   # @param agent [Mechanize]
188 |   # @param next_method [Symbol]
189 |   # @yieldparam scraper [Grubby::Scraper]
190 |   # @return [void]
191 |   # @raise [NoMethodError]
192 |   #   if the Scraper class does not define the method indicated by
193 |   #   +next_method+
194 |   # @raise [Grubby::Scraper::Error]
195 |   #   if any {Scraper.scrapes} blocks fail
196 |   def self.each(start, agent = $grubby, next_method: :next)
197 |     unless self.method_defined?(next_method)
198 |       raise NoMethodError.new(nil, next_method), "#{self} does not define `#{next_method}`"
199 |     end
200 | 
201 |     return to_enum(:each, start, agent, next_method: next_method) unless block_given?
202 | 
203 |     current = start
204 |     while current
205 |       current = agent.get(current) if current.is_a?(String) || current.is_a?(URI)
206 |       scraper = self.new(current)
207 |       yield scraper
208 |       current = scraper.send(next_method)
209 |     end
210 |   end
211 | 
212 |   # The object being scraped.  Typically an instance of a Mechanize
213 |   # pluggable parser such as +Mechanize::Page+.
214 |   #
215 |   # @return [Object]
216 |   attr_reader :source
217 | 
218 |   # Collected errors raised during {initialize} by {Scraper.scrapes}
219 |   # blocks, indexed by field name.  This Hash will be empty if
220 |   # {initialize} did not raise a +Grubby::Scraper::Error+.
221 |   #
222 |   # @return [Hash{Symbol => StandardError}]
223 |   attr_reader :errors
224 | 
225 |   # @param source
226 |   # @raise [Grubby::Scraper::Error]
227 |   #   if any {Scraper.scrapes} blocks fail
228 |   def initialize(source)
229 |     @source = source
230 |     @scraped = {}
231 |     @errors = {}
232 | 
233 |     self.class.fields.each do |field|
234 |       begin
235 |         self.send(field)
236 |       rescue FieldScrapeFailedError
237 |       end
238 |     end
239 | 
240 |     raise Error.new(self) unless @errors.empty?
241 |   end
242 | 
243 |   # Returns the scraped value named by +field+.
244 |   #
245 |   # @param field [Symbol, String]
246 |   # @return [Object]
247 |   # @raise [RuntimeError]
248 |   #   if +field+ is not a valid name
249 |   def [](field)
250 |     @scraped.fetch(field.to_sym)
251 |   end
252 | 
253 |   # Returns all scraped values as a Hash.
254 |   #
255 |   # @return [Hash{Symbol => Object}]
256 |   def to_h
257 |     @scraped.dup
258 |   end
259 | 
260 |   class Error < RuntimeError
261 |     # @!visibility private
262 |     BACKTRACE_CLEANER = ActiveSupport::BacktraceCleaner.new.tap do |cleaner|
263 |       cleaner.add_silencer do |line|
264 |         line.include?(__dir__) && line.include?("scraper.rb:")
265 |       end
266 |     end
267 | 
268 |     # The Scraper that raised this Error.
269 |     #
270 |     # @return [Grubby::Scraper]
271 |     attr_accessor :scraper
272 | 
273 |     # @!visibility private
274 |     def initialize(scraper)
275 |       self.scraper = scraper
276 | 
277 |       listing = scraper.errors.
278 |         reject{|field, error| error.is_a?(FieldScrapeFailedError) }.
279 |         map do |field, error|
280 |           "* `#{field}` (#{error.class})\n" +
281 |             error.message.indent(2) + "\n\n" +
282 |             BACKTRACE_CLEANER.clean(error.backtrace).join("\n").indent(4) + "\n"
283 |         end.
284 |         join("\n")
285 | 
286 |       super("Failed to scrape the following fields:\n#{listing}")
287 |     end
288 |   end
289 | 
290 |   # @!visibility private
291 |   class FieldScrapeFailedError < RuntimeError
292 |     def initialize(field, field_error)
293 |       super("`#{field}` raised #{field_error.class}")
294 |     end
295 |   end
296 | 
297 |   # @!visibility private
298 |   class FieldValueRequiredError < RuntimeError
299 |     def initialize(field)
300 |       super("`#{field}` is nil but is not marked as optional")
301 |     end
302 |   end
303 | 
304 | end
305 | 


--------------------------------------------------------------------------------
/lib/grubby/version.rb:
--------------------------------------------------------------------------------
1 | GRUBBY_VERSION = "2.0.0"
2 | 


--------------------------------------------------------------------------------
/test/core_ext/string_test.rb:
--------------------------------------------------------------------------------
 1 | require "test_helper"
 2 | 
 3 | class StringTest < Minitest::Test
 4 | 
 5 |   def test_to_absolute_uri_with_absolute_uri
 6 |     string = "http://localhost"
 7 |     uri = string.to_absolute_uri
 8 |     assert_kind_of URI, uri
 9 |     assert_equal string, uri.to_s
10 |   end
11 | 
12 |   def test_to_absolute_uri_with_relative_uri
13 |     assert_raises { "/index.html".to_absolute_uri }
14 |   end
15 | 
16 | end
17 | 


--------------------------------------------------------------------------------
/test/core_ext/uri_test.rb:
--------------------------------------------------------------------------------
 1 | require "test_helper"
 2 | 
 3 | class URITest < Minitest::Test
 4 | 
 5 |   def test_basename
 6 |     ["", "foo", "foo/bar"].each do |path|
 7 |       basename = File.basename(path)
 8 |       assert_equal basename, URI.join("http://localhost", path).basename
 9 |       assert_equal basename, URI.join("http://localhost", path + "/").basename
10 |     end
11 |   end
12 | 
13 |   def test_query_param
14 |     keys = ["", "[]", "[][x]", "[][y]", "[x][]", "[y][]"].map{|brack| "foo#{brack}" }
15 |     values = ["a", "b", "c"]
16 |     query = keys.product(values).map{|key, value| "#{key}=#{value}" }.join("&")
17 |     uri = URI("http://localhost/?#{query}")
18 | 
19 |     keys.each do |key|
20 |       expected = key.include?("[]") ? values : values.last
21 |       assert_equal expected, uri.query_param(key)
22 |     end
23 | 
24 |     assert_nil uri.query_param("nope")
25 |   end
26 | 
27 |   def test_query_param_with_no_query
28 |     assert_nil URI("http://localhost/").query_param("nope")
29 |     assert_nil URI("http://localhost/?").query_param("nope")
30 |   end
31 | 
32 |   def test_to_absolute_uri_with_absolute_uri
33 |     uri = URI("http://localhost")
34 |     assert_same uri, uri.to_absolute_uri
35 |   end
36 | 
37 |   def test_to_absolute_uri_with_relative_uri
38 |     uri = URI("/index.html")
39 |     assert_raises { uri.to_absolute_uri }
40 |   end
41 | 
42 | end
43 | 


--------------------------------------------------------------------------------
/test/grubby_test.rb:
--------------------------------------------------------------------------------
  1 | require "test_helper"
  2 | 
  3 | class GrubbyTest < Mechanize::TestCase
  4 | 
  5 |   def test_that_it_has_a_version_number
  6 |     refute_nil Grubby::VERSION
  7 |   end
  8 | 
  9 |   def test_default_constructor
 10 |     assert_kind_of Mechanize, Grubby.new
 11 |   end
 12 | 
 13 |   def test_global_default_instance
 14 |     assert_instance_of Grubby, $grubby
 15 |   end
 16 | 
 17 |   def test_time_between_requests_with_number
 18 |     $sleep_last_amount = 0.0
 19 |     amount = 5.0
 20 | 
 21 |     grubby = Grubby.new
 22 |     grubby.time_between_requests = amount
 23 |     grubby.get("http://localhost")
 24 |     assert_equal 0.0, $sleep_last_amount
 25 |     grubby.get("http://localhost")
 26 |     assert_includes ((amount - 0.1)..amount), $sleep_last_amount
 27 |   end
 28 | 
 29 |   def test_time_between_requests_with_range
 30 |     $sleep_last_amount = 0.0
 31 |     min_amount = 5.0
 32 |     max_amount = 10.0
 33 | 
 34 |     grubby = Grubby.new
 35 |     grubby.time_between_requests = min_amount..max_amount
 36 |     grubby.get("http://localhost")
 37 |     assert_equal 0.0, $sleep_last_amount
 38 |     grubby.get("http://localhost")
 39 |     assert_includes ((min_amount - 0.1)..max_amount), $sleep_last_amount
 40 |   end
 41 | 
 42 |   def test_time_between_requests_begins_after_request_finishes
 43 |     grubby = Grubby.new
 44 |     grubby.time_between_requests = 1.0
 45 |     # use content-encoding hook so that a time recorded by a pre-connect
 46 |     # hook will be disregarded, while a time recorded by a post-connect
 47 |     # hook will not
 48 |     grubby.content_encoding_hooks << Proc.new{ grubby.send(:mark_last_request_time, nil) }
 49 | 
 50 |     grubby.get("http://localhost")
 51 |     $sleep_last_amount = 0.0
 52 |     grubby.get("http://localhost")
 53 |     assert_operator $sleep_last_amount, :>, 0.0
 54 |   end
 55 | 
 56 |   def test_sleep_between_requests_after_redirect
 57 |     $sleep_calls = 0
 58 |     redirect_url = "http://localhost/redirect"
 59 |     grubby = Grubby.new
 60 | 
 61 |     actual_url = grubby.get(redirect_url).uri.to_s
 62 |     refute_equal redirect_url, actual_url # sanity check
 63 |     grubby.get(redirect_url)
 64 |     assert_equal 1, $sleep_calls
 65 |   end
 66 | 
 67 |   def test_ok_predicate_with_success_code
 68 |     assert Grubby.new.ok?(make_uris(1).first)
 69 |   end
 70 | 
 71 |   def test_ok_predicate_with_error_code
 72 |     refute Grubby.new.ok?(make_uris(1, "500").first)
 73 |   end
 74 | 
 75 |   def test_get_mirrored_with_first_successful
 76 |     uris = make_uris(2)
 77 | 
 78 |     assert_equal uris.first, get_mirrored_resultant_uri(uris)
 79 |   end
 80 | 
 81 |   def test_get_mirrored_with_last_successful
 82 |     uris = make_uris(2, "404") + make_uris(1)
 83 | 
 84 |     assert_equal uris.last, get_mirrored_resultant_uri(uris)
 85 |   end
 86 | 
 87 |   def test_get_mirrored_with_none_successful
 88 |     uris = make_uris(2, "404")
 89 | 
 90 |     assert_raises(Mechanize::ResponseCodeError) do
 91 |       get_mirrored_resultant_uri(uris)
 92 |     end
 93 |   end
 94 | 
 95 |   def test_fulfill_returns_block_result
 96 |     expected = "foo"
 97 |     actual = silence_logging do
 98 |       Grubby.new.fulfill(make_uris(1).first){ expected }
 99 |     end
100 | 
101 |     assert_equal expected, actual
102 |   end
103 | 
104 |   def test_fulfill_with_different_pages
105 |     uris = make_uris(2)
106 |     uris.last.path = "/form_test.html"
107 | 
108 |     assert_equal uris, fulfill_resultant_uris(uris)
109 |   end
110 | 
111 |   def test_fulfill_with_same_url
112 |     uris = make_uris(1) * 2
113 | 
114 |     assert_equal uris.uniq, fulfill_resultant_uris(uris)
115 |   end
116 | 
117 |   def test_fulfill_with_same_page_content
118 |     uris = make_uris(2)
119 | 
120 |     assert_equal uris.take(1), fulfill_resultant_uris(uris)
121 |   end
122 | 
123 |   def test_fulfill_with_different_purposes
124 |     purposes = 2.times.map{|i| "purpose #{i}" }
125 |     uris = make_uris(1) * purposes.length
126 | 
127 |     assert_equal uris, fulfill_resultant_uris(uris.zip(purposes))
128 |   end
129 | 
130 |   def test_journal_fulfill
131 |     uris = make_uris(2)
132 | 
133 |     in_tmpdir do
134 |       refute_empty fulfill_resultant_uris(uris, Grubby.new("journal.txt"))
135 |       assert_empty fulfill_resultant_uris(uris, Grubby.new("journal.txt"))
136 |     end
137 |   end
138 | 
139 |   def test_journal_fulfill_with_different_pages
140 |     uris = make_uris(2)
141 |     uris.last.path = "/form_test.html"
142 | 
143 |     in_tmpdir do
144 |       refute_empty fulfill_resultant_uris(uris.take(1), Grubby.new("journal.txt"))
145 |       refute_empty fulfill_resultant_uris(uris.drop(1), Grubby.new("journal.txt"))
146 |     end
147 |   end
148 | 
149 |   def test_journal_fulfill_with_different_purposes
150 |     purposes = 2.times.map{|i| "purpose #{i}" }
151 |     uris = make_uris(1) * purposes.length
152 |     requests = uris.zip(purposes)
153 | 
154 |     in_tmpdir do
155 |       refute_empty fulfill_resultant_uris(requests, Grubby.new("journal.txt"))
156 |       assert_empty fulfill_resultant_uris(requests, Grubby.new("journal.txt"))
157 |     end
158 |   end
159 | 
160 |   def test_journal_initializer
161 |     in_tmpdir do
162 |       assert_equal Pathname.new("expected"), Grubby.new("expected").journal
163 |     end
164 |   end
165 | 
166 |   def test_journal_attr
167 |     uris = make_uris(2)
168 |     journal_a = Pathname.new("a")
169 |     journal_b = Pathname.new("b")
170 | 
171 |     in_tmpdir do
172 |       grubby = Grubby.new
173 | 
174 |       grubby.journal = journal_a.to_s
175 |       assert_equal journal_a, grubby.journal
176 |       refute_empty fulfill_resultant_uris(uris, grubby)
177 | 
178 |       grubby.journal = journal_b
179 |       assert_equal journal_b, grubby.journal
180 |       refute_empty fulfill_resultant_uris(uris, grubby)
181 | 
182 |       grubby.journal = journal_a
183 |       assert_empty fulfill_resultant_uris(uris, grubby)
184 | 
185 |       grubby.journal = nil
186 |       assert_nil grubby.journal
187 |       refute_empty fulfill_resultant_uris(uris, grubby)
188 |     end
189 |   end
190 | 
191 |   def test_json_pluggable_parser
192 |     grubby = Grubby.new
193 |     parser = grubby.get("http://localhost/response_code?code=200&ct=application/json")
194 | 
195 |     assert_instance_of Grubby::JsonParser, parser
196 |     assert_same grubby, parser.mech
197 |   end
198 | 
199 | 
200 |   private
201 | 
202 |   def make_uris(count, response_code = "200")
203 |     count.times.map do |i|
204 |       URI("http://localhost/response_code?code=#{response_code}&i=#{i}")
205 |     end
206 |   end
207 | 
208 |   def get_mirrored_resultant_uri(uris)
209 |     silence_logging do
210 |       Grubby.new.get_mirrored(uris).uri
211 |     end
212 |   end
213 | 
214 |   def fulfill_resultant_uris(requests, grubby = Grubby.new)
215 |     silence_logging do
216 |       requests.map do |args|
217 |         grubby.fulfill(*args){|page| page.uri }
218 |       end.compact
219 |     end
220 |   end
221 | 
222 | end
223 | 


--------------------------------------------------------------------------------
/test/json_parser_test.rb:
--------------------------------------------------------------------------------
 1 | require "test_helper"
 2 | 
 3 | class GrubbyJsonParserTest < Minitest::Test
 4 | 
 5 |   def test_initialize
 6 |     uri = URI("http://localhost")
 7 |     data = [{ "key1" => "val1" }, { "key2" => "val2" }]
 8 |     body = data.to_json
 9 |     code = "200"
10 |     mech = Grubby.new
11 |     parser = Grubby::JsonParser.new(uri, nil, body, code, mech)
12 | 
13 |     assert_equal uri, parser.uri
14 |     assert_equal body, parser.body
15 |     assert_equal data, parser.json
16 |     assert_equal code, parser.code
17 |     assert_same mech, parser.mech
18 |   end
19 | 
20 |   def test_initialize_with_blanks
21 |     [[], [nil] * 5, [nil, nil, "", nil, nil]].each do |args|
22 |       parser = Grubby::JsonParser.new(*args) # does not raise
23 |       assert_nil parser.json
24 |     end
25 |   end
26 | 
27 |   def test_json_parsing_is_safe
28 |     require "json/add/complex"
29 |     body = JSON.dump(Complex(0, 1))
30 |     assert_instance_of Complex, JSON.load(body) # sanity check
31 | 
32 |     refute_instance_of Complex, Grubby::JsonParser.new(nil, nil, body).json
33 |   end
34 | 
35 | end
36 | 


--------------------------------------------------------------------------------
/test/json_scraper_test.rb:
--------------------------------------------------------------------------------
 1 | require "test_helper"
 2 | 
 3 | class GrubbyJsonScraperTest < Minitest::Test
 4 | 
 5 |   def test_initialize_with_valid_parser
 6 |     parser = Grubby::JsonParser.new(nil, nil, "[1, 2, 3]", nil)
 7 |     scraper = Grubby::JsonScraper.new(parser)
 8 |     assert_same parser.json, scraper.json
 9 |   end
10 | 
11 |   def test_initialize_with_invalid_parser
12 |     page = Mechanize::Page.new
13 |     assert_raises { Grubby::JsonScraper.new(page) }
14 |   end
15 | 
16 |   def test_scrape_file
17 |     Dir.mktmpdir do |dir|
18 |       path = File.join(dir, "some file.json")
19 |       hi = "Hello"
20 |       File.write(path, "{ \"hi\": \"#{hi}\" }")
21 |       scraper = MyScraper.scrape_file(path)
22 | 
23 |       assert_instance_of MyScraper, scraper
24 |       assert_equal hi, scraper.hi
25 |       assert_same $grubby, scraper.source.mech
26 |     end
27 |   end
28 | 
29 |   def test_scrape_file_with_agent
30 |     Dir.mktmpdir do |dir|
31 |       path = File.join(dir, "file.json")
32 |       File.write(path, "{ \"hi\": \"...\" }")
33 |       grubby = Grubby.new
34 |       scraper = MyScraper.scrape_file(path, grubby)
35 | 
36 |       assert_same grubby, scraper.source.mech
37 |     end
38 |   end
39 | 
40 |   private
41 | 
42 |   class MyScraper < Grubby::JsonScraper
43 |     scrapes(:hi){ json.fetch("hi") }
44 |   end
45 | 
46 | end
47 | 


--------------------------------------------------------------------------------
/test/log_test.rb:
--------------------------------------------------------------------------------
 1 | require 'test_helper'
 2 | 
 3 | class LogTest < Minitest::Test
 4 | 
 5 |   def test_log_global_exists
 6 |     assert_kind_of Logger, $log
 7 |   end
 8 | 
 9 |   def test_log_global_logs
10 |     out, err = capture_subprocess_io do
11 |       $log.error('testing123')
12 |     end
13 | 
14 |     assert_match 'testing123', (out + err)
15 |   end
16 | 
17 | end
18 | 


--------------------------------------------------------------------------------
/test/mechanize/download_test.rb:
--------------------------------------------------------------------------------
 1 | require "test_helper"
 2 | 
 3 | class MechanizeDownloadTest < Mechanize::TestCase
 4 | 
 5 |   def test_content_hash
 6 |     content = "abcdef"
 7 |     assert_equal content.sha1, make_mechanize_download(content).content_hash
 8 |   end
 9 | 
10 | 
11 |   private
12 | 
13 |   def make_mechanize_download(content)
14 |     Mechanize::Download.new(nil, nil, StringIO.new(content), nil)
15 |   end
16 | 
17 | end
18 | 


--------------------------------------------------------------------------------
/test/mechanize/fetch_with_retry_test.rb:
--------------------------------------------------------------------------------
 1 | require "test_helper"
 2 | 
 3 | class FetchWithRetryTest < Mechanize::TestCase
 4 | 
 5 |   class ::Mechanize::HTTP::Agent
 6 |     def stubbed_fetch_without_retry(*args)
 7 |       $stubbed_fetch_error_queue ||= []
 8 |       error = $stubbed_fetch_error_queue.shift
 9 |       raise error if error
10 |       real_fetch_without_retry(*args)
11 |     end
12 | 
13 |     alias_method :real_fetch_without_retry, :fetch_without_retry
14 |     alias_method :fetch_without_retry, :stubbed_fetch_without_retry
15 |   end
16 | 
17 | 
18 |   def test_fetch_works_normally
19 |     $stubbed_fetch_error_queue = []
20 |     assert_instance_of Mechanize::Page, do_fetch
21 |   end
22 | 
23 |   def test_fetch_retries_upto_max
24 |     max_retries = ::Mechanize::HTTP::Agent::MAX_CONNECTION_RESET_RETRIES
25 |     $stubbed_fetch_error_queue = max_retries.times.map do
26 |       Net::HTTP::Persistent::Error.new("too many connection resets")
27 |     end
28 | 
29 |     out, err = capture_subprocess_io do
30 |       assert_instance_of Mechanize::Page, do_fetch
31 |     end
32 |     assert_equal max_retries, (out + err).scan(/retry in \d+ seconds?/i).length
33 |   end
34 | 
35 |   def test_fetch_fails_after_max_retries
36 |     max_retries = ::Mechanize::HTTP::Agent::MAX_CONNECTION_RESET_RETRIES
37 |     $stubbed_fetch_error_queue = (max_retries + 1).times.map do
38 |       Net::HTTP::Persistent::Error.new("too many connection resets")
39 |     end
40 | 
41 |     out, err = capture_subprocess_io do
42 |       assert_raises(Net::HTTP::Persistent::Error) { do_fetch }
43 |     end
44 |     assert_equal max_retries, (out + err).scan(/retry in \d+ seconds?/i).length
45 |   end
46 | 
47 |   def test_fetch_reraises_other_errors
48 |     expected_error = RuntimeError.new("something else went wrong")
49 |     $stubbed_fetch_error_queue = [expected_error]
50 | 
51 |     actual_error = assert_raises { do_fetch }
52 |     assert_equal expected_error, actual_error
53 |   end
54 | 
55 | 
56 |   private
57 | 
58 |   def do_fetch
59 |     Mechanize.new.get("http://localhost")
60 |   end
61 | 
62 | end
63 | 


--------------------------------------------------------------------------------
/test/mechanize/file_test.rb:
--------------------------------------------------------------------------------
 1 | require "test_helper"
 2 | 
 3 | class MechanizeFileTest < Mechanize::TestCase
 4 | 
 5 |   def test_read_local
 6 |     Dir.mktmpdir do |dir|
 7 |       path = File.join(dir, "`this` & {that}.txt")
 8 |       content = "stuff\nmorestuff\n"
 9 |       File.write(path, content)
10 |       mech_file = Mechanize::File.read_local(path)
11 | 
12 |       assert_instance_of Mechanize::File, mech_file
13 |       assert_equal "file", mech_file.uri.scheme
14 |       assert_equal path, CGI.unescape(mech_file.uri.path)
15 |       assert_equal content, mech_file.content
16 |       assert_equal "200", mech_file.code
17 |     end
18 |   end
19 | 
20 |   def test_content_hash
21 |     content = "abcdef"
22 |     mech_file = Mechanize::File.new(nil, nil, content, nil)
23 |     assert_equal content.sha1, mech_file.content_hash
24 |   end
25 | 
26 | end
27 | 


--------------------------------------------------------------------------------
/test/mechanize/link_test.rb:
--------------------------------------------------------------------------------
 1 | require "test_helper"
 2 | 
 3 | class MechanizeLinkTest < Mechanize::TestCase
 4 | 
 5 |   def test_to_absolute_uri_with_absolute_href
 6 |     href = "http://localhost"
 7 |     uri = make_link(href).to_absolute_uri
 8 |     assert_kind_of URI, uri
 9 |     assert_equal href, uri.to_s
10 |   end
11 | 
12 |   def test_to_absolute_uri_with_relative_href
13 |     href = "/index.html"
14 |     uri = make_link(href).to_absolute_uri
15 |     assert_kind_of URI, uri
16 |     assert uri.absolute?
17 |     assert uri.to_s.end_with?(href)
18 |   end
19 | 
20 |   def test_to_absolute_uri_with_nil_href
21 |     uri = make_link(nil).to_absolute_uri
22 |     assert_kind_of URI, uri
23 |     assert uri.absolute?
24 |   end
25 | 
26 | 
27 |   private
28 | 
29 |   def make_link(href)
30 |     page = html_page(<<-HTML)
31 |       <html>
32 |       <body>
33 |         <a #{href && "href=\"#{href}\""}>link</a>
34 |       </body>
35 |       </html>
36 |     HTML
37 | 
38 |     page.links.first
39 |   end
40 | 
41 | end
42 | 


--------------------------------------------------------------------------------
/test/mechanize/page_test.rb:
--------------------------------------------------------------------------------
 1 | require "test_helper"
 2 | 
 3 | class MechanizePageTest < Mechanize::TestCase
 4 | 
 5 |   def test_searchbang_with_one_matching
 6 |     results = make_page.search!("#bad1", "#good2", "#bad2")
 7 |     assert_equal "good2", results.first.attr("id")
 8 |   end
 9 | 
10 |   def test_atbang_with_multiple_matching
11 |     result = make_page.at!("#good3", "#good2")
12 |     assert_equal "good3", result.attr("id")
13 |   end
14 | 
15 | 
16 |   private
17 | 
18 |   def make_page
19 |     html_page(<<-HTML)
20 |       <html>
21 |       <body>
22 |         <p id="good1"></p>
23 |         <p id="good2"></p>
24 |         <p id="good3"></p>
25 |       </body>
26 |       </html>
27 |     HTML
28 |   end
29 | 
30 | end
31 | 


--------------------------------------------------------------------------------
/test/mechanize/parser_test.rb:
--------------------------------------------------------------------------------
 1 | require "test_helper"
 2 | 
 3 | class MechanizeParserTest < Mechanize::TestCase
 4 | 
 5 |   def test_save_to_sanity
 6 |     # sanity check that Mechanize includes the Mechanize::Parser module
 7 |     # and implements #save! in the relevant classes
 8 |     assert_includes Mechanize::File.included_modules, Mechanize::Parser
 9 |     assert_includes Mechanize::File.instance_methods, :save!
10 |     assert_includes Mechanize::Download.included_modules, Mechanize::Parser
11 |     assert_includes Mechanize::Download.instance_methods, :save!
12 |     assert_includes Mechanize::Image.included_modules, Mechanize::Parser
13 |     assert_includes Mechanize::Image.instance_methods, :save!
14 |   end
15 | 
16 |   def test_save_to
17 |     dir = "deeply/nested/dir/"
18 |     html1 = "<h1>Hello</h1>"
19 |     html2 = "<h2>Hello</h2>"
20 | 
21 |     in_tmpdir do
22 |       path1 = html_page(html1).save_to(dir)
23 |       assert_match %r"^#{dir}.+", path1
24 |       assert_equal html1, File.read(path1)
25 | 
26 |       path2 = html_page(html2).save_to(dir)
27 |       assert_match %r"^#{dir}.+", path2
28 |       assert_equal html2, File.read(path2)
29 |       refute_equal path1, path2
30 |       assert_equal html1, File.read(path1)
31 |     end
32 |   end
33 | 
34 |   def test_save_to_bang
35 |     dir = "deeply/nested/dir/"
36 |     html1 = "<h1>Hello</h1>"
37 |     html2 = "<h2>Hello</h2>"
38 | 
39 |     in_tmpdir do
40 |       path1 = html_page(html1).save_to!(dir)
41 |       assert_match %r"^#{dir}.+", path1
42 |       assert_equal html1, File.read(path1)
43 | 
44 |       path2 = html_page(html2).save_to!(dir)
45 |       assert_equal path1, path2
46 |       assert_equal html2, File.read(path1)
47 |     end
48 |   end
49 | 
50 | end
51 | 


--------------------------------------------------------------------------------
/test/page_scraper_test.rb:
--------------------------------------------------------------------------------
 1 | require "test_helper"
 2 | 
 3 | class GrubbyPageScraperTest < Minitest::Test
 4 | 
 5 |   def test_initialize_with_valid_parser
 6 |     page = Mechanize::Page.new
 7 |     scraper = Grubby::PageScraper.new(page)
 8 |     assert_same page, scraper.page
 9 |   end
10 | 
11 |   def test_initialize_with_invalid_parser
12 |     download = Mechanize::Download.new
13 |     assert_raises { Grubby::PageScraper.new(download) }
14 |   end
15 | 
16 |   def test_scrape_file
17 |     Dir.mktmpdir do |dir|
18 |       path = File.join(dir, "some file.html")
19 |       h1 = "Hello"
20 |       File.write(path, "<h1>#{h1}</h1>")
21 |       scraper = MyScraper.scrape_file(path)
22 | 
23 |       assert_instance_of MyScraper, scraper
24 |       assert_equal h1, scraper.h1
25 |       assert_same $grubby, scraper.page.mech
26 |     end
27 |   end
28 | 
29 |   def test_scrape_file_with_agent
30 |     Dir.mktmpdir do |dir|
31 |       path = File.join(dir, "file.html")
32 |       File.write(path, "<h1>...</h1>")
33 |       grubby = Grubby.new
34 |       scraper = MyScraper.scrape_file(path, grubby)
35 | 
36 |       assert_same grubby, scraper.page.mech
37 |     end
38 |   end
39 | 
40 |   private
41 | 
42 |   class MyScraper < Grubby::PageScraper
43 |     scrapes(:h1){ page.at("h1").text }
44 |   end
45 | 
46 | end
47 | 


--------------------------------------------------------------------------------
/test/scraper_test.rb:
--------------------------------------------------------------------------------
  1 | require "test_helper"
  2 | 
  3 | class GrubbyScraperTest < Minitest::Test
  4 | 
  5 |   def test_scrapes_values
  6 |     scraper = make_scraper(CONTENT)
  7 | 
  8 |     EXPECTED.each do |field, expected|
  9 |       assert_equal [expected], [scraper.send(field)]
 10 |     end
 11 |   end
 12 | 
 13 |   def test_raises_on_nil_required_value
 14 |     assert_raises(Grubby::Scraper::Error) do
 15 |       make_scraper(CONTENT.merge(req: nil))
 16 |     end
 17 |   end
 18 | 
 19 |   def test_allows_nil_optional_value
 20 |     scraper = make_scraper(CONTENT.merge(opt: nil))
 21 | 
 22 |     assert_equal EXPECTED[:req_val], scraper.req_val # sanity check
 23 |     assert_nil scraper.opt_val
 24 |   end
 25 | 
 26 |   def test_obeys_conditional_modifiers
 27 |     scraper = make_scraper(CONTENT.merge(opt: nil))
 28 | 
 29 |     assert_nil scraper.opt_word
 30 |     refute_nil scraper.opt_miss
 31 |   end
 32 | 
 33 |   def test_captures_all_errors
 34 |     error = assert_raises(Grubby::Scraper::Error){ make_scraper({}) }
 35 | 
 36 |     assert_instance_of MyScraper, error.scraper
 37 |     EXPECTED.compact.keys.each do |field|
 38 |       assert_kind_of StandardError, error.scraper.errors[field]
 39 |     end
 40 |   end
 41 | 
 42 |   def test_reports_only_original_errors
 43 |     error = assert_raises(Grubby::Scraper::Error){ make_scraper({}) }
 44 | 
 45 |     assert_match "req_val", error.message
 46 |     assert_match "opt_val", error.message
 47 |     refute_match "opt_word", error.message
 48 |   end
 49 | 
 50 |   def test_filters_error_backtrace
 51 |     error = assert_raises(Grubby::Scraper::Error){ make_scraper({}) }
 52 |     ruby_file = Grubby::Scraper.method(:scrapes).source_location[0]
 53 | 
 54 |     refute_match ruby_file, error.message
 55 |   end
 56 | 
 57 |   def test_fields_attr
 58 |     assert_equal EXPECTED.keys.sort, MyScraper.fields.sort
 59 |   end
 60 | 
 61 |   def test_fields_attr_includes_superclass_fields
 62 |     assert_equal INHERITING_EXPECTED.keys.sort, MyInheritingScraper.fields.sort
 63 |   end
 64 | 
 65 |   def test_source_attr
 66 |     scraper = make_scraper(CONTENT)
 67 | 
 68 |     assert_kind_of Mechanize::File, scraper.source
 69 |     assert_equal CONTENT, scraper.source.content
 70 |   end
 71 | 
 72 |   def test_lookup
 73 |     scraper = make_scraper(CONTENT)
 74 | 
 75 |     EXPECTED.each do |field, expected|
 76 |       assert_equal [expected], [scraper[field]]
 77 |     end
 78 |   end
 79 | 
 80 |   def test_to_h
 81 |     scraper = make_scraper(CONTENT)
 82 | 
 83 |     assert_equal EXPECTED, scraper.to_h
 84 |   end
 85 | 
 86 |   def test_to_h_includes_superclass_fields
 87 |     scraper = make_scraper(CONTENT, MyInheritingScraper)
 88 | 
 89 |     assert_equal INHERITING_EXPECTED, scraper.to_h
 90 |   end
 91 | 
 92 |   def test_initialize_missing_super_raises_friendly_error
 93 |     error = assert_raises{ IncorrectScraper.new.foo }
 94 | 
 95 |     assert_match "initialize", error.message
 96 |   end
 97 | 
 98 |   def test_factory_method
 99 |     scraper = UrlScraper.scrape(UrlScraper.url)
100 | 
101 |     assert_instance_of UrlScraper, scraper
102 |     assert_equal UrlScraper.url, scraper.url
103 |     assert_same $grubby, scraper.source.mech
104 |   end
105 | 
106 |   def test_factory_method_with_agent
107 |     agent = Mechanize.new
108 |     scraper = UrlScraper.scrape(UrlScraper.url, agent)
109 | 
110 |     assert_same agent, scraper.source.mech
111 |   end
112 | 
113 |   def test_each
114 |     expected_urls = [3, 2, 1].map{|i| UrlScraper.url(i) }
115 |     actual_urls = []
116 | 
117 |     UrlScraper.each(expected_urls.first) do |scraper|
118 |       assert_instance_of UrlScraper, scraper
119 |       assert_same $grubby, scraper.source.mech
120 |       actual_urls << scraper.url
121 |     end
122 |     assert_equal expected_urls, actual_urls
123 |   end
124 | 
125 |   def test_each_without_block
126 |     expected_urls = [3, 2, 1].map{|i| UrlScraper.url(i) }
127 | 
128 |     actual_enum = UrlScraper.each(expected_urls.first)
129 |     assert_kind_of Enumerator, actual_enum
130 | 
131 |     actual_scrapers = actual_enum.to_a
132 |     actual_scrapers.each{|scraper| assert_instance_of UrlScraper, scraper }
133 |     assert_equal expected_urls, actual_scrapers.map(&:url)
134 |   end
135 | 
136 |   def test_each_with_next_method
137 |     expected_urls = [3, 2, 1].map{|i| UrlScraper.url(i) + "#next_uri" }
138 |     actual_urls = UrlScraper.each(expected_urls.first, next_method: :next_uri).map(&:url)
139 | 
140 |     assert_equal expected_urls, actual_urls
141 |   end
142 | 
143 |   def test_each_with_invalid_next_method
144 |     error = assert_raises NoMethodError do
145 |       UrlScraper.each(UrlScraper.url(2), next_method: :nope) do |scraper|
146 |         assert false # should never get here
147 |       end
148 |     end
149 |     assert_equal :nope, error.name
150 | 
151 |     error = assert_raises NoMethodError do
152 |       UrlScraper.each(UrlScraper.url(2), next_method: :nope)
153 |     end
154 |     assert_equal :nope, error.name
155 |   end
156 | 
157 |   def test_each_over_page_objects
158 |     expected_urls = [3, 2, 1].map{|i| UrlScraper.url(i) + "#next_page" }
159 |     start_page = Grubby.new.get(expected_urls.first)
160 |     actual_urls = UrlScraper.each(start_page, next_method: :next_page).map(&:url)
161 | 
162 |     assert_equal expected_urls, actual_urls
163 |   end
164 | 
165 |   def test_each_with_agent
166 |     agent = Mechanize.new
167 | 
168 |     UrlScraper.each(UrlScraper.url(3), agent) do |scraper|
169 |       assert_same agent, scraper.source.mech
170 |     end
171 |   end
172 | 
173 |   private
174 | 
175 |   CONTENT = {
176 |     req: "required value",
177 |     opt: "optional value",
178 |   }
179 | 
180 |   EXPECTED = {
181 |     req_val: "required value",
182 |     opt_val: "optional value",
183 |     opt_word: "optional",
184 |     opt_miss: nil,
185 |   }
186 | 
187 |   INHERITING_EXPECTED = EXPECTED.merge(
188 |     opt_val: EXPECTED[:opt_val].swapcase,
189 |     opt_word: EXPECTED[:opt_word].swapcase,
190 |     add_val: EXPECTED[:req_val],
191 |   )
192 | 
193 |   class MyScraper < Grubby::Scraper
194 |     scrapes :req_val do
195 |       source.content.fetch(:req)
196 |     end
197 | 
198 |     scrapes :opt_val, optional: true do
199 |       source.content.fetch(:opt)
200 |     end
201 | 
202 |     scrapes :opt_word, if: :opt_val do
203 |       opt_val[/\w+/]
204 |     end
205 | 
206 |     scrapes :opt_miss, unless: :opt_val do
207 |       true
208 |     end
209 |   end
210 | 
211 |   class MyInheritingScraper < MyScraper
212 |     scrapes :opt_val, optional: true do
213 |       source.content.fetch(:opt)&.swapcase
214 |     end
215 | 
216 |     scrapes :add_val do
217 |       req_val
218 |     end
219 |   end
220 | 
221 |   def make_scraper(content, klass = MyScraper)
222 |     source = Mechanize::File.new(nil, nil, content, nil)
223 |     silence_logging do
224 |       klass.new(source)
225 |     end
226 |   end
227 | 
228 |   class IncorrectScraper < Grubby::Scraper
229 |     scrapes(:foo){ "FOO!" }
230 | 
231 |     def initialize(*args)
232 |       # does not call `super`
233 |     end
234 |   end
235 | 
236 |   class UrlScraper < Grubby::Scraper
237 |     def self.url(n = 1)
238 |       "http://localhost/response_code?code=200&n=#{n}"
239 |     end
240 | 
241 |     scrapes(:url){ source.uri.to_s }
242 |     scrapes(:n){ source.uri.query[/\bn=(\d+)\b/, 1]&.to_i }
243 | 
244 |     def next
245 |       self.class.url(n - 1) if n > 1
246 |     end
247 | 
248 |     def next_uri
249 |       self.next.try{|next_url| URI(next_url + "#next_uri") }
250 |     end
251 | 
252 |     def next_page
253 |       self.next.try{|next_url| source.mech.get(next_url + "#next_page") }
254 |     end
255 |   end
256 | 
257 | end
258 | 


--------------------------------------------------------------------------------
/test/test_helper.rb:
--------------------------------------------------------------------------------
 1 | $LOAD_PATH.unshift File.expand_path("../../lib", __FILE__)
 2 | require "grubby"
 3 | 
 4 | require "minitest/autorun"
 5 | 
 6 | require "mechanize/test_case"
 7 | # disable obnoxious coloring from "minitest/pride" forcibly included by "mechanize/test_case"
 8 | if Minitest.const_defined?("PrideIO")
 9 |   class << Minitest::PrideIO
10 |     remove_method :pride?
11 | 
12 |     def pride?
13 |       false
14 |     end
15 |   end
16 | end
17 | 
18 | 
19 | module Kernel
20 | 
21 |   def dont_sleep(amount)
22 |     $sleep_calls ||= 0
23 |     $sleep_calls += 1
24 |     $sleep_total_amount ||= 0
25 |     $sleep_total_amount += amount
26 |     $sleep_last_amount = amount
27 |   end
28 | 
29 |   alias_method :actually_sleep, :sleep
30 |   alias_method :sleep, :dont_sleep
31 | 
32 | end
33 | 
34 | 
35 | class Minitest::Test
36 | 
37 |   def silence_logging
38 |     log_level = $log.level
39 |     $log.level = Logger::Severity::FATAL
40 |     begin
41 |       result = yield
42 |     ensure
43 |       $log.level = log_level
44 |     end
45 |     result
46 |   end
47 | 
48 | end
49 | 


--------------------------------------------------------------------------------