├── .github └── workflows │ └── ci.yml ├── .gitignore ├── CHANGELOG.md ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── grubby.gemspec ├── lib ├── grubby.rb └── grubby │ ├── core_ext │ ├── string.rb │ └── uri.rb │ ├── json_parser.rb │ ├── json_scraper.rb │ ├── log.rb │ ├── mechanize │ ├── download.rb │ ├── fetch_with_retry.rb │ ├── file.rb │ ├── link.rb │ ├── page.rb │ └── parser.rb │ ├── page_scraper.rb │ ├── scraper.rb │ └── version.rb └── test ├── core_ext ├── string_test.rb └── uri_test.rb ├── grubby_test.rb ├── json_parser_test.rb ├── json_scraper_test.rb ├── log_test.rb ├── mechanize ├── download_test.rb ├── fetch_with_retry_test.rb ├── file_test.rb ├── link_test.rb ├── page_test.rb └── parser_test.rb ├── page_scraper_test.rb ├── scraper_test.rb └── test_helper.rb /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [push, pull_request] 3 | jobs: 4 | tests: 5 | strategy: 6 | matrix: 7 | ruby: ["2.7", "3.0", "3.1", "3.2"] 8 | fail-fast: false 9 | 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | 15 | - uses: ruby/setup-ruby@v1 16 | with: 17 | ruby-version: ${{ matrix.ruby }} 18 | bundler-cache: true 19 | 20 | - run: bundle exec rake test 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /Gemfile.lock 4 | /_yardoc/ 5 | /coverage/ 6 | /doc/ 7 | /gemfiles/*.lock 8 | /pkg/ 9 | /spec/reports/ 10 | /tmp/ 11 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 2.0.0 2 | 3 | * [BREAKING] Drop support for Active Support < 6.0 4 | * [BREAKING] Require casual_support ~> 4.0 5 | * [BREAKING] Require mini_sanity ~> 2.0 6 | * [BREAKING] Require pleasant_path ~> 2.0 7 | * [BREAKING] Remove `JsonParser.json_parse_options` 8 | * Use `::JSON.load_default_options` instead 9 | * [BREAKING] Rename `Grubby#singleton` to `Grubby#fulfill` 10 | * [BREAKING] Change `Grubby#fulfill` to return block's result 11 | 12 | 13 | ## 1.2.1 14 | 15 | * Add `JsonParser#mech` attribute for parity with `Mechanize::Page#mech` 16 | * Ensure time spent fetching a response does not count toward the time 17 | to sleep between requests 18 | * Prevent sleep between requests when following a redirect 19 | * Prevent duplicates in `Scraper.fields` 20 | * Fix `URI#query_param` when query is nil 21 | * Fix `PageScraper.scrape_file` and `JsonScraper.scrape_file` when path 22 | contains characters that need to be URI-encoded 23 | 24 | 25 | ## 1.2.0 26 | 27 | * Add `Grubby#journal=` 28 | * Add `$grubby` global default `Grubby` instance 29 | * Add `Scraper.scrape` 30 | * Add `Scraper.each` 31 | * Support `:if` and `:unless` options for `Scraper.scrapes` 32 | * Fix fail-fast behavior of inherited scraper fields 33 | * Fix `JsonParser` on empty response body 34 | * Loosen Active Support version constraint 35 | 36 | 37 | ## 1.1.0 38 | 39 | * Add `Grubby#ok?` 40 | * Add `PageScraper.scrape_file` and `JsonScraper.scrape_file` 41 | * Add `Mechanize::Parser#save_to` and `Mechanize::Parser#save_to!`, 42 | which are inherited by `Mechanize::Download` and `Mechanize::File` 43 | * Add `URI#basename` 44 | * Add `URI#query_param` 45 | * Add utility methods from [ryoba](https://rubygems.org/gems/ryoba) 46 | * Add `Scraper::Error#scraper` and `Scraper#errors` for interactive 47 | debugging with e.g. `byebug` 48 | * Improve log messages and error formatting 49 | * Fix compatibility with net-http-persistent gem v3.0 50 | 51 | 52 | ## 1.0.0 53 | 54 | * Initial release 55 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | # Specify your gem's dependencies in grubby.gemspec 4 | gemspec 5 | 6 | gem "rake", "~> 12.0" 7 | gem "minitest", "~> 5.0" 8 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 Jonathan Hefner 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # grubby 2 | 3 | [Fail-fast] web scraping. *grubby* adds a layer of utility and 4 | error-checking atop the marvelous [Mechanize gem]. See API listing 5 | below, or browse the [full documentation]. 6 | 7 | [Fail-fast]: https://en.wikipedia.org/wiki/Fail-fast 8 | [Mechanize gem]: https://rubygems.org/gems/mechanize 9 | [full documentation]: https://www.rubydoc.info/gems/grubby/ 10 | 11 | 12 | ## Examples 13 | 14 | The following code scrapes stories from the [Hacker News]( 15 | https://news.ycombinator.com/news) front page: 16 | 17 | ```ruby 18 | require "grubby" 19 | 20 | class HackerNews < Grubby::PageScraper 21 | scrapes(:items) do 22 | page.search!(".athing").map{|element| Item.new(element) } 23 | end 24 | 25 | class Item < Grubby::Scraper 26 | scrapes(:story_link){ source.at!("a.storylink") } 27 | 28 | scrapes(:story_url){ expand_url(story_link["href"]) } 29 | 30 | scrapes(:title){ story_link.text } 31 | 32 | scrapes(:comments_link, optional: true) do 33 | source.next_sibling.search!(".subtext a").find do |link| 34 | link.text.match?(/comment|discuss/) 35 | end 36 | end 37 | 38 | scrapes(:comments_url, if: :comments_link) do 39 | expand_url(comments_link["href"]) 40 | end 41 | 42 | scrapes(:comment_count, if: :comments_link) do 43 | comments_link.text.to_i 44 | end 45 | 46 | def expand_url(url) 47 | url.include?("://") ? url : source.document.uri.merge(url).to_s 48 | end 49 | end 50 | end 51 | 52 | # The following line will raise an exception if anything goes wrong 53 | # during the scraping process. For example, if the structure of the 54 | # HTML does not match expectations due to a site change, the script will 55 | # terminate immediately with a helpful error message. This prevents bad 56 | # data from propagating and causing hard-to-trace errors. 57 | hn = HackerNews.scrape("https://news.ycombinator.com/news") 58 | 59 | # Your processing logic goes here: 60 | hn.items.take(10).each do |item| 61 | puts "* #{item.title}" 62 | puts " #{item.story_url}" 63 | puts " #{item.comment_count} comments: #{item.comments_url}" if item.comments_url 64 | puts 65 | end 66 | ``` 67 | 68 | Hacker News also offers a [JSON API](https://github.com/HackerNews/API), 69 | which may be more robust for scraping purposes. *grubby* can scrape 70 | JSON just as well: 71 | 72 | ```ruby 73 | require "grubby" 74 | 75 | class HackerNews < Grubby::JsonScraper 76 | scrapes(:items) do 77 | # API returns array of top 500 item IDs, so limit as necessary 78 | json.take(10).map do |item_id| 79 | Item.scrape("https://hacker-news.firebaseio.com/v0/item/#{item_id}.json") 80 | end 81 | end 82 | 83 | class Item < Grubby::JsonScraper 84 | scrapes(:story_url){ json["url"] || hn_url } 85 | 86 | scrapes(:title){ json["title"] } 87 | 88 | scrapes(:comments_url, optional: true) do 89 | hn_url if json["descendants"] 90 | end 91 | 92 | scrapes(:comment_count, optional: true) do 93 | json["descendants"]&.to_i 94 | end 95 | 96 | def hn_url 97 | "https://news.ycombinator.com/item?id=#{json["id"]}" 98 | end 99 | end 100 | end 101 | 102 | hn = HackerNews.scrape("https://hacker-news.firebaseio.com/v0/topstories.json") 103 | 104 | # Your processing logic goes here: 105 | hn.items.each do |item| 106 | puts "* #{item.title}" 107 | puts " #{item.story_url}" 108 | puts " #{item.comment_count} comments: #{item.comments_url}" if item.comments_url 109 | puts 110 | end 111 | ``` 112 | 113 | 114 | ## Core API 115 | 116 | - [Grubby](https://www.rubydoc.info/gems/grubby/Grubby) 117 | - [#fulfill](https://www.rubydoc.info/gems/grubby/Grubby:fulfill) 118 | - [#get_mirrored](https://www.rubydoc.info/gems/grubby/Grubby:get_mirrored) 119 | - [#ok?](https://www.rubydoc.info/gems/grubby/Grubby:ok%3F) 120 | - [#time_between_requests](https://www.rubydoc.info/gems/grubby/Grubby:time_between_requests) 121 | - [Scraper](https://www.rubydoc.info/gems/grubby/Grubby/Scraper) 122 | - [.each](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.each) 123 | - [.scrape](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrape) 124 | - [.scrapes](https://www.rubydoc.info/gems/grubby/Grubby/Scraper.scrapes) 125 | - [#[]](https://www.rubydoc.info/gems/grubby/Grubby/Scraper:[]) 126 | - [#to_h](https://www.rubydoc.info/gems/grubby/Grubby/Scraper:to_h) 127 | - [PageScraper](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper) 128 | - [.scrape_file](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper.scrape_file) 129 | - [#page](https://www.rubydoc.info/gems/grubby/Grubby/PageScraper:page) 130 | - [JsonScraper](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper) 131 | - [.scrape_file](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper.scrape_file) 132 | - [#json](https://www.rubydoc.info/gems/grubby/Grubby/JsonScraper:json) 133 | - Mechanize::File 134 | - [#save_to](https://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to) 135 | - [#save_to!](https://www.rubydoc.info/gems/grubby/Mechanize/Parser:save_to%21) 136 | - Mechanize::Page 137 | - [#at!](https://www.rubydoc.info/gems/grubby/Mechanize/Page:at%21) 138 | - [#search!](https://www.rubydoc.info/gems/grubby/Mechanize/Page:search%21) 139 | - Mechanize::Page::Link 140 | - [#to_absolute_uri](https://www.rubydoc.info/gems/grubby/Mechanize/Page/Link#to_absolute_uri) 141 | - URI 142 | - [#basename](https://www.rubydoc.info/gems/grubby/URI:basename) 143 | - [#query_param](https://www.rubydoc.info/gems/grubby/URI:query_param) 144 | 145 | 146 | ## Auxiliary API 147 | 148 | *grubby* loads several gems that extend Ruby objects with utility 149 | methods. Some of those methods are listed below. See each gem's 150 | documentation for a complete API listing. 151 | 152 | - [Active Support](https://rubygems.org/gems/activesupport) 153 | ([docs](https://www.rubydoc.info/gems/activesupport/)) 154 | - [Enumerable#index_by](https://www.rubydoc.info/gems/activesupport/Enumerable:index_by) 155 | - [File.atomic_write](https://www.rubydoc.info/gems/activesupport/File:atomic_write) 156 | - [Object#presence](https://www.rubydoc.info/gems/activesupport/Object:presence) 157 | - [String#blank?](https://www.rubydoc.info/gems/activesupport/String:blank%3F) 158 | - [String#squish](https://www.rubydoc.info/gems/activesupport/String:squish) 159 | - [casual_support](https://rubygems.org/gems/casual_support) 160 | ([docs](https://www.rubydoc.info/gems/casual_support/)) 161 | - [Enumerable#index_to](https://www.rubydoc.info/gems/casual_support/Enumerable:index_to) 162 | - [String#after](https://www.rubydoc.info/gems/casual_support/String:after) 163 | - [String#after_last](https://www.rubydoc.info/gems/casual_support/String:after_last) 164 | - [String#before](https://www.rubydoc.info/gems/casual_support/String:before) 165 | - [String#before_last](https://www.rubydoc.info/gems/casual_support/String:before_last) 166 | - [String#between](https://www.rubydoc.info/gems/casual_support/String:between) 167 | - [Time#to_hms](https://www.rubydoc.info/gems/casual_support/Time:to_hms) 168 | - [Time#to_ymd](https://www.rubydoc.info/gems/casual_support/Time:to_ymd) 169 | - [gorge](https://rubygems.org/gems/gorge) 170 | ([docs](https://www.rubydoc.info/gems/gorge/)) 171 | - [Pathname#file_crc32](https://www.rubydoc.info/gems/gorge/Pathname:file_crc32) 172 | - [Pathname#file_md5](https://www.rubydoc.info/gems/gorge/Pathname:file_md5) 173 | - [Pathname#file_sha1](https://www.rubydoc.info/gems/gorge/Pathname:file_sha1) 174 | - [mini_sanity](https://rubygems.org/gems/mini_sanity) 175 | ([docs](https://www.rubydoc.info/gems/mini_sanity/)) 176 | - [Enumerator#result!](https://www.rubydoc.info/gems/mini_sanity/Enumerator:result%21) 177 | - [Enumerator#results!](https://www.rubydoc.info/gems/mini_sanity/Enumerator:results%21) 178 | - [Object#assert!](https://www.rubydoc.info/gems/mini_sanity/Object:assert%21) 179 | - [Object#refute!](https://www.rubydoc.info/gems/mini_sanity/Object:refute%21) 180 | - [String#match!](https://www.rubydoc.info/gems/mini_sanity/String:match%21) 181 | - [pleasant_path](https://rubygems.org/gems/pleasant_path) 182 | ([docs](https://www.rubydoc.info/gems/pleasant_path/)) 183 | - [Pathname#available_name](https://www.rubydoc.info/gems/pleasant_path/Pathname:available_name) 184 | - [Pathname#existence](https://www.rubydoc.info/gems/pleasant_path/Pathname:existence) 185 | - [Pathname#make_dirname](https://www.rubydoc.info/gems/pleasant_path/Pathname:make_dirname) 186 | - [Pathname#move_as](https://www.rubydoc.info/gems/pleasant_path/Pathname:move_as) 187 | - [Pathname#rename_basename](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_basename) 188 | - [Pathname#rename_extname](https://www.rubydoc.info/gems/pleasant_path/Pathname:rename_extname) 189 | - [ryoba](https://rubygems.org/gems/ryoba) 190 | ([docs](https://www.rubydoc.info/gems/ryoba/)) 191 | - [Nokogiri::XML::Node#matches!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:matches%21) 192 | - [Nokogiri::XML::Node#text!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:text%21) 193 | - [Nokogiri::XML::Node#uri](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Node:uri) 194 | - [Nokogiri::XML::Searchable#ancestor!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestor%21) 195 | - [Nokogiri::XML::Searchable#ancestors!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:ancestors%21) 196 | - [Nokogiri::XML::Searchable#at!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21) 197 | - [Nokogiri::XML::Searchable#search!](https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21) 198 | 199 | 200 | ## Installation 201 | 202 | Install the [`grubby` gem](https://rubygems.org/gems/grubby). 203 | 204 | 205 | ## Contributing 206 | 207 | Run `rake test` to run the tests. 208 | 209 | 210 | ## License 211 | 212 | [MIT License](LICENSE.txt) 213 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rake/testtask" 3 | 4 | Rake::TestTask.new(:test) do |t| 5 | t.libs << "test" 6 | t.libs << "lib" 7 | t.test_files = FileList["test/**/*_test.rb"] 8 | end 9 | 10 | task :default => :test 11 | -------------------------------------------------------------------------------- /grubby.gemspec: -------------------------------------------------------------------------------- 1 | require_relative "lib/grubby/version" 2 | 3 | Gem::Specification.new do |spec| 4 | spec.name = "grubby" 5 | spec.version = GRUBBY_VERSION 6 | spec.authors = ["Jonathan Hefner"] 7 | spec.email = ["jonathan@hefner.pro"] 8 | 9 | spec.summary = %q{Fail-fast web scraping} 10 | spec.homepage = "https://github.com/jonathanhefner/grubby" 11 | spec.license = "MIT" 12 | spec.required_ruby_version = ">= 2.7" 13 | 14 | spec.metadata["homepage_uri"] = spec.homepage 15 | spec.metadata["source_code_uri"] = spec.homepage 16 | spec.metadata["changelog_uri"] = spec.metadata["source_code_uri"] + "/blob/master/CHANGELOG.md" 17 | 18 | # Specify which files should be added to the gem when it is released. 19 | # The `git ls-files -z` loads the files in the RubyGem that have been added into git. 20 | spec.files = Dir.chdir(__dir__) do 21 | `git ls-files -z`.split("\x0").reject { |f| f.start_with?("test/", ".git") } 22 | end 23 | spec.bindir = "exe" 24 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } 25 | spec.require_paths = ["lib"] 26 | 27 | spec.add_dependency "activesupport", ">= 6.0" 28 | spec.add_dependency "casual_support", "~> 4.0" 29 | spec.add_dependency "gorge", "~> 1.0" 30 | spec.add_dependency "mechanize", "~> 2.7" 31 | spec.add_dependency "mini_sanity", "~> 2.0" 32 | spec.add_dependency "pleasant_path", "~> 2.0" 33 | spec.add_dependency "ryoba", "~> 1.0" 34 | end 35 | -------------------------------------------------------------------------------- /lib/grubby.rb: -------------------------------------------------------------------------------- 1 | require "active_support/all" 2 | require "casual_support" 3 | require "gorge" 4 | require "mechanize" 5 | require "mini_sanity" 6 | require "pleasant_path" 7 | require "ryoba" 8 | 9 | require_relative "grubby/version" 10 | require_relative "grubby/log" 11 | 12 | require_relative "grubby/core_ext/string" 13 | require_relative "grubby/core_ext/uri" 14 | require_relative "grubby/mechanize/fetch_with_retry" 15 | require_relative "grubby/mechanize/download" 16 | require_relative "grubby/mechanize/file" 17 | require_relative "grubby/mechanize/link" 18 | require_relative "grubby/mechanize/page" 19 | require_relative "grubby/mechanize/parser" 20 | 21 | 22 | class Grubby < Mechanize 23 | 24 | VERSION = GRUBBY_VERSION 25 | 26 | # The minimum amount of time enforced between requests, in seconds. 27 | # If the value is a Range, a random number within the Range is chosen 28 | # for each request. 29 | # 30 | # @return [Integer, Float, Range, Range] 31 | attr_accessor :time_between_requests 32 | 33 | # Journal file used to ensure only-once processing of resources by 34 | # {fulfill} across multiple program runs. 35 | # 36 | # @return [Pathname, nil] 37 | attr_reader :journal 38 | 39 | # @param journal [Pathname, String] 40 | # Optional journal file used to ensure only-once processing of 41 | # resources by {fulfill} across multiple program runs 42 | def initialize(journal = nil) 43 | super() 44 | 45 | # Prevent "memory leaks", and prevent mistakenly blank urls from 46 | # resolving. (Blank urls resolve as a path relative to the last 47 | # history entry. Without this setting, an erroneous `agent.get("")` 48 | # could sometimes successfully fetch a page.) 49 | self.max_history = 0 50 | 51 | # Prevent files of unforeseen content type from being buffered into 52 | # memory by default, in case they are very large. However, increase 53 | # the threshold for what is considered "large", to prevent 54 | # unnecessary writes to disk. 55 | # 56 | # References: 57 | # - http://docs.seattlerb.org/mechanize/Mechanize/PluggableParser.html 58 | # - http://docs.seattlerb.org/mechanize/Mechanize/Download.html 59 | # - http://docs.seattlerb.org/mechanize/Mechanize/File.html 60 | self.max_file_buffer = 1_000_000 # only applies to Mechanize::Download 61 | self.pluggable_parser.default = Mechanize::Download 62 | self.pluggable_parser["text/plain"] = Mechanize::File 63 | self.pluggable_parser["application/json"] = Grubby::JsonParser 64 | 65 | # Set up configurable rate limiting, and choose a reasonable default 66 | # rate limit. 67 | self.pre_connect_hooks << Proc.new{ self.send(:sleep_between_requests) } 68 | self.post_connect_hooks << Proc.new do |agent, uri, response, body| 69 | self.send(:mark_last_request_time, (Time.now unless response.code.to_s.start_with?("3"))) 70 | end 71 | self.time_between_requests = 1.0 72 | 73 | self.journal = journal 74 | end 75 | 76 | # Sets the journal file used to ensure only-once processing of 77 | # resources by {fulfill} across multiple program runs. Setting the 78 | # journal file will clear the in-memory list of previously-processed 79 | # resources, and, if the journal file exists, load the list from file. 80 | # 81 | # @param path [Pathname, String, nil] 82 | # @return [Pathname] 83 | def journal=(path) 84 | @journal = path&.to_pathname&.make_file 85 | @fulfilled = if @journal 86 | require "csv" 87 | CSV.read(@journal).map{|row| FulfilledEntry.new(*row) }.to_set 88 | else 89 | Set.new 90 | end 91 | @journal 92 | end 93 | 94 | # Calls +#head+ and returns true if a response code "200" is received, 95 | # false otherwise. Unlike +#head+, error response codes (e.g. "404", 96 | # "500") do not result in a +Mechanize::ResponseCodeError+ being 97 | # raised. 98 | # 99 | # @param uri [URI, String] 100 | # @return [Boolean] 101 | def ok?(uri, query_params = {}, headers = {}) 102 | begin 103 | head(uri, query_params, headers).code == "200" 104 | rescue Mechanize::ResponseCodeError 105 | false 106 | end 107 | end 108 | 109 | # Calls +#get+ with each of +mirror_uris+ until a successful 110 | # ("200 OK") response is received, and returns that +#get+ result. 111 | # Rescues and logs +Mechanize::ResponseCodeError+ failures for all but 112 | # the last mirror. 113 | # 114 | # @example 115 | # grubby = Grubby.new 116 | # 117 | # urls = [ 118 | # "https://httpstat.us/404", 119 | # "https://httpstat.us/500", 120 | # "https://httpstat.us/200?foo", 121 | # "https://httpstat.us/200?bar", 122 | # ] 123 | # 124 | # grubby.get_mirrored(urls).uri # == URI("https://httpstat.us/200?foo") 125 | # 126 | # grubby.get_mirrored(urls.take(2)) # raise Mechanize::ResponseCodeError 127 | # 128 | # @param mirror_uris [Array, Array] 129 | # @return [Mechanize::Page, Mechanize::File, Mechanize::Download, ...] 130 | # @raise [Mechanize::ResponseCodeError] 131 | # if all +mirror_uris+ fail 132 | def get_mirrored(mirror_uris, parameters = [], referer = nil, headers = {}) 133 | i = 0 134 | begin 135 | get(mirror_uris[i], parameters, referer, headers) 136 | rescue Mechanize::ResponseCodeError => e 137 | i += 1 138 | if i >= mirror_uris.length 139 | raise 140 | else 141 | $log.debug("Mirror failed (code #{e.response_code}): #{mirror_uris[i - 1]}") 142 | $log.debug("Try mirror: #{mirror_uris[i]}") 143 | retry 144 | end 145 | end 146 | end 147 | 148 | # Ensures only-once processing of the resource indicated by +uri+ for 149 | # the specified +purpose+. The given block is executed and the result 150 | # is returned if and only if the Grubby instance has not recorded a 151 | # previous call to +fulfill+ for the same resource and purpose. 152 | # 153 | # Note that the resource is identified by both its URI and its content 154 | # hash. The latter prevents superfluous and rearranged URI query 155 | # string parameters from interfering with only-once processing. 156 | # 157 | # If {journal} is set, and if the block does not raise an exception, 158 | # the resource and purpose are logged to the journal file. This 159 | # enables only-once processing across multiple program runs. It also 160 | # provides a means to resume batch processing after an unexpected 161 | # termination. 162 | # 163 | # @example 164 | # grubby = Grubby.new 165 | # 166 | # grubby.fulfill("https://example.com/posts") do |page| 167 | # "first time" 168 | # end 169 | # # == "first time" 170 | # 171 | # grubby.fulfill("https://example.com/posts") do |page| 172 | # "already seen" # not evaluated 173 | # end 174 | # # == nil 175 | # 176 | # grubby.fulfill("https://example.com/posts?page=1") do |page| 177 | # "already seen content hash" # not evaluated 178 | # end 179 | # # == nil 180 | # 181 | # grubby.fulfill("https://example.com/posts", "again!") do |page| 182 | # "already seen, but new purpose" 183 | # end 184 | # # == "already seen, but new purpose" 185 | # 186 | # @param uri [URI, String] 187 | # @param purpose [String] 188 | # @yieldparam resource [Mechanize::Page, Mechanize::File, Mechanize::Download, ...] 189 | # @yieldreturn [Object] 190 | # @return [Object, nil] 191 | # @raise [Mechanize::ResponseCodeError] 192 | # if fetching the resource results in error (see +Mechanize#get+) 193 | def fulfill(uri, purpose = "") 194 | series = [] 195 | 196 | uri = uri.to_absolute_uri 197 | return unless add_fulfilled(uri, purpose, series) 198 | 199 | normalized_uri = normalize_uri(uri) 200 | return unless add_fulfilled(normalized_uri, purpose, series) 201 | 202 | $log.info("Fetch #{normalized_uri}") 203 | resource = get(normalized_uri) 204 | unprocessed = add_fulfilled(resource.uri, purpose, series) & 205 | add_fulfilled("content hash: #{resource.content_hash}", purpose, series) 206 | 207 | result = yield resource if unprocessed 208 | 209 | CSV.open(journal, "a") do |csv| 210 | series.each{|entry| csv << entry } 211 | end if journal 212 | 213 | result 214 | end 215 | 216 | 217 | private 218 | 219 | # @!visibility private 220 | FulfilledEntry = Struct.new(:purpose, :target) 221 | 222 | def add_fulfilled(target, purpose, series) 223 | series << FulfilledEntry.new(purpose, target.to_s) 224 | if (series.uniq!) || @fulfilled.add?(series.last) 225 | true 226 | else 227 | $log.info("Skip #{series.first.target}" \ 228 | " (seen#{" #{series.last.target}" unless series.length == 1})") 229 | false 230 | end 231 | end 232 | 233 | def normalize_uri(uri) 234 | uri = uri.dup 235 | $log.warn("Ignore ##{uri.fragment} in #{uri}") if uri.fragment 236 | uri.fragment = nil 237 | uri.path = uri.path.chomp("/") 238 | uri 239 | end 240 | 241 | def sleep_between_requests 242 | @last_request_at ||= 0.0 243 | delay_duration = time_between_requests.is_a?(Range) ? 244 | rand(time_between_requests) : time_between_requests 245 | sleep_duration = @last_request_at + delay_duration - Time.now.to_f 246 | sleep(sleep_duration) if sleep_duration > 0 247 | end 248 | 249 | def mark_last_request_time(time) 250 | @last_request_at = time.to_f 251 | end 252 | 253 | end 254 | 255 | 256 | require_relative "grubby/json_parser" 257 | require_relative "grubby/scraper" 258 | require_relative "grubby/page_scraper" 259 | require_relative "grubby/json_scraper" 260 | 261 | 262 | $grubby = Grubby.new 263 | -------------------------------------------------------------------------------- /lib/grubby/core_ext/string.rb: -------------------------------------------------------------------------------- 1 | class String 2 | 3 | # Constructs a URI from the String. Raises an exception if the String 4 | # does not denote an absolute URI. 5 | # 6 | # @return [URI] 7 | # @raise [RuntimeError] 8 | # if the String does not denote an absolute URI 9 | def to_absolute_uri 10 | URI(self).to_absolute_uri 11 | end 12 | 13 | end 14 | -------------------------------------------------------------------------------- /lib/grubby/core_ext/uri.rb: -------------------------------------------------------------------------------- 1 | module URI 2 | 3 | # Returns the basename of the URI's +path+, a la +File.basename+. 4 | # 5 | # @example 6 | # URI("https://example.com/foo/bar").basename # == "bar" 7 | # URI("https://example.com/foo").basename # == "foo" 8 | # URI("https://example.com/").basename # == "" 9 | # 10 | # @return [String] 11 | def basename 12 | self.path == "/" ? "" : ::File.basename(self.path) 13 | end 14 | 15 | # Returns the value of the specified query param in the URI's query 16 | # string. The specified +name+ must be *exactly* as it appears in the 17 | # query string, and support for complex nested values is limited. 18 | # (See +CGI.parse+ for parsing behavior.) If +name+ contains +"[]"+, 19 | # all occurrences of the query param are returned as an Array. 20 | # Otherwise, only the last occurrence is returned. 21 | # 22 | # @example 23 | # URI("https://example.com/?foo=a").query_param("foo") # == "a" 24 | # 25 | # URI("https://example.com/?foo=a&foo=b").query_param("foo") # == "b" 26 | # URI("https://example.com/?foo=a&foo=b").query_param("foo[]") # == nil 27 | # 28 | # URI("https://example.com/?foo[]=a&foo[]=b").query_param("foo") # == nil 29 | # URI("https://example.com/?foo[]=a&foo[]=b").query_param("foo[]") # == ["a", "b"] 30 | # 31 | # URI("https://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[]") # == nil 32 | # URI("https://example.com/?foo[][x]=a&foo[][y]=b").query_param("foo[][x]") # == ["a"] 33 | # 34 | # @param name [String] 35 | # @return [String, Array, nil] 36 | def query_param(name) 37 | values = CGI.parse(self.query)[name] if self.query 38 | (values.nil? || name.include?("[]")) ? values : values.last 39 | end 40 | 41 | # Raises an exception if the URI is not +absolute?+. Otherwise, 42 | # returns the URI. 43 | # 44 | # @return [self] 45 | # @raise [RuntimeError] 46 | # if the URI is not +absolute?+ 47 | def to_absolute_uri 48 | raise "URI is not absolute: #{self}" unless self.absolute? 49 | self 50 | end 51 | 52 | end 53 | -------------------------------------------------------------------------------- /lib/grubby/json_parser.rb: -------------------------------------------------------------------------------- 1 | class Grubby::JsonParser < Mechanize::File 2 | 3 | # The parsed JSON data. 4 | # 5 | # @return [Hash, Array] 6 | attr_reader :json 7 | 8 | # The Mechanize agent used to make the request. 9 | # 10 | # @return [Mechanize, nil] 11 | attr_accessor :mech 12 | 13 | def initialize(uri = nil, response = nil, body = nil, code = nil, mech = nil) 14 | @json = JSON.load(body, nil, create_additions: false) 15 | @mech = mech 16 | super(uri, response, body, code) 17 | end 18 | 19 | end 20 | -------------------------------------------------------------------------------- /lib/grubby/json_scraper.rb: -------------------------------------------------------------------------------- 1 | class Grubby::JsonScraper < Grubby::Scraper 2 | 3 | # The parsed JSON data being scraped. 4 | # 5 | # @return [Hash, Array] 6 | attr_reader :json 7 | 8 | # @param source [Grubby::JsonParser] 9 | # @raise [Grubby::Scraper::Error] 10 | # if any {Scraper.scrapes} blocks fail 11 | def initialize(source) 12 | @json = source.assert!(Grubby::JsonParser).json 13 | super 14 | end 15 | 16 | # Scrapes a locally-stored file. This method is intended for use with 17 | # subclasses of +Grubby::JsonScraper+. 18 | # 19 | # @example 20 | # class MyScraper < Grubby::JsonScraper 21 | # # ... 22 | # end 23 | # 24 | # MyScraper.scrape_file("path/to/local_file.json") # === MyScraper 25 | # 26 | # @param path [String] 27 | # @param agent [Mechanize] 28 | # @return [Grubby::JsonScraper] 29 | # @raise [Grubby::Scraper::Error] 30 | # if any {Scraper.scrapes} blocks fail 31 | def self.scrape_file(path, agent = $grubby) 32 | self.new(Grubby::JsonParser.read_local(path).tap{|parser| parser.mech = agent }) 33 | end 34 | 35 | end 36 | -------------------------------------------------------------------------------- /lib/grubby/log.rb: -------------------------------------------------------------------------------- 1 | $log ||= Logger.new($stderr).tap do |logger| 2 | logger.formatter = ->(severity, datetime, progname, msg) do 3 | "[#{datetime.to_ymd} #{datetime.to_hms}] #{severity} #{msg}\n" 4 | end 5 | end 6 | -------------------------------------------------------------------------------- /lib/grubby/mechanize/download.rb: -------------------------------------------------------------------------------- 1 | # @!visibility private 2 | class Mechanize::Download 3 | 4 | def content_hash 5 | @content_hash ||= Digest::SHA1.new.io(self.body_io).hexdigest 6 | end 7 | 8 | end 9 | -------------------------------------------------------------------------------- /lib/grubby/mechanize/fetch_with_retry.rb: -------------------------------------------------------------------------------- 1 | # This monkey patch attempts to fix the insidious "too many connection 2 | # resets" bug described here: https://github.com/sparklemotion/mechanize/issues/123 3 | # 4 | # The code is taken and modified from this helpful blog article: 5 | # http://scottwb.com/blog/2013/11/09/defeating-the-infamous-mechanize-too-many-connection-resets-bug/ 6 | class Mechanize::HTTP::Agent 7 | 8 | MAX_CONNECTION_RESET_RETRIES = 9 9 | IDEMPOTENT_HTTP_METHODS = [:get, :head, :options, :delete] 10 | 11 | # Replacement for +Mechanize::HTTP::Agent#fetch+. When a "too many 12 | # connection resets" error is encountered, this method retries the 13 | # request (upto {MAX_CONNECTION_RESET_RETRIES} times). 14 | def fetch_with_retry(uri, http_method = :get, headers = {}, params = [], referer = current_page, redirects = 0) 15 | retry_count = 0 16 | begin 17 | fetch_without_retry(uri, http_method, headers, params, referer, redirects) 18 | rescue Net::HTTP::Persistent::Error => e 19 | # raise if different type of error 20 | raise unless e.message.include?("too many connection resets") 21 | # raise if non-idempotent http method 22 | raise unless IDEMPOTENT_HTTP_METHODS.include?(http_method) 23 | # raise if we've tried too many times 24 | raise if retry_count >= MAX_CONNECTION_RESET_RETRIES 25 | 26 | # otherwise, shutdown the persistent HTTP connection and try again 27 | retry_count += 1 28 | $log.warn("#{e.message} (#{e.class}). Retry in #{retry_count} seconds.") 29 | sleep(retry_count) # incremental backoff to allow server to self-correct 30 | $log.warn("Retry #{http_method.to_s.upcase} #{uri}") 31 | retry 32 | end 33 | end 34 | 35 | alias_method :fetch_without_retry, :fetch 36 | alias_method :fetch, :fetch_with_retry 37 | 38 | end 39 | -------------------------------------------------------------------------------- /lib/grubby/mechanize/file.rb: -------------------------------------------------------------------------------- 1 | # @!visibility private 2 | class Mechanize::File 3 | 4 | def self.read_local(path) 5 | uri_path = File.expand_path(path).gsub(%r"[^/\\]+"){|component| CGI.escape(component) } 6 | self.new(URI::File.build(path: uri_path), nil, File.read(path), "200") 7 | end 8 | 9 | def content_hash 10 | @content_hash ||= self.body.to_s.sha1 11 | end 12 | 13 | end 14 | -------------------------------------------------------------------------------- /lib/grubby/mechanize/link.rb: -------------------------------------------------------------------------------- 1 | class Mechanize::Page::Link 2 | 3 | # Returns the URI represented by the Link, in absolute form. If the 4 | # href attribute of the Link is expressed in relative form, the URI is 5 | # converted to absolute form using the Link's +page.uri+. Raises an 6 | # exception if the URI cannot be converted to absolute form. 7 | # 8 | # @return [URI] 9 | # @raise [RuntimeError] 10 | # if the URI cannot be converted to absolute form 11 | def to_absolute_uri 12 | # Via the W3 spec[1]: "If the a element has no href attribute, then 13 | # the element represents a placeholder for where a link might 14 | # otherwise have been placed, if it had been relevant, consisting of 15 | # just the element's contents." So, we assume a link with no href 16 | # attribute (i.e. `uri == nil`) should be treated the same as an 17 | # intra-page link. 18 | # 19 | # [1]: https://www.w3.org/TR/2016/REC-html51-20161101/textlevel-semantics.html#the-a-element 20 | URI.join(self.page.uri, self.uri || "#").to_absolute_uri 21 | end 22 | 23 | end 24 | -------------------------------------------------------------------------------- /lib/grubby/mechanize/page.rb: -------------------------------------------------------------------------------- 1 | class Mechanize::Page 2 | 3 | # @!method search!(*queries) 4 | # See ryoba's {https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:search%21 5 | # +Nokogiri::XML::Searchable#search!+}. 6 | # 7 | # @param queries [Array] 8 | # @return [Nokogiri::XML::NodeSet] 9 | # @raise [Ryoba::Error] 10 | # if all queries yield no results 11 | def_delegators :parser, :search! 12 | 13 | # @!method at!(*queries) 14 | # See ryoba's {https://www.rubydoc.info/gems/ryoba/Nokogiri/XML/Searchable:at%21 15 | # +Nokogiri::XML::Searchable#at!+}. 16 | # 17 | # @param queries [Array] 18 | # @return [Nokogiri::XML::Element] 19 | # @raise [Ryoba::Error] 20 | # if all queries yield no results 21 | def_delegators :parser, :at! 22 | 23 | end 24 | -------------------------------------------------------------------------------- /lib/grubby/mechanize/parser.rb: -------------------------------------------------------------------------------- 1 | require "fileutils" 2 | 3 | module Mechanize::Parser 4 | 5 | # Saves the payload to a specified directory, using the default 6 | # filename suggested by the server. If a file with that name already 7 | # exists, this method will try to find a free filename by appending 8 | # numbers to the default filename. Returns the full path of the saved 9 | # file. 10 | # 11 | # @note This method expects a +#save!+ method to be defined by the 12 | # class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+ 13 | # and +Mechanize::Download#save!+. 14 | # 15 | # @param directory [String] 16 | # @return [String] 17 | def save_to(directory) 18 | raise "#{self.class}#save! is not defined" unless self.respond_to?(:save!) 19 | 20 | FileUtils.mkdir_p(directory) 21 | path = find_free_name(File.join(directory, @filename)) 22 | save!(path) 23 | path 24 | end 25 | 26 | # Saves the payload to a specified directory, using the default 27 | # filename suggested by the server. If a file with that name already 28 | # exists, that file will be overwritten. Returns the full path of the 29 | # saved file. 30 | # 31 | # @note This method expects a +#save!+ method to be defined by the 32 | # class extending +Mechanize::Parser+, e.g. +Mechanize::File#save!+ 33 | # and +Mechanize::Download#save!+. 34 | # 35 | # @param directory [String] 36 | # @return [String] 37 | def save_to!(directory) 38 | raise "#{self.class}#save! is not defined" unless self.respond_to?(:save!) 39 | 40 | FileUtils.mkdir_p(directory) 41 | path = File.join(directory, @filename) 42 | save!(path) 43 | path 44 | end 45 | 46 | end 47 | -------------------------------------------------------------------------------- /lib/grubby/page_scraper.rb: -------------------------------------------------------------------------------- 1 | class Grubby::PageScraper < Grubby::Scraper 2 | 3 | # The Page being scraped. 4 | # 5 | # @return [Mechanize::Page] 6 | attr_reader :page 7 | 8 | # @param source [Mechanize::Page] 9 | # @raise [Grubby::Scraper::Error] 10 | # if any {Scraper.scrapes} blocks fail 11 | def initialize(source) 12 | @page = source.assert!(Mechanize::Page) 13 | super 14 | end 15 | 16 | # Scrapes a locally-stored file. This method is intended for use with 17 | # subclasses of +Grubby::PageScraper+. 18 | # 19 | # @example 20 | # class MyScraper < Grubby::PageScraper 21 | # # ... 22 | # end 23 | # 24 | # MyScraper.scrape_file("path/to/local_file.html") # === MyScraper 25 | # 26 | # @param path [String] 27 | # @param agent [Mechanize] 28 | # @return [Grubby::PageScraper] 29 | # @raise [Grubby::Scraper::Error] 30 | # if any {Scraper.scrapes} blocks fail 31 | def self.scrape_file(path, agent = $grubby) 32 | self.new(Mechanize::Page.read_local(path).tap{|page| page.mech = agent }) 33 | end 34 | 35 | end 36 | -------------------------------------------------------------------------------- /lib/grubby/scraper.rb: -------------------------------------------------------------------------------- 1 | class Grubby::Scraper 2 | 3 | # Defines an attribute reader method named by +field+. During 4 | # {initialize}, the given block is called, and the attribute is set to 5 | # the block's return value. 6 | # 7 | # By default, raises an exception if the block's return value is nil. 8 | # To prevent this behavior, set the +:optional+ option to true. 9 | # Alternatively, the block can be conditionally evaluated, based on 10 | # another method's return value, using the +:if+ or +:unless+ options. 11 | # 12 | # @example Default behavior 13 | # class GreetingScraper < Grubby::Scraper 14 | # scrapes(:name) do 15 | # source[/Hello (\w+)/, 1] 16 | # end 17 | # end 18 | # 19 | # scraper = GreetingScraper.new("Hello World!") 20 | # scraper.name # == "World" 21 | # 22 | # scraper = GreetingScraper.new("Hello!") # raises Grubby::Scraper::Error 23 | # 24 | # @example Optional scraped value 25 | # class GreetingScraper < Grubby::Scraper 26 | # scrapes(:name, optional: true) do 27 | # source[/Hello (\w+)/, 1] 28 | # end 29 | # end 30 | # 31 | # scraper = GreetingScraper.new("Hello World!") 32 | # scraper.name # == "World" 33 | # 34 | # scraper = GreetingScraper.new("Hello!") 35 | # scraper.name # == nil 36 | # 37 | # @example Conditional scraped value 38 | # class GreetingScraper < Grubby::Scraper 39 | # def hello? 40 | # source.start_with?("Hello ") 41 | # end 42 | # 43 | # scrapes(:name, if: :hello?) do 44 | # source[/Hello (\w+)/, 1] 45 | # end 46 | # end 47 | # 48 | # scraper = GreetingScraper.new("Hello World!") 49 | # scraper.name # == "World" 50 | # 51 | # scraper = GreetingScraper.new("Hello!") # raises Grubby::Scraper::Error 52 | # 53 | # scraper = GreetingScraper.new("How are you?") 54 | # scraper.name # == nil 55 | # 56 | # @param field [Symbol, String] 57 | # @param options [Hash] 58 | # @option options :optional [Boolean] (false) 59 | # Whether the block should be allowed to return a nil value 60 | # @option options :if [Symbol] (nil) 61 | # Name of predicate method that determines if the block should be 62 | # evaluated 63 | # @option options :unless [Symbol] (nil) 64 | # Name of predicate method that determines if the block should not 65 | # be evaluated 66 | # @yieldreturn [Object] 67 | # @return [void] 68 | def self.scrapes(field, **options, &block) 69 | field = field.to_sym 70 | (self.fields << field).uniq! 71 | 72 | define_method(field) do 73 | raise "#{self.class}#initialize does not invoke `super`" unless defined?(@scraped) 74 | 75 | if !@scraped.key?(field) && !@errors.key?(field) 76 | begin 77 | skip = (options[:if] && !self.send(options[:if])) || 78 | (options[:unless] && self.send(options[:unless])) 79 | 80 | if skip 81 | @scraped[field] = nil 82 | else 83 | @scraped[field] = instance_eval(&block) 84 | if @scraped[field].nil? 85 | raise FieldValueRequiredError.new(field) unless options[:optional] 86 | $log.debug("#{self.class}##{field} is nil") 87 | end 88 | end 89 | rescue RuntimeError, IndexError => e 90 | @errors[field] = e 91 | end 92 | end 93 | 94 | if @errors.key?(field) 95 | raise FieldScrapeFailedError.new(field, @errors[field]) 96 | else 97 | @scraped[field] 98 | end 99 | end 100 | end 101 | 102 | # Fields defined via {scrapes}. 103 | # 104 | # @return [Array] 105 | def self.fields 106 | @fields ||= self == Grubby::Scraper ? [] : self.superclass.fields.dup 107 | end 108 | 109 | # Instantiates the Scraper class with the resource indicated by +url+. 110 | # This method acts as a default factory method, and provides a 111 | # standard interface for overrides. 112 | # 113 | # @example Default factory method 114 | # class PostPageScraper < Grubby::PageScraper 115 | # # ... 116 | # end 117 | # 118 | # PostPageScraper.scrape("https://example.com/posts/42") 119 | # # == PostPageScraper.new($grubby.get("https://example.com/posts/42")) 120 | # 121 | # @example Override factory method 122 | # class PostApiScraper < Grubby::JsonScraper 123 | # # ... 124 | # 125 | # def self.scrape(url, agent = $grubby) 126 | # api_url = url.to_s.sub(%r"//example.com/(.+)", '//api.example.com/\1.json') 127 | # super(api_url, agent) 128 | # end 129 | # end 130 | # 131 | # PostApiScraper.scrape("https://example.com/posts/42") 132 | # # == PostApiScraper.new($grubby.get("https://api.example.com/posts/42.json")) 133 | # 134 | # @param url [String, URI] 135 | # @param agent [Mechanize] 136 | # @return [Grubby::Scraper] 137 | # @raise [Grubby::Scraper::Error] 138 | # if any {Scraper.scrapes} blocks fail 139 | def self.scrape(url, agent = $grubby) 140 | self.new(agent.get(url)) 141 | end 142 | 143 | # Iterates a series of pages, starting at +start+. The Scraper class 144 | # is instantiated with each page, and each Scraper instance is passed 145 | # to the given block. Subsequent pages in the series are determined 146 | # by invoking the +next_method+ method on each Scraper instance. 147 | # 148 | # Iteration stops when the +next_method+ method returns falsy. If the 149 | # +next_method+ method returns a String or URI, that value will be 150 | # treated as the URL of the next page. Otherwise that value will be 151 | # treated as the page itself. 152 | # 153 | # @example Iterate from page object 154 | # class PostsIndexScraper < Grubby::PageScraper 155 | # def next 156 | # page.link_with(text: "Next >")&.click 157 | # end 158 | # end 159 | # 160 | # PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper| 161 | # scraper.page.uri.query # == "page=1", "page=2", "page=3", ... 162 | # end 163 | # 164 | # @example Iterate from URI 165 | # class PostsIndexScraper < Grubby::PageScraper 166 | # def next 167 | # page.link_with(text: "Next >")&.to_absolute_uri 168 | # end 169 | # end 170 | # 171 | # PostsIndexScraper.each("https://example.com/posts?page=1") do |scraper| 172 | # scraper.page.uri.query # == "page=1", "page=2", "page=3", ... 173 | # end 174 | # 175 | # @example Specifying the iteration method 176 | # class PostsIndexScraper < Grubby::PageScraper 177 | # scrapes(:next_uri, optional: true) do 178 | # page.link_with(text: "Next >")&.to_absolute_uri 179 | # end 180 | # end 181 | # 182 | # PostsIndexScraper.each("https://example.com/posts?page=1", next_method: :next_uri) do |scraper| 183 | # scraper.page.uri.query # == "page=1", "page=2", "page=3", ... 184 | # end 185 | # 186 | # @param start [String, URI, Mechanize::Page, Mechanize::File] 187 | # @param agent [Mechanize] 188 | # @param next_method [Symbol] 189 | # @yieldparam scraper [Grubby::Scraper] 190 | # @return [void] 191 | # @raise [NoMethodError] 192 | # if the Scraper class does not define the method indicated by 193 | # +next_method+ 194 | # @raise [Grubby::Scraper::Error] 195 | # if any {Scraper.scrapes} blocks fail 196 | def self.each(start, agent = $grubby, next_method: :next) 197 | unless self.method_defined?(next_method) 198 | raise NoMethodError.new(nil, next_method), "#{self} does not define `#{next_method}`" 199 | end 200 | 201 | return to_enum(:each, start, agent, next_method: next_method) unless block_given? 202 | 203 | current = start 204 | while current 205 | current = agent.get(current) if current.is_a?(String) || current.is_a?(URI) 206 | scraper = self.new(current) 207 | yield scraper 208 | current = scraper.send(next_method) 209 | end 210 | end 211 | 212 | # The object being scraped. Typically an instance of a Mechanize 213 | # pluggable parser such as +Mechanize::Page+. 214 | # 215 | # @return [Object] 216 | attr_reader :source 217 | 218 | # Collected errors raised during {initialize} by {Scraper.scrapes} 219 | # blocks, indexed by field name. This Hash will be empty if 220 | # {initialize} did not raise a +Grubby::Scraper::Error+. 221 | # 222 | # @return [Hash{Symbol => StandardError}] 223 | attr_reader :errors 224 | 225 | # @param source 226 | # @raise [Grubby::Scraper::Error] 227 | # if any {Scraper.scrapes} blocks fail 228 | def initialize(source) 229 | @source = source 230 | @scraped = {} 231 | @errors = {} 232 | 233 | self.class.fields.each do |field| 234 | begin 235 | self.send(field) 236 | rescue FieldScrapeFailedError 237 | end 238 | end 239 | 240 | raise Error.new(self) unless @errors.empty? 241 | end 242 | 243 | # Returns the scraped value named by +field+. 244 | # 245 | # @param field [Symbol, String] 246 | # @return [Object] 247 | # @raise [RuntimeError] 248 | # if +field+ is not a valid name 249 | def [](field) 250 | @scraped.fetch(field.to_sym) 251 | end 252 | 253 | # Returns all scraped values as a Hash. 254 | # 255 | # @return [Hash{Symbol => Object}] 256 | def to_h 257 | @scraped.dup 258 | end 259 | 260 | class Error < RuntimeError 261 | # @!visibility private 262 | BACKTRACE_CLEANER = ActiveSupport::BacktraceCleaner.new.tap do |cleaner| 263 | cleaner.add_silencer do |line| 264 | line.include?(__dir__) && line.include?("scraper.rb:") 265 | end 266 | end 267 | 268 | # The Scraper that raised this Error. 269 | # 270 | # @return [Grubby::Scraper] 271 | attr_accessor :scraper 272 | 273 | # @!visibility private 274 | def initialize(scraper) 275 | self.scraper = scraper 276 | 277 | listing = scraper.errors. 278 | reject{|field, error| error.is_a?(FieldScrapeFailedError) }. 279 | map do |field, error| 280 | "* `#{field}` (#{error.class})\n" + 281 | error.message.indent(2) + "\n\n" + 282 | BACKTRACE_CLEANER.clean(error.backtrace).join("\n").indent(4) + "\n" 283 | end. 284 | join("\n") 285 | 286 | super("Failed to scrape the following fields:\n#{listing}") 287 | end 288 | end 289 | 290 | # @!visibility private 291 | class FieldScrapeFailedError < RuntimeError 292 | def initialize(field, field_error) 293 | super("`#{field}` raised #{field_error.class}") 294 | end 295 | end 296 | 297 | # @!visibility private 298 | class FieldValueRequiredError < RuntimeError 299 | def initialize(field) 300 | super("`#{field}` is nil but is not marked as optional") 301 | end 302 | end 303 | 304 | end 305 | -------------------------------------------------------------------------------- /lib/grubby/version.rb: -------------------------------------------------------------------------------- 1 | GRUBBY_VERSION = "2.0.0" 2 | -------------------------------------------------------------------------------- /test/core_ext/string_test.rb: -------------------------------------------------------------------------------- 1 | require "test_helper" 2 | 3 | class StringTest < Minitest::Test 4 | 5 | def test_to_absolute_uri_with_absolute_uri 6 | string = "http://localhost" 7 | uri = string.to_absolute_uri 8 | assert_kind_of URI, uri 9 | assert_equal string, uri.to_s 10 | end 11 | 12 | def test_to_absolute_uri_with_relative_uri 13 | assert_raises { "/index.html".to_absolute_uri } 14 | end 15 | 16 | end 17 | -------------------------------------------------------------------------------- /test/core_ext/uri_test.rb: -------------------------------------------------------------------------------- 1 | require "test_helper" 2 | 3 | class URITest < Minitest::Test 4 | 5 | def test_basename 6 | ["", "foo", "foo/bar"].each do |path| 7 | basename = File.basename(path) 8 | assert_equal basename, URI.join("http://localhost", path).basename 9 | assert_equal basename, URI.join("http://localhost", path + "/").basename 10 | end 11 | end 12 | 13 | def test_query_param 14 | keys = ["", "[]", "[][x]", "[][y]", "[x][]", "[y][]"].map{|brack| "foo#{brack}" } 15 | values = ["a", "b", "c"] 16 | query = keys.product(values).map{|key, value| "#{key}=#{value}" }.join("&") 17 | uri = URI("http://localhost/?#{query}") 18 | 19 | keys.each do |key| 20 | expected = key.include?("[]") ? values : values.last 21 | assert_equal expected, uri.query_param(key) 22 | end 23 | 24 | assert_nil uri.query_param("nope") 25 | end 26 | 27 | def test_query_param_with_no_query 28 | assert_nil URI("http://localhost/").query_param("nope") 29 | assert_nil URI("http://localhost/?").query_param("nope") 30 | end 31 | 32 | def test_to_absolute_uri_with_absolute_uri 33 | uri = URI("http://localhost") 34 | assert_same uri, uri.to_absolute_uri 35 | end 36 | 37 | def test_to_absolute_uri_with_relative_uri 38 | uri = URI("/index.html") 39 | assert_raises { uri.to_absolute_uri } 40 | end 41 | 42 | end 43 | -------------------------------------------------------------------------------- /test/grubby_test.rb: -------------------------------------------------------------------------------- 1 | require "test_helper" 2 | 3 | class GrubbyTest < Mechanize::TestCase 4 | 5 | def test_that_it_has_a_version_number 6 | refute_nil Grubby::VERSION 7 | end 8 | 9 | def test_default_constructor 10 | assert_kind_of Mechanize, Grubby.new 11 | end 12 | 13 | def test_global_default_instance 14 | assert_instance_of Grubby, $grubby 15 | end 16 | 17 | def test_time_between_requests_with_number 18 | $sleep_last_amount = 0.0 19 | amount = 5.0 20 | 21 | grubby = Grubby.new 22 | grubby.time_between_requests = amount 23 | grubby.get("http://localhost") 24 | assert_equal 0.0, $sleep_last_amount 25 | grubby.get("http://localhost") 26 | assert_includes ((amount - 0.1)..amount), $sleep_last_amount 27 | end 28 | 29 | def test_time_between_requests_with_range 30 | $sleep_last_amount = 0.0 31 | min_amount = 5.0 32 | max_amount = 10.0 33 | 34 | grubby = Grubby.new 35 | grubby.time_between_requests = min_amount..max_amount 36 | grubby.get("http://localhost") 37 | assert_equal 0.0, $sleep_last_amount 38 | grubby.get("http://localhost") 39 | assert_includes ((min_amount - 0.1)..max_amount), $sleep_last_amount 40 | end 41 | 42 | def test_time_between_requests_begins_after_request_finishes 43 | grubby = Grubby.new 44 | grubby.time_between_requests = 1.0 45 | # use content-encoding hook so that a time recorded by a pre-connect 46 | # hook will be disregarded, while a time recorded by a post-connect 47 | # hook will not 48 | grubby.content_encoding_hooks << Proc.new{ grubby.send(:mark_last_request_time, nil) } 49 | 50 | grubby.get("http://localhost") 51 | $sleep_last_amount = 0.0 52 | grubby.get("http://localhost") 53 | assert_operator $sleep_last_amount, :>, 0.0 54 | end 55 | 56 | def test_sleep_between_requests_after_redirect 57 | $sleep_calls = 0 58 | redirect_url = "http://localhost/redirect" 59 | grubby = Grubby.new 60 | 61 | actual_url = grubby.get(redirect_url).uri.to_s 62 | refute_equal redirect_url, actual_url # sanity check 63 | grubby.get(redirect_url) 64 | assert_equal 1, $sleep_calls 65 | end 66 | 67 | def test_ok_predicate_with_success_code 68 | assert Grubby.new.ok?(make_uris(1).first) 69 | end 70 | 71 | def test_ok_predicate_with_error_code 72 | refute Grubby.new.ok?(make_uris(1, "500").first) 73 | end 74 | 75 | def test_get_mirrored_with_first_successful 76 | uris = make_uris(2) 77 | 78 | assert_equal uris.first, get_mirrored_resultant_uri(uris) 79 | end 80 | 81 | def test_get_mirrored_with_last_successful 82 | uris = make_uris(2, "404") + make_uris(1) 83 | 84 | assert_equal uris.last, get_mirrored_resultant_uri(uris) 85 | end 86 | 87 | def test_get_mirrored_with_none_successful 88 | uris = make_uris(2, "404") 89 | 90 | assert_raises(Mechanize::ResponseCodeError) do 91 | get_mirrored_resultant_uri(uris) 92 | end 93 | end 94 | 95 | def test_fulfill_returns_block_result 96 | expected = "foo" 97 | actual = silence_logging do 98 | Grubby.new.fulfill(make_uris(1).first){ expected } 99 | end 100 | 101 | assert_equal expected, actual 102 | end 103 | 104 | def test_fulfill_with_different_pages 105 | uris = make_uris(2) 106 | uris.last.path = "/form_test.html" 107 | 108 | assert_equal uris, fulfill_resultant_uris(uris) 109 | end 110 | 111 | def test_fulfill_with_same_url 112 | uris = make_uris(1) * 2 113 | 114 | assert_equal uris.uniq, fulfill_resultant_uris(uris) 115 | end 116 | 117 | def test_fulfill_with_same_page_content 118 | uris = make_uris(2) 119 | 120 | assert_equal uris.take(1), fulfill_resultant_uris(uris) 121 | end 122 | 123 | def test_fulfill_with_different_purposes 124 | purposes = 2.times.map{|i| "purpose #{i}" } 125 | uris = make_uris(1) * purposes.length 126 | 127 | assert_equal uris, fulfill_resultant_uris(uris.zip(purposes)) 128 | end 129 | 130 | def test_journal_fulfill 131 | uris = make_uris(2) 132 | 133 | in_tmpdir do 134 | refute_empty fulfill_resultant_uris(uris, Grubby.new("journal.txt")) 135 | assert_empty fulfill_resultant_uris(uris, Grubby.new("journal.txt")) 136 | end 137 | end 138 | 139 | def test_journal_fulfill_with_different_pages 140 | uris = make_uris(2) 141 | uris.last.path = "/form_test.html" 142 | 143 | in_tmpdir do 144 | refute_empty fulfill_resultant_uris(uris.take(1), Grubby.new("journal.txt")) 145 | refute_empty fulfill_resultant_uris(uris.drop(1), Grubby.new("journal.txt")) 146 | end 147 | end 148 | 149 | def test_journal_fulfill_with_different_purposes 150 | purposes = 2.times.map{|i| "purpose #{i}" } 151 | uris = make_uris(1) * purposes.length 152 | requests = uris.zip(purposes) 153 | 154 | in_tmpdir do 155 | refute_empty fulfill_resultant_uris(requests, Grubby.new("journal.txt")) 156 | assert_empty fulfill_resultant_uris(requests, Grubby.new("journal.txt")) 157 | end 158 | end 159 | 160 | def test_journal_initializer 161 | in_tmpdir do 162 | assert_equal Pathname.new("expected"), Grubby.new("expected").journal 163 | end 164 | end 165 | 166 | def test_journal_attr 167 | uris = make_uris(2) 168 | journal_a = Pathname.new("a") 169 | journal_b = Pathname.new("b") 170 | 171 | in_tmpdir do 172 | grubby = Grubby.new 173 | 174 | grubby.journal = journal_a.to_s 175 | assert_equal journal_a, grubby.journal 176 | refute_empty fulfill_resultant_uris(uris, grubby) 177 | 178 | grubby.journal = journal_b 179 | assert_equal journal_b, grubby.journal 180 | refute_empty fulfill_resultant_uris(uris, grubby) 181 | 182 | grubby.journal = journal_a 183 | assert_empty fulfill_resultant_uris(uris, grubby) 184 | 185 | grubby.journal = nil 186 | assert_nil grubby.journal 187 | refute_empty fulfill_resultant_uris(uris, grubby) 188 | end 189 | end 190 | 191 | def test_json_pluggable_parser 192 | grubby = Grubby.new 193 | parser = grubby.get("http://localhost/response_code?code=200&ct=application/json") 194 | 195 | assert_instance_of Grubby::JsonParser, parser 196 | assert_same grubby, parser.mech 197 | end 198 | 199 | 200 | private 201 | 202 | def make_uris(count, response_code = "200") 203 | count.times.map do |i| 204 | URI("http://localhost/response_code?code=#{response_code}&i=#{i}") 205 | end 206 | end 207 | 208 | def get_mirrored_resultant_uri(uris) 209 | silence_logging do 210 | Grubby.new.get_mirrored(uris).uri 211 | end 212 | end 213 | 214 | def fulfill_resultant_uris(requests, grubby = Grubby.new) 215 | silence_logging do 216 | requests.map do |args| 217 | grubby.fulfill(*args){|page| page.uri } 218 | end.compact 219 | end 220 | end 221 | 222 | end 223 | -------------------------------------------------------------------------------- /test/json_parser_test.rb: -------------------------------------------------------------------------------- 1 | require "test_helper" 2 | 3 | class GrubbyJsonParserTest < Minitest::Test 4 | 5 | def test_initialize 6 | uri = URI("http://localhost") 7 | data = [{ "key1" => "val1" }, { "key2" => "val2" }] 8 | body = data.to_json 9 | code = "200" 10 | mech = Grubby.new 11 | parser = Grubby::JsonParser.new(uri, nil, body, code, mech) 12 | 13 | assert_equal uri, parser.uri 14 | assert_equal body, parser.body 15 | assert_equal data, parser.json 16 | assert_equal code, parser.code 17 | assert_same mech, parser.mech 18 | end 19 | 20 | def test_initialize_with_blanks 21 | [[], [nil] * 5, [nil, nil, "", nil, nil]].each do |args| 22 | parser = Grubby::JsonParser.new(*args) # does not raise 23 | assert_nil parser.json 24 | end 25 | end 26 | 27 | def test_json_parsing_is_safe 28 | require "json/add/complex" 29 | body = JSON.dump(Complex(0, 1)) 30 | assert_instance_of Complex, JSON.load(body) # sanity check 31 | 32 | refute_instance_of Complex, Grubby::JsonParser.new(nil, nil, body).json 33 | end 34 | 35 | end 36 | -------------------------------------------------------------------------------- /test/json_scraper_test.rb: -------------------------------------------------------------------------------- 1 | require "test_helper" 2 | 3 | class GrubbyJsonScraperTest < Minitest::Test 4 | 5 | def test_initialize_with_valid_parser 6 | parser = Grubby::JsonParser.new(nil, nil, "[1, 2, 3]", nil) 7 | scraper = Grubby::JsonScraper.new(parser) 8 | assert_same parser.json, scraper.json 9 | end 10 | 11 | def test_initialize_with_invalid_parser 12 | page = Mechanize::Page.new 13 | assert_raises { Grubby::JsonScraper.new(page) } 14 | end 15 | 16 | def test_scrape_file 17 | Dir.mktmpdir do |dir| 18 | path = File.join(dir, "some file.json") 19 | hi = "Hello" 20 | File.write(path, "{ \"hi\": \"#{hi}\" }") 21 | scraper = MyScraper.scrape_file(path) 22 | 23 | assert_instance_of MyScraper, scraper 24 | assert_equal hi, scraper.hi 25 | assert_same $grubby, scraper.source.mech 26 | end 27 | end 28 | 29 | def test_scrape_file_with_agent 30 | Dir.mktmpdir do |dir| 31 | path = File.join(dir, "file.json") 32 | File.write(path, "{ \"hi\": \"...\" }") 33 | grubby = Grubby.new 34 | scraper = MyScraper.scrape_file(path, grubby) 35 | 36 | assert_same grubby, scraper.source.mech 37 | end 38 | end 39 | 40 | private 41 | 42 | class MyScraper < Grubby::JsonScraper 43 | scrapes(:hi){ json.fetch("hi") } 44 | end 45 | 46 | end 47 | -------------------------------------------------------------------------------- /test/log_test.rb: -------------------------------------------------------------------------------- 1 | require 'test_helper' 2 | 3 | class LogTest < Minitest::Test 4 | 5 | def test_log_global_exists 6 | assert_kind_of Logger, $log 7 | end 8 | 9 | def test_log_global_logs 10 | out, err = capture_subprocess_io do 11 | $log.error('testing123') 12 | end 13 | 14 | assert_match 'testing123', (out + err) 15 | end 16 | 17 | end 18 | -------------------------------------------------------------------------------- /test/mechanize/download_test.rb: -------------------------------------------------------------------------------- 1 | require "test_helper" 2 | 3 | class MechanizeDownloadTest < Mechanize::TestCase 4 | 5 | def test_content_hash 6 | content = "abcdef" 7 | assert_equal content.sha1, make_mechanize_download(content).content_hash 8 | end 9 | 10 | 11 | private 12 | 13 | def make_mechanize_download(content) 14 | Mechanize::Download.new(nil, nil, StringIO.new(content), nil) 15 | end 16 | 17 | end 18 | -------------------------------------------------------------------------------- /test/mechanize/fetch_with_retry_test.rb: -------------------------------------------------------------------------------- 1 | require "test_helper" 2 | 3 | class FetchWithRetryTest < Mechanize::TestCase 4 | 5 | class ::Mechanize::HTTP::Agent 6 | def stubbed_fetch_without_retry(*args) 7 | $stubbed_fetch_error_queue ||= [] 8 | error = $stubbed_fetch_error_queue.shift 9 | raise error if error 10 | real_fetch_without_retry(*args) 11 | end 12 | 13 | alias_method :real_fetch_without_retry, :fetch_without_retry 14 | alias_method :fetch_without_retry, :stubbed_fetch_without_retry 15 | end 16 | 17 | 18 | def test_fetch_works_normally 19 | $stubbed_fetch_error_queue = [] 20 | assert_instance_of Mechanize::Page, do_fetch 21 | end 22 | 23 | def test_fetch_retries_upto_max 24 | max_retries = ::Mechanize::HTTP::Agent::MAX_CONNECTION_RESET_RETRIES 25 | $stubbed_fetch_error_queue = max_retries.times.map do 26 | Net::HTTP::Persistent::Error.new("too many connection resets") 27 | end 28 | 29 | out, err = capture_subprocess_io do 30 | assert_instance_of Mechanize::Page, do_fetch 31 | end 32 | assert_equal max_retries, (out + err).scan(/retry in \d+ seconds?/i).length 33 | end 34 | 35 | def test_fetch_fails_after_max_retries 36 | max_retries = ::Mechanize::HTTP::Agent::MAX_CONNECTION_RESET_RETRIES 37 | $stubbed_fetch_error_queue = (max_retries + 1).times.map do 38 | Net::HTTP::Persistent::Error.new("too many connection resets") 39 | end 40 | 41 | out, err = capture_subprocess_io do 42 | assert_raises(Net::HTTP::Persistent::Error) { do_fetch } 43 | end 44 | assert_equal max_retries, (out + err).scan(/retry in \d+ seconds?/i).length 45 | end 46 | 47 | def test_fetch_reraises_other_errors 48 | expected_error = RuntimeError.new("something else went wrong") 49 | $stubbed_fetch_error_queue = [expected_error] 50 | 51 | actual_error = assert_raises { do_fetch } 52 | assert_equal expected_error, actual_error 53 | end 54 | 55 | 56 | private 57 | 58 | def do_fetch 59 | Mechanize.new.get("http://localhost") 60 | end 61 | 62 | end 63 | -------------------------------------------------------------------------------- /test/mechanize/file_test.rb: -------------------------------------------------------------------------------- 1 | require "test_helper" 2 | 3 | class MechanizeFileTest < Mechanize::TestCase 4 | 5 | def test_read_local 6 | Dir.mktmpdir do |dir| 7 | path = File.join(dir, "`this` & {that}.txt") 8 | content = "stuff\nmorestuff\n" 9 | File.write(path, content) 10 | mech_file = Mechanize::File.read_local(path) 11 | 12 | assert_instance_of Mechanize::File, mech_file 13 | assert_equal "file", mech_file.uri.scheme 14 | assert_equal path, CGI.unescape(mech_file.uri.path) 15 | assert_equal content, mech_file.content 16 | assert_equal "200", mech_file.code 17 | end 18 | end 19 | 20 | def test_content_hash 21 | content = "abcdef" 22 | mech_file = Mechanize::File.new(nil, nil, content, nil) 23 | assert_equal content.sha1, mech_file.content_hash 24 | end 25 | 26 | end 27 | -------------------------------------------------------------------------------- /test/mechanize/link_test.rb: -------------------------------------------------------------------------------- 1 | require "test_helper" 2 | 3 | class MechanizeLinkTest < Mechanize::TestCase 4 | 5 | def test_to_absolute_uri_with_absolute_href 6 | href = "http://localhost" 7 | uri = make_link(href).to_absolute_uri 8 | assert_kind_of URI, uri 9 | assert_equal href, uri.to_s 10 | end 11 | 12 | def test_to_absolute_uri_with_relative_href 13 | href = "/index.html" 14 | uri = make_link(href).to_absolute_uri 15 | assert_kind_of URI, uri 16 | assert uri.absolute? 17 | assert uri.to_s.end_with?(href) 18 | end 19 | 20 | def test_to_absolute_uri_with_nil_href 21 | uri = make_link(nil).to_absolute_uri 22 | assert_kind_of URI, uri 23 | assert uri.absolute? 24 | end 25 | 26 | 27 | private 28 | 29 | def make_link(href) 30 | page = html_page(<<-HTML) 31 | 32 | 33 | link 34 | 35 | 36 | HTML 37 | 38 | page.links.first 39 | end 40 | 41 | end 42 | -------------------------------------------------------------------------------- /test/mechanize/page_test.rb: -------------------------------------------------------------------------------- 1 | require "test_helper" 2 | 3 | class MechanizePageTest < Mechanize::TestCase 4 | 5 | def test_searchbang_with_one_matching 6 | results = make_page.search!("#bad1", "#good2", "#bad2") 7 | assert_equal "good2", results.first.attr("id") 8 | end 9 | 10 | def test_atbang_with_multiple_matching 11 | result = make_page.at!("#good3", "#good2") 12 | assert_equal "good3", result.attr("id") 13 | end 14 | 15 | 16 | private 17 | 18 | def make_page 19 | html_page(<<-HTML) 20 | 21 | 22 |

23 |

24 |

25 | 26 | 27 | HTML 28 | end 29 | 30 | end 31 | -------------------------------------------------------------------------------- /test/mechanize/parser_test.rb: -------------------------------------------------------------------------------- 1 | require "test_helper" 2 | 3 | class MechanizeParserTest < Mechanize::TestCase 4 | 5 | def test_save_to_sanity 6 | # sanity check that Mechanize includes the Mechanize::Parser module 7 | # and implements #save! in the relevant classes 8 | assert_includes Mechanize::File.included_modules, Mechanize::Parser 9 | assert_includes Mechanize::File.instance_methods, :save! 10 | assert_includes Mechanize::Download.included_modules, Mechanize::Parser 11 | assert_includes Mechanize::Download.instance_methods, :save! 12 | assert_includes Mechanize::Image.included_modules, Mechanize::Parser 13 | assert_includes Mechanize::Image.instance_methods, :save! 14 | end 15 | 16 | def test_save_to 17 | dir = "deeply/nested/dir/" 18 | html1 = "

Hello

" 19 | html2 = "

Hello

" 20 | 21 | in_tmpdir do 22 | path1 = html_page(html1).save_to(dir) 23 | assert_match %r"^#{dir}.+", path1 24 | assert_equal html1, File.read(path1) 25 | 26 | path2 = html_page(html2).save_to(dir) 27 | assert_match %r"^#{dir}.+", path2 28 | assert_equal html2, File.read(path2) 29 | refute_equal path1, path2 30 | assert_equal html1, File.read(path1) 31 | end 32 | end 33 | 34 | def test_save_to_bang 35 | dir = "deeply/nested/dir/" 36 | html1 = "

Hello

" 37 | html2 = "

Hello

" 38 | 39 | in_tmpdir do 40 | path1 = html_page(html1).save_to!(dir) 41 | assert_match %r"^#{dir}.+", path1 42 | assert_equal html1, File.read(path1) 43 | 44 | path2 = html_page(html2).save_to!(dir) 45 | assert_equal path1, path2 46 | assert_equal html2, File.read(path1) 47 | end 48 | end 49 | 50 | end 51 | -------------------------------------------------------------------------------- /test/page_scraper_test.rb: -------------------------------------------------------------------------------- 1 | require "test_helper" 2 | 3 | class GrubbyPageScraperTest < Minitest::Test 4 | 5 | def test_initialize_with_valid_parser 6 | page = Mechanize::Page.new 7 | scraper = Grubby::PageScraper.new(page) 8 | assert_same page, scraper.page 9 | end 10 | 11 | def test_initialize_with_invalid_parser 12 | download = Mechanize::Download.new 13 | assert_raises { Grubby::PageScraper.new(download) } 14 | end 15 | 16 | def test_scrape_file 17 | Dir.mktmpdir do |dir| 18 | path = File.join(dir, "some file.html") 19 | h1 = "Hello" 20 | File.write(path, "

#{h1}

") 21 | scraper = MyScraper.scrape_file(path) 22 | 23 | assert_instance_of MyScraper, scraper 24 | assert_equal h1, scraper.h1 25 | assert_same $grubby, scraper.page.mech 26 | end 27 | end 28 | 29 | def test_scrape_file_with_agent 30 | Dir.mktmpdir do |dir| 31 | path = File.join(dir, "file.html") 32 | File.write(path, "

...

") 33 | grubby = Grubby.new 34 | scraper = MyScraper.scrape_file(path, grubby) 35 | 36 | assert_same grubby, scraper.page.mech 37 | end 38 | end 39 | 40 | private 41 | 42 | class MyScraper < Grubby::PageScraper 43 | scrapes(:h1){ page.at("h1").text } 44 | end 45 | 46 | end 47 | -------------------------------------------------------------------------------- /test/scraper_test.rb: -------------------------------------------------------------------------------- 1 | require "test_helper" 2 | 3 | class GrubbyScraperTest < Minitest::Test 4 | 5 | def test_scrapes_values 6 | scraper = make_scraper(CONTENT) 7 | 8 | EXPECTED.each do |field, expected| 9 | assert_equal [expected], [scraper.send(field)] 10 | end 11 | end 12 | 13 | def test_raises_on_nil_required_value 14 | assert_raises(Grubby::Scraper::Error) do 15 | make_scraper(CONTENT.merge(req: nil)) 16 | end 17 | end 18 | 19 | def test_allows_nil_optional_value 20 | scraper = make_scraper(CONTENT.merge(opt: nil)) 21 | 22 | assert_equal EXPECTED[:req_val], scraper.req_val # sanity check 23 | assert_nil scraper.opt_val 24 | end 25 | 26 | def test_obeys_conditional_modifiers 27 | scraper = make_scraper(CONTENT.merge(opt: nil)) 28 | 29 | assert_nil scraper.opt_word 30 | refute_nil scraper.opt_miss 31 | end 32 | 33 | def test_captures_all_errors 34 | error = assert_raises(Grubby::Scraper::Error){ make_scraper({}) } 35 | 36 | assert_instance_of MyScraper, error.scraper 37 | EXPECTED.compact.keys.each do |field| 38 | assert_kind_of StandardError, error.scraper.errors[field] 39 | end 40 | end 41 | 42 | def test_reports_only_original_errors 43 | error = assert_raises(Grubby::Scraper::Error){ make_scraper({}) } 44 | 45 | assert_match "req_val", error.message 46 | assert_match "opt_val", error.message 47 | refute_match "opt_word", error.message 48 | end 49 | 50 | def test_filters_error_backtrace 51 | error = assert_raises(Grubby::Scraper::Error){ make_scraper({}) } 52 | ruby_file = Grubby::Scraper.method(:scrapes).source_location[0] 53 | 54 | refute_match ruby_file, error.message 55 | end 56 | 57 | def test_fields_attr 58 | assert_equal EXPECTED.keys.sort, MyScraper.fields.sort 59 | end 60 | 61 | def test_fields_attr_includes_superclass_fields 62 | assert_equal INHERITING_EXPECTED.keys.sort, MyInheritingScraper.fields.sort 63 | end 64 | 65 | def test_source_attr 66 | scraper = make_scraper(CONTENT) 67 | 68 | assert_kind_of Mechanize::File, scraper.source 69 | assert_equal CONTENT, scraper.source.content 70 | end 71 | 72 | def test_lookup 73 | scraper = make_scraper(CONTENT) 74 | 75 | EXPECTED.each do |field, expected| 76 | assert_equal [expected], [scraper[field]] 77 | end 78 | end 79 | 80 | def test_to_h 81 | scraper = make_scraper(CONTENT) 82 | 83 | assert_equal EXPECTED, scraper.to_h 84 | end 85 | 86 | def test_to_h_includes_superclass_fields 87 | scraper = make_scraper(CONTENT, MyInheritingScraper) 88 | 89 | assert_equal INHERITING_EXPECTED, scraper.to_h 90 | end 91 | 92 | def test_initialize_missing_super_raises_friendly_error 93 | error = assert_raises{ IncorrectScraper.new.foo } 94 | 95 | assert_match "initialize", error.message 96 | end 97 | 98 | def test_factory_method 99 | scraper = UrlScraper.scrape(UrlScraper.url) 100 | 101 | assert_instance_of UrlScraper, scraper 102 | assert_equal UrlScraper.url, scraper.url 103 | assert_same $grubby, scraper.source.mech 104 | end 105 | 106 | def test_factory_method_with_agent 107 | agent = Mechanize.new 108 | scraper = UrlScraper.scrape(UrlScraper.url, agent) 109 | 110 | assert_same agent, scraper.source.mech 111 | end 112 | 113 | def test_each 114 | expected_urls = [3, 2, 1].map{|i| UrlScraper.url(i) } 115 | actual_urls = [] 116 | 117 | UrlScraper.each(expected_urls.first) do |scraper| 118 | assert_instance_of UrlScraper, scraper 119 | assert_same $grubby, scraper.source.mech 120 | actual_urls << scraper.url 121 | end 122 | assert_equal expected_urls, actual_urls 123 | end 124 | 125 | def test_each_without_block 126 | expected_urls = [3, 2, 1].map{|i| UrlScraper.url(i) } 127 | 128 | actual_enum = UrlScraper.each(expected_urls.first) 129 | assert_kind_of Enumerator, actual_enum 130 | 131 | actual_scrapers = actual_enum.to_a 132 | actual_scrapers.each{|scraper| assert_instance_of UrlScraper, scraper } 133 | assert_equal expected_urls, actual_scrapers.map(&:url) 134 | end 135 | 136 | def test_each_with_next_method 137 | expected_urls = [3, 2, 1].map{|i| UrlScraper.url(i) + "#next_uri" } 138 | actual_urls = UrlScraper.each(expected_urls.first, next_method: :next_uri).map(&:url) 139 | 140 | assert_equal expected_urls, actual_urls 141 | end 142 | 143 | def test_each_with_invalid_next_method 144 | error = assert_raises NoMethodError do 145 | UrlScraper.each(UrlScraper.url(2), next_method: :nope) do |scraper| 146 | assert false # should never get here 147 | end 148 | end 149 | assert_equal :nope, error.name 150 | 151 | error = assert_raises NoMethodError do 152 | UrlScraper.each(UrlScraper.url(2), next_method: :nope) 153 | end 154 | assert_equal :nope, error.name 155 | end 156 | 157 | def test_each_over_page_objects 158 | expected_urls = [3, 2, 1].map{|i| UrlScraper.url(i) + "#next_page" } 159 | start_page = Grubby.new.get(expected_urls.first) 160 | actual_urls = UrlScraper.each(start_page, next_method: :next_page).map(&:url) 161 | 162 | assert_equal expected_urls, actual_urls 163 | end 164 | 165 | def test_each_with_agent 166 | agent = Mechanize.new 167 | 168 | UrlScraper.each(UrlScraper.url(3), agent) do |scraper| 169 | assert_same agent, scraper.source.mech 170 | end 171 | end 172 | 173 | private 174 | 175 | CONTENT = { 176 | req: "required value", 177 | opt: "optional value", 178 | } 179 | 180 | EXPECTED = { 181 | req_val: "required value", 182 | opt_val: "optional value", 183 | opt_word: "optional", 184 | opt_miss: nil, 185 | } 186 | 187 | INHERITING_EXPECTED = EXPECTED.merge( 188 | opt_val: EXPECTED[:opt_val].swapcase, 189 | opt_word: EXPECTED[:opt_word].swapcase, 190 | add_val: EXPECTED[:req_val], 191 | ) 192 | 193 | class MyScraper < Grubby::Scraper 194 | scrapes :req_val do 195 | source.content.fetch(:req) 196 | end 197 | 198 | scrapes :opt_val, optional: true do 199 | source.content.fetch(:opt) 200 | end 201 | 202 | scrapes :opt_word, if: :opt_val do 203 | opt_val[/\w+/] 204 | end 205 | 206 | scrapes :opt_miss, unless: :opt_val do 207 | true 208 | end 209 | end 210 | 211 | class MyInheritingScraper < MyScraper 212 | scrapes :opt_val, optional: true do 213 | source.content.fetch(:opt)&.swapcase 214 | end 215 | 216 | scrapes :add_val do 217 | req_val 218 | end 219 | end 220 | 221 | def make_scraper(content, klass = MyScraper) 222 | source = Mechanize::File.new(nil, nil, content, nil) 223 | silence_logging do 224 | klass.new(source) 225 | end 226 | end 227 | 228 | class IncorrectScraper < Grubby::Scraper 229 | scrapes(:foo){ "FOO!" } 230 | 231 | def initialize(*args) 232 | # does not call `super` 233 | end 234 | end 235 | 236 | class UrlScraper < Grubby::Scraper 237 | def self.url(n = 1) 238 | "http://localhost/response_code?code=200&n=#{n}" 239 | end 240 | 241 | scrapes(:url){ source.uri.to_s } 242 | scrapes(:n){ source.uri.query[/\bn=(\d+)\b/, 1]&.to_i } 243 | 244 | def next 245 | self.class.url(n - 1) if n > 1 246 | end 247 | 248 | def next_uri 249 | self.next.try{|next_url| URI(next_url + "#next_uri") } 250 | end 251 | 252 | def next_page 253 | self.next.try{|next_url| source.mech.get(next_url + "#next_page") } 254 | end 255 | end 256 | 257 | end 258 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift File.expand_path("../../lib", __FILE__) 2 | require "grubby" 3 | 4 | require "minitest/autorun" 5 | 6 | require "mechanize/test_case" 7 | # disable obnoxious coloring from "minitest/pride" forcibly included by "mechanize/test_case" 8 | if Minitest.const_defined?("PrideIO") 9 | class << Minitest::PrideIO 10 | remove_method :pride? 11 | 12 | def pride? 13 | false 14 | end 15 | end 16 | end 17 | 18 | 19 | module Kernel 20 | 21 | def dont_sleep(amount) 22 | $sleep_calls ||= 0 23 | $sleep_calls += 1 24 | $sleep_total_amount ||= 0 25 | $sleep_total_amount += amount 26 | $sleep_last_amount = amount 27 | end 28 | 29 | alias_method :actually_sleep, :sleep 30 | alias_method :sleep, :dont_sleep 31 | 32 | end 33 | 34 | 35 | class Minitest::Test 36 | 37 | def silence_logging 38 | log_level = $log.level 39 | $log.level = Logger::Severity::FATAL 40 | begin 41 | result = yield 42 | ensure 43 | $log.level = log_level 44 | end 45 | result 46 | end 47 | 48 | end 49 | --------------------------------------------------------------------------------