├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── Gemfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── bin
    ├── console
    └── setup
├── exe
    └── kimurai
├── kimurai.gemspec
├── lib
    ├── kimurai.rb
    └── kimurai
    │   ├── automation
    │       ├── deploy.yml
    │       ├── setup.yml
    │       └── setup
    │       │   ├── chromium_chromedriver.yml
    │       │   ├── firefox_geckodriver.yml
    │       │   ├── phantomjs.yml
    │       │   └── ruby_environment.yml
    │   ├── base.rb
    │   ├── base
    │       ├── saver.rb
    │       └── storage.rb
    │   ├── base_helper.rb
    │   ├── browser_builder.rb
    │   ├── browser_builder
    │       ├── mechanize_builder.rb
    │       ├── poltergeist_phantomjs_builder.rb
    │       ├── selenium_chrome_builder.rb
    │       └── selenium_firefox_builder.rb
    │   ├── capybara_configuration.rb
    │   ├── capybara_ext
    │       ├── driver
    │       │   └── base.rb
    │       ├── mechanize
    │       │   └── driver.rb
    │       ├── poltergeist
    │       │   └── driver.rb
    │       ├── selenium
    │       │   └── driver.rb
    │       ├── session.rb
    │       └── session
    │       │   └── config.rb
    │   ├── cli.rb
    │   ├── cli
    │       ├── ansible_command_builder.rb
    │       └── generator.rb
    │   ├── core_ext
    │       ├── array.rb
    │       ├── hash.rb
    │       ├── numeric.rb
    │       └── string.rb
    │   ├── pipeline.rb
    │   ├── runner.rb
    │   ├── template
    │       ├── .gitignore
    │       ├── Gemfile
    │       ├── README.md
    │       ├── config
    │       │   ├── application.rb
    │       │   ├── automation.yml
    │       │   ├── boot.rb
    │       │   ├── initializers
    │       │   │   └── .keep
    │       │   └── schedule.rb
    │       ├── db
    │       │   └── .keep
    │       ├── helpers
    │       │   └── application_helper.rb
    │       ├── lib
    │       │   └── .keep
    │       ├── log
    │       │   └── .keep
    │       ├── pipelines
    │       │   ├── saver.rb
    │       │   └── validator.rb
    │       ├── spiders
    │       │   └── application_spider.rb
    │       └── tmp
    │       │   └── .keep
    │   └── version.rb
└── test
    ├── kimurai_test.rb
    └── test_helper.rb


/.gitignore:
--------------------------------------------------------------------------------
 1 | /.bundle/
 2 | /.yardoc
 3 | /_yardoc/
 4 | /coverage/
 5 | /doc/
 6 | /pkg/
 7 | /spec/reports/
 8 | /tmp/
 9 | Gemfile.lock
10 | 
11 | *.retry
12 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: false
2 | language: ruby
3 | rvm:
4 |   - 2.5.1
5 | before_install: gem install bundler -v 1.16.2
6 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # CHANGELOG
  2 | ## 1.4.0
  3 | ### New
  4 | * Add `encoding` config option (see [All available config options](https://github.com/vifreefly/kimuraframework#all-available-config-options))
  5 | * Validate url before processing a request (Base#request_to)
  6 | 
  7 | ### Fixes
  8 | * Fix console command bug (see [issue 21](https://github.com/vifreefly/kimuraframework/issues/21))
  9 | 
 10 | ## 1.3.2
 11 | ### Fixes
 12 | * In the project template, set Ruby version as >= 2.5 (before was hard-coded to 2.5.1)
 13 | * Remove .ruby-version file (was hard-coded to 2.5.1) from the project template
 14 | 
 15 | ## 1.3.1
 16 | ### Fixes
 17 | * Fixed bug in Base#save_to
 18 | 
 19 | ## 1.3.0
 20 | ### Breaking changes 1.3.0
 21 | * Remove persistence database feature (because it's slow and makes things complicated)
 22 | 
 23 | ### New
 24 | * Add `--include` and `--exclude` options to CLI#runner
 25 | * Add Base `#create_browser` method to easily create additional browser instances
 26 | * Add Capybara::Session `#scroll_to_bottom`
 27 | * Add skip_on_failure feature to `retry_request_errors` config option
 28 | * Add info about `add_event` method to the README
 29 | 
 30 | ### Fixes and improvements
 31 | * Improve Runner
 32 | * Fix time helper in schedule.rb
 33 | * Add proxy validation to browser builders
 34 | * Allow to pass different arguments to the `Base.parse` method
 35 | 
 36 | ## 1.2.0
 37 | ### New
 38 | * Add possibility to add array of values to the storage (`Base::Storage#add`)
 39 | * Add `exception_on_fail` option to `Base.crawl!`
 40 | * Add possibility to pass request hash to the `start_urls` (You can use array of hashes as well, like: `@start_urls = [{ url: "https://example.com/cat?id=1", data: { category: "First Category" } }]`)
 41 | * Implement `skip_request_errors` config feature. Added [Handle request errors](https://github.com/vifreefly/kimuraframework#handle-request-errors) chapter to the README.
 42 | * Add option to choose response type for `Session#current_response` (`:html` default, or `:json`)
 43 | * Add option to provide custom chrome and chromedriver paths
 44 | 
 45 | ### Improvements
 46 | * Refactor `Runner`
 47 | 
 48 | ### Fixes
 49 | * Fix `Base#Saver` (automatically create file if it doesn't exists in case of persistence database)
 50 | * Do not deep merge config's `headers:` option
 51 | 
 52 | ## 1.1.0
 53 | ### Breaking changes 1.1.0
 54 | `browser` config option depricated. Now all sub-options inside `browser` should be placed right into `@config` hash, without `browser` parent key. Example:
 55 | 
 56 | ```ruby
 57 | # Was:
 58 | @config = {
 59 |   browser: {
 60 |     retry_request_errors: [Net::ReadTimeout],
 61 |     restart_if: {
 62 |       memory_limit: 350_000,
 63 |       requests_limit: 100
 64 |     },
 65 |     before_request: {
 66 |       change_proxy: true,
 67 |       change_user_agent: true,
 68 |       clear_cookies: true,
 69 |       clear_and_set_cookies: true,
 70 |       delay: 1..3
 71 |     }
 72 |   }
 73 | }
 74 | 
 75 | # Now:
 76 | @config = {
 77 |   retry_request_errors: [Net::ReadTimeout],
 78 |   restart_if: {
 79 |     memory_limit: 350_000,
 80 |     requests_limit: 100
 81 |   },
 82 |   before_request: {
 83 |     change_proxy: true,
 84 |     change_user_agent: true,
 85 |     clear_cookies: true,
 86 |     clear_and_set_cookies: true,
 87 |     delay: 1..3
 88 |   }
 89 | }
 90 | ```
 91 | 
 92 | ### New
 93 | * Add `storage` object with additional methods and persistence database feature
 94 | * Add events feature to `run_info`
 95 | * Add `skip_duplicate_requests` config option to automatically skip already visited urls when using requrst_to
 96 | * Add  `extensions` config option to allow inject JS code into browser (supported only by poltergeist_phantomjs engine)
 97 | * Add Capybara::Session#within_new_window_by method
 98 | 
 99 | ### Improvements
100 | * Add the last backtrace line to pipeline output when item was dropped
101 | * Do not destroy driver if it's not exists (for Base.parse! method)
102 | * Handle possible Net::ReadTimeout error while trying to #quit driver
103 | 
104 | ### Fixes
105 | * Fix Mechanize::Driver#proxy (there was a bug while using proxy for mechanize engine without authorization)
106 | * Fix requests retries logic
107 | 
108 | 
109 | ## 1.0.1
110 | * Add missing `logger` method to pipeline
111 | * Fix `set_proxy` in Mechanize and Poltergeist builders
112 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 | 
3 | git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4 | 
5 | # Specify your gem's dependencies in kimurai.gemspec
6 | gemspec
7 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018 Victor Afanasev
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | # Kimurai
   2 | 
   3 | > UPD. I will soon have a time to work on issues for current 1.4 version and also plan to release new 2.0 version with https://github.com/twalpole/apparition engine.
   4 | 
   5 | Kimurai is a modern web scraping framework written in Ruby which **works out of box with Headless Chromium/Firefox, PhantomJS**, or simple HTTP requests and **allows to scrape and interact with JavaScript rendered websites.**
   6 | 
   7 | Kimurai based on well-known [Capybara](https://github.com/teamcapybara/capybara) and [Nokogiri](https://github.com/sparklemotion/nokogiri) gems, so you don't have to learn anything new. Lets see:
   8 | 
   9 | ```ruby
  10 | # github_spider.rb
  11 | require 'kimurai'
  12 | 
  13 | class GithubSpider < Kimurai::Base
  14 |   @name = "github_spider"
  15 |   @engine = :selenium_chrome
  16 |   @start_urls = ["https://github.com/search?q=Ruby%20Web%20Scraping"]
  17 |   @config = {
  18 |     user_agent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36",
  19 |     before_request: { delay: 4..7 }
  20 |   }
  21 | 
  22 |   def parse(response, url:, data: {})
  23 |     response.xpath("//ul[@class='repo-list']/div//h3/a").each do |a|
  24 |       request_to :parse_repo_page, url: absolute_url(a[:href], base: url)
  25 |     end
  26 | 
  27 |     if next_page = response.at_xpath("//a[@class='next_page']")
  28 |       request_to :parse, url: absolute_url(next_page[:href], base: url)
  29 |     end
  30 |   end
  31 | 
  32 |   def parse_repo_page(response, url:, data: {})
  33 |     item = {}
  34 | 
  35 |     item[:owner] = response.xpath("//h1//a[@rel='author']").text
  36 |     item[:repo_name] = response.xpath("//h1/strong[@itemprop='name']/a").text
  37 |     item[:repo_url] = url
  38 |     item[:description] = response.xpath("//span[@itemprop='about']").text.squish
  39 |     item[:tags] = response.xpath("//div[@id='topics-list-container']/div/a").map { |a| a.text.squish }
  40 |     item[:watch_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Watch')]/a[2]").text.squish
  41 |     item[:star_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Star')]/a[2]").text.squish
  42 |     item[:fork_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Fork')]/a[2]").text.squish
  43 |     item[:last_commit] = response.xpath("//span[@itemprop='dateModified']/*").text
  44 | 
  45 |     save_to "results.json", item, format: :pretty_json
  46 |   end
  47 | end
  48 | 
  49 | GithubSpider.crawl!
  50 | ```
  51 | 
  52 | <details/>
  53 |   <summary>Run: <code>$ ruby github_spider.rb</code></summary>
  54 | 
  55 | ```
  56 | I, [2018-08-22 13:08:03 +0400#15477] [M: 47377500980720]  INFO -- github_spider: Spider: started: github_spider
  57 | D, [2018-08-22 13:08:03 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): created browser instance
  58 | D, [2018-08-22 13:08:03 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): enabled `browser before_request delay`
  59 | D, [2018-08-22 13:08:03 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: sleep 7 seconds before request...
  60 | D, [2018-08-22 13:08:10 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): enabled custom user-agent
  61 | D, [2018-08-22 13:08:10 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): enabled native headless_mode
  62 | I, [2018-08-22 13:08:10 +0400#15477] [M: 47377500980720]  INFO -- github_spider: Browser: started get request to: https://github.com/search?q=Ruby%20Web%20Scraping
  63 | I, [2018-08-22 13:08:26 +0400#15477] [M: 47377500980720]  INFO -- github_spider: Browser: finished get request to: https://github.com/search?q=Ruby%20Web%20Scraping
  64 | I, [2018-08-22 13:08:26 +0400#15477] [M: 47377500980720]  INFO -- github_spider: Info: visits: requests: 1, responses: 1
  65 | D, [2018-08-22 13:08:27 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: driver.current_memory: 107968
  66 | D, [2018-08-22 13:08:27 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: sleep 5 seconds before request...
  67 | I, [2018-08-22 13:08:32 +0400#15477] [M: 47377500980720]  INFO -- github_spider: Browser: started get request to: https://github.com/lorien/awesome-web-scraping
  68 | I, [2018-08-22 13:08:33 +0400#15477] [M: 47377500980720]  INFO -- github_spider: Browser: finished get request to: https://github.com/lorien/awesome-web-scraping
  69 | I, [2018-08-22 13:08:33 +0400#15477] [M: 47377500980720]  INFO -- github_spider: Info: visits: requests: 2, responses: 2
  70 | D, [2018-08-22 13:08:33 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: driver.current_memory: 212542
  71 | D, [2018-08-22 13:08:33 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: sleep 4 seconds before request...
  72 | I, [2018-08-22 13:08:37 +0400#15477] [M: 47377500980720]  INFO -- github_spider: Browser: started get request to: https://github.com/jaimeiniesta/metainspector
  73 | 
  74 | ...
  75 | 
  76 | I, [2018-08-22 13:23:07 +0400#15477] [M: 47377500980720]  INFO -- github_spider: Browser: started get request to: https://github.com/preston/idclight
  77 | I, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720]  INFO -- github_spider: Browser: finished get request to: https://github.com/preston/idclight
  78 | I, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720]  INFO -- github_spider: Info: visits: requests: 140, responses: 140
  79 | D, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: driver.current_memory: 204198
  80 | I, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720]  INFO -- github_spider: Browser: driver selenium_chrome has been destroyed
  81 | 
  82 | I, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720]  INFO -- github_spider: Spider: stopped: {:spider_name=>"github_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 13:08:03 +0400, :stop_time=>2018-08-22 13:23:08 +0400, :running_time=>"15m, 5s", :visits=>{:requests=>140, :responses=>140}, :error=>nil}
  83 | ```
  84 | </details>
  85 | 
  86 | <details/>
  87 |   <summary>results.json</summary>
  88 | 
  89 | ```json
  90 | [
  91 |   {
  92 |     "owner": "lorien",
  93 |     "repo_name": "awesome-web-scraping",
  94 |     "repo_url": "https://github.com/lorien/awesome-web-scraping",
  95 |     "description": "List of libraries, tools and APIs for web scraping and data processing.",
  96 |     "tags": [
  97 |       "awesome",
  98 |       "awesome-list",
  99 |       "web-scraping",
 100 |       "data-processing",
 101 |       "python",
 102 |       "javascript",
 103 |       "php",
 104 |       "ruby"
 105 |     ],
 106 |     "watch_count": "159",
 107 |     "star_count": "2,423",
 108 |     "fork_count": "358",
 109 |     "last_commit": "4 days ago",
 110 |     "position": 1
 111 |   },
 112 | 
 113 |   ...
 114 | 
 115 |   {
 116 |     "owner": "preston",
 117 |     "repo_name": "idclight",
 118 |     "repo_url": "https://github.com/preston/idclight",
 119 |     "description": "A Ruby gem for accessing the freely available IDClight (IDConverter Light) web service, which convert between different types of gene IDs such as Hugo and Entrez. Queries are screen scraped from http://idclight.bioinfo.cnio.es.",
 120 |     "tags": [
 121 | 
 122 |     ],
 123 |     "watch_count": "6",
 124 |     "star_count": "1",
 125 |     "fork_count": "0",
 126 |     "last_commit": "on Apr 12, 2012",
 127 |     "position": 127
 128 |   }
 129 | ]
 130 | ```
 131 | </details><br>
 132 | 
 133 | Okay, that was easy. How about javascript rendered websites with dynamic HTML? Lets scrape a page with infinite scroll:
 134 | 
 135 | ```ruby
 136 | # infinite_scroll_spider.rb
 137 | require 'kimurai'
 138 | 
 139 | class InfiniteScrollSpider < Kimurai::Base
 140 |   @name = "infinite_scroll_spider"
 141 |   @engine = :selenium_chrome
 142 |   @start_urls = ["https://infinite-scroll.com/demo/full-page/"]
 143 | 
 144 |   def parse(response, url:, data: {})
 145 |     posts_headers_path = "//article/h2"
 146 |     count = response.xpath(posts_headers_path).count
 147 | 
 148 |     loop do
 149 |       browser.execute_script("window.scrollBy(0,10000)") ; sleep 2
 150 |       response = browser.current_response
 151 | 
 152 |       new_count = response.xpath(posts_headers_path).count
 153 |       if count == new_count
 154 |         logger.info "> Pagination is done" and break
 155 |       else
 156 |         count = new_count
 157 |         logger.info "> Continue scrolling, current count is #{count}..."
 158 |       end
 159 |     end
 160 | 
 161 |     posts_headers = response.xpath(posts_headers_path).map(&:text)
 162 |     logger.info "> All posts from page: #{posts_headers.join('; ')}"
 163 |   end
 164 | end
 165 | 
 166 | InfiniteScrollSpider.crawl!
 167 | ```
 168 | 
 169 | <details/>
 170 |   <summary>Run: <code>$ ruby infinite_scroll_spider.rb</code></summary>
 171 | 
 172 | ```
 173 | I, [2018-08-22 13:32:57 +0400#23356] [M: 47375890851320]  INFO -- infinite_scroll_spider: Spider: started: infinite_scroll_spider
 174 | D, [2018-08-22 13:32:57 +0400#23356] [M: 47375890851320] DEBUG -- infinite_scroll_spider: BrowserBuilder (selenium_chrome): created browser instance
 175 | D, [2018-08-22 13:32:57 +0400#23356] [M: 47375890851320] DEBUG -- infinite_scroll_spider: BrowserBuilder (selenium_chrome): enabled native headless_mode
 176 | I, [2018-08-22 13:32:57 +0400#23356] [M: 47375890851320]  INFO -- infinite_scroll_spider: Browser: started get request to: https://infinite-scroll.com/demo/full-page/
 177 | I, [2018-08-22 13:33:03 +0400#23356] [M: 47375890851320]  INFO -- infinite_scroll_spider: Browser: finished get request to: https://infinite-scroll.com/demo/full-page/
 178 | I, [2018-08-22 13:33:03 +0400#23356] [M: 47375890851320]  INFO -- infinite_scroll_spider: Info: visits: requests: 1, responses: 1
 179 | D, [2018-08-22 13:33:03 +0400#23356] [M: 47375890851320] DEBUG -- infinite_scroll_spider: Browser: driver.current_memory: 95463
 180 | I, [2018-08-22 13:33:05 +0400#23356] [M: 47375890851320]  INFO -- infinite_scroll_spider: > Continue scrolling, current count is 5...
 181 | I, [2018-08-22 13:33:18 +0400#23356] [M: 47375890851320]  INFO -- infinite_scroll_spider: > Continue scrolling, current count is 9...
 182 | I, [2018-08-22 13:33:20 +0400#23356] [M: 47375890851320]  INFO -- infinite_scroll_spider: > Continue scrolling, current count is 11...
 183 | I, [2018-08-22 13:33:26 +0400#23356] [M: 47375890851320]  INFO -- infinite_scroll_spider: > Continue scrolling, current count is 13...
 184 | I, [2018-08-22 13:33:28 +0400#23356] [M: 47375890851320]  INFO -- infinite_scroll_spider: > Continue scrolling, current count is 15...
 185 | I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320]  INFO -- infinite_scroll_spider: > Pagination is done
 186 | I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320]  INFO -- infinite_scroll_spider: > All posts from page: 1a - Infinite Scroll full page demo; 1b - RGB Schemes logo in Computer Arts; 2a - RGB Schemes logo; 2b - Masonry gets horizontalOrder; 2c - Every vector 2016; 3a - Logo Pizza delivered; 3b - Some CodePens; 3c - 365daysofmusic.com; 3d - Holograms; 4a - Huebee: 1-click color picker; 4b - Word is Flickity is good; Flickity v2 released: groupCells, adaptiveHeight, parallax; New tech gets chatter; Isotope v3 released: stagger in, IE8 out; Packery v2 released
 187 | I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320]  INFO -- infinite_scroll_spider: Browser: driver selenium_chrome has been destroyed
 188 | I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320]  INFO -- infinite_scroll_spider: Spider: stopped: {:spider_name=>"infinite_scroll_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 13:32:57 +0400, :stop_time=>2018-08-22 13:33:30 +0400, :running_time=>"33s", :visits=>{:requests=>1, :responses=>1}, :error=>nil}
 189 | 
 190 | ```
 191 | </details><br>
 192 | 
 193 | 
 194 | ## Features
 195 | * Scrape javascript rendered websites out of box
 196 | * Supported engines: [Headless Chrome](https://developers.google.com/web/updates/2017/04/headless-chrome), [Headless Firefox](https://developer.mozilla.org/en-US/docs/Mozilla/Firefox/Headless_mode), [PhantomJS](https://github.com/ariya/phantomjs) or simple HTTP requests ([mechanize](https://github.com/sparklemotion/mechanize) gem)
 197 | * Write spider code once, and use it with any supported engine later
 198 | * All the power of [Capybara](https://github.com/teamcapybara/capybara): use methods like `click_on`, `fill_in`, `select`, `choose`, `set`, `go_back`, etc. to interact with web pages
 199 | * Rich [configuration](#spider-config): **set default headers, cookies, delay between requests, enable proxy/user-agents rotation**
 200 | * Built-in helpers to make scraping easy, like [save_to](#save_to-helper) (save items to JSON, JSON lines, or CSV formats) or [unique?](#skip-duplicates-unique-helper) to skip duplicates
 201 | * Automatically [handle requests errors](#handle-request-errors)
 202 | * Automatically restart browsers when reaching memory limit [**(memory control)**](#spider-config) or requests limit
 203 | * Easily [schedule spiders](#schedule-spiders-using-cron) within cron using [Whenever](https://github.com/javan/whenever) (no need to know cron syntax)
 204 | * [Parallel scraping](#parallel-crawling-using-in_parallel) using simple method `in_parallel`
 205 | * **Two modes:** use single file for a simple spider, or [generate](#project-mode) Scrapy-like **project**
 206 | * Convenient development mode with [console](#interactive-console), colorized logger and debugger ([Pry](https://github.com/pry/pry), [Byebug](https://github.com/deivid-rodriguez/byebug))
 207 | * Automated [server environment setup](#setup) (for ubuntu 18.04) and [deploy](#deploy) using commands `kimurai setup` and `kimurai deploy` ([Ansible](https://github.com/ansible/ansible) under the hood)
 208 | * Command-line [runner](#runner) to run all project spiders one by one or in parallel
 209 | 
 210 | ## Table of Contents
 211 | * [Kimurai](#kimurai)
 212 |   * [Features](#features)
 213 |   * [Table of Contents](#table-of-contents)
 214 |   * [Installation](#installation)
 215 |   * [Getting to Know](#getting-to-know)
 216 |     * [Interactive console](#interactive-console)
 217 |     * [Available engines](#available-engines)
 218 |     * [Minimum required spider structure](#minimum-required-spider-structure)
 219 |     * [Method arguments response, url and data](#method-arguments-response-url-and-data)
 220 |     * [browser object](#browser-object)
 221 |     * [request_to method](#request_to-method)
 222 |     * [save_to helper](#save_to-helper)
 223 |     * [Skip duplicates](#skip-duplicates)
 224 |       * [Automatically skip all duplicated requests urls](#automatically-skip-all-duplicated-requests-urls)
 225 |       * [Storage object](#storage-object)
 226 |     * [Handle request errors](#handle-request-errors)
 227 |       * [skip_request_errors](#skip_request_errors)
 228 |       * [retry_request_errors](#retry_request_errors)
 229 |     * [Logging custom events](#logging-custom-events)
 230 |     * [open_spider and close_spider callbacks](#open_spider-and-close_spider-callbacks)
 231 |     * [KIMURAI_ENV](#kimurai_env)
 232 |     * [Parallel crawling using in_parallel](#parallel-crawling-using-in_parallel)
 233 |     * [Active Support included](#active-support-included)
 234 |     * [Schedule spiders using Cron](#schedule-spiders-using-cron)
 235 |     * [Configuration options](#configuration-options)
 236 |     * [Using Kimurai inside existing Ruby application](#using-kimurai-inside-existing-ruby-application)
 237 |       * [crawl! method](#crawl-method)
 238 |       * [parse! method](#parsemethod_name-url-method)
 239 |       * [Kimurai.list and Kimurai.find_by_name](#kimurailist-and-kimuraifind_by_name)
 240 |     * [Automated sever setup and deployment](#automated-sever-setup-and-deployment)
 241 |       * [Setup](#setup)
 242 |       * [Deploy](#deploy)
 243 |   * [Spider @config](#spider-config)
 244 |     * [All available @config options](#all-available-config-options)
 245 |     * [@config settings inheritance](#config-settings-inheritance)
 246 |   * [Project mode](#project-mode)
 247 |     * [Generate new spider](#generate-new-spider)
 248 |     * [Crawl](#crawl)
 249 |     * [List](#list)
 250 |     * [Parse](#parse)
 251 |     * [Pipelines, send_item method](#pipelines-send_item-method)
 252 |     * [Runner](#runner)
 253 |       * [Runner callbacks](#runner-callbacks)
 254 |   * [Chat Support and Feedback](#chat-support-and-feedback)
 255 |   * [License](#license)
 256 | 
 257 | 
 258 | ## Installation
 259 | Kimurai requires Ruby version `>= 2.5.0`. Supported platforms: `Linux` and `Mac OS X`.
 260 | 
 261 | 1) If your system doesn't have appropriate Ruby version, install it:
 262 | 
 263 | <details/>
 264 |   <summary>Ubuntu 18.04</summary>
 265 | 
 266 | ```bash
 267 | # Install required packages for ruby-build
 268 | sudo apt update
 269 | sudo apt install git-core curl zlib1g-dev build-essential libssl-dev libreadline-dev libreadline6-dev libyaml-dev libxml2-dev libxslt1-dev libcurl4-openssl-dev libffi-dev
 270 | 
 271 | # Install rbenv and ruby-build
 272 | cd && git clone https://github.com/rbenv/rbenv.git ~/.rbenv
 273 | echo 'export PATH="$HOME/.rbenv/bin:$PATH"' >> ~/.bashrc
 274 | echo 'eval "$(rbenv init -)"' >> ~/.bashrc
 275 | exec $SHELL
 276 | 
 277 | git clone https://github.com/rbenv/ruby-build.git ~/.rbenv/plugins/ruby-build
 278 | echo 'export PATH="$HOME/.rbenv/plugins/ruby-build/bin:$PATH"' >> ~/.bashrc
 279 | exec $SHELL
 280 | 
 281 | # Install latest Ruby
 282 | rbenv install 2.5.3
 283 | rbenv global 2.5.3
 284 | 
 285 | gem install bundler
 286 | ```
 287 | </details>
 288 | 
 289 | <details/>
 290 |   <summary>Mac OS X</summary>
 291 | 
 292 | ```bash
 293 | # Install homebrew if you don't have it https://brew.sh/
 294 | # Install rbenv and ruby-build:
 295 | brew install rbenv ruby-build
 296 | 
 297 | # Add rbenv to bash so that it loads every time you open a terminal
 298 | echo 'if which rbenv > /dev/null; then eval "$(rbenv init -)"; fi' >> ~/.bash_profile
 299 | source ~/.bash_profile
 300 | 
 301 | # Install latest Ruby
 302 | rbenv install 2.5.3
 303 | rbenv global 2.5.3
 304 | 
 305 | gem install bundler
 306 | ```
 307 | </details>
 308 | 
 309 | 2) Install Kimurai gem: `$ gem install kimurai`
 310 | 
 311 | 3) Install browsers with webdrivers:
 312 | 
 313 | <details/>
 314 |   <summary>Ubuntu 18.04</summary>
 315 | 
 316 | Note: for Ubuntu 16.04-18.04 there is available automatic installation using `setup` command:
 317 | ```bash
 318 | $ kimurai setup localhost --local --ask-sudo
 319 | ```
 320 | It works using [Ansible](https://github.com/ansible/ansible) so you need to install it first: `$ sudo apt install ansible`. You can check using playbooks [here](lib/kimurai/automation).
 321 | 
 322 | If you chose automatic installation, you can skip following and go to "Getting To Know" part. In case if you want to install everything manually:
 323 | 
 324 | ```bash
 325 | # Install basic tools
 326 | sudo apt install -q -y unzip wget tar openssl
 327 | 
 328 | # Install xvfb (for virtual_display headless mode, in additional to native)
 329 | sudo apt install -q -y xvfb
 330 | 
 331 | # Install chromium-browser and firefox
 332 | sudo apt install -q -y chromium-browser firefox
 333 | 
 334 | # Instal chromedriver (2.44 version)
 335 | # All versions located here https://sites.google.com/a/chromium.org/chromedriver/downloads
 336 | cd /tmp && wget https://chromedriver.storage.googleapis.com/2.44/chromedriver_linux64.zip
 337 | sudo unzip chromedriver_linux64.zip -d /usr/local/bin
 338 | rm -f chromedriver_linux64.zip
 339 | 
 340 | # Install geckodriver (0.23.0 version)
 341 | # All versions located here https://github.com/mozilla/geckodriver/releases/
 342 | cd /tmp && wget https://github.com/mozilla/geckodriver/releases/download/v0.23.0/geckodriver-v0.23.0-linux64.tar.gz
 343 | sudo tar -xvzf geckodriver-v0.23.0-linux64.tar.gz -C /usr/local/bin
 344 | rm -f geckodriver-v0.23.0-linux64.tar.gz
 345 | 
 346 | # Install PhantomJS (2.1.1)
 347 | # All versions located here http://phantomjs.org/download.html
 348 | sudo apt install -q -y chrpath libxft-dev libfreetype6 libfreetype6-dev libfontconfig1 libfontconfig1-dev
 349 | cd /tmp && wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
 350 | tar -xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2
 351 | sudo mv phantomjs-2.1.1-linux-x86_64 /usr/local/lib
 352 | sudo ln -s /usr/local/lib/phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local/bin
 353 | rm -f phantomjs-2.1.1-linux-x86_64.tar.bz2
 354 | ```
 355 | 
 356 | </details>
 357 | 
 358 | <details/>
 359 |   <summary>Mac OS X</summary>
 360 | 
 361 | ```bash
 362 | # Install chrome and firefox
 363 | brew cask install google-chrome firefox
 364 | 
 365 | # Install chromedriver (latest)
 366 | brew cask install chromedriver
 367 | 
 368 | # Install geckodriver (latest)
 369 | brew install geckodriver
 370 | 
 371 | # Install PhantomJS (latest)
 372 | brew install phantomjs
 373 | ```
 374 | </details><br>
 375 | 
 376 | Also, if you want to save scraped items to the database (using [ActiveRecord](https://github.com/rails/rails/tree/master/activerecord), [Sequel](https://github.com/jeremyevans/sequel) or [MongoDB Ruby Driver](https://github.com/mongodb/mongo-ruby-driver)/[Mongoid](https://github.com/mongodb/mongoid)), you need to install database clients/servers:
 377 | 
 378 | <details/>
 379 |   <summary>Ubuntu 18.04</summary>
 380 | 
 381 | SQlite: `$ sudo apt -q -y install libsqlite3-dev sqlite3`.
 382 | 
 383 | If you want to connect to a remote database, you don't need database server on a local machine (only client):
 384 | ```bash
 385 | # Install MySQL client
 386 | sudo apt -q -y install mysql-client libmysqlclient-dev
 387 | 
 388 | # Install Postgres client
 389 | sudo apt install -q -y postgresql-client libpq-dev
 390 | 
 391 | # Install MongoDB client
 392 | sudo apt install -q -y mongodb-clients
 393 | ```
 394 | 
 395 | But if you want to save items to a local database, database server required as well:
 396 | ```bash
 397 | # Install MySQL client and server
 398 | sudo apt -q -y install mysql-server mysql-client libmysqlclient-dev
 399 | 
 400 | # Install  Postgres client and server
 401 | sudo apt install -q -y postgresql postgresql-contrib libpq-dev
 402 | 
 403 | # Install MongoDB client and server
 404 | # version 4.0 (check here https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/)
 405 | sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 9DA31620334BD75D9DCB49F368818C72E52529D4
 406 | # for 16.04:
 407 | # echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/4.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.0.list
 408 | # for 18.04:
 409 | echo "deb [ arch=amd64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/4.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.0.list
 410 | sudo apt update
 411 | sudo apt install -q -y mongodb-org
 412 | sudo service mongod start
 413 | ```
 414 | </details>
 415 | 
 416 | <details/>
 417 |   <summary>Mac OS X</summary>
 418 | 
 419 | SQlite: `$ brew install sqlite3`
 420 | 
 421 | ```bash
 422 | # Install MySQL client and server
 423 | brew install mysql
 424 | # Start server if you need it: brew services start mysql
 425 | 
 426 | # Install Postgres client and server
 427 | brew install postgresql
 428 | # Start server if you need it: brew services start postgresql
 429 | 
 430 | # Install MongoDB client and server
 431 | brew install mongodb
 432 | # Start server if you need it: brew services start mongodb
 433 | ```
 434 | </details>
 435 | 
 436 | 
 437 | ## Getting to Know
 438 | ### Interactive console
 439 | Before you get to know all Kimurai features, there is `$ kimurai console` command which is an interactive console where you can try and debug your scraping code very quickly, without having to run any spider (yes, it's like [Scrapy shell](https://doc.scrapy.org/en/latest/topics/shell.html#topics-shell)).
 440 | 
 441 | ```bash
 442 | $ kimurai console --engine selenium_chrome --url https://github.com/vifreefly/kimuraframework
 443 | ```
 444 | 
 445 | <details/>
 446 |   <summary>Show output</summary>
 447 | 
 448 | ```
 449 | $ kimurai console --engine selenium_chrome --url https://github.com/vifreefly/kimuraframework
 450 | 
 451 | D, [2018-08-22 13:42:32 +0400#26079] [M: 47461994677760] DEBUG -- : BrowserBuilder (selenium_chrome): created browser instance
 452 | D, [2018-08-22 13:42:32 +0400#26079] [M: 47461994677760] DEBUG -- : BrowserBuilder (selenium_chrome): enabled native headless_mode
 453 | I, [2018-08-22 13:42:32 +0400#26079] [M: 47461994677760]  INFO -- : Browser: started get request to: https://github.com/vifreefly/kimuraframework
 454 | I, [2018-08-22 13:42:35 +0400#26079] [M: 47461994677760]  INFO -- : Browser: finished get request to: https://github.com/vifreefly/kimuraframework
 455 | D, [2018-08-22 13:42:35 +0400#26079] [M: 47461994677760] DEBUG -- : Browser: driver.current_memory: 201701
 456 | 
 457 | From: /home/victor/code/kimurai/lib/kimurai/base.rb @ line 189 Kimurai::Base#console:
 458 | 
 459 |     188: def console(response = nil, url: nil, data: {})
 460 |  => 189:   binding.pry
 461 |     190: end
 462 | 
 463 | [1] pry(#<Kimurai::Base>)> response.xpath("//title").text
 464 | => "GitHub - vifreefly/kimuraframework: Modern web scraping framework written in Ruby which works out of box with Headless Chromium/Firefox, PhantomJS, or simple HTTP requests and allows to scrape and interact with JavaScript rendered websites"
 465 | 
 466 | [2] pry(#<Kimurai::Base>)> ls
 467 | Kimurai::Base#methods: browser  console  logger  request_to  save_to  unique?
 468 | instance variables: @browser  @config  @engine  @logger  @pipelines
 469 | locals: _  __  _dir_  _ex_  _file_  _in_  _out_  _pry_  data  response  url
 470 | 
 471 | [3] pry(#<Kimurai::Base>)> ls response
 472 | Nokogiri::XML::PP::Node#methods: inspect  pretty_print
 473 | Nokogiri::XML::Searchable#methods: %  /  at  at_css  at_xpath  css  search  xpath
 474 | Enumerable#methods:
 475 |   all?         collect         drop        each_with_index   find_all    grep_v    lazy    member?    none?      reject        slice_when  take_while  without
 476 |   any?         collect_concat  drop_while  each_with_object  find_index  group_by  many?   min        one?       reverse_each  sort        to_a        zip
 477 |   as_json      count           each_cons   entries           first       include?  map     min_by     partition  select        sort_by     to_h
 478 |   chunk        cycle           each_entry  exclude?          flat_map    index_by  max     minmax     pluck      slice_after   sum         to_set
 479 |   chunk_while  detect          each_slice  find              grep        inject    max_by  minmax_by  reduce     slice_before  take        uniq
 480 | Nokogiri::XML::Node#methods:
 481 |   <=>                   append_class       classes                 document?             has_attribute?      matches?          node_name=        processing_instruction?  to_str
 482 |   ==                    attr               comment?                each                  html?               name=             node_type         read_only?               to_xhtml
 483 |   >                     attribute          content                 elem?                 inner_html          namespace=        parent=           remove                   traverse
 484 |   []                    attribute_nodes    content=                element?              inner_html=         namespace_scopes  parse             remove_attribute         unlink
 485 |   []=                   attribute_with_ns  create_external_subset  element_children      inner_text          namespaced_key?   path              remove_class             values
 486 |   accept                before             create_internal_subset  elements              internal_subset     native_content=   pointer_id        replace                  write_html_to
 487 |   add_class             blank?             css_path                encode_special_chars  key?                next              prepend_child     set_attribute            write_to
 488 |   add_next_sibling      cdata?             decorate!               external_subset       keys                next=             previous          text                     write_xhtml_to
 489 |   add_previous_sibling  child              delete                  first_element_child   lang                next_element      previous=         text?                    write_xml_to
 490 |   after                 children           description             fragment?             lang=               next_sibling      previous_element  to_html                  xml?
 491 |   ancestors             children=          do_xinclude             get_attribute         last_element_child  node_name         previous_sibling  to_s
 492 | Nokogiri::XML::Document#methods:
 493 |   <<         canonicalize  collect_namespaces  create_comment  create_entity     decorate    document  encoding   errors   name        remove_namespaces!  root=  to_java  url       version
 494 |   add_child  clone         create_cdata        create_element  create_text_node  decorators  dup       encoding=  errors=  namespaces  root                slop!  to_xml   validate
 495 | Nokogiri::HTML::Document#methods: fragment  meta_encoding  meta_encoding=  serialize  title  title=  type
 496 | instance variables: @decorators  @errors  @node_cache
 497 | 
 498 | [4] pry(#<Kimurai::Base>)> exit
 499 | I, [2018-08-22 13:43:47 +0400#26079] [M: 47461994677760]  INFO -- : Browser: driver selenium_chrome has been destroyed
 500 | $
 501 | ```
 502 | </details><br>
 503 | 
 504 | CLI options:
 505 | * `--engine` (optional) [engine](#available-drivers) to use. Default is `mechanize`
 506 | * `--url` (optional) url to process. If url omitted, `response` and `url` objects inside the console will be `nil` (use [browser](#browser-object) object to navigate to any webpage).
 507 | 
 508 | ### Available engines
 509 | Kimurai has support for following engines and mostly can switch between them without need to rewrite any code:
 510 | 
 511 | * `:mechanize` - [pure Ruby fake http browser](https://github.com/sparklemotion/mechanize). Mechanize can't render javascript and don't know what DOM is it. It only can parse original HTML code of a page. Because of it, mechanize much faster, takes much less memory and in general much more stable than any real browser. Use mechanize if you can do it, and the website doesn't use javascript to render any meaningful parts of its structure. Still, because mechanize trying to mimic a real browser, it supports almost all Capybara's [methods to interact with a web page](http://cheatrags.com/capybara) (filling forms, clicking buttons, checkboxes, etc).
 512 | * `:poltergeist_phantomjs` - [PhantomJS headless browser](https://github.com/ariya/phantomjs), can render javascript. In general, PhantomJS still faster than Headless Chrome (and Headless Firefox). PhantomJS has memory leakage, but Kimurai has [memory control feature](#crawler-config) so you shouldn't consider it as a problem. Also, some websites can recognize PhantomJS and block access to them. Like mechanize (and unlike selenium engines) `:poltergeist_phantomjs` can freely rotate proxies and change headers _on the fly_ (see [config section](#all-available-config-options)).
 513 | * `:selenium_chrome` Chrome in headless mode driven by selenium. Modern headless browser solution with proper javascript rendering.
 514 | * `:selenium_firefox` Firefox in headless mode driven by selenium. Usually takes more memory than other drivers, but sometimes can be useful.
 515 | 
 516 | **Tip:** add `HEADLESS=false` ENV variable before command (`$ HEADLESS=false ruby spider.rb`) to run browser in normal (not headless) mode and see it's window (only for selenium-like engines). It works for [console](#interactive-console) command as well.
 517 | 
 518 | 
 519 | ### Minimum required spider structure
 520 | > You can manually create a spider file, or use generator instead: `$ kimurai generate spider simple_spider`
 521 | 
 522 | ```ruby
 523 | require 'kimurai'
 524 | 
 525 | class SimpleSpider < Kimurai::Base
 526 |   @name = "simple_spider"
 527 |   @engine = :selenium_chrome
 528 |   @start_urls = ["https://example.com/"]
 529 | 
 530 |   def parse(response, url:, data: {})
 531 |   end
 532 | end
 533 | 
 534 | SimpleSpider.crawl!
 535 | ```
 536 | 
 537 | Where:
 538 | * `@name` name of a spider. You can omit name if use single-file spider
 539 | * `@engine` engine for a spider
 540 | * `@start_urls` array of start urls to process one by one inside `parse` method
 541 | * Method `parse` is the start method, should be always present in spider class
 542 | 
 543 | 
 544 | ### Method arguments `response`, `url` and `data`
 545 | 
 546 | ```ruby
 547 | def parse(response, url:, data: {})
 548 | end
 549 | ```
 550 | 
 551 | * `response` ([Nokogiri::HTML::Document](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document) object) Contains parsed HTML code of a processed webpage
 552 | * `url` (String) url of a processed webpage
 553 | * `data` (Hash) uses to pass data between requests
 554 | 
 555 | <details/>
 556 |   <summary><strong>Example how to use <code>data</code></strong></summary>
 557 | 
 558 | Imagine that there is a product page which doesn't contain product category. Category name present only on category page with pagination. This is the case where we can use `data` to pass category name from `parse` to `parse_product` method:
 559 | 
 560 | ```ruby
 561 | class ProductsSpider < Kimurai::Base
 562 |   @engine = :selenium_chrome
 563 |   @start_urls = ["https://example-shop.com/example-product-category"]
 564 | 
 565 |   def parse(response, url:, data: {})
 566 |     category_name = response.xpath("//path/to/category/name").text
 567 |     response.xpath("//path/to/products/urls").each do |product_url|
 568 |       # Merge category_name with current data hash and pass it next to parse_product method
 569 |       request_to(:parse_product, url: product_url[:href], data: data.merge(category_name: category_name))
 570 |     end
 571 | 
 572 |     # ...
 573 |   end
 574 | 
 575 |   def parse_product(response, url:, data: {})
 576 |     item = {}
 577 |     # Assign item's category_name from data[:category_name]
 578 |     item[:category_name] = data[:category_name]
 579 | 
 580 |     # ...
 581 |   end
 582 | end
 583 | 
 584 | ```
 585 | </details><br>
 586 | 
 587 | **You can query `response` using [XPath or CSS selectors](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/Searchable)**. Check Nokogiri tutorials to understand how to work with `response`:
 588 | * [Parsing HTML with Nokogiri](http://ruby.bastardsbook.com/chapters/html-parsing/) - ruby.bastardsbook.com
 589 | * [HOWTO parse HTML with Ruby & Nokogiri](https://readysteadycode.com/howto-parse-html-with-ruby-and-nokogiri) - readysteadycode.com
 590 | * [Class: Nokogiri::HTML::Document](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document) (documentation) - rubydoc.info
 591 | 
 592 | 
 593 | ### `browser` object
 594 | 
 595 | From any spider instance method there is available `browser` object, which is [Capybara::Session](https://www.rubydoc.info/github/jnicklas/capybara/Capybara/Session) object and uses to process requests and get page response (`current_response` method). Usually you don't need to touch it directly, because there is `response` (see above) which contains page response after it was loaded.
 596 | 
 597 | But if you need to interact with a page (like filling form fields, clicking elements, checkboxes, etc) `browser` is ready for you:
 598 | 
 599 | ```ruby
 600 | class GoogleSpider < Kimurai::Base
 601 |   @name = "google_spider"
 602 |   @engine = :selenium_chrome
 603 |   @start_urls = ["https://www.google.com/"]
 604 | 
 605 |   def parse(response, url:, data: {})
 606 |     browser.fill_in "q", with: "Kimurai web scraping framework"
 607 |     browser.click_button "Google Search"
 608 | 
 609 |     # Update response to current response after interaction with a browser
 610 |     response = browser.current_response
 611 | 
 612 |     # Collect results
 613 |     results = response.xpath("//div[@class='g']//h3/a").map do |a|
 614 |       { title: a.text, url: a[:href] }
 615 |     end
 616 | 
 617 |     # ...
 618 |   end
 619 | end
 620 | ```
 621 | 
 622 | Check out **Capybara cheat sheets** where you can see all available methods **to interact with browser**:
 623 | * [UI Testing with RSpec and Capybara [cheat sheet]](http://cheatrags.com/capybara) - cheatrags.com
 624 | * [Capybara Cheatsheet PDF](https://thoughtbot.com/upcase/test-driven-rails-resources/capybara.pdf) - thoughtbot.com
 625 | * [Class: Capybara::Session](https://www.rubydoc.info/github/jnicklas/capybara/Capybara/Session) (documentation) - rubydoc.info
 626 | 
 627 | ### `request_to` method
 628 | 
 629 | For making requests to a particular method there is `request_to`. It requires minimum two arguments: `:method_name` and `url:`. An optional argument is `data:` (see above what for is it). Example:
 630 | 
 631 | ```ruby
 632 | class Spider < Kimurai::Base
 633 |   @engine = :selenium_chrome
 634 |   @start_urls = ["https://example.com/"]
 635 | 
 636 |   def parse(response, url:, data: {})
 637 |     # Process request to `parse_product` method with `https://example.com/some_product` url:
 638 |     request_to :parse_product, url: "https://example.com/some_product"
 639 |   end
 640 | 
 641 |   def parse_product(response, url:, data: {})
 642 |     puts "From page https://example.com/some_product !"
 643 |   end
 644 | end
 645 | ```
 646 | 
 647 | Under the hood `request_to` simply call [#visit](https://www.rubydoc.info/github/jnicklas/capybara/Capybara%2FSession:visit) (`browser.visit(url)`) and then required method with arguments:
 648 | 
 649 | <details/>
 650 |   <summary>request_to</summary>
 651 | 
 652 | ```ruby
 653 | def request_to(handler, url:, data: {})
 654 |   request_data = { url: url, data: data }
 655 | 
 656 |   browser.visit(url)
 657 |   public_send(handler, browser.current_response, request_data)
 658 | end
 659 | ```
 660 | </details><br>
 661 | 
 662 | `request_to` just makes things simpler, and without it we could do something like:
 663 | 
 664 | <details/>
 665 |   <summary>Check the code</summary>
 666 | 
 667 | ```ruby
 668 | class Spider < Kimurai::Base
 669 |   @engine = :selenium_chrome
 670 |   @start_urls = ["https://example.com/"]
 671 | 
 672 |   def parse(response, url:, data: {})
 673 |     url_to_process = "https://example.com/some_product"
 674 | 
 675 |     browser.visit(url_to_process)
 676 |     parse_product(browser.current_response, url: url_to_process)
 677 |   end
 678 | 
 679 |   def parse_product(response, url:, data: {})
 680 |     puts "From page https://example.com/some_product !"
 681 |   end
 682 | end
 683 | ```
 684 | </details>
 685 | 
 686 | ### `save_to` helper
 687 | 
 688 | Sometimes all that you need is to simply save scraped data to a file format, like JSON or CSV. You can use `save_to` for it:
 689 | 
 690 | ```ruby
 691 | class ProductsSpider < Kimurai::Base
 692 |   @engine = :selenium_chrome
 693 |   @start_urls = ["https://example-shop.com/"]
 694 | 
 695 |   # ...
 696 | 
 697 |   def parse_product(response, url:, data: {})
 698 |     item = {}
 699 | 
 700 |     item[:title] = response.xpath("//title/path").text
 701 |     item[:description] = response.xpath("//desc/path").text.squish
 702 |     item[:price] = response.xpath("//price/path").text[/\d+/]&.to_f
 703 | 
 704 |     # Add each new item to the `scraped_products.json` file:
 705 |     save_to "scraped_products.json", item, format: :json
 706 |   end
 707 | end
 708 | ```
 709 | 
 710 | Supported formats:
 711 | * `:json` JSON
 712 | * `:pretty_json` "pretty" JSON (`JSON.pretty_generate`)
 713 | * `:jsonlines` [JSON Lines](http://jsonlines.org/)
 714 | * `:csv` CSV
 715 | 
 716 | Note: `save_to` requires data (item to save) to be a `Hash`.
 717 | 
 718 | By default `save_to` add position key to an item hash. You can disable it with `position: false`: `save_to "scraped_products.json", item, format: :json, position: false`.
 719 | 
 720 | **How helper works:**
 721 | 
 722 | Until spider stops, each new item will be appended to a file. At the next run, helper will clear the content of a file first, and then start again appending items to it.
 723 | 
 724 | > If you don't want file to be cleared before each run, add option `append: true`: `save_to "scraped_products.json", item, format: :json, append: true`
 725 | 
 726 | ### Skip duplicates
 727 | 
 728 | It's pretty common when websites have duplicated pages. For example when an e-commerce shop has the same products in different categories. To skip duplicates, there is simple `unique?` helper:
 729 | 
 730 | ```ruby
 731 | class ProductsSpider < Kimurai::Base
 732 |   @engine = :selenium_chrome
 733 |   @start_urls = ["https://example-shop.com/"]
 734 | 
 735 |   def parse(response, url:, data: {})
 736 |     response.xpath("//categories/path").each do |category|
 737 |       request_to :parse_category, url: category[:href]
 738 |     end
 739 |   end
 740 | 
 741 |   # Check products for uniqueness using product url inside of parse_category:
 742 |   def parse_category(response, url:, data: {})
 743 |     response.xpath("//products/path").each do |product|
 744 |       # Skip url if it's not unique:
 745 |       next unless unique?(:product_url, product[:href])
 746 |       # Otherwise process it:
 747 |       request_to :parse_product, url: product[:href]
 748 |     end
 749 |   end
 750 | 
 751 |   # Or/and check products for uniqueness using product sku inside of parse_product:
 752 |   def parse_product(response, url:, data: {})
 753 |     item = {}
 754 |     item[:sku] = response.xpath("//product/sku/path").text.strip.upcase
 755 |     # Don't save product and return from method if there is already saved item with the same sku:
 756 |     return unless unique?(:sku, item[:sku])
 757 | 
 758 |     # ...
 759 |     save_to "results.json", item, format: :json
 760 |   end
 761 | end
 762 | ```
 763 | 
 764 | `unique?` helper works pretty simple:
 765 | 
 766 | ```ruby
 767 | # Check string "http://example.com" in scope `url` for a first time:
 768 | unique?(:url, "http://example.com")
 769 | # => true
 770 | 
 771 | # Try again:
 772 | unique?(:url, "http://example.com")
 773 | # => false
 774 | ```
 775 | 
 776 | To check something for uniqueness, you need to provide a scope:
 777 | 
 778 | ```ruby
 779 | # `product_url` scope
 780 | unique?(:product_url, "http://example.com/product_1")
 781 | 
 782 | # `id` scope
 783 | unique?(:id, 324234232)
 784 | 
 785 | # `custom` scope
 786 | unique?(:custom, "Lorem Ipsum")
 787 | ```
 788 | 
 789 | #### Automatically skip all duplicated requests urls
 790 | 
 791 | It is possible to automatically skip all already visited urls while calling `request_to` method, using [@config](#all-available-config-options) option `skip_duplicate_requests: true`. With this option, all already visited urls will be automatically skipped. Also check the [@config](#all-available-config-options) for an additional options of this setting.
 792 | 
 793 | #### `storage` object
 794 | 
 795 | `unique?` method it's just an alias for `storage#unique?`. Storage has several methods:
 796 | 
 797 | * `#all` - display storage hash where keys are existing scopes.
 798 | * `#include?(scope, value)` - return `true` if value in the scope exists, and `false` if not
 799 | * `#add(scope, value)` - add value to the scope
 800 | * `#unique?(scope, value)` - method already described above, will return `false` if value in the scope exists, or return `true` + add value to the scope if value in the scope not exists.
 801 | * `#clear!` - reset the whole storage by deleting all values from all scopes.
 802 | 
 803 | 
 804 | ### Handle request errors
 805 | It is quite common that some pages of crawling website can return different response code than `200 ok`. In such cases, method `request_to` (or `browser.visit`) can raise an exception. Kimurai provides `skip_request_errors` and `retry_request_errors` [config](#spider-config) options to handle such errors:
 806 | 
 807 | #### skip_request_errors
 808 | You can automatically skip some of errors while requesting a page using `skip_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught, and request will be skipped. It is a good idea to skip errors like NotFound(404), etc.
 809 | 
 810 | Format for the option: array where elements are error classes or/and hashes. You can use _hash_ format for more flexibility:
 811 | 
 812 | ```
 813 | @config = {
 814 |   skip_request_errors: [{ error: RuntimeError, message: "404 => Net::HTTPNotFound" }]
 815 | }
 816 | ```
 817 | In this case, provided `message:` will be compared with a full error message using `String#include?`. Also you can use regex instead: `{ error: RuntimeError, message: /404|403/ }`.
 818 | 
 819 | #### retry_request_errors
 820 | You can automatically retry some of errors with a few attempts while requesting a page using `retry_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught and the request will be processed again within a delay.
 821 | 
 822 | There are 3 attempts: first: delay _15 sec_, second: delay _30 sec_, third: delay _45 sec_. If after 3 attempts there is still an exception, then the exception will be raised. It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc.
 823 | 
 824 | Format for the option: same like for `skip_request_errors` option.
 825 | 
 826 | If you would like to skip (not raise) error after all retries gone, you can specify `skip_on_failure: true` option:
 827 | 
 828 | ```ruby
 829 | @config = {
 830 |   retry_request_errors: [{ error: RuntimeError, skip_on_failure: true }]
 831 | }
 832 | ```
 833 | 
 834 | ### Logging custom events
 835 | 
 836 | It is possible to save custom messages to the [run_info](#open_spider-and-close_spider-callbacks) hash using `add_event('Some message')` method. This feature helps you to keep track on important things which happened during crawling without checking the whole spider log (in case if you're logging these messages using `logger`). Example:
 837 | 
 838 | ```ruby
 839 | def parse_product(response, url:, data: {})
 840 |   unless response.at_xpath("//path/to/add_to_card_button")
 841 |     add_event("Product is sold") and return
 842 |   end
 843 | 
 844 |   # ...
 845 | end
 846 | ```
 847 | 
 848 | ```
 849 | ...
 850 | I, [2018-11-28 22:20:19 +0400#7402] [M: 47156576560640]  INFO -- example_spider: Spider: new event (scope: custom): Product is sold
 851 | ...
 852 | I, [2018-11-28 22:20:19 +0400#7402] [M: 47156576560640]  INFO -- example_spider: Spider: stopped: {:events=>{:custom=>{"Product is sold"=>1}}}
 853 | ```
 854 | 
 855 | ### `open_spider` and `close_spider` callbacks
 856 | 
 857 | You can define `.open_spider` and `.close_spider` callbacks (class methods) to perform some action before spider started or after spider has been stopped:
 858 | 
 859 | ```ruby
 860 | require 'kimurai'
 861 | 
 862 | class ExampleSpider < Kimurai::Base
 863 |   @name = "example_spider"
 864 |   @engine = :selenium_chrome
 865 |   @start_urls = ["https://example.com/"]
 866 | 
 867 |   def self.open_spider
 868 |     logger.info "> Starting..."
 869 |   end
 870 | 
 871 |   def self.close_spider
 872 |     logger.info "> Stopped!"
 873 |   end
 874 | 
 875 |   def parse(response, url:, data: {})
 876 |     logger.info "> Scraping..."
 877 |   end
 878 | end
 879 | 
 880 | ExampleSpider.crawl!
 881 | ```
 882 | 
 883 | <details/>
 884 |   <summary>Output</summary>
 885 | 
 886 | ```
 887 | I, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840]  INFO -- example_spider: Spider: started: example_spider
 888 | I, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840]  INFO -- example_spider: > Starting...
 889 | D, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840] DEBUG -- example_spider: BrowserBuilder (selenium_chrome): created browser instance
 890 | D, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840] DEBUG -- example_spider: BrowserBuilder (selenium_chrome): enabled native headless_mode
 891 | I, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840]  INFO -- example_spider: Browser: started get request to: https://example.com/
 892 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840]  INFO -- example_spider: Browser: finished get request to: https://example.com/
 893 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840]  INFO -- example_spider: Info: visits: requests: 1, responses: 1
 894 | D, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] DEBUG -- example_spider: Browser: driver.current_memory: 82415
 895 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840]  INFO -- example_spider: > Scraping...
 896 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840]  INFO -- example_spider: Browser: driver selenium_chrome has been destroyed
 897 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840]  INFO -- example_spider: > Stopped!
 898 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840]  INFO -- example_spider: Spider: stopped: {:spider_name=>"example_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 14:26:32 +0400, :stop_time=>2018-08-22 14:26:34 +0400, :running_time=>"1s", :visits=>{:requests=>1, :responses=>1}, :error=>nil}
 899 | ```
 900 | </details><br>
 901 | 
 902 | Inside `open_spider` and `close_spider` class methods there is available `run_info` method which contains useful information about spider state:
 903 | 
 904 | ```ruby
 905 |     11: def self.open_spider
 906 |  => 12:   binding.pry
 907 |     13: end
 908 | 
 909 | [1] pry(example_spider)> run_info
 910 | => {
 911 |   :spider_name=>"example_spider",
 912 |   :status=>:running,
 913 |   :environment=>"development",
 914 |   :start_time=>2018-08-05 23:32:00 +0400,
 915 |   :stop_time=>nil,
 916 |   :running_time=>nil,
 917 |   :visits=>{:requests=>0, :responses=>0},
 918 |   :error=>nil
 919 | }
 920 | ```
 921 | 
 922 | Inside `close_spider`, `run_info` will be updated:
 923 | 
 924 | ```ruby
 925 |     15: def self.close_spider
 926 |  => 16:   binding.pry
 927 |     17: end
 928 | 
 929 | [1] pry(example_spider)> run_info
 930 | => {
 931 |   :spider_name=>"example_spider",
 932 |   :status=>:completed,
 933 |   :environment=>"development",
 934 |   :start_time=>2018-08-05 23:32:00 +0400,
 935 |   :stop_time=>2018-08-05 23:32:06 +0400,
 936 |   :running_time=>6.214,
 937 |   :visits=>{:requests=>1, :responses=>1},
 938 |   :error=>nil
 939 | }
 940 | ```
 941 | 
 942 | `run_info[:status]` helps to determine if spider was finished successfully or failed (possible values: `:completed`, `:failed`):
 943 | 
 944 | ```ruby
 945 | class ExampleSpider < Kimurai::Base
 946 |   @name = "example_spider"
 947 |   @engine = :selenium_chrome
 948 |   @start_urls = ["https://example.com/"]
 949 | 
 950 |   def self.close_spider
 951 |     puts ">>> run info: #{run_info}"
 952 |   end
 953 | 
 954 |   def parse(response, url:, data: {})
 955 |     logger.info "> Scraping..."
 956 |     # Let's try to strip nil:
 957 |     nil.strip
 958 |   end
 959 | end
 960 | ```
 961 | 
 962 | <details/>
 963 |   <summary>Output</summary>
 964 | 
 965 | ```
 966 | I, [2018-08-22 14:34:24 +0400#8459] [M: 47020523644400]  INFO -- example_spider: Spider: started: example_spider
 967 | D, [2018-08-22 14:34:25 +0400#8459] [M: 47020523644400] DEBUG -- example_spider: BrowserBuilder (selenium_chrome): created browser instance
 968 | D, [2018-08-22 14:34:25 +0400#8459] [M: 47020523644400] DEBUG -- example_spider: BrowserBuilder (selenium_chrome): enabled native headless_mode
 969 | I, [2018-08-22 14:34:25 +0400#8459] [M: 47020523644400]  INFO -- example_spider: Browser: started get request to: https://example.com/
 970 | I, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400]  INFO -- example_spider: Browser: finished get request to: https://example.com/
 971 | I, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400]  INFO -- example_spider: Info: visits: requests: 1, responses: 1
 972 | D, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400] DEBUG -- example_spider: Browser: driver.current_memory: 83351
 973 | I, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400]  INFO -- example_spider: > Scraping...
 974 | I, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400]  INFO -- example_spider: Browser: driver selenium_chrome has been destroyed
 975 | 
 976 | >>> run info: {:spider_name=>"example_spider", :status=>:failed, :environment=>"development", :start_time=>2018-08-22 14:34:24 +0400, :stop_time=>2018-08-22 14:34:26 +0400, :running_time=>2.01, :visits=>{:requests=>1, :responses=>1}, :error=>"#<NoMethodError: undefined method `strip' for nil:NilClass>"}
 977 | 
 978 | F, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400] FATAL -- example_spider: Spider: stopped: {:spider_name=>"example_spider", :status=>:failed, :environment=>"development", :start_time=>2018-08-22 14:34:24 +0400, :stop_time=>2018-08-22 14:34:26 +0400, :running_time=>"2s", :visits=>{:requests=>1, :responses=>1}, :error=>"#<NoMethodError: undefined method `strip' for nil:NilClass>"}
 979 | Traceback (most recent call last):
 980 |         6: from example_spider.rb:19:in `<main>'
 981 |         5: from /home/victor/code/kimurai/lib/kimurai/base.rb:127:in `crawl!'
 982 |         4: from /home/victor/code/kimurai/lib/kimurai/base.rb:127:in `each'
 983 |         3: from /home/victor/code/kimurai/lib/kimurai/base.rb:128:in `block in crawl!'
 984 |         2: from /home/victor/code/kimurai/lib/kimurai/base.rb:185:in `request_to'
 985 |         1: from /home/victor/code/kimurai/lib/kimurai/base.rb:185:in `public_send'
 986 | example_spider.rb:15:in `parse': undefined method `strip' for nil:NilClass (NoMethodError)
 987 | ```
 988 | </details><br>
 989 | 
 990 | **Usage example:** if spider finished successfully, send JSON file with scraped items to a remote FTP location, otherwise (if spider failed), skip incompleted results and send email/notification to slack about it:
 991 | 
 992 | <details/>
 993 |   <summary>Example</summary>
 994 | 
 995 | Also you can use additional methods `completed?` or `failed?`
 996 | 
 997 | ```ruby
 998 | class Spider < Kimurai::Base
 999 |   @engine = :selenium_chrome
1000 |   @start_urls = ["https://example.com/"]
1001 | 
1002 |   def self.close_spider
1003 |     if completed?
1004 |       send_file_to_ftp("results.json")
1005 |     else
1006 |       send_error_notification(run_info[:error])
1007 |     end
1008 |   end
1009 | 
1010 |   def self.send_file_to_ftp(file_path)
1011 |     # ...
1012 |   end
1013 | 
1014 |   def self.send_error_notification(error)
1015 |     # ...
1016 |   end
1017 | 
1018 |   # ...
1019 | 
1020 |   def parse_item(response, url:, data: {})
1021 |     item = {}
1022 |     # ...
1023 | 
1024 |     save_to "results.json", item, format: :json
1025 |   end
1026 | end
1027 | ```
1028 | </details>
1029 | 
1030 | 
1031 | ### `KIMURAI_ENV`
1032 | Kimurai has environments, default is `development`. To provide custom environment pass `KIMURAI_ENV` ENV variable before command: `$ KIMURAI_ENV=production ruby spider.rb`. To access current environment there is `Kimurai.env` method.
1033 | 
1034 | Usage example:
1035 | ```ruby
1036 | class Spider < Kimurai::Base
1037 |   @engine = :selenium_chrome
1038 |   @start_urls = ["https://example.com/"]
1039 | 
1040 |   def self.close_spider
1041 |     if failed? && Kimurai.env == "production"
1042 |       send_error_notification(run_info[:error])
1043 |     else
1044 |       # Do nothing
1045 |     end
1046 |   end
1047 | 
1048 |   # ...
1049 | end
1050 | ```
1051 | 
1052 | ### Parallel crawling using `in_parallel`
1053 | Kimurai can process web pages concurrently in one single line: `in_parallel(:parse_product, urls, threads: 3)`, where `:parse_product` is a method to process, `urls` is array of urls to crawl and `threads:` is a number of threads:
1054 | 
1055 | ```ruby
1056 | # amazon_spider.rb
1057 | require 'kimurai'
1058 | 
1059 | class AmazonSpider < Kimurai::Base
1060 |   @name = "amazon_spider"
1061 |   @engine = :mechanize
1062 |   @start_urls = ["https://www.amazon.com/"]
1063 | 
1064 |   def parse(response, url:, data: {})
1065 |     browser.fill_in "field-keywords", with: "Web Scraping Books"
1066 |     browser.click_on "Go"
1067 | 
1068 |     # Walk through pagination and collect products urls:
1069 |     urls = []
1070 |     loop do
1071 |       response = browser.current_response
1072 |       response.xpath("//li//a[contains(@class, 's-access-detail-page')]").each do |a|
1073 |         urls << a[:href].sub(/ref=.+/, "")
1074 |       end
1075 | 
1076 |       browser.find(:xpath, "//a[@id='pagnNextLink']", wait: 1).click rescue break
1077 |     end
1078 | 
1079 |     # Process all collected urls concurrently within 3 threads:
1080 |     in_parallel(:parse_book_page, urls, threads: 3)
1081 |   end
1082 | 
1083 |   def parse_book_page(response, url:, data: {})
1084 |     item = {}
1085 | 
1086 |     item[:title] = response.xpath("//h1/span[@id]").text.squish
1087 |     item[:url] = url
1088 |     item[:price] = response.xpath("(//span[contains(@class, 'a-color-price')])[1]").text.squish.presence
1089 |     item[:publisher] = response.xpath("//h2[text()='Product details']/following::b[text()='Publisher:']/following-sibling::text()[1]").text.squish.presence
1090 | 
1091 |     save_to "books.json", item, format: :pretty_json
1092 |   end
1093 | end
1094 | 
1095 | AmazonSpider.crawl!
1096 | ```
1097 | 
1098 | <details/>
1099 |   <summary>Run: <code>$ ruby amazon_spider.rb</code></summary>
1100 | 
1101 | ```
1102 | I, [2018-08-22 14:48:37 +0400#13033] [M: 46982297486840]  INFO -- amazon_spider: Spider: started: amazon_spider
1103 | D, [2018-08-22 14:48:37 +0400#13033] [M: 46982297486840] DEBUG -- amazon_spider: BrowserBuilder (mechanize): created browser instance
1104 | I, [2018-08-22 14:48:37 +0400#13033] [M: 46982297486840]  INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/
1105 | I, [2018-08-22 14:48:38 +0400#13033] [M: 46982297486840]  INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/
1106 | I, [2018-08-22 14:48:38 +0400#13033] [M: 46982297486840]  INFO -- amazon_spider: Info: visits: requests: 1, responses: 1
1107 | 
1108 | I, [2018-08-22 14:48:43 +0400#13033] [M: 46982297486840]  INFO -- amazon_spider: Spider: in_parallel: starting processing 52 urls within 3 threads
1109 | D, [2018-08-22 14:48:43 +0400#13033] [C: 46982320219020] DEBUG -- amazon_spider: BrowserBuilder (mechanize): created browser instance
1110 | I, [2018-08-22 14:48:43 +0400#13033] [C: 46982320219020]  INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Practical-Web-Scraping-Data-Science/dp/1484235819/
1111 | D, [2018-08-22 14:48:44 +0400#13033] [C: 46982320189640] DEBUG -- amazon_spider: BrowserBuilder (mechanize): created browser instance
1112 | I, [2018-08-22 14:48:44 +0400#13033] [C: 46982320189640]  INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Python-Web-Scraping-Cookbook-scraping/dp/1787285219/
1113 | D, [2018-08-22 14:48:44 +0400#13033] [C: 46982319187320] DEBUG -- amazon_spider: BrowserBuilder (mechanize): created browser instance
1114 | I, [2018-08-22 14:48:44 +0400#13033] [C: 46982319187320]  INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Scraping-Python-Community-Experience-Distilled/dp/1782164367/
1115 | I, [2018-08-22 14:48:45 +0400#13033] [C: 46982320219020]  INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Practical-Web-Scraping-Data-Science/dp/1484235819/
1116 | I, [2018-08-22 14:48:45 +0400#13033] [C: 46982320219020]  INFO -- amazon_spider: Info: visits: requests: 4, responses: 2
1117 | I, [2018-08-22 14:48:45 +0400#13033] [C: 46982320219020]  INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Web-Scraping-Python-Collecting-Modern/dp/1491910291/
1118 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982320189640]  INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Python-Web-Scraping-Cookbook-scraping/dp/1787285219/
1119 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982320189640]  INFO -- amazon_spider: Info: visits: requests: 5, responses: 3
1120 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982320189640]  INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Web-Scraping-Python-Collecting-Modern/dp/1491985577/
1121 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982319187320]  INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Scraping-Python-Community-Experience-Distilled/dp/1782164367/
1122 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982319187320]  INFO -- amazon_spider: Info: visits: requests: 6, responses: 4
1123 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982319187320]  INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Web-Scraping-Excel-Effective-Scrapes-ebook/dp/B01CMMJGZ8/
1124 | 
1125 | ...
1126 | 
1127 | I, [2018-08-22 14:49:10 +0400#13033] [C: 46982320219020]  INFO -- amazon_spider: Info: visits: requests: 51, responses: 49
1128 | I, [2018-08-22 14:49:10 +0400#13033] [C: 46982320219020]  INFO -- amazon_spider: Browser: driver mechanize has been destroyed
1129 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982320189640]  INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Scraping-Ice-Life-Bill-Rayburn-ebook/dp/B00C0NF1L8/
1130 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982320189640]  INFO -- amazon_spider: Info: visits: requests: 51, responses: 50
1131 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982320189640]  INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Instant-Scraping-Jacob-Ward-2013-07-26/dp/B01FJ1G3G4/
1132 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982319187320]  INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Php-architects-Guide-Scraping-Author/dp/B010DTKYY4/
1133 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982319187320]  INFO -- amazon_spider: Info: visits: requests: 52, responses: 51
1134 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982319187320]  INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Ship-Tracking-Maritime-Domain-Awareness/dp/B001J5MTOK/
1135 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982320189640]  INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Instant-Scraping-Jacob-Ward-2013-07-26/dp/B01FJ1G3G4/
1136 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982320189640]  INFO -- amazon_spider: Info: visits: requests: 53, responses: 52
1137 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982320189640]  INFO -- amazon_spider: Browser: driver mechanize has been destroyed
1138 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982319187320]  INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Ship-Tracking-Maritime-Domain-Awareness/dp/B001J5MTOK/
1139 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982319187320]  INFO -- amazon_spider: Info: visits: requests: 53, responses: 53
1140 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982319187320]  INFO -- amazon_spider: Browser: driver mechanize has been destroyed
1141 | 
1142 | I, [2018-08-22 14:49:12 +0400#13033] [M: 46982297486840]  INFO -- amazon_spider: Spider: in_parallel: stopped processing 52 urls within 3 threads, total time: 29s
1143 | I, [2018-08-22 14:49:12 +0400#13033] [M: 46982297486840]  INFO -- amazon_spider: Browser: driver mechanize has been destroyed
1144 | 
1145 | I, [2018-08-22 14:49:12 +0400#13033] [M: 46982297486840]  INFO -- amazon_spider: Spider: stopped: {:spider_name=>"amazon_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 14:48:37 +0400, :stop_time=>2018-08-22 14:49:12 +0400, :running_time=>"35s", :visits=>{:requests=>53, :responses=>53}, :error=>nil}
1146 | 
1147 | ```
1148 | </details>
1149 | 
1150 | <details/>
1151 |   <summary>books.json</summary>
1152 | 
1153 | ```json
1154 | [
1155 |   {
1156 |     "title": "Web Scraping with Python: Collecting More Data from the Modern Web2nd Edition",
1157 |     "url": "https://www.amazon.com/Web-Scraping-Python-Collecting-Modern/dp/1491985577/",
1158 |     "price": "$26.94",
1159 |     "publisher": "O'Reilly Media; 2 edition (April 14, 2018)",
1160 |     "position": 1
1161 |   },
1162 |   {
1163 |     "title": "Python Web Scraping Cookbook: Over 90 proven recipes to get you scraping with Python, micro services, Docker and AWS",
1164 |     "url": "https://www.amazon.com/Python-Web-Scraping-Cookbook-scraping/dp/1787285219/",
1165 |     "price": "$39.99",
1166 |     "publisher": "Packt Publishing - ebooks Account (February 9, 2018)",
1167 |     "position": 2
1168 |   },
1169 |   {
1170 |     "title": "Web Scraping with Python: Collecting Data from the Modern Web1st Edition",
1171 |     "url": "https://www.amazon.com/Web-Scraping-Python-Collecting-Modern/dp/1491910291/",
1172 |     "price": "$15.75",
1173 |     "publisher": "O'Reilly Media; 1 edition (July 24, 2015)",
1174 |     "position": 3
1175 |   },
1176 | 
1177 |   ...
1178 | 
1179 |   {
1180 |     "title": "Instant Web Scraping with Java by Ryan Mitchell (2013-08-26)",
1181 |     "url": "https://www.amazon.com/Instant-Scraping-Java-Mitchell-2013-08-26/dp/B01FEM76X2/",
1182 |     "price": "$35.82",
1183 |     "publisher": "Packt Publishing (2013-08-26) (1896)",
1184 |     "position": 52
1185 |   }
1186 | ]
1187 | ```
1188 | </details><br>
1189 | 
1190 | > Note that [save_to](#save_to-helper) and [unique?](#skip-duplicates-unique-helper) helpers are thread-safe (protected by [Mutex](https://ruby-doc.org/core-2.5.1/Mutex.html)) and can be freely used inside threads.
1191 | 
1192 | `in_parallel` can take additional options:
1193 | * `data:` pass with urls custom data hash: `in_parallel(:method, urls, threads: 3, data: { category: "Scraping" })`
1194 | * `delay:` set delay between requests: `in_parallel(:method, urls, threads: 3, delay: 2)`. Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a Range, delay number will be chosen randomly for each request: `rand (2..5) # => 3`
1195 | * `engine:` set custom engine than a default one: `in_parallel(:method, urls, threads: 3, engine: :poltergeist_phantomjs)`
1196 | * `config:` pass custom options to config (see [config section](#crawler-config))
1197 | 
1198 | ### Active Support included
1199 | 
1200 | You can use all the power of familiar [Rails core-ext methods](https://guides.rubyonrails.org/active_support_core_extensions.html#loading-all-core-extensions) for scraping inside Kimurai. Especially take a look at [squish](https://apidock.com/rails/String/squish), [truncate_words](https://apidock.com/rails/String/truncate_words), [titleize](https://apidock.com/rails/String/titleize), [remove](https://apidock.com/rails/String/remove), [present?](https://guides.rubyonrails.org/active_support_core_extensions.html#blank-questionmark-and-present-questionmark) and [presence](https://guides.rubyonrails.org/active_support_core_extensions.html#presence).
1201 | 
1202 | ### Schedule spiders using Cron
1203 | 
1204 | 1) Inside spider directory generate [Whenever](https://github.com/javan/whenever) config: `$ kimurai generate schedule`.
1205 | 
1206 | <details/>
1207 |   <summary><code>schedule.rb</code></summary>
1208 | 
1209 | ```ruby
1210 | ### Settings ###
1211 | require 'tzinfo'
1212 | 
1213 | # Export current PATH to the cron
1214 | env :PATH, ENV["PATH"]
1215 | 
1216 | # Use 24 hour format when using `at:` option
1217 | set :chronic_options, hours24: true
1218 | 
1219 | # Use local_to_utc helper to setup execution time using your local timezone instead
1220 | # of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`).
1221 | # Also maybe you'll want to set same timezone in kimurai as well (use `Kimurai.configuration.time_zone =` for that),
1222 | # to have spiders logs in a specific time zone format.
1223 | # Example usage of helper:
1224 | # every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do
1225 | #   crawl "google_spider.com", output: "log/google_spider.com.log"
1226 | # end
1227 | def local_to_utc(time_string, zone:)
1228 |   TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
1229 | end
1230 | 
1231 | # Note: by default Whenever exports cron commands with :environment == "production".
1232 | # Note: Whenever can only append log data to a log file (>>). If you want
1233 | # to overwrite (>) log file before each run, pass lambda:
1234 | # crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
1235 | 
1236 | # Project job types
1237 | job_type :crawl,  "cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output"
1238 | job_type :runner, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output"
1239 | 
1240 | # Single file job type
1241 | job_type :single, "cd :path && KIMURAI_ENV=:environment ruby :task :output"
1242 | # Single with bundle exec
1243 | job_type :single_bundle, "cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output"
1244 | 
1245 | ### Schedule ###
1246 | # Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
1247 | # every 1.day do
1248 |   # Example to schedule a single spider in the project:
1249 |   # crawl "google_spider.com", output: "log/google_spider.com.log"
1250 | 
1251 |   # Example to schedule all spiders in the project using runner. Each spider will write
1252 |   # it's own output to the `log/spider_name.log` file (handled by a runner itself).
1253 |   # Runner output will be written to log/runner.log file.
1254 |   # Argument number it's a count of concurrent jobs:
1255 |   # runner 3, output:"log/runner.log"
1256 | 
1257 |   # Example to schedule single spider (without project):
1258 |   # single "single_spider.rb", output: "single_spider.log"
1259 | # end
1260 | 
1261 | ### How to set a cron schedule ###
1262 | # Run: `$ whenever --update-crontab --load-file config/schedule.rb`.
1263 | # If you don't have whenever command, install the gem: `$ gem install whenever`.
1264 | 
1265 | ### How to cancel a schedule ###
1266 | # Run: `$ whenever --clear-crontab --load-file config/schedule.rb`.
1267 | ```
1268 | </details><br>
1269 | 
1270 | 2) Add at the bottom of `schedule.rb` following code:
1271 | 
1272 | ```ruby
1273 | every 1.day, at: "7:00" do
1274 |   single "example_spider.rb", output: "example_spider.log"
1275 | end
1276 | ```
1277 | 
1278 | 3) Run: `$ whenever --update-crontab --load-file schedule.rb`. Done!
1279 | 
1280 | You can check Whenever examples [here](https://github.com/javan/whenever#example-schedulerb-file). To cancel schedule, run: `$ whenever --clear-crontab --load-file schedule.rb`.
1281 | 
1282 | ### Configuration options
1283 | You can configure several options using `configure` block:
1284 | 
1285 | ```ruby
1286 | Kimurai.configure do |config|
1287 |   # Default logger has colored mode in development.
1288 |   # If you would like to disable it, set `colorize_logger` to false.
1289 |   # config.colorize_logger = false
1290 | 
1291 |   # Logger level for default logger:
1292 |   # config.log_level = :info
1293 | 
1294 |   # Custom logger:
1295 |   # config.logger = Logger.new(STDOUT)
1296 | 
1297 |   # Custom time zone (for logs):
1298 |   # config.time_zone = "UTC"
1299 |   # config.time_zone = "Europe/Moscow"
1300 | 
1301 |   # Provide custom chrome binary path (default is any available chrome/chromium in the PATH):
1302 |   # config.selenium_chrome_path = "/usr/bin/chromium-browser"
1303 |   # Provide custom selenium chromedriver path (default is "/usr/local/bin/chromedriver"):
1304 |   # config.chromedriver_path = "~/.local/bin/chromedriver"
1305 | end
1306 | ```
1307 | 
1308 | ### Using Kimurai inside existing Ruby application
1309 | 
1310 | You can integrate Kimurai spiders (which are just Ruby classes) to an existing Ruby application like Rails or Sinatra, and run them using background jobs (for example). Check the following info to understand the running process of spiders:
1311 | 
1312 | #### `.crawl!` method
1313 | 
1314 | `.crawl!` (class method) performs a _full run_ of a particular spider. This method will return run_info if run was successful, or an exception if something went wrong.
1315 | 
1316 | ```ruby
1317 | class ExampleSpider < Kimurai::Base
1318 |   @name = "example_spider"
1319 |   @engine = :mechanize
1320 |   @start_urls = ["https://example.com/"]
1321 | 
1322 |   def parse(response, url:, data: {})
1323 |     title = response.xpath("//title").text.squish
1324 |   end
1325 | end
1326 | 
1327 | ExampleSpider.crawl!
1328 | # => { :spider_name => "example_spider", :status => :completed, :environment => "development", :start_time => 2018-08-22 18:20:16 +0400, :stop_time => 2018-08-22 18:20:17 +0400, :running_time => 1.216, :visits => { :requests => 1, :responses => 1 }, :items => { :sent => 0, :processed => 0 }, :error => nil }
1329 | ```
1330 | 
1331 | You can't `.crawl!` spider in different thread if it still running (because spider instances store some shared data in the `@run_info` class variable while `crawl`ing):
1332 | 
1333 | ```ruby
1334 | 2.times do |i|
1335 |   Thread.new { p i, ExampleSpider.crawl! }
1336 | end # =>
1337 | 
1338 | # 1
1339 | # false
1340 | 
1341 | # 0
1342 | # {:spider_name=>"example_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 18:49:22 +0400, :stop_time=>2018-08-22 18:49:23 +0400, :running_time=>0.801, :visits=>{:requests=>1, :responses=>1}, :items=>{:sent=>0, :processed=>0}, :error=>nil}
1343 | ```
1344 | 
1345 | So what if you're don't care about stats and just want to process request to a particular spider method and get the returning value from this method? Use `.parse!` instead:
1346 | 
1347 | #### `.parse!(:method_name, url:)` method
1348 | 
1349 | `.parse!` (class method) creates a new spider instance and performs a request to given method with a given url. Value from the method will be returned back:
1350 | 
1351 | ```ruby
1352 | class ExampleSpider < Kimurai::Base
1353 |   @name = "example_spider"
1354 |   @engine = :mechanize
1355 |   @start_urls = ["https://example.com/"]
1356 | 
1357 |   def parse(response, url:, data: {})
1358 |     title = response.xpath("//title").text.squish
1359 |   end
1360 | end
1361 | 
1362 | ExampleSpider.parse!(:parse, url: "https://example.com/")
1363 | # => "Example Domain"
1364 | ```
1365 | 
1366 | Like `.crawl!`, `.parse!` method takes care of a browser instance and kills it (`browser.destroy_driver!`) before returning the value. Unlike `.crawl!`, `.parse!` method can be called from different threads at the same time:
1367 | 
1368 | ```ruby
1369 | urls = ["https://www.google.com/", "https://www.reddit.com/", "https://en.wikipedia.org/"]
1370 | 
1371 | urls.each do |url|
1372 |   Thread.new { p ExampleSpider.parse!(:parse, url: url) }
1373 | end # =>
1374 | 
1375 | # "Google"
1376 | # "Wikipedia, the free encyclopedia"
1377 | # "reddit: the front page of the internetHotHot"
1378 | ```
1379 | 
1380 | Keep in mind, that [save_to](#save_to-helper) and [unique?](#skip-duplicates) helpers are not thread-safe while using `.parse!` method.
1381 | 
1382 | #### `Kimurai.list` and `Kimurai.find_by_name()`
1383 | 
1384 | ```ruby
1385 | class GoogleSpider < Kimurai::Base
1386 |   @name = "google_spider"
1387 | end
1388 | 
1389 | class RedditSpider < Kimurai::Base
1390 |   @name = "reddit_spider"
1391 | end
1392 | 
1393 | class WikipediaSpider < Kimurai::Base
1394 |   @name = "wikipedia_spider"
1395 | end
1396 | 
1397 | # To get the list of all available spider classes:
1398 | Kimurai.list
1399 | # => {"google_spider"=>GoogleSpider, "reddit_spider"=>RedditSpider, "wikipedia_spider"=>WikipediaSpider}
1400 | 
1401 | # To find a particular spider class by it's name:
1402 | Kimurai.find_by_name("reddit_spider")
1403 | # => RedditSpider
1404 | ```
1405 | 
1406 | 
1407 | ### Automated sever setup and deployment
1408 | > **EXPERIMENTAL**
1409 | 
1410 | #### Setup
1411 | You can automatically setup [required environment](#installation) for Kimurai on the remote server (currently there is only Ubuntu Server 18.04 support) using `$ kimurai setup` command. `setup` will perform installation of: latest Ruby with Rbenv, browsers with webdrivers and in additional databases clients (only clients) for MySQL, Postgres and MongoDB (so you can connect to a remote database from ruby).
1412 | 
1413 | > To perform remote server setup, [Ansible](https://github.com/ansible/ansible) is required **on the desktop** machine (to install: Ubuntu: `$ sudo apt install ansible`, Mac OS X: `$ brew install ansible`)
1414 | 
1415 | > It's recommended to use regular user to setup the server, not `root`. To create a new user, login to the server `$ ssh root@your_server_ip`, type `$ adduser username` to create a user, and `$ gpasswd -a username sudo` to add new user to a sudo group.
1416 | 
1417 | Example:
1418 | 
1419 | ```bash
1420 | $ kimurai setup deploy@123.123.123.123 --ask-sudo --ssh-key-path path/to/private_key
1421 | ```
1422 | 
1423 | CLI options:
1424 | * `--ask-sudo` pass this option to ask sudo (user) password for system-wide installation of packages (`apt install`)
1425 | * `--ssh-key-path path/to/private_key` authorization on the server using private ssh key. You can omit it if required key already [added to keychain](https://help.github.com/articles/generating-a-new-ssh-key-and-adding-it-to-the-ssh-agent/#adding-your-ssh-key-to-the-ssh-agent) on your desktop (Ansible uses [SSH agent forwarding](https://developer.github.com/v3/guides/using-ssh-agent-forwarding/))
1426 | * `--ask-auth-pass` authorization on the server using user password, alternative option to `--ssh-key-path`.
1427 | * `-p port_number` custom port for ssh connection (`-p 2222`)
1428 | 
1429 | > You can check setup playbook [here](lib/kimurai/automation/setup.yml)
1430 | 
1431 | #### Deploy
1432 | 
1433 | After successful `setup` you can deploy a spider to the remote server using `$ kimurai deploy` command. On each deploy there are performing several tasks: 1) pull repo from a remote origin to `~/repo_name` user directory 2) run `bundle install` 3) Update crontab `whenever --update-crontab` (to update spider schedule from schedule.rb file).
1434 | 
1435 | Before `deploy` make sure that inside spider directory you have: 1) git repository with remote origin (bitbucket, github, etc.) 2) `Gemfile` 3) schedule.rb inside subfolder `config` (`config/schedule.rb`).
1436 | 
1437 | Example:
1438 | 
1439 | ```bash
1440 | $ kimurai deploy deploy@123.123.123.123 --ssh-key-path path/to/private_key --repo-key-path path/to/repo_private_key
1441 | ```
1442 | 
1443 | CLI options: _same like for [setup](#setup) command_ (except `--ask-sudo`), plus
1444 | * `--repo-url` provide custom repo url (`--repo-url git@bitbucket.org:username/repo_name.git`), otherwise current `origin/master` will be taken (output from `$ git remote get-url origin`)
1445 | * `--repo-key-path` if git repository is private, authorization is required to pull the code on the remote server. Use this option to provide a private repository SSH key. You can omit it if required key already added to keychain on your desktop (same like with `--ssh-key-path` option)
1446 | 
1447 | > You can check deploy playbook [here](lib/kimurai/automation/deploy.yml)
1448 | 
1449 | ## Spider `@config`
1450 | 
1451 | Using `@config` you can set several options for a spider, like proxy, user-agent, default cookies/headers, delay between requests, browser **memory control** and so on:
1452 | 
1453 | ```ruby
1454 | class Spider < Kimurai::Base
1455 |   USER_AGENTS = ["Chrome", "Firefox", "Safari", "Opera"]
1456 |   PROXIES = ["2.3.4.5:8080:http:username:password", "3.4.5.6:3128:http", "1.2.3.4:3000:socks5"]
1457 | 
1458 |   @engine = :poltergeist_phantomjs
1459 |   @start_urls = ["https://example.com/"]
1460 |   @config = {
1461 |     headers: { "custom_header" => "custom_value" },
1462 |     cookies: [{ name: "cookie_name", value: "cookie_value", domain: ".example.com" }],
1463 |     user_agent: -> { USER_AGENTS.sample },
1464 |     proxy: -> { PROXIES.sample },
1465 |     window_size: [1366, 768],
1466 |     disable_images: true,
1467 |     restart_if: {
1468 |       # Restart browser if provided memory limit (in kilobytes) is exceeded:
1469 |       memory_limit: 350_000
1470 |     },
1471 |     before_request: {
1472 |       # Change user agent before each request:
1473 |       change_user_agent: true,
1474 |       # Change proxy before each request:
1475 |       change_proxy: true,
1476 |       # Clear all cookies and set default cookies (if provided) before each request:
1477 |       clear_and_set_cookies: true,
1478 |       # Process delay before each request:
1479 |       delay: 1..3
1480 |     }
1481 |   }
1482 | 
1483 |   def parse(response, url:, data: {})
1484 |     # ...
1485 |   end
1486 | end
1487 | ```
1488 | 
1489 | ### All available `@config` options
1490 | 
1491 | ```ruby
1492 | @config = {
1493 |   # Custom headers, format: hash. Example: { "some header" => "some value", "another header" => "another value" }
1494 |   # Works only for :mechanize and :poltergeist_phantomjs engines (Selenium doesn't allow to set/get headers)
1495 |   headers: {},
1496 | 
1497 |   # Custom User Agent, format: string or lambda.
1498 |   # Use lambda if you want to rotate user agents before each run:
1499 |   # user_agent: -> { ARRAY_OF_USER_AGENTS.sample }
1500 |   # Works for all engines
1501 |   user_agent: "Mozilla/5.0 Firefox/61.0",
1502 | 
1503 |   # Custom cookies, format: array of hashes.
1504 |   # Format for a single cookie: { name: "cookie name", value: "cookie value", domain: ".example.com" }
1505 |   # Works for all engines
1506 |   cookies: [],
1507 | 
1508 |   # Proxy, format: string or lambda. Format of a proxy string: "ip:port:protocol:user:password"
1509 |   # `protocol` can be http or socks5. User and password are optional.
1510 |   # Use lambda if you want to rotate proxies before each run:
1511 |   # proxy: -> { ARRAY_OF_PROXIES.sample }
1512 |   # Works for all engines, but keep in mind that Selenium drivers doesn't support proxies
1513 |   # with authorization. Also, Mechanize doesn't support socks5 proxy format (only http)
1514 |   proxy: "3.4.5.6:3128:http:user:pass",
1515 | 
1516 |   # If enabled, browser will ignore any https errors. It's handy while using a proxy
1517 |   # with self-signed SSL cert (for example Crawlera or Mitmproxy)
1518 |   # Also, it will allow to visit webpages with expires SSL certificate.
1519 |   # Works for all engines
1520 |   ignore_ssl_errors: true,
1521 | 
1522 |   # Custom window size, works for all engines
1523 |   window_size: [1366, 768],
1524 | 
1525 |   # Skip images downloading if true, works for all engines
1526 |   disable_images: true,
1527 | 
1528 |   # Selenium engines only: headless mode, `:native` or `:virtual_display` (default is :native)
1529 |   # Although native mode has a better performance, virtual display mode
1530 |   # sometimes can be useful. For example, some websites can detect (and block)
1531 |   # headless chrome, so you can use virtual_display mode instead
1532 |   headless_mode: :native,
1533 | 
1534 |   # This option tells the browser not to use a proxy for the provided list of domains or IP addresses.
1535 |   # Format: array of strings. Works only for :selenium_firefox and selenium_chrome
1536 |   proxy_bypass_list: [],
1537 | 
1538 |   # Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize
1539 |   ssl_cert_path: "path/to/ssl_cert",
1540 | 
1541 |   # Inject some JavaScript code to the browser.
1542 |   # Format: array of strings, where each string is a path to JS file.
1543 |   # Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection)
1544 |   extensions: ["lib/code_to_inject.js"],
1545 | 
1546 |   # Automatically skip duplicated (already visited) urls when using `request_to` method.
1547 |   # Possible values: `true` or `hash` with options.
1548 |   # In case of `true`, all visited urls will be added to the storage's scope `:requests_urls`
1549 |   # and if url already contains in this scope, request will be skipped.
1550 |   # You can configure this setting by providing additional options as hash:
1551 |   # `skip_duplicate_requests: { scope: :custom_scope, check_only: true }`, where:
1552 |   # `scope:` - use custom scope than `:requests_urls`
1553 |   # `check_only:` - if true, then scope will be only checked for url, url will not
1554 |   # be added to the scope if scope doesn't contains it.
1555 |   # works for all drivers
1556 |   skip_duplicate_requests: true,
1557 | 
1558 |   # Automatically skip provided errors while requesting a page.
1559 |   # If raised error matches one of the errors in the list, then this error will be caught,
1560 |   # and request will be skipped.
1561 |   # It is a good idea to skip errors like NotFound(404), etc.
1562 |   # Format: array where elements are error classes or/and hashes. You can use hash format
1563 |   # for more flexibility: `{ error: "RuntimeError", message: "404 => Net::HTTPNotFound" }`.
1564 |   # Provided `message:` will be compared with a full error message using `String#include?`. Also
1565 |   # you can use regex instead: `{ error: "RuntimeError", message: /404|403/ }`.
1566 |   skip_request_errors: [{ error: RuntimeError, message: "404 => Net::HTTPNotFound" }],
1567 | 
1568 |   # Automatically retry provided errors with a few attempts while requesting a page.
1569 |   # If raised error matches one of the errors in the list, then this error will be caught
1570 |   # and the request will be processed again within a delay. There are 3 attempts:
1571 |   # first: delay 15 sec, second: delay 30 sec, third: delay 45 sec.
1572 |   # If after 3 attempts there is still an exception, then the exception will be raised.
1573 |   # It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc.
1574 |   # Format: same like for `skip_request_errors` option.
1575 |   retry_request_errors: [Net::ReadTimeout],
1576 | 
1577 |   # Handle page encoding while parsing html response using Nokogiri. There are two modes:
1578 |   # Auto (`:auto`) (try to fetch correct encoding from <meta http-equiv="Content-Type"> or <meta charset> tags)
1579 |   # Set required encoding manually, example: `encoding: "GB2312"` (Set required encoding manually)
1580 |   # Default this option is unset.
1581 |   encoding: nil,
1582 | 
1583 |   # Restart browser if one of the options is true:
1584 |   restart_if: {
1585 |     # Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
1586 |     memory_limit: 350_000,
1587 | 
1588 |     # Restart browser if provided requests limit is exceeded (works for all engines)
1589 |     requests_limit: 100
1590 |   },
1591 | 
1592 |   # Perform several actions before each request:
1593 |   before_request: {
1594 |     # Change proxy before each request. The `proxy:` option above should be presented
1595 |     # and has lambda format. Works only for poltergeist and mechanize engines
1596 |     # (Selenium doesn't support proxy rotation).
1597 |     change_proxy: true,
1598 | 
1599 |     # Change user agent before each request. The `user_agent:` option above should be presented
1600 |     # and has lambda format. Works only for poltergeist and mechanize engines
1601 |     # (selenium doesn't support to get/set headers).
1602 |     change_user_agent: true,
1603 | 
1604 |     # Clear all cookies before each request, works for all engines
1605 |     clear_cookies: true,
1606 | 
1607 |     # If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
1608 |     # use this option instead (works for all engines)
1609 |     clear_and_set_cookies: true,
1610 | 
1611 |     # Global option to set delay between requests.
1612 |     # Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
1613 |     # delay number will be chosen randomly for each request: `rand (2..5) # => 3`
1614 |     delay: 1..3
1615 |   }
1616 | }
1617 | ```
1618 | 
1619 | As you can see, most of the options are universal for any engine.
1620 | 
1621 | ### `@config` settings inheritance
1622 | Settings can be inherited:
1623 | 
1624 | ```ruby
1625 | class ApplicationSpider < Kimurai::Base
1626 |   @engine = :poltergeist_phantomjs
1627 |   @config = {
1628 |     user_agent: "Firefox",
1629 |     disable_images: true,
1630 |     restart_if: { memory_limit: 350_000 },
1631 |     before_request: { delay: 1..2 }
1632 |   }
1633 | end
1634 | 
1635 | class CustomSpider < ApplicationSpider
1636 |   @name = "custom_spider"
1637 |   @start_urls = ["https://example.com/"]
1638 |   @config = {
1639 |     before_request: { delay: 4..6 }
1640 |   }
1641 | 
1642 |   def parse(response, url:, data: {})
1643 |     # ...
1644 |   end
1645 | end
1646 | ```
1647 | 
1648 | Here, `@config` of `CustomSpider` will be _[deep merged](https://apidock.com/rails/Hash/deep_merge)_ with `ApplicationSpider` config, so `CustomSpider` will keep all inherited options with only `delay` updated.
1649 | 
1650 | ## Project mode
1651 | 
1652 | Kimurai can work in project mode ([Like Scrapy](https://doc.scrapy.org/en/latest/intro/tutorial.html#creating-a-project)). To generate a new project, run: `$ kimurai generate project web_spiders` (where `web_spiders` is a name of project).
1653 | 
1654 | Structure of the project:
1655 | 
1656 | ```bash
1657 | .
1658 | ├── config/
1659 | │   ├── initializers/
1660 | │   ├── application.rb
1661 | │   ├── automation.yml
1662 | │   ├── boot.rb
1663 | │   └── schedule.rb
1664 | ├── spiders/
1665 | │   └── application_spider.rb
1666 | ├── db/
1667 | ├── helpers/
1668 | │   └── application_helper.rb
1669 | ├── lib/
1670 | ├── log/
1671 | ├── pipelines/
1672 | │   ├── validator.rb
1673 | │   └── saver.rb
1674 | ├── tmp/
1675 | ├── .env
1676 | ├── Gemfile
1677 | ├── Gemfile.lock
1678 | └── README.md
1679 | ```
1680 | 
1681 | <details/>
1682 |   <summary>Description</summary>
1683 | 
1684 | * `config/` folder for configutation files
1685 |   * `config/initializers` [Rails-like initializers](https://guides.rubyonrails.org/configuring.html#using-initializer-files) to load custom code at start of framework
1686 |   * `config/application.rb` configuration settings for Kimurai (`Kimurai.configure do` block)
1687 |   * `config/automation.yml` specify some settings for [setup and deploy](#automated-sever-setup-and-deployment)
1688 |   * `config/boot.rb` loads framework and project
1689 |   * `config/schedule.rb` Cron [schedule for spiders](#schedule-spiders-using-cron)
1690 | * `spiders/` folder for spiders
1691 |   * `spiders/application_spider.rb` Base parent class for all spiders
1692 | * `db/` store here all database files (`sqlite`, `json`, `csv`, etc.)
1693 | * `helpers/` Rails-like helpers for spiders
1694 |   * `helpers/application_helper.rb` all methods inside ApplicationHelper module will be available for all spiders
1695 | * `lib/` put here custom Ruby code
1696 | * `log/` folder for logs
1697 | * `pipelines/` folder for [Scrapy-like](https://doc.scrapy.org/en/latest/topics/item-pipeline.html) pipelines. One file = one pipeline
1698 |   * `pipelines/validator.rb` example pipeline to validate item
1699 |   * `pipelines/saver.rb` example pipeline to save item
1700 | * `tmp/` folder for temp. files
1701 | * `.env` file to store ENV variables for project and load them using [Dotenv](https://github.com/bkeepers/dotenv)
1702 | * `Gemfile` dependency file
1703 | * `Readme.md` example project readme
1704 | </details>
1705 | 
1706 | 
1707 | ### Generate new spider
1708 | To generate a new spider in the project, run:
1709 | 
1710 | ```bash
1711 | $ kimurai generate spider example_spider
1712 |       create  spiders/example_spider.rb
1713 | ```
1714 | 
1715 | Command will generate a new spider class inherited from `ApplicationSpider`:
1716 | 
1717 | ```ruby
1718 | class ExampleSpider < ApplicationSpider
1719 |   @name = "example_spider"
1720 |   @start_urls = []
1721 |   @config = {}
1722 | 
1723 |   def parse(response, url:, data: {})
1724 |   end
1725 | end
1726 | ```
1727 | 
1728 | ### Crawl
1729 | To run a particular spider in the project, run: `$ bundle exec kimurai crawl example_spider`. Don't forget to add `bundle exec` before command to load required environment.
1730 | 
1731 | ### List
1732 | To list all project spiders, run: `$ bundle exec kimurai list`
1733 | 
1734 | ### Parse
1735 | For project spiders you can use `$ kimurai parse` command which helps to debug spiders:
1736 | 
1737 | ```bash
1738 | $ bundle exec kimurai parse example_spider parse_product --url https://example-shop.com/product-1
1739 | ```
1740 | 
1741 | where `example_spider` is a spider to run, `parse_product` is a spider method to process and `--url` is url to open inside processing method.
1742 | 
1743 | ### Pipelines, `send_item` method
1744 | You can use item pipelines to organize and store in one place item processing logic for all project spiders (also check Scrapy [description of pipelines](https://doc.scrapy.org/en/latest/topics/item-pipeline.html#item-pipeline)).
1745 | 
1746 | Imagine if you have three spiders where each of them crawls different e-commerce shop and saves only shoe positions. For each spider, you want to save items only with "shoe" category, unique sku, valid title/price and with existing images. To avoid code duplication between spiders, use pipelines:
1747 | 
1748 | <details/>
1749 |   <summary>Example</summary>
1750 | 
1751 | pipelines/validator.rb
1752 | ```ruby
1753 | class Validator < Kimurai::Pipeline
1754 |   def process_item(item, options: {})
1755 |     # Here you can validate item and raise `DropItemError`
1756 |     # if one of the validations failed. Examples:
1757 | 
1758 |     # Drop item if it's category is not "shoe":
1759 |     if item[:category] != "shoe"
1760 |       raise DropItemError, "Wrong item category"
1761 |     end
1762 | 
1763 |     # Check item sku for uniqueness using buit-in unique? helper:
1764 |     unless unique?(:sku, item[:sku])
1765 |       raise DropItemError, "Item sku is not unique"
1766 |     end
1767 | 
1768 |     # Drop item if title length shorter than 5 symbols:
1769 |     if item[:title].size < 5
1770 |       raise DropItemError, "Item title is short"
1771 |     end
1772 | 
1773 |     # Drop item if price is not present
1774 |     unless item[:price].present?
1775 |       raise DropItemError, "item price is not present"
1776 |     end
1777 | 
1778 |     # Drop item if it doesn't contains any images:
1779 |     unless item[:images].present?
1780 |       raise DropItemError, "Item images are not present"
1781 |     end
1782 | 
1783 |     # Pass item to the next pipeline (if it wasn't dropped):
1784 |     item
1785 |   end
1786 | end
1787 | 
1788 | ```
1789 | 
1790 | pipelines/saver.rb
1791 | ```ruby
1792 | class Saver < Kimurai::Pipeline
1793 |   def process_item(item, options: {})
1794 |     # Here you can save item to the database, send it to a remote API or
1795 |     # simply save item to a file format using `save_to` helper:
1796 | 
1797 |     # To get the name of current spider: `spider.class.name`
1798 |     save_to "db/#{spider.class.name}.json", item, format: :json
1799 | 
1800 |     item
1801 |   end
1802 | end
1803 | ```
1804 | 
1805 | spiders/application_spider.rb
1806 | ```ruby
1807 | class ApplicationSpider < Kimurai::Base
1808 |   @engine = :selenium_chrome
1809 |   # Define pipelines (by order) for all spiders:
1810 |   @pipelines = [:validator, :saver]
1811 | end
1812 | ```
1813 | 
1814 | spiders/shop_spider_1.rb
1815 | ```ruby
1816 | class ShopSpiderOne < ApplicationSpider
1817 |   @name = "shop_spider_1"
1818 |   @start_urls = ["https://shop-1.com"]
1819 | 
1820 |   # ...
1821 | 
1822 |   def parse_product(response, url:, data: {})
1823 |     # ...
1824 | 
1825 |     # Send item to pipelines:
1826 |     send_item item
1827 |   end
1828 | end
1829 | ```
1830 | 
1831 | spiders/shop_spider_2.rb
1832 | ```ruby
1833 | class ShopSpiderTwo < ApplicationSpider
1834 |   @name = "shop_spider_2"
1835 |   @start_urls = ["https://shop-2.com"]
1836 | 
1837 |   def parse_product(response, url:, data: {})
1838 |     # ...
1839 | 
1840 |     # Send item to pipelines:
1841 |     send_item item
1842 |   end
1843 | end
1844 | ```
1845 | 
1846 | spiders/shop_spider_3.rb
1847 | ```ruby
1848 | class ShopSpiderThree < ApplicationSpider
1849 |   @name = "shop_spider_3"
1850 |   @start_urls = ["https://shop-3.com"]
1851 | 
1852 |   def parse_product(response, url:, data: {})
1853 |     # ...
1854 | 
1855 |     # Send item to pipelines:
1856 |     send_item item
1857 |   end
1858 | end
1859 | ```
1860 | </details><br>
1861 | 
1862 | When you start using pipelines, there are stats for items appears:
1863 | 
1864 | <details>
1865 |   <summary>Example</summary>
1866 | 
1867 | pipelines/validator.rb
1868 | ```ruby
1869 | class Validator < Kimurai::Pipeline
1870 |   def process_item(item, options: {})
1871 |     if item[:star_count] < 10
1872 |       raise DropItemError, "Repository doesn't have enough stars"
1873 |     end
1874 | 
1875 |     item
1876 |   end
1877 | end
1878 | ```
1879 | 
1880 | spiders/github_spider.rb
1881 | ```ruby
1882 | class GithubSpider < ApplicationSpider
1883 |   @name = "github_spider"
1884 |   @engine = :selenium_chrome
1885 |   @pipelines = [:validator]
1886 |   @start_urls = ["https://github.com/search?q=Ruby%20Web%20Scraping"]
1887 |   @config = {
1888 |     user_agent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36",
1889 |     before_request: { delay: 4..7 }
1890 |   }
1891 | 
1892 |   def parse(response, url:, data: {})
1893 |     response.xpath("//ul[@class='repo-list']/div//h3/a").each do |a|
1894 |       request_to :parse_repo_page, url: absolute_url(a[:href], base: url)
1895 |     end
1896 | 
1897 |     if next_page = response.at_xpath("//a[@class='next_page']")
1898 |       request_to :parse, url: absolute_url(next_page[:href], base: url)
1899 |     end
1900 |   end
1901 | 
1902 |   def parse_repo_page(response, url:, data: {})
1903 |     item = {}
1904 | 
1905 |     item[:owner] = response.xpath("//h1//a[@rel='author']").text
1906 |     item[:repo_name] = response.xpath("//h1/strong[@itemprop='name']/a").text
1907 |     item[:repo_url] = url
1908 |     item[:description] = response.xpath("//span[@itemprop='about']").text.squish
1909 |     item[:tags] = response.xpath("//div[@id='topics-list-container']/div/a").map { |a| a.text.squish }
1910 |     item[:watch_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Watch')]/a[2]").text.squish.delete(",").to_i
1911 |     item[:star_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Star')]/a[2]").text.squish.delete(",").to_i
1912 |     item[:fork_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Fork')]/a[2]").text.squish.delete(",").to_i
1913 |     item[:last_commit] = response.xpath("//span[@itemprop='dateModified']/*").text
1914 | 
1915 |     send_item item
1916 |   end
1917 | end
1918 | ```
1919 | 
1920 | ```
1921 | $ bundle exec kimurai crawl github_spider
1922 | 
1923 | I, [2018-08-22 15:56:35 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Spider: started: github_spider
1924 | D, [2018-08-22 15:56:35 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): created browser instance
1925 | I, [2018-08-22 15:56:40 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Browser: started get request to: https://github.com/search?q=Ruby%20Web%20Scraping
1926 | I, [2018-08-22 15:56:44 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Browser: finished get request to: https://github.com/search?q=Ruby%20Web%20Scraping
1927 | I, [2018-08-22 15:56:44 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Info: visits: requests: 1, responses: 1
1928 | D, [2018-08-22 15:56:44 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: driver.current_memory: 116182
1929 | D, [2018-08-22 15:56:44 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: sleep 5 seconds before request...
1930 | 
1931 | I, [2018-08-22 15:56:49 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Browser: started get request to: https://github.com/lorien/awesome-web-scraping
1932 | I, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Browser: finished get request to: https://github.com/lorien/awesome-web-scraping
1933 | I, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Info: visits: requests: 2, responses: 2
1934 | D, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: driver.current_memory: 217432
1935 | D, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Pipeline: starting processing item through 1 pipeline...
1936 | I, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Pipeline: processed: {"owner":"lorien","repo_name":"awesome-web-scraping","repo_url":"https://github.com/lorien/awesome-web-scraping","description":"List of libraries, tools and APIs for web scraping and data processing.","tags":["awesome","awesome-list","web-scraping","data-processing","python","javascript","php","ruby"],"watch_count":159,"star_count":2423,"fork_count":358,"last_commit":"4 days ago"}
1937 | I, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Info: items: sent: 1, processed: 1
1938 | D, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: sleep 6 seconds before request...
1939 | 
1940 | ...
1941 | 
1942 | I, [2018-08-22 16:11:50 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Browser: started get request to: https://github.com/preston/idclight
1943 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Browser: finished get request to: https://github.com/preston/idclight
1944 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Info: visits: requests: 140, responses: 140
1945 | D, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: driver.current_memory: 211713
1946 | 
1947 | D, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Pipeline: starting processing item through 1 pipeline...
1948 | E, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] ERROR -- github_spider: Pipeline: dropped: #<Kimurai::Pipeline::DropItemError: Repository doesn't have enough stars>, item: {:owner=>"preston", :repo_name=>"idclight", :repo_url=>"https://github.com/preston/idclight", :description=>"A Ruby gem for accessing the freely available IDClight (IDConverter Light) web service, which convert between different types of gene IDs such as Hugo and Entrez. Queries are screen scraped from http://idclight.bioinfo.cnio.es.", :tags=>[], :watch_count=>6, :star_count=>1, :fork_count=>0, :last_commit=>"on Apr 12, 2012"}
1949 | 
1950 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Info: items: sent: 127, processed: 12
1951 | 
1952 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Browser: driver selenium_chrome has been destroyed
1953 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980]  INFO -- github_spider: Spider: stopped: {:spider_name=>"github_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 15:56:35 +0400, :stop_time=>2018-08-22 16:11:51 +0400, :running_time=>"15m, 16s", :visits=>{:requests=>140, :responses=>140}, :items=>{:sent=>127, :processed=>12}, :error=>nil}
1954 | ```
1955 | </details><br>
1956 | 
1957 | Also, you can pass custom options to pipeline from a particular spider if you want to change pipeline behavior for this spider:
1958 | 
1959 | <details>
1960 |   <summary>Example</summary>
1961 | 
1962 | spiders/custom_spider.rb
1963 | ```ruby
1964 | class CustomSpider < ApplicationSpider
1965 |   @name = "custom_spider"
1966 |   @start_urls = ["https://example.com"]
1967 |   @pipelines = [:validator]
1968 | 
1969 |   # ...
1970 | 
1971 |   def parse_item(response, url:, data: {})
1972 |     # ...
1973 | 
1974 |     # Pass custom option `skip_uniq_checking` for Validator pipeline:
1975 |     send_item item, validator: { skip_uniq_checking: true }
1976 |   end
1977 | end
1978 | 
1979 | ```
1980 | 
1981 | pipelines/validator.rb
1982 | ```ruby
1983 | class Validator < Kimurai::Pipeline
1984 |   def process_item(item, options: {})
1985 | 
1986 |     # Do not check item sku for uniqueness if options[:skip_uniq_checking] is true
1987 |     if options[:skip_uniq_checking] != true
1988 |       raise DropItemError, "Item sku is not unique" unless unique?(:sku, item[:sku])
1989 |     end
1990 |   end
1991 | end
1992 | ```
1993 | </details>
1994 | 
1995 | 
1996 | ### Runner
1997 | 
1998 | You can run project spiders one by one or in parallel using `$ kimurai runner` command:
1999 | 
2000 | ```
2001 | $ bundle exec kimurai list
2002 | custom_spider
2003 | example_spider
2004 | github_spider
2005 | 
2006 | $ bundle exec kimurai runner -j 3
2007 | >>> Runner: started: {:id=>1533727423, :status=>:processing, :start_time=>2018-08-08 15:23:43 +0400, :stop_time=>nil, :environment=>"development", :concurrent_jobs=>3, :spiders=>["custom_spider", "github_spider", "example_spider"]}
2008 | > Runner: started spider: custom_spider, index: 0
2009 | > Runner: started spider: github_spider, index: 1
2010 | > Runner: started spider: example_spider, index: 2
2011 | < Runner: stopped spider: custom_spider, index: 0
2012 | < Runner: stopped spider: example_spider, index: 2
2013 | < Runner: stopped spider: github_spider, index: 1
2014 | <<< Runner: stopped: {:id=>1533727423, :status=>:completed, :start_time=>2018-08-08 15:23:43 +0400, :stop_time=>2018-08-08 15:25:11 +0400, :environment=>"development", :concurrent_jobs=>3, :spiders=>["custom_spider", "github_spider", "example_spider"]}
2015 | ```
2016 | 
2017 | Each spider runs in a separate process. Spiders logs available at `log/` folder. Pass `-j` option to specify how many spiders should be processed at the same time (default is 1).
2018 | 
2019 | You can provide additional arguments like `--include` or `--exclude` to specify which spiders to run:
2020 | 
2021 | ```bash
2022 | # Run only custom_spider and example_spider:
2023 | $ bundle exec kimurai runner --include custom_spider example_spider
2024 | 
2025 | # Run all except github_spider:
2026 | $ bundle exec kimurai runner --exclude github_spider
2027 | ```
2028 | 
2029 | #### Runner callbacks
2030 | 
2031 | You can perform custom actions before runner starts and after runner stops using `config.runner_at_start_callback` and `config.runner_at_stop_callback`. Check [config/application.rb](lib/kimurai/template/config/application.rb) to see example.
2032 | 
2033 | 
2034 | ## Chat Support and Feedback
2035 | Will be updated
2036 | 
2037 | ## License
2038 | The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
2039 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | require "bundler/gem_tasks"
 2 | require "rake/testtask"
 3 | 
 4 | Rake::TestTask.new(:test) do |t|
 5 |   t.libs << "test"
 6 |   t.libs << "lib"
 7 |   t.test_files = FileList["test/**/*_test.rb"]
 8 | end
 9 | 
10 | task :default => :test
11 | 


--------------------------------------------------------------------------------
/bin/console:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require "bundler/setup"
 4 | require "kimurai"
 5 | 
 6 | # You can add fixtures and/or initialization code here to make experimenting
 7 | # with your gem easier. You can also use a different console, if you like.
 8 | 
 9 | # (If you use this, don't forget to add pry to your Gemfile!)
10 | # require "pry"
11 | # Pry.start
12 | 
13 | require "irb"
14 | IRB.start(__FILE__)
15 | 


--------------------------------------------------------------------------------
/bin/setup:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euo pipefail
3 | IFS=$'\n\t'
4 | set -vx
5 | 
6 | bundle install
7 | 
8 | # Do any other automated setup that you need to do here
9 | 


--------------------------------------------------------------------------------
/exe/kimurai:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | 
3 | require 'kimurai'
4 | require 'kimurai/cli'
5 | 
6 | Kimurai::CLI.start(ARGV)
7 | 


--------------------------------------------------------------------------------
/kimurai.gemspec:
--------------------------------------------------------------------------------
 1 | 
 2 | lib = File.expand_path("../lib", __FILE__)
 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 4 | require "kimurai/version"
 5 | 
 6 | Gem::Specification.new do |spec|
 7 |   spec.name          = "kimurai"
 8 |   spec.version       = Kimurai::VERSION
 9 |   spec.authors       = ["Victor Afanasev"]
10 |   spec.email         = ["vicfreefly@gmail.com"]
11 | 
12 |   spec.summary       = "Modern web scraping framework written in Ruby and based on Capybara/Nokogiri"
13 |   spec.homepage      = "https://github.com/vifreefly/kimuraframework"
14 |   spec.license       = "MIT"
15 | 
16 |   # Specify which files should be added to the gem when it is released.
17 |   # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
18 |   spec.files         = Dir.chdir(File.expand_path('..', __FILE__)) do
19 |     `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
20 |   end
21 |   spec.bindir        = "exe"
22 |   spec.executables   = "kimurai"
23 |   spec.require_paths = ["lib"]
24 |   spec.required_ruby_version = ">= 2.5.0"
25 | 
26 |   spec.add_dependency "thor"
27 |   spec.add_dependency "cliver"
28 |   spec.add_dependency "activesupport"
29 |   spec.add_dependency "murmurhash3"
30 |   spec.add_dependency "nokogiri"
31 | 
32 |   spec.add_dependency "capybara", ">= 2.15", "< 4.0"
33 |   spec.add_dependency "capybara-mechanize"
34 |   spec.add_dependency "poltergeist"
35 |   spec.add_dependency "selenium-webdriver"
36 | 
37 |   spec.add_dependency "headless"
38 |   spec.add_dependency "pmap"
39 | 
40 |   spec.add_dependency "whenever"
41 | 
42 |   spec.add_dependency "rbcat", "~> 0.2"
43 |   spec.add_dependency "pry"
44 | 
45 |   spec.add_development_dependency "bundler", "~> 1.16"
46 |   spec.add_development_dependency "rake", "~> 10.0"
47 |   spec.add_development_dependency "minitest", "~> 5.0"
48 | end
49 | 


--------------------------------------------------------------------------------
/lib/kimurai.rb:
--------------------------------------------------------------------------------
 1 | require 'ostruct'
 2 | require 'logger'
 3 | require 'json'
 4 | require 'active_support'
 5 | require 'active_support/core_ext'
 6 | require 'rbcat'
 7 | 
 8 | require_relative 'kimurai/version'
 9 | 
10 | require_relative 'kimurai/core_ext/numeric'
11 | require_relative 'kimurai/core_ext/string'
12 | require_relative 'kimurai/core_ext/array'
13 | require_relative 'kimurai/core_ext/hash'
14 | 
15 | require_relative 'kimurai/browser_builder'
16 | require_relative 'kimurai/base_helper'
17 | require_relative 'kimurai/pipeline'
18 | require_relative 'kimurai/base'
19 | 
20 | module Kimurai
21 |   class << self
22 |     def configuration
23 |       @configuration ||= OpenStruct.new
24 |     end
25 | 
26 |     def configure
27 |       yield(configuration)
28 |     end
29 | 
30 |     def env
31 |       ENV.fetch("KIMURAI_ENV") { "development" }
32 |     end
33 | 
34 |     def time_zone
35 |       ENV["TZ"]
36 |     end
37 | 
38 |     def time_zone=(value)
39 |       ENV.store("TZ", value)
40 |     end
41 | 
42 |     def list
43 |       Base.descendants.map do |klass|
44 |         next unless klass.name
45 |         [klass.name, klass]
46 |       end.compact.to_h
47 |     end
48 | 
49 |     def find_by_name(name)
50 |       return unless name
51 |       Base.descendants.find { |klass| klass.name == name }
52 |     end
53 |   end
54 | end
55 | 


--------------------------------------------------------------------------------
/lib/kimurai/automation/deploy.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   vars:
 4 |     rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
 5 |     rbenv_shims_path: "{{ rbenv_root_path }}/shims"
 6 |     repo_url:
 7 |     repo_name:
 8 |     repo_key_path:
 9 | 
10 |   tasks:
11 |     - name: Copy custom git ssh key to /tmp/private_key (if provided)
12 |       when: repo_key_path is not none
13 |       copy:
14 |         src: "{{ repo_key_path }}"
15 |         dest: /tmp/private_key
16 |         mode: 0600
17 | 
18 |     - name: Clone/pull project repo to ~/{{ repo_name }} user directory (using ssh-agent forwarding or https)
19 |       when: repo_key_path is none
20 |       git:
21 |         repo: "{{ repo_url }}"
22 |         dest: "~/{{ repo_name }}"
23 |         force: true
24 |         accept_hostkey: true
25 | 
26 |     - name: Clone/pull project repo to ~/{{ repo_name }} user directory (using custom git ssh key)
27 |       when: repo_key_path is not none
28 |       git:
29 |         repo: "{{ repo_url }}"
30 |         dest: "~/{{ repo_name }}"
31 |         force: true
32 |         accept_hostkey: true
33 |         key_file: /tmp/private_key
34 | 
35 |     - name: Delete custom git ssh key from /tmp/private_key (if provided)
36 |       when: repo_key_path is not none
37 |       file:
38 |         state: absent
39 |         path: /tmp/private_key
40 | 
41 |     - name: Run bundle install
42 |       command: bundle install
43 |       args:
44 |         chdir: ~/{{ repo_name }}
45 |       environment:
46 |         PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
47 | 
48 |     - name: Run whenever to update crontab
49 |       command: whenever --update-crontab
50 |       args:
51 |         chdir: ~/{{ repo_name }}
52 |       environment:
53 |         PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
54 | 
55 | 


--------------------------------------------------------------------------------
/lib/kimurai/automation/setup.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   vars:
 4 |     ruby: 2.5.3
 5 |     rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
 6 |     rbenv_shims_path: "{{ rbenv_root_path }}/shims"
 7 |     ruby_versions_path: "{{ rbenv_root_path }}/versions"
 8 |     # check latest here http://phantomjs.org/download.html
 9 |     phantomjs: 2.1.1
10 |     # check latest here https://github.com/mozilla/geckodriver/releases/
11 |     geckodriver: 0.23.0
12 |     # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
13 |     chromedriver: 2.44
14 | 
15 |   tasks:
16 |     - name: Update apt cache
17 |       become: true
18 |       apt: update_cache=yes cache_valid_time=86400
19 | 
20 |     - name: Install base packages
21 |       become: true
22 |       apt:
23 |         pkg: "{{ item }}"
24 |         state: present
25 |       with_items:
26 |         - git
27 |         - xvfb
28 |         - libsqlite3-dev
29 |         - sqlite3
30 |         - mongodb-clients
31 |         - mysql-client
32 |         - libmysqlclient-dev
33 |         - postgresql-client
34 |         - libpq-dev
35 | 
36 |     - import_tasks: setup/ruby_environment.yml
37 | 
38 |     - import_tasks: setup/phantomjs.yml
39 |       become: true
40 | 
41 |     - import_tasks: setup/firefox_geckodriver.yml
42 |       become: true
43 | 
44 |     - import_tasks: setup/chromium_chromedriver.yml
45 |       become: true
46 | 


--------------------------------------------------------------------------------
/lib/kimurai/automation/setup/chromium_chromedriver.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install chromium browser
 3 |   apt:
 4 |     pkg: chromium-browser
 5 |     state: present
 6 | 
 7 | - name: Get current chromedriver version
 8 |   shell: chromedriver --version
 9 |   args:
10 |     executable: /bin/bash
11 |   register: current_chromedriver_version
12 |   changed_when: false
13 |   ignore_errors: true
14 | 
15 | - name: Install unzip tool to unarchive chromedriver archive
16 |   apt:
17 |     pkg: unzip
18 |     state: present
19 | 
20 | - name: Download chromedriver binary archive and unarchive it to /usr/local/bin
21 |   unarchive:
22 |     src: https://chromedriver.storage.googleapis.com/{{ chromedriver }}/chromedriver_linux64.zip
23 |     dest: /usr/local/bin
24 |     remote_src: true
25 |     mode: a+x
26 |   when: chromedriver not in current_chromedriver_version.stdout_lines
27 | 


--------------------------------------------------------------------------------
/lib/kimurai/automation/setup/firefox_geckodriver.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install firefox
 3 |   apt:
 4 |     pkg: firefox
 5 |     state: present
 6 | 
 7 | - name: Get current geckodriver version
 8 |   shell: geckodriver --version
 9 |   args:
10 |     executable: /bin/bash
11 |   register: current_geckodriver_version
12 |   changed_when: false
13 |   ignore_errors: true
14 | 
15 | - name: Download geckodriver binary archive and unarchive it to /usr/local/bin
16 |   unarchive:
17 |     src: https://github.com/mozilla/geckodriver/releases/download/v{{ geckodriver }}/geckodriver-v{{ geckodriver }}-linux64.tar.gz
18 |     dest: /usr/local/bin
19 |     remote_src: true
20 |   when: geckodriver not in current_geckodriver_version.stdout
21 | 


--------------------------------------------------------------------------------
/lib/kimurai/automation/setup/phantomjs.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Install dependencies for PhantomJS
 3 |   apt:
 4 |     pkg: "{{ item }}"
 5 |     state: present
 6 |   with_items:
 7 |     - chrpath
 8 |     - libxft-dev
 9 |     - libfreetype6
10 |     - libfreetype6-dev
11 |     - libfontconfig1
12 |     - libfontconfig1-dev
13 | 
14 | - name: Get current phantomjs version
15 |   shell: phantomjs -v
16 |   args:
17 |     executable: /bin/bash
18 |   register: current_phantomjs_version
19 |   changed_when: false
20 |   ignore_errors: true
21 | 
22 | - name: Download phantomJS archive and unarchive it to /usr/local/lib
23 |   unarchive:
24 |     src: https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-{{ phantomjs }}-linux-x86_64.tar.bz2
25 |     dest: /usr/local/lib
26 |     remote_src: true
27 |   when: phantomjs not in current_phantomjs_version.stdout
28 | 
29 | - name: Link PhantomJS binary to /usr/local/bin/phantomjs
30 |   file:
31 |     src: /usr/local/lib/phantomjs-{{ phantomjs }}-linux-x86_64/bin/phantomjs
32 |     dest: /usr/local/bin/phantomjs
33 |     state: link
34 | 


--------------------------------------------------------------------------------
/lib/kimurai/automation/setup/ruby_environment.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | - name: Install dependencies for ruby-build
  3 |   become: true
  4 |   apt:
  5 |     pkg: "{{ item }}"
  6 |     state: present
  7 |   with_items:
  8 |     - zlib1g-dev
  9 |     - build-essential
 10 |     - libssl-dev
 11 |     - libreadline-dev
 12 |     - libreadline6-dev
 13 |     - libyaml-dev
 14 |     - libxml2-dev
 15 |     - libxslt1-dev
 16 |     - libcurl4-openssl-dev
 17 |     - libffi-dev
 18 | 
 19 | - name: Clone Rbenv repository to the {{ ansible_user_id }} user directory
 20 |   git:
 21 |     repo: https://github.com/sstephenson/rbenv.git
 22 |     dest: "{{ rbenv_root_path }}"
 23 | 
 24 | - name: Clone ruby-build repo to the {{ ansible_user_id }} user directory
 25 |   git:
 26 |     repo: https://github.com/sstephenson/ruby-build.git
 27 |     dest: "{{ rbenv_root_path }}/plugins/ruby-build"
 28 | 
 29 | - name: Add Rbenv path to the .bashrc
 30 |   lineinfile:
 31 |     dest: ~/.bashrc
 32 |     regexp: '^export PATH="\$HOME\/\.rbenv'
 33 |     line: export PATH="$HOME/.rbenv/bin:$PATH"
 34 |     state: present
 35 | 
 36 | - name: Add Rbenv init to the .bashrc
 37 |   lineinfile:
 38 |     dest: ~/.bashrc
 39 |     regexp: '^eval "\$\(rbenv'
 40 |     line: eval "$(rbenv init -)"
 41 |     state: present
 42 | 
 43 | - name: Check if desired Ruby version already installed
 44 |   stat:
 45 |     path: "{{ ruby_versions_path }}/{{ ruby }}"
 46 |   register: ruby_present
 47 | 
 48 | - name: Install desired Ruby version using ruby-build (this can take a while)
 49 |   command: rbenv install {{ ruby }}
 50 |   when: not ruby_present.stat.exists
 51 |   environment:
 52 |     CONFIGURE_OPTS: "--disable-install-doc"
 53 |     PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
 54 | 
 55 | - name: Get current Ruby version
 56 |   command: "ruby -v"
 57 |   register: current_ruby_version
 58 |   changed_when: false
 59 |   ignore_errors: true
 60 |   environment:
 61 |     PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
 62 | 
 63 | - name: Set desired Ruby version as a global version
 64 |   command: "rbenv global {{ ruby }}"
 65 |   when: ruby not in current_ruby_version.stdout
 66 |   environment:
 67 |     PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
 68 |   register: set_ruby
 69 | 
 70 | - name: Execute `rbenv rehash` command
 71 |   command: rbenv rehash
 72 |   when: set_ruby.changed
 73 |   environment:
 74 |     PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
 75 | 
 76 | - name: Create ~/.gemrc file to skip docs
 77 |   copy:
 78 |     dest: ~/.gemrc
 79 |     content: "gem: --no-ri --no-rdoc"
 80 | 
 81 | - name: Create ~/.bundle directory
 82 |   file:
 83 |     dest: ~/.bundle
 84 |     state: directory
 85 | 
 86 | - name: Create ~/.bundle/config file with default settings `BUNDLE_GIT__ALLOW_INSECURE true` and `BUNDLE_JOBS 4`
 87 |   copy:
 88 |     dest: ~/.bundle/config
 89 |     content: |
 90 |       BUNDLE_GIT__ALLOW_INSECURE: "true"
 91 |       BUNDLE_JOBS: "4"
 92 | 
 93 | - name: Check if Bundler gem installed
 94 |   stat:
 95 |     path: "{{ ruby_versions_path }}/{{ ruby }}/bin/bundler"
 96 |   register: bundler_gem_present
 97 | 
 98 | - name: Install Bundler gem
 99 |   command: gem install bundler
100 |   when: not bundler_gem_present.stat.exists
101 |   environment:
102 |     PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
103 | 
104 | - name: Check if Whenever gem installed
105 |   stat:
106 |     path: "{{ ruby_versions_path }}/{{ ruby }}/bin/whenever"
107 |   register: whenever_gem_present
108 | 
109 | - name: Install Whenever gem
110 |   command: gem install whenever
111 |   when: not whenever_gem_present.stat.exists
112 |   environment:
113 |     PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
114 | 
115 | - name: Check if Kimurai gem installed
116 |   stat:
117 |     path: "{{ ruby_versions_path }}/{{ ruby }}/bin/kimurai"
118 |   register: kimurai_gem_present
119 | 
120 | - name: Install Kimurai gem
121 |   command: gem install kimurai
122 |   when: not kimurai_gem_present.stat.exists
123 |   environment:
124 |     PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
125 | 


--------------------------------------------------------------------------------
/lib/kimurai/base.rb:
--------------------------------------------------------------------------------
  1 | require_relative 'base/saver'
  2 | require_relative 'base/storage'
  3 | 
  4 | module Kimurai
  5 |   class Base
  6 |     class InvalidUrlError < StandardError; end
  7 | 
  8 |     # don't deep merge config's headers hash option
  9 |     DMERGE_EXCLUDE = [:headers]
 10 | 
 11 |     LoggerFormatter = proc do |severity, datetime, progname, msg|
 12 |       current_thread_id = Thread.current.object_id
 13 |       thread_type = Thread.main == Thread.current ? "M" : "C"
 14 |       output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
 15 |         .freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]
 16 | 
 17 |       if Kimurai.configuration.colorize_logger != false && Kimurai.env == "development"
 18 |         Rbcat.colorize(output, predefined: [:jsonhash, :logger])
 19 |       else
 20 |         output
 21 |       end
 22 |     end
 23 | 
 24 |     include BaseHelper
 25 | 
 26 |     ###
 27 | 
 28 |     class << self
 29 |       attr_reader :run_info, :savers, :storage
 30 |     end
 31 | 
 32 |     def self.running?
 33 |       @run_info && @run_info[:status] == :running
 34 |     end
 35 | 
 36 |     def self.completed?
 37 |       @run_info && @run_info[:status] == :completed
 38 |     end
 39 | 
 40 |     def self.failed?
 41 |       @run_info && @run_info[:status] == :failed
 42 |     end
 43 | 
 44 |     def self.visits
 45 |       @run_info && @run_info[:visits]
 46 |     end
 47 | 
 48 |     def self.items
 49 |       @run_info && @run_info[:items]
 50 |     end
 51 | 
 52 |     def self.update(type, subtype)
 53 |       return unless @run_info
 54 |       @update_mutex.synchronize { @run_info[type][subtype] += 1 }
 55 |     end
 56 | 
 57 |     def self.add_event(scope, event)
 58 |       return unless @run_info
 59 |       @update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
 60 |     end
 61 | 
 62 |     ###
 63 | 
 64 |     @engine = :mechanize
 65 |     @pipelines = []
 66 |     @config = {}
 67 | 
 68 |     def self.name
 69 |       @name
 70 |     end
 71 | 
 72 |     def self.engine
 73 |       @engine ||= superclass.engine
 74 |     end
 75 | 
 76 |     def self.pipelines
 77 |       @pipelines ||= superclass.pipelines
 78 |     end
 79 | 
 80 |     def self.start_urls
 81 |       @start_urls
 82 |     end
 83 | 
 84 |     def self.config
 85 |       if superclass.equal?(::Object)
 86 |         @config
 87 |       else
 88 |         superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE)
 89 |       end
 90 |     end
 91 | 
 92 |     ###
 93 | 
 94 |     def self.logger
 95 |       @logger ||= Kimurai.configuration.logger || begin
 96 |         log_level = (ENV["LOG_LEVEL"] || Kimurai.configuration.log_level || "DEBUG").to_s.upcase
 97 |         log_level = "Logger::#{log_level}".constantize
 98 |         Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
 99 |       end
100 |     end
101 | 
102 |     def self.crawl!(exception_on_fail: true)
103 |       logger.error "Spider: already running: #{name}" and return false if running?
104 | 
105 |       @storage = Storage.new
106 |       @savers = {}
107 |       @update_mutex = Mutex.new
108 | 
109 |       @run_info = {
110 |         spider_name: name, status: :running, error: nil, environment: Kimurai.env,
111 |         start_time: Time.new, stop_time: nil, running_time: nil,
112 |         visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
113 |         events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) }
114 |       }
115 | 
116 |       ###
117 | 
118 |       logger.info "Spider: started: #{name}"
119 |       open_spider if self.respond_to? :open_spider
120 | 
121 |       spider = self.new
122 |       spider.with_info = true
123 |       if start_urls
124 |         start_urls.each do |start_url|
125 |           if start_url.class == Hash
126 |             spider.request_to(:parse, start_url)
127 |           else
128 |             spider.request_to(:parse, url: start_url)
129 |           end
130 |         end
131 |       else
132 |         spider.parse
133 |       end
134 |     rescue StandardError, SignalException, SystemExit => e
135 |       @run_info.merge!(status: :failed, error: e.inspect)
136 |       exception_on_fail ? raise(e) : [@run_info, e]
137 |     else
138 |       @run_info.merge!(status: :completed)
139 |     ensure
140 |       if spider
141 |         spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
142 | 
143 |         stop_time  = Time.now
144 |         total_time = (stop_time - @run_info[:start_time]).round(3)
145 |         @run_info.merge!(stop_time: stop_time, running_time: total_time)
146 | 
147 |         close_spider if self.respond_to? :close_spider
148 | 
149 |         message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
150 |         failed? ? logger.fatal(message) : logger.info(message)
151 | 
152 |         @run_info, @storage, @savers, @update_mutex = nil
153 |       end
154 |     end
155 | 
156 |     def self.parse!(handler, *args, **request)
157 |       spider = self.new
158 | 
159 |       if args.present?
160 |         spider.public_send(handler, *args)
161 |       elsif request.present?
162 |         spider.request_to(handler, request)
163 |       else
164 |         spider.public_send(handler)
165 |       end
166 |     ensure
167 |       spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
168 |     end
169 | 
170 |     ###
171 | 
172 |     attr_reader :logger
173 |     attr_accessor :with_info
174 | 
175 |     def initialize(engine = self.class.engine, config: {})
176 |       @engine = engine || self.class.engine
177 |       @config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
178 |       @pipelines = self.class.pipelines.map do |pipeline_name|
179 |         klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
180 |         instance = klass.new
181 |         instance.spider = self
182 |         [pipeline_name, instance]
183 |       end.to_h
184 | 
185 |       @logger = self.class.logger
186 |       @savers = {}
187 |     end
188 | 
189 |     def browser
190 |       @browser ||= BrowserBuilder.build(@engine, @config, spider: self)
191 |     end
192 | 
193 |     def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
194 |       raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).kind_of?(URI::HTTP)
195 | 
196 |       if @config[:skip_duplicate_requests] && !unique_request?(url)
197 |         add_event(:duplicate_requests) if self.with_info
198 |         logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
199 |       end
200 | 
201 |       visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
202 |       return unless visited
203 | 
204 |       public_send(handler, browser.current_response(response_type), { url: url, data: data })
205 |     end
206 | 
207 |     def console(response = nil, url: nil, data: {})
208 |       binding.pry
209 |     end
210 | 
211 |     ###
212 | 
213 |     def storage
214 |       # Note: for `.crawl!` uses shared thread safe Storage instance,
215 |       # otherwise, each spider instance will have it's own Storage
216 |       @storage ||= self.with_info ? self.class.storage : Storage.new
217 |     end
218 | 
219 |     def unique?(scope, value)
220 |       storage.unique?(scope, value)
221 |     end
222 | 
223 |     def save_to(path, item, format:, position: true, append: false)
224 |       @savers[path] ||= begin
225 |         options = { format: format, position: position, append: append }
226 |         if self.with_info
227 |           self.class.savers[path] ||= Saver.new(path, options)
228 |         else
229 |           Saver.new(path, options)
230 |         end
231 |       end
232 | 
233 |       @savers[path].save(item)
234 |     end
235 | 
236 |     ###
237 | 
238 |     def add_event(scope = :custom, event)
239 |       if self.with_info
240 |         self.class.add_event(scope, event)
241 |       end
242 | 
243 |       logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
244 |     end
245 | 
246 |     ###
247 | 
248 |     private
249 | 
250 |     def create_browser(engine, config = {})
251 |       Kimurai::BrowserBuilder.build(engine, config, spider: self)
252 |     end
253 | 
254 |     def unique_request?(url)
255 |       options = @config[:skip_duplicate_requests]
256 |       if options.class == Hash
257 |         scope = options[:scope] || :requests_urls
258 |         if options[:check_only]
259 |           storage.include?(scope, url) ? false : true
260 |         else
261 |           storage.unique?(scope, url) ? true : false
262 |         end
263 |       else
264 |         storage.unique?(:requests_urls, url) ? true : false
265 |       end
266 |     end
267 | 
268 |     def send_item(item, options = {})
269 |       logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
270 |       self.class.update(:items, :sent) if self.with_info
271 | 
272 |       @pipelines.each do |name, instance|
273 |         item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
274 |       end
275 |     rescue => e
276 |       logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}"
277 |       add_event(:drop_items_errors, e.inspect) if self.with_info
278 |       false
279 |     else
280 |       self.class.update(:items, :processed) if self.with_info
281 |       logger.info "Pipeline: processed: #{JSON.generate(item)}"
282 |       true
283 |     ensure
284 |       if self.with_info
285 |         logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
286 |       end
287 |     end
288 | 
289 |     def in_parallel(handler, urls, threads:, data: {}, delay: nil, engine: @engine, config: {})
290 |       parts = urls.in_sorted_groups(threads, false)
291 |       urls_count = urls.size
292 | 
293 |       all = []
294 |       start_time = Time.now
295 |       logger.info "Spider: in_parallel: starting processing #{urls_count} urls within #{threads} threads"
296 | 
297 |       parts.each do |part|
298 |         all << Thread.new(part) do |part|
299 |           Thread.current.abort_on_exception = true
300 | 
301 |           spider = self.class.new(engine, config: @config.deep_merge_excl(config, DMERGE_EXCLUDE))
302 |           spider.with_info = true if self.with_info
303 | 
304 |           part.each do |url_data|
305 |             if url_data.class == Hash
306 |               if url_data[:url].present? && url_data[:data].present?
307 |                 spider.request_to(handler, delay, url_data)
308 |               else
309 |                 spider.public_send(handler, url_data)
310 |               end
311 |             else
312 |               spider.request_to(handler, delay, url: url_data, data: data)
313 |             end
314 |           end
315 |         ensure
316 |           spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
317 |         end
318 | 
319 |         sleep 0.5
320 |       end
321 | 
322 |       all.each(&:join)
323 |       logger.info "Spider: in_parallel: stopped processing #{urls_count} urls within #{threads} threads, total time: #{(Time.now - start_time).duration}"
324 |     end
325 |   end
326 | end
327 | 


--------------------------------------------------------------------------------
/lib/kimurai/base/saver.rb:
--------------------------------------------------------------------------------
  1 | require 'json'
  2 | require 'csv'
  3 | 
  4 | module Kimurai
  5 |   class Base
  6 |     class Saver
  7 |       attr_reader :format, :path, :position, :append
  8 | 
  9 |       def initialize(path, format:, position: true, append: false)
 10 |         unless %i(json pretty_json jsonlines csv).include?(format)
 11 |           raise "SimpleSaver: wrong type of format: #{format}"
 12 |         end
 13 | 
 14 |         @path = path
 15 |         @format = format
 16 |         @position = position
 17 |         @index = 0
 18 |         @append = append
 19 |         @mutex = Mutex.new
 20 |       end
 21 | 
 22 |       def save(item)
 23 |         @mutex.synchronize do
 24 |           @index += 1
 25 |           item[:position] = @index if position
 26 | 
 27 |           case format
 28 |           when :json
 29 |             save_to_json(item)
 30 |           when :pretty_json
 31 |             save_to_pretty_json(item)
 32 |           when :jsonlines
 33 |             save_to_jsonlines(item)
 34 |           when :csv
 35 |             save_to_csv(item)
 36 |           end
 37 |         end
 38 |       end
 39 | 
 40 |       private
 41 | 
 42 |       def save_to_json(item)
 43 |         data = JSON.generate([item])
 44 | 
 45 |         if @index > 1 || append && File.exists?(path)
 46 |           file_content = File.read(path).sub(/\}\]\Z/, "\}\,")
 47 |           File.open(path, "w") do |f|
 48 |             f.write(file_content + data.sub(/\A\[/, ""))
 49 |           end
 50 |         else
 51 |           File.open(path, "w") { |f| f.write(data) }
 52 |         end
 53 |       end
 54 | 
 55 |       def save_to_pretty_json(item)
 56 |         data = JSON.pretty_generate([item])
 57 | 
 58 |         if @index > 1 || append && File.exists?(path)
 59 |           file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n")
 60 |           File.open(path, "w") do |f|
 61 |             f.write(file_content + data.sub(/\A\[\n/, ""))
 62 |           end
 63 |         else
 64 |           File.open(path, "w") { |f| f.write(data) }
 65 |         end
 66 |       end
 67 | 
 68 |       def save_to_jsonlines(item)
 69 |         data = JSON.generate(item)
 70 | 
 71 |         if @index > 1 || append && File.exists?(path)
 72 |           File.open(path, "a") { |file| file.write("\n" + data) }
 73 |         else
 74 |           File.open(path, "w") { |file| file.write(data) }
 75 |         end
 76 |       end
 77 | 
 78 |       def save_to_csv(item)
 79 |         data = flatten_hash(item)
 80 | 
 81 |         if @index > 1 || append && File.exists?(path)
 82 |           CSV.open(path, "a+", force_quotes: true) do |csv|
 83 |             csv << data.values
 84 |           end
 85 |         else
 86 |           CSV.open(path, "w", force_quotes: true) do |csv|
 87 |             csv << data.keys
 88 |             csv << data.values
 89 |           end
 90 |         end
 91 |       end
 92 | 
 93 |       def flatten_hash(hash)
 94 |         hash.each_with_object({}) do |(k, v), h|
 95 |           if v.is_a? Hash
 96 |             flatten_hash(v).map { |h_k, h_v| h["#{k}.#{h_k}"] = h_v }
 97 |           else
 98 |             h[k&.to_s] = v
 99 |           end
100 |         end
101 |       end
102 |     end
103 |   end
104 | end
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/lib/kimurai/base/storage.rb:
--------------------------------------------------------------------------------
 1 | module Kimurai
 2 |   class Base
 3 |     class Storage
 4 |       attr_reader :database
 5 | 
 6 |       def initialize
 7 |         @mutex = Mutex.new
 8 |         @database = {}
 9 |       end
10 | 
11 |       def all(scope = nil)
12 |         @mutex.synchronize do
13 |           scope ? database.fetch(scope, []) : database
14 |         end
15 |       end
16 | 
17 |       def include?(scope, value)
18 |         @mutex.synchronize do
19 |           database[scope] ||= []
20 |           database[scope].include?(value)
21 |         end
22 |       end
23 | 
24 |       def add(scope, value)
25 |         @mutex.synchronize do
26 |           database[scope] ||= []
27 |           if value.kind_of?(Array)
28 |             database[scope] += value
29 |             database[scope].uniq!
30 |           else
31 |             database[scope].push(value) unless database[scope].include?(value)
32 |           end
33 |         end
34 |       end
35 | 
36 |       ###
37 | 
38 |       def unique?(scope, value)
39 |         @mutex.synchronize do
40 |           database[scope] ||= []
41 |           database[scope].include?(value) ? false : database[scope].push(value) and true
42 |         end
43 |       end
44 | 
45 |       ###
46 | 
47 |       def clear!
48 |         @mutex.synchronize do
49 |           @database = {}
50 |         end
51 |       end
52 |     end
53 |   end
54 | end
55 | 


--------------------------------------------------------------------------------
/lib/kimurai/base_helper.rb:
--------------------------------------------------------------------------------
 1 | module Kimurai
 2 |   module BaseHelper
 3 |     private
 4 | 
 5 |     def absolute_url(url, base:)
 6 |       return unless url
 7 |       URI.join(base, URI.escape(url)).to_s
 8 |     end
 9 | 
10 |     def escape_url(url)
11 |       uri = URI.parse(url)
12 |     rescue URI::InvalidURIError => e
13 |       URI.parse(URI.escape url).to_s rescue url
14 |     else
15 |       url
16 |     end
17 | 
18 |     def normalize_url(url, base:)
19 |       escape_url(absolute_url(url, base: base))
20 |     end
21 |   end
22 | end
23 | 


--------------------------------------------------------------------------------
/lib/kimurai/browser_builder.rb:
--------------------------------------------------------------------------------
 1 | module Kimurai
 2 |   module BrowserBuilder
 3 |     def self.build(engine, config = {}, spider:)
 4 |       if config[:browser].present?
 5 |         raise "++++++ BrowserBuilder: browser option is depricated. Now all sub-options inside " \
 6 |           "`browser` should be placed right into `@config` hash, without `browser` parent key.\n" \
 7 |           "See more here: https://github.com/vifreefly/kimuraframework/blob/master/CHANGELOG.md#breaking-changes-110 ++++++"
 8 |       end
 9 | 
10 |       begin
11 |         require "kimurai/browser_builder/#{engine}_builder"
12 |       rescue LoadError => e
13 |       end
14 | 
15 |       builder_class_name = "#{engine}_builder".classify
16 |       builder = "Kimurai::BrowserBuilder::#{builder_class_name}".constantize
17 |       builder.new(config, spider: spider).build
18 |     end
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/lib/kimurai/browser_builder/mechanize_builder.rb:
--------------------------------------------------------------------------------
  1 | require 'capybara'
  2 | require 'capybara/mechanize'
  3 | require_relative '../capybara_configuration'
  4 | require_relative '../capybara_ext/mechanize/driver'
  5 | require_relative '../capybara_ext/session'
  6 | 
  7 | module Kimurai::BrowserBuilder
  8 |   class MechanizeBuilder
  9 |     attr_reader :logger, :spider
 10 | 
 11 |     def initialize(config, spider:)
 12 |       @config = config
 13 |       @spider = spider
 14 |       @logger = spider.logger
 15 |     end
 16 | 
 17 |     def build
 18 |       # Register driver
 19 |       Capybara.register_driver :mechanize do |app|
 20 |         driver = Capybara::Mechanize::Driver.new("app")
 21 |         # keep the history as small as possible (by default it's unlimited)
 22 |         driver.configure { |a| a.history.max_size = 2 }
 23 |         driver
 24 |       end
 25 | 
 26 |       # Create browser instance (Capybara session)
 27 |       @browser = Capybara::Session.new(:mechanize)
 28 |       @browser.spider = spider
 29 |       logger.debug "BrowserBuilder (mechanize): created browser instance"
 30 | 
 31 |       if @config[:extensions].present?
 32 |         logger.error "BrowserBuilder (mechanize): `extensions` option not supported, skipped"
 33 |       end
 34 | 
 35 |       # Proxy
 36 |       if proxy = @config[:proxy].presence
 37 |         proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
 38 |         ip, port, type = proxy_string.split(":")
 39 | 
 40 |         if type == "http"
 41 |           @browser.driver.set_proxy(*proxy_string.split(":"))
 42 |           logger.debug "BrowserBuilder (mechanize): enabled http proxy, ip: #{ip}, port: #{port}"
 43 |         else
 44 |           logger.error "BrowserBuilder (mechanize): can't set #{type} proxy (not supported), skipped"
 45 |         end
 46 |       end
 47 | 
 48 |       # SSL
 49 |       if ssl_cert_path = @config[:ssl_cert_path].presence
 50 |         @browser.driver.browser.agent.http.ca_file = ssl_cert_path
 51 |         logger.debug "BrowserBuilder (mechanize): enabled custom ssl_cert"
 52 |       end
 53 | 
 54 |       if @config[:ignore_ssl_errors].present?
 55 |         @browser.driver.browser.agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
 56 |         logger.debug "BrowserBuilder (mechanize): enabled ignore_ssl_errors"
 57 |       end
 58 | 
 59 |       # Headers
 60 |       if headers = @config[:headers].presence
 61 |         @browser.driver.headers = headers
 62 |         logger.debug "BrowserBuilder (mechanize): enabled custom headers"
 63 |       end
 64 | 
 65 |       if user_agent = @config[:user_agent].presence
 66 |         user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
 67 | 
 68 |         @browser.driver.add_header("User-Agent", user_agent_string)
 69 |         logger.debug "BrowserBuilder (mechanize): enabled custom user_agent"
 70 |       end
 71 | 
 72 |       # Cookies
 73 |       if cookies = @config[:cookies].presence
 74 |         cookies.each do |cookie|
 75 |           @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
 76 |         end
 77 | 
 78 |         logger.debug "BrowserBuilder (mechanize): enabled custom cookies"
 79 |       end
 80 | 
 81 |       # Browser instance options
 82 |       # skip_request_errors
 83 |       if skip_errors = @config[:skip_request_errors].presence
 84 |         @browser.config.skip_request_errors = skip_errors
 85 |         logger.debug "BrowserBuilder (mechanize): enabled skip_request_errors"
 86 |       end
 87 | 
 88 |       # retry_request_errors
 89 |       if retry_errors = @config[:retry_request_errors].presence
 90 |         @browser.config.retry_request_errors = retry_errors
 91 |         logger.debug "BrowserBuilder (mechanize): enabled retry_request_errors"
 92 |       end
 93 | 
 94 |       # restart_if
 95 |       if @config[:restart_if].present?
 96 |         logger.warn "BrowserBuilder (mechanize): restart_if options not supported by Mechanize, skipped"
 97 |       end
 98 | 
 99 |       # before_request clear_cookies
100 |       if @config.dig(:before_request, :clear_cookies)
101 |         @browser.config.before_request[:clear_cookies] = true
102 |         logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_cookies"
103 |       end
104 | 
105 |       # before_request clear_and_set_cookies
106 |       if @config.dig(:before_request, :clear_and_set_cookies)
107 |         if cookies = @config[:cookies].presence
108 |           @browser.config.cookies = cookies
109 |           @browser.config.before_request[:clear_and_set_cookies] = true
110 |           logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_and_set_cookies"
111 |         else
112 |           logger.error "BrowserBuilder (mechanize): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
113 |         end
114 |       end
115 | 
116 |       # before_request change_user_agent
117 |       if @config.dig(:before_request, :change_user_agent)
118 |         if @config[:user_agent].present? && @config[:user_agent].class == Proc
119 |           @browser.config.user_agent = @config[:user_agent]
120 |           @browser.config.before_request[:change_user_agent] = true
121 |           logger.debug "BrowserBuilder (mechanize): enabled before_request.change_user_agent"
122 |         else
123 |           logger.error "BrowserBuilder (mechanize): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
124 |         end
125 |       end
126 | 
127 |       # before_request change_proxy
128 |       if @config.dig(:before_request, :change_proxy)
129 |         if @config[:proxy].present? && @config[:proxy].class == Proc
130 |           @browser.config.proxy = @config[:proxy]
131 |           @browser.config.before_request[:change_proxy] = true
132 |           logger.debug "BrowserBuilder (mechanize): enabled before_request.change_proxy"
133 |         else
134 |           logger.error "BrowserBuilder (mechanize): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
135 |         end
136 |       end
137 | 
138 |       # before_request delay
139 |       if delay = @config.dig(:before_request, :delay).presence
140 |         @browser.config.before_request[:delay] = delay
141 |         logger.debug "BrowserBuilder (mechanize): enabled before_request.delay"
142 |       end
143 | 
144 |       # encoding
145 |       if encoding = @config[:encoding]
146 |         @browser.config.encoding = encoding
147 |         logger.debug "BrowserBuilder (mechanize): enabled encoding: #{encoding}"
148 |       end
149 | 
150 |       # return Capybara session instance
151 |       @browser
152 |     end
153 |   end
154 | end
155 | 


--------------------------------------------------------------------------------
/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb:
--------------------------------------------------------------------------------
  1 | require 'capybara'
  2 | require 'capybara/poltergeist'
  3 | require_relative '../capybara_configuration'
  4 | require_relative '../capybara_ext/poltergeist/driver'
  5 | require_relative '../capybara_ext/session'
  6 | 
  7 | module Kimurai::BrowserBuilder
  8 |   class PoltergeistPhantomjsBuilder
  9 |     attr_reader :logger, :spider
 10 | 
 11 |     def initialize(config, spider:)
 12 |       @config = config
 13 |       @spider = spider
 14 |       @logger = spider.logger
 15 |     end
 16 | 
 17 |     def build
 18 |       # Register driver
 19 |       Capybara.register_driver :poltergeist_phantomjs do |app|
 20 |         # Create driver options
 21 |         driver_options = {
 22 |           js_errors: false, debug: false, inspector: false, phantomjs_options: []
 23 |         }
 24 | 
 25 |         if extensions = @config[:extensions].presence
 26 |           driver_options[:extensions] = extensions
 27 |           logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled extensions"
 28 |         end
 29 | 
 30 |         # Window size
 31 |         if size = @config[:window_size].presence
 32 |           driver_options[:window_size] = size
 33 |           logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled window_size"
 34 |         end
 35 | 
 36 |         # SSL
 37 |         if ssl_cert_path = @config[:ssl_cert_path].presence
 38 |           driver_options[:phantomjs_options] << "--ssl-certificates-path=#{ssl_cert_path}"
 39 |           logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom ssl_cert"
 40 |         end
 41 | 
 42 |         if @config[:ignore_ssl_errors].present?
 43 |           driver_options[:phantomjs_options].push("--ignore-ssl-errors=yes", "--ssl-protocol=any")
 44 |           logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled ignore_ssl_errors"
 45 |         end
 46 | 
 47 |         # Disable images
 48 |         if @config[:disable_images].present?
 49 |           driver_options[:phantomjs_options] << "--load-images=no"
 50 |           logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled disable_images"
 51 |         end
 52 | 
 53 |         Capybara::Poltergeist::Driver.new(app, driver_options)
 54 |       end
 55 | 
 56 |       # Create browser instance (Capybara session)
 57 |       @browser = Capybara::Session.new(:poltergeist_phantomjs)
 58 |       @browser.spider = spider
 59 |       logger.debug "BrowserBuilder (poltergeist_phantomjs): created browser instance"
 60 | 
 61 |       # Proxy
 62 |       if proxy = @config[:proxy].presence
 63 |         proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
 64 |         ip, port, type = proxy_string.split(":")
 65 | 
 66 |         if %w(http socks5).include?(type)
 67 |           @browser.driver.set_proxy(*proxy_string.split(":"))
 68 |           logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled #{type} proxy, ip: #{ip}, port: #{port}"
 69 |         else
 70 |           logger.error "BrowserBuilder (poltergeist_phantomjs): wrong type of proxy: #{type}, skipped"
 71 |         end
 72 |       end
 73 | 
 74 |       # Headers
 75 |       if headers = @config[:headers].presence
 76 |         @browser.driver.headers = headers
 77 |         logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom headers"
 78 |       end
 79 | 
 80 |       if user_agent = @config[:user_agent].presence
 81 |         user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
 82 | 
 83 |         @browser.driver.add_header("User-Agent", user_agent_string)
 84 |         logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom user_agent"
 85 |       end
 86 | 
 87 |       # Cookies
 88 |       if cookies = @config[:cookies].presence
 89 |         cookies.each do |cookie|
 90 |           @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
 91 |         end
 92 | 
 93 |         logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom cookies"
 94 |       end
 95 | 
 96 |       # Browser instance options
 97 |       # skip_request_errors
 98 |       if skip_errors = @config[:skip_request_errors].presence
 99 |         @browser.config.skip_request_errors = skip_errors
100 |         logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled skip_request_errors"
101 |       end
102 | 
103 |       # retry_request_errors
104 |       if retry_errors = @config[:retry_request_errors].presence
105 |         @browser.config.retry_request_errors = retry_errors
106 |         logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled retry_request_errors"
107 |       end
108 | 
109 |       # restart_if
110 |       if requests_limit = @config.dig(:restart_if, :requests_limit).presence
111 |         @browser.config.restart_if[:requests_limit] = requests_limit
112 |         logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.requests_limit >= #{requests_limit}"
113 |       end
114 | 
115 |       if memory_limit = @config.dig(:restart_if, :memory_limit).presence
116 |         @browser.config.restart_if[:memory_limit] = memory_limit
117 |         logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.memory_limit >= #{memory_limit}"
118 |       end
119 | 
120 |       # before_request clear_cookies
121 |       if @config.dig(:before_request, :clear_cookies)
122 |         @browser.config.before_request[:clear_cookies] = true
123 |         logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_cookies"
124 |       end
125 | 
126 |       # before_request clear_and_set_cookies
127 |       if @config.dig(:before_request, :clear_and_set_cookies)
128 |         if cookies = @config[:cookies].presence
129 |           @browser.config.cookies = cookies
130 |           @browser.config.before_request[:clear_and_set_cookies] = true
131 |           logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_and_set_cookies"
132 |         else
133 |           logger.error "BrowserBuilder (poltergeist_phantomjs): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
134 |         end
135 |       end
136 | 
137 |       # before_request change_user_agent
138 |       if @config.dig(:before_request, :change_user_agent)
139 |         if @config[:user_agent].present? && @config[:user_agent].class == Proc
140 |           @browser.config.user_agent = @config[:user_agent]
141 |           @browser.config.before_request[:change_user_agent] = true
142 |           logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_user_agent"
143 |         else
144 |           logger.error "BrowserBuilder (poltergeist_phantomjs): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
145 |         end
146 |       end
147 | 
148 |       # before_request change_proxy
149 |       if @config.dig(:before_request, :change_proxy)
150 |         if @config[:proxy].present? && @config[:proxy].class == Proc
151 |           @browser.config.proxy = @config[:proxy]
152 |           @browser.config.before_request[:change_proxy] = true
153 |           logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_proxy"
154 |         else
155 |           logger.error "BrowserBuilder (poltergeist_phantomjs): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
156 |         end
157 |       end
158 | 
159 |       # before_request delay
160 |       if delay = @config.dig(:before_request, :delay).presence
161 |         @browser.config.before_request[:delay] = delay
162 |         logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.delay"
163 |       end
164 | 
165 |       # encoding
166 |       if encoding = @config[:encoding]
167 |         @browser.config.encoding = encoding
168 |         logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled encoding: #{encoding}"
169 |       end
170 | 
171 |       # return Capybara session instance
172 |       @browser
173 |     end
174 |   end
175 | end
176 | 


--------------------------------------------------------------------------------
/lib/kimurai/browser_builder/selenium_chrome_builder.rb:
--------------------------------------------------------------------------------
  1 | require 'capybara'
  2 | require 'selenium-webdriver'
  3 | require_relative '../capybara_configuration'
  4 | require_relative '../capybara_ext/selenium/driver'
  5 | require_relative '../capybara_ext/session'
  6 | 
  7 | module Kimurai::BrowserBuilder
  8 |   class SeleniumChromeBuilder
  9 |     class << self
 10 |       attr_accessor :virtual_display
 11 |     end
 12 | 
 13 |     attr_reader :logger, :spider
 14 | 
 15 |     def initialize(config, spider:)
 16 |       @config = config
 17 |       @spider = spider
 18 |       @logger = spider.logger
 19 |     end
 20 | 
 21 |     def build
 22 |       # Register driver
 23 |       Capybara.register_driver :selenium_chrome do |app|
 24 |         # Create driver options
 25 |         opts = { args: %w[--disable-gpu --no-sandbox --disable-translate] }
 26 | 
 27 |         # Provide custom chrome browser path:
 28 |         if chrome_path = Kimurai.configuration.selenium_chrome_path
 29 |           opts.merge!(binary: chrome_path)
 30 |         end
 31 | 
 32 |         # See all options here: https://seleniumhq.github.io/selenium/docs/api/rb/Selenium/WebDriver/Chrome/Options.html
 33 |         driver_options = Selenium::WebDriver::Chrome::Options.new(opts)
 34 | 
 35 |         # Window size
 36 |         if size = @config[:window_size].presence
 37 |           driver_options.args << "--window-size=#{size.join(',')}"
 38 |           logger.debug "BrowserBuilder (selenium_chrome): enabled window_size"
 39 |         end
 40 | 
 41 |         # Proxy
 42 |         if proxy = @config[:proxy].presence
 43 |           proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
 44 |           ip, port, type, user, password = proxy_string.split(":")
 45 | 
 46 |           if %w(http socks5).include?(type)
 47 |             if user.nil? && password.nil?
 48 |               driver_options.args << "--proxy-server=#{type}://#{ip}:#{port}"
 49 |               logger.debug "BrowserBuilder (selenium_chrome): enabled #{type} proxy, ip: #{ip}, port: #{port}"
 50 |             else
 51 |               logger.error "BrowserBuilder (selenium_chrome): proxy with authentication doesn't supported by selenium, skipped"
 52 |             end
 53 |           else
 54 |             logger.error "BrowserBuilder (selenium_chrome): wrong type of proxy: #{type}, skipped"
 55 |           end
 56 |         end
 57 | 
 58 |         if proxy_bypass_list = @config[:proxy_bypass_list].presence
 59 |           if proxy
 60 |             driver_options.args << "--proxy-bypass-list=#{proxy_bypass_list.join(';')}"
 61 |             logger.debug "BrowserBuilder (selenium_chrome): enabled proxy_bypass_list"
 62 |           else
 63 |             logger.error "BrowserBuilder (selenium_chrome): provide `proxy` to set proxy_bypass_list, skipped"
 64 |           end
 65 |         end
 66 | 
 67 |         # SSL
 68 |         if @config[:ignore_ssl_errors].present?
 69 |           driver_options.args << "--ignore-certificate-errors"
 70 |           driver_options.args << "--allow-insecure-localhost"
 71 |           logger.debug "BrowserBuilder (selenium_chrome): enabled ignore_ssl_errors"
 72 |         end
 73 | 
 74 |         # Disable images
 75 |         if @config[:disable_images].present?
 76 |           driver_options.prefs["profile.managed_default_content_settings.images"] = 2
 77 |           logger.debug "BrowserBuilder (selenium_chrome): enabled disable_images"
 78 |         end
 79 | 
 80 |         # Headers
 81 |         if @config[:headers].present?
 82 |           logger.warn "BrowserBuilder: (selenium_chrome): custom headers doesn't supported by selenium, skipped"
 83 |         end
 84 | 
 85 |         if user_agent = @config[:user_agent].presence
 86 |           user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
 87 |           driver_options.args << "--user-agent='#{user_agent_string}'"
 88 |           logger.debug "BrowserBuilder (selenium_chrome): enabled custom user_agent"
 89 |         end
 90 | 
 91 |         # Headless mode
 92 |         if ENV["HEADLESS"] != "false"
 93 |           if @config[:headless_mode] == :virtual_display
 94 |             if Gem::Platform.local.os == "linux"
 95 |               unless self.class.virtual_display
 96 |                 require 'headless'
 97 |                 self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false)
 98 |                 self.class.virtual_display.start
 99 |               end
100 | 
101 |               logger.debug "BrowserBuilder (selenium_chrome): enabled virtual_display headless_mode"
102 |             else
103 |               logger.error "BrowserBuilder (selenium_chrome): virtual_display headless_mode works only " \
104 |                 "on Linux platform. Browser will run in normal mode. Set `native` mode instead."
105 |             end
106 |           else
107 |             driver_options.args << "--headless"
108 |             logger.debug "BrowserBuilder (selenium_chrome): enabled native headless_mode"
109 |           end
110 |         end
111 | 
112 |         chromedriver_path = Kimurai.configuration.chromedriver_path || "/usr/local/bin/chromedriver"
113 |         service = Selenium::WebDriver::Service.chrome(path: chromedriver_path)
114 |         Capybara::Selenium::Driver.new(app, browser: :chrome, options: driver_options, service: service)
115 |       end
116 | 
117 |       # Create browser instance (Capybara session)
118 |       @browser = Capybara::Session.new(:selenium_chrome)
119 |       @browser.spider = spider
120 |       logger.debug "BrowserBuilder (selenium_chrome): created browser instance"
121 | 
122 |       if @config[:extensions].present?
123 |         logger.error "BrowserBuilder (selenium_chrome): `extensions` option not supported by Selenium, skipped"
124 |       end
125 | 
126 |       # Cookies
127 |       if cookies = @config[:cookies].presence
128 |         @browser.config.cookies = cookies
129 |         logger.debug "BrowserBuilder (selenium_chrome): enabled custom cookies"
130 |       end
131 | 
132 |       # Browser instance options
133 |       # skip_request_errors
134 |       if skip_errors = @config[:skip_request_errors].presence
135 |         @browser.config.skip_request_errors = skip_errors
136 |         logger.debug "BrowserBuilder (selenium_chrome): enabled skip_request_errors"
137 |       end
138 | 
139 |       # retry_request_errors
140 |       if retry_errors = @config[:retry_request_errors].presence
141 |         @browser.config.retry_request_errors = retry_errors
142 |         logger.debug "BrowserBuilder (selenium_chrome): enabled retry_request_errors"
143 |       end
144 | 
145 |       # restart_if
146 |       if requests_limit = @config.dig(:restart_if, :requests_limit).presence
147 |         @browser.config.restart_if[:requests_limit] = requests_limit
148 |         logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.requests_limit >= #{requests_limit}"
149 |       end
150 | 
151 |       if memory_limit = @config.dig(:restart_if, :memory_limit).presence
152 |         @browser.config.restart_if[:memory_limit] = memory_limit
153 |         logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.memory_limit >= #{memory_limit}"
154 |       end
155 | 
156 |       # before_request clear_cookies
157 |       if @config.dig(:before_request, :clear_cookies)
158 |         @browser.config.before_request[:clear_cookies] = true
159 |         logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_cookies"
160 |       end
161 | 
162 |       # before_request clear_and_set_cookies
163 |       if @config.dig(:before_request, :clear_and_set_cookies)
164 |         if cookies = @config[:cookies].presence
165 |           @browser.config.cookies = cookies
166 |           @browser.config.before_request[:clear_and_set_cookies] = true
167 |           logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_and_set_cookies"
168 |         else
169 |           logger.error "BrowserBuilder (selenium_chrome): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
170 |         end
171 |       end
172 | 
173 |       # before_request change_user_agent
174 |       if @config.dig(:before_request, :change_user_agent)
175 |         logger.error "BrowserBuilder (selenium_chrome): before_request.change_user_agent option not supported by Selenium, skipped"
176 |       end
177 | 
178 |       # before_request change_proxy
179 |       if @config.dig(:before_request, :change_proxy)
180 |         logger.error "BrowserBuilder (selenium_chrome): before_request.change_proxy option not supported by Selenium, skipped"
181 |       end
182 | 
183 |       # before_request delay
184 |       if delay = @config.dig(:before_request, :delay).presence
185 |         @browser.config.before_request[:delay] = delay
186 |         logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.delay"
187 |       end
188 | 
189 |       # encoding
190 |       if encoding = @config[:encoding]
191 |         @browser.config.encoding = encoding
192 |         logger.debug "BrowserBuilder (selenium_chrome): enabled encoding: #{encoding}"
193 |       end
194 | 
195 |       # return Capybara session instance
196 |       @browser
197 |     end
198 |   end
199 | end
200 | 


--------------------------------------------------------------------------------
/lib/kimurai/browser_builder/selenium_firefox_builder.rb:
--------------------------------------------------------------------------------
  1 | require 'capybara'
  2 | require 'selenium-webdriver'
  3 | require_relative '../capybara_configuration'
  4 | require_relative '../capybara_ext/selenium/driver'
  5 | require_relative '../capybara_ext/session'
  6 | 
  7 | module Kimurai::BrowserBuilder
  8 |   class SeleniumFirefoxBuilder
  9 |     class << self
 10 |       attr_accessor :virtual_display
 11 |     end
 12 | 
 13 |     attr_reader :logger, :spider
 14 | 
 15 |     def initialize(config, spider:)
 16 |       @config = config
 17 |       @spider = spider
 18 |       @logger = spider.logger
 19 |     end
 20 | 
 21 |     def build
 22 |       # Register driver
 23 |       Capybara.register_driver :selenium_firefox do |app|
 24 |         # Create driver options
 25 |         driver_options = Selenium::WebDriver::Firefox::Options.new
 26 |         driver_options.profile = Selenium::WebDriver::Firefox::Profile.new
 27 |         driver_options.profile["browser.link.open_newwindow"] = 3 # open windows in tabs
 28 |         driver_options.profile["media.peerconnection.enabled"] = false # disable web rtc
 29 | 
 30 |         # Proxy
 31 |         if proxy = @config[:proxy].presence
 32 |           proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
 33 |           ip, port, type, user, password = proxy_string.split(":")
 34 | 
 35 |           if user.nil? && password.nil?
 36 |             driver_options.profile["network.proxy.type"] = 1
 37 |             if type == "http"
 38 |               driver_options.profile["network.proxy.http"] = ip
 39 |               driver_options.profile["network.proxy.http_port"] = port.to_i
 40 |               driver_options.profile["network.proxy.ssl"] = ip
 41 |               driver_options.profile["network.proxy.ssl_port"] = port.to_i
 42 | 
 43 |               logger.debug "BrowserBuilder (selenium_firefox): enabled http proxy, ip: #{ip}, port: #{port}"
 44 |             elsif type == "socks5"
 45 |               driver_options.profile["network.proxy.socks"] = ip
 46 |               driver_options.profile["network.proxy.socks_port"] = port.to_i
 47 |               driver_options.profile["network.proxy.socks_version"] = 5
 48 |               driver_options.profile["network.proxy.socks_remote_dns"] = true
 49 | 
 50 |               logger.debug "BrowserBuilder (selenium_firefox): enabled socks5 proxy, ip: #{ip}, port: #{port}"
 51 |             else
 52 |               logger.error "BrowserBuilder (selenium_firefox): wrong type of proxy: #{type}, skipped"
 53 |             end
 54 |           else
 55 |             logger.error "BrowserBuilder (selenium_firefox): proxy with authentication doesn't supported by selenium, skipped"
 56 |           end
 57 |         end
 58 | 
 59 |         if proxy_bypass_list = @config[:proxy_bypass_list].presence
 60 |           if proxy
 61 |             driver_options.profile["network.proxy.no_proxies_on"] = proxy_bypass_list.join(", ")
 62 |             logger.debug "BrowserBuilder (selenium_firefox): enabled proxy_bypass_list"
 63 |           else
 64 |             logger.error "BrowserBuilder (selenium_firefox): provide `proxy` to set proxy_bypass_list, skipped"
 65 |           end
 66 |         end
 67 | 
 68 |         # SSL
 69 |         if @config[:ignore_ssl_errors].present?
 70 |           driver_options.profile.secure_ssl = false
 71 |           driver_options.profile.assume_untrusted_certificate_issuer = true
 72 |           logger.debug "BrowserBuilder (selenium_firefox): enabled ignore_ssl_errors"
 73 |         end
 74 | 
 75 |         # Disable images
 76 |         if @config[:disable_images].present?
 77 |           driver_options.profile["permissions.default.image"] = 2
 78 |           logger.debug "BrowserBuilder (selenium_firefox): enabled disable_images"
 79 |         end
 80 | 
 81 |         # Headers
 82 |         if @config[:headers].present?
 83 |           logger.warn "BrowserBuilder: (selenium_firefox): custom headers doesn't supported by selenium, skipped"
 84 |         end
 85 | 
 86 |         if user_agent = @config[:user_agent].presence
 87 |           user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
 88 |           driver_options.profile["general.useragent.override"] = user_agent_string
 89 |           logger.debug "BrowserBuilder (selenium_firefox): enabled custom user_agent"
 90 |         end
 91 | 
 92 |         # Headless mode
 93 |         if ENV["HEADLESS"] != "false"
 94 |           if @config[:headless_mode] == :virtual_display
 95 |             if Gem::Platform.local.os == "linux"
 96 |               unless self.class.virtual_display
 97 |                 require 'headless'
 98 |                 self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false)
 99 |                 self.class.virtual_display.start
100 |               end
101 | 
102 |               logger.debug "BrowserBuilder (selenium_firefox): enabled virtual_display headless_mode"
103 |             else
104 |               logger.error "BrowserBuilder (selenium_firefox): virtual_display headless_mode works only " \
105 |                 "on Linux platform. Browser will run in normal mode. Set `native` mode instead."
106 |             end
107 |           else
108 |             driver_options.args << "--headless"
109 |             logger.debug "BrowserBuilder (selenium_firefox): enabled native headless_mode"
110 |           end
111 |         end
112 | 
113 |         Capybara::Selenium::Driver.new(app, browser: :firefox, options: driver_options)
114 |       end
115 | 
116 |       # Create browser instance (Capybara session)
117 |       @browser = Capybara::Session.new(:selenium_firefox)
118 |       @browser.spider = spider
119 |       logger.debug "BrowserBuilder (selenium_firefox): created browser instance"
120 | 
121 |       if @config[:extensions].present?
122 |         logger.error "BrowserBuilder (selenium_firefox): `extensions` option not supported by Selenium, skipped"
123 |       end
124 | 
125 |       # Window size
126 |       if size = @config[:window_size].presence
127 |         @browser.current_window.resize_to(*size)
128 |         logger.debug "BrowserBuilder (selenium_firefox): enabled window_size"
129 |       end
130 | 
131 |       # Cookies
132 |       if cookies = @config[:cookies].presence
133 |         @browser.config.cookies = cookies
134 |         logger.debug "BrowserBuilder (selenium_firefox): enabled custom cookies"
135 |       end
136 | 
137 |       # Browser instance options
138 |       # skip_request_errors
139 |       if skip_errors = @config[:skip_request_errors].presence
140 |         @browser.config.skip_request_errors = skip_errors
141 |         logger.debug "BrowserBuilder (selenium_firefox): enabled skip_request_errors"
142 |       end
143 | 
144 |       # retry_request_errors
145 |       if retry_errors = @config[:retry_request_errors].presence
146 |         @browser.config.retry_request_errors = retry_errors
147 |         logger.debug "BrowserBuilder (selenium_firefox): enabled retry_request_errors"
148 |       end
149 | 
150 |       # restart_if
151 |       if requests_limit = @config.dig(:restart_if, :requests_limit).presence
152 |         @browser.config.restart_if[:requests_limit] = requests_limit
153 |         logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.requests_limit >= #{requests_limit}"
154 |       end
155 | 
156 |       if memory_limit = @config.dig(:restart_if, :memory_limit).presence
157 |         @browser.config.restart_if[:memory_limit] = memory_limit
158 |         logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.memory_limit >= #{memory_limit}"
159 |       end
160 | 
161 |       # before_request clear_cookies
162 |       if @config.dig(:before_request, :clear_cookies)
163 |         @browser.config.before_request[:clear_cookies] = true
164 |         logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_cookies"
165 |       end
166 | 
167 |       # before_request clear_and_set_cookies
168 |       if @config.dig(:before_request, :clear_and_set_cookies)
169 |         if cookies = @config[:cookies].presence
170 |           @browser.config.cookies = cookies
171 |           @browser.config.before_request[:clear_and_set_cookies] = true
172 |           logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_and_set_cookies"
173 |         else
174 |           logger.error "BrowserBuilder (selenium_firefox): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
175 |         end
176 |       end
177 | 
178 |       # before_request change_user_agent
179 |       if @config.dig(:before_request, :change_user_agent)
180 |         logger.error "BrowserBuilder (selenium_firefox): before_request.change_user_agent option not supported by Selenium, skipped"
181 |       end
182 | 
183 |       # before_request change_proxy
184 |       if @config.dig(:before_request, :change_proxy)
185 |         logger.error "BrowserBuilder (selenium_firefox): before_request.change_proxy option not supported by Selenium, skipped"
186 |       end
187 | 
188 |       # before_request delay
189 |       if delay = @config.dig(:before_request, :delay).presence
190 |         @browser.config.before_request[:delay] = delay
191 |         logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.delay"
192 |       end
193 | 
194 |       # encoding
195 |       if encoding = @config[:encoding]
196 |         @browser.config.encoding = encoding
197 |         logger.debug "BrowserBuilder (selenium_firefox): enabled encoding: #{encoding}"
198 |       end
199 | 
200 |       # return Capybara session instance
201 |       @browser
202 |     end
203 |   end
204 | end
205 | 


--------------------------------------------------------------------------------
/lib/kimurai/capybara_configuration.rb:
--------------------------------------------------------------------------------
 1 | require 'capybara'
 2 | 
 3 | Capybara.configure do |config|
 4 |   config.run_server = false
 5 |   config.default_selector = :xpath
 6 |   config.save_path = "tmp"
 7 |   config.default_max_wait_time = 10
 8 |   config.ignore_hidden_elements = false
 9 |   config.threadsafe = true
10 | end
11 | 


--------------------------------------------------------------------------------
/lib/kimurai/capybara_ext/driver/base.rb:
--------------------------------------------------------------------------------
 1 | require 'pathname'
 2 | 
 3 | class Capybara::Driver::Base
 4 |   attr_accessor :visited
 5 |   attr_writer :requests, :responses
 6 | 
 7 |   def requests
 8 |     @requests ||= 0
 9 |   end
10 | 
11 |   def responses
12 |     @responses ||= 0
13 |   end
14 | 
15 |   def current_memory
16 |     driver_pid = pid
17 | 
18 |     all = (get_descendant_processes(driver_pid) << driver_pid).uniq
19 |     all.map { |pid| get_process_memory(pid) }.sum
20 |   end
21 | 
22 |   private
23 | 
24 |   def get_descendant_processes(base)
25 |     descendants = Hash.new { |ht, k| ht[k] = [k] }
26 |     Hash[*`ps -eo pid,ppid`.scan(/\d+/).map(&:to_i)].each do |pid, ppid|
27 |       descendants[ppid] << descendants[pid]
28 |     end
29 | 
30 |     descendants[base].flatten - [base]
31 |   end
32 | 
33 |   # https://github.com/schneems/get_process_mem
34 |   # Note: for Linux takes PSS (not RSS) memory (I think PSS better fits in this case)
35 |   def get_process_memory(pid)
36 |     case @platform ||= Gem::Platform.local.os
37 |     when "linux"
38 |       begin
39 |         file = Pathname.new "/proc/#{pid}/smaps"
40 |         return 0 unless file.exist?
41 | 
42 |         lines = file.each_line.select { |line| line.match(/^Pss/) }
43 |         return 0 if lines.empty?
44 | 
45 |         lines.reduce(0) do |sum, line|
46 |           line.match(/(?<value>(\d*\.{0,1}\d+))\s+(?<unit>\w\w)/) do |m|
47 |             sum += m[:value].to_i
48 |           end
49 | 
50 |           sum
51 |         end
52 |       rescue Errno::EACCES
53 |         0
54 |       end
55 |     when "darwin"
56 |       mem = `ps -o rss= -p #{pid}`.strip
57 |       mem.empty? ? 0 : mem.to_i
58 |     else
59 |       raise "Can't check process memory, wrong type of platform: #{@platform}"
60 |     end
61 |   end
62 | end
63 | 


--------------------------------------------------------------------------------
/lib/kimurai/capybara_ext/mechanize/driver.rb:
--------------------------------------------------------------------------------
 1 | require 'mechanize'
 2 | require_relative '../driver/base'
 3 | 
 4 | class Capybara::Mechanize::Driver
 5 |   # Extend capybara-mechnize to support Poltergeist-like methods
 6 |   # https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver
 7 | 
 8 |   def set_proxy(ip, port, type, user = nil, password = nil)
 9 |     # type is always "http", "socks" is not supported (yet)
10 |     browser.agent.set_proxy(ip, port, user, password)
11 |   end
12 | 
13 |   ###
14 | 
15 |   def headers
16 |     browser.agent.request_headers
17 |   end
18 | 
19 |   def headers=(headers)
20 |     browser.agent.request_headers = headers
21 |   end
22 | 
23 |   def add_header(name, value)
24 |     browser.agent.request_headers[name] = value
25 |   end
26 | 
27 |   ###
28 | 
29 |   def get_cookies
30 |     browser.agent.cookies
31 |   end
32 | 
33 |   def set_cookie(name, value, options = {})
34 |     options[:name]  ||= name
35 |     options[:value] ||= value
36 | 
37 |     cookie = Mechanize::Cookie.new(options.merge path: "/")
38 |     browser.agent.cookie_jar << cookie
39 |   end
40 | 
41 |   def set_cookies(cookies)
42 |     cookies.each do |cookie|
43 |       set_cookie(cookie[:name], cookie[:value], cookie)
44 |     end
45 |   end
46 | 
47 |   def clear_cookies
48 |     browser.agent.cookie_jar.clear!
49 |   end
50 | 
51 |   ###
52 | 
53 |   def quit
54 |     browser.agent.shutdown
55 |   end
56 | 
57 |   ###
58 | 
59 |   # Reset parent method `current_memory` for mechanize (we can't measure memory of Mechanize driver)
60 |   def current_memory
61 |     nil
62 |   end
63 | 
64 |   def pid
65 |     nil
66 |   end
67 | 
68 |   def port
69 |     nil
70 |   end
71 | end
72 | 


--------------------------------------------------------------------------------
/lib/kimurai/capybara_ext/poltergeist/driver.rb:
--------------------------------------------------------------------------------
 1 | require_relative '../driver/base'
 2 | 
 3 | module Capybara::Poltergeist
 4 |   class Driver
 5 |     def pid
 6 |       client_pid
 7 |     end
 8 | 
 9 |     def port
10 |       server.port
11 |     end
12 |   end
13 | end
14 | 


--------------------------------------------------------------------------------
/lib/kimurai/capybara_ext/selenium/driver.rb:
--------------------------------------------------------------------------------
 1 | require_relative '../driver/base'
 2 | 
 3 | class Capybara::Selenium::Driver
 4 |   def get_cookies
 5 |     browser.manage.all_cookies
 6 |   end
 7 | 
 8 |   def set_cookie(name, value, options = {})
 9 |     options[:name]  ||= name
10 |     options[:value] ||= value
11 | 
12 |     browser.manage.add_cookie(options)
13 |   end
14 | 
15 |   def set_cookies(cookies)
16 |     cookies.each do |cookie|
17 |       set_cookie(cookie[:name], cookie[:value], cookie)
18 |     end
19 |   end
20 | 
21 |   def clear_cookies
22 |     browser.manage.delete_all_cookies
23 |   end
24 | 
25 |   ###
26 | 
27 |   def pid
28 |     @pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
29 |   end
30 | 
31 |   def port
32 |     @port ||= browser.send(:bridge).instance_variable_get("@http").instance_variable_get("@server_url").port
33 |   end
34 | end
35 | 


--------------------------------------------------------------------------------
/lib/kimurai/capybara_ext/session.rb:
--------------------------------------------------------------------------------
  1 | require 'capybara'
  2 | require 'nokogiri'
  3 | require 'json'
  4 | require_relative 'session/config'
  5 | 
  6 | module Capybara
  7 |   class Session
  8 |     attr_accessor :spider
  9 | 
 10 |     alias_method :original_visit, :visit
 11 |     def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
 12 |       if spider
 13 |         process_delay(delay) if delay
 14 |         retries, sleep_interval = 0, 0
 15 | 
 16 |         begin
 17 |           check_request_options(visit_uri) unless skip_request_options
 18 |           driver.requests += 1 and logger.info "Browser: started get request to: #{visit_uri}"
 19 |           spider.class.update(:visits, :requests) if spider.with_info
 20 | 
 21 |           original_visit(visit_uri)
 22 |         rescue => e
 23 |           if match_error?(e, type: :to_skip)
 24 |             logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
 25 |             spider.add_event(:requests_errors, e.inspect) if spider.with_info
 26 |             false
 27 |           elsif match_error?(e, type: :to_retry)
 28 |             logger.error "Browser: retry request error: #{e.inspect}, url: #{visit_uri}"
 29 |             spider.add_event(:requests_errors, e.inspect) if spider.with_info
 30 | 
 31 |             if (retries += 1) <= max_retries
 32 |               logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
 33 |               sleep sleep_interval and retry
 34 |             else
 35 |               logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
 36 |               raise e unless skip_error_on_failure?(e)
 37 |             end
 38 |           else
 39 |             raise e
 40 |           end
 41 |         else
 42 |           driver.responses += 1 and logger.info "Browser: finished get request to: #{visit_uri}"
 43 |           spider.class.update(:visits, :responses) if spider.with_info
 44 |           driver.visited = true unless driver.visited
 45 |           true
 46 |         ensure
 47 |           if spider.with_info
 48 |             logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
 49 |           end
 50 | 
 51 |           if memory = driver.current_memory
 52 |             logger.debug "Browser: driver.current_memory: #{memory}"
 53 |           end
 54 |         end
 55 |       else
 56 |         original_visit(visit_uri)
 57 |       end
 58 |     end
 59 | 
 60 |     def destroy_driver!
 61 |       if @driver
 62 |         begin
 63 |           @driver.quit
 64 |         # handle Net::ReadTimeout error for Selenium like drivers
 65 |         rescue Net::ReadTimeout => e
 66 |           @driver.quit
 67 |         end
 68 | 
 69 |         @driver = nil
 70 |         logger.info "Browser: driver #{mode} has been destroyed"
 71 |       else
 72 |         logger.warn "Browser: driver #{mode} is not present"
 73 |       end
 74 |     end
 75 | 
 76 |     def restart!
 77 |       if mode.match?(/poltergeist/)
 78 |         @driver.browser.restart
 79 |         @driver.requests, @driver.responses = 0, 0
 80 |       else
 81 |         destroy_driver!
 82 |         driver
 83 |       end
 84 | 
 85 |       logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
 86 |     end
 87 | 
 88 |     def current_response(response_type = :html)
 89 |       case response_type
 90 |       when :html
 91 |         if config.encoding
 92 |           if config.encoding == :auto
 93 |             charset = body.force_encoding("ISO-8859-1").encode("UTF-8")[/<meta.*?charset=["]?([\w+\d+\-]*)/i, 1]
 94 |             Nokogiri::HTML(body, nil, charset)
 95 |           else
 96 |             Nokogiri::HTML(body, nil, config.encoding)
 97 |           end
 98 |         else
 99 |           Nokogiri::HTML(body)
100 |         end
101 |       when :json
102 |         JSON.parse(body)
103 |       end
104 |     end
105 | 
106 |     ###
107 | 
108 |     # Handy method to perform some processing in the new tab within block and then automatically close this tab:
109 |     # Usage (url):
110 |     # browser.within_new_window_by(url: "https://google.com") do
111 |       # do some stuff and then automatically close this tab and return back to the first tab
112 |     # end
113 |     # Usage (action) (when new tab opening by some action, for example by clicking
114 |     # on a particular element):
115 |     # action = -> { browser.find("//some/element/path").click }
116 |     # browser.within_new_window_by(action: action) do
117 |       # do some stuff and then automatically close this tab and return back to the first tab
118 |     # end
119 |     def within_new_window_by(action: nil, url: nil)
120 |       case
121 |       when action
122 |         opened_window = window_opened_by { action.call }
123 |         within_window(opened_window) do
124 |           yield
125 |           current_window.close
126 |         end
127 |       when url
128 |         within_window(open_new_window) do
129 |           visit(url)
130 | 
131 |           yield
132 |           current_window.close
133 |         end
134 |       end
135 |     end
136 | 
137 |     ###
138 | 
139 |     def scroll_to_bottom
140 |       execute_script("window.scrollBy(0,10000)")
141 |     end
142 | 
143 |     private
144 | 
145 |     def skip_error_on_failure?(e)
146 |       config.retry_request_errors.any? do |error|
147 |         error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
148 |       end
149 |     end
150 | 
151 |     def match_error?(e, type:)
152 |       errors =
153 |         case type
154 |         when :to_retry then config.retry_request_errors
155 |         when :to_skip then config.skip_request_errors
156 |         end
157 | 
158 |       errors.any? do |error|
159 |         if error.kind_of?(Hash)
160 |           match_class = e.class.ancestors.include?(error[:error])
161 |           if error[:message].present?
162 |             if error[:message].kind_of?(Regexp)
163 |               e.message&.match?(error[:message])
164 |             else
165 |               e.message&.include?(error[:message])
166 |             end && match_class
167 |           else
168 |             match_class
169 |           end
170 |         else
171 |           e.class.ancestors.include?(error)
172 |         end
173 |       end
174 |     end
175 | 
176 |     def process_delay(delay)
177 |       interval = (delay.class == Range ? rand(delay) : delay)
178 |       logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
179 |       sleep interval
180 |     end
181 | 
182 |     def check_request_options(url_to_visit)
183 |       # restart_if
184 |       if memory_limit = config.restart_if[:memory_limit]
185 |         memory = driver.current_memory
186 |         if memory && memory >= memory_limit
187 |           logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
188 |           restart!
189 |         end
190 |       end
191 | 
192 |       if requests_limit = config.restart_if[:requests_limit]
193 |         requests = driver.requests
194 |         if requests >= requests_limit
195 |           logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
196 |           restart!
197 |         end
198 |       end
199 | 
200 |       # cookies
201 |       # (Selenium only) if config.cookies present and browser was just created,
202 |       # visit url_to_visit first and only then set cookies:
203 |       if driver.visited.nil? && config.cookies && mode.match?(/selenium/)
204 |         visit(url_to_visit, skip_request_options: true)
205 |         config.cookies.each do |cookie|
206 |           driver.set_cookie(cookie[:name], cookie[:value], cookie)
207 |         end
208 |       end
209 | 
210 |       if config.before_request[:clear_cookies]
211 |         driver.clear_cookies
212 |         logger.debug "Browser: cleared cookies before request"
213 |       end
214 | 
215 |       if config.before_request[:clear_and_set_cookies]
216 |         driver.clear_cookies
217 | 
218 |         # (Selenium only) if browser is not visited yet any page, visit url_to_visit
219 |         # first and then set cookies (needs after browser restart):
220 |         if driver.visited.nil? && mode.match?(/selenium/)
221 |           visit(url_to_visit, skip_request_options: true)
222 |         end
223 | 
224 |         config.cookies.each do |cookie|
225 |           driver.set_cookie(cookie[:name], cookie[:value], cookie)
226 |         end
227 | 
228 |         logger.debug "Browser: cleared and set cookies before request"
229 |       end
230 | 
231 |       # user_agent
232 |       if config.before_request[:change_user_agent]
233 |         driver.add_header("User-Agent", config.user_agent.call)
234 |         logger.debug "Browser: changed user_agent before request"
235 |       end
236 | 
237 |       # proxy
238 |       if config.before_request[:change_proxy]
239 |         proxy_string = config.proxy.call
240 |         driver.set_proxy(*proxy_string.split(":"))
241 |         logger.debug "Browser: changed proxy before request"
242 |       end
243 |     end
244 | 
245 |     def logger
246 |       spider.logger
247 |     end
248 |   end
249 | end
250 | 


--------------------------------------------------------------------------------
/lib/kimurai/capybara_ext/session/config.rb:
--------------------------------------------------------------------------------
 1 | module Capybara
 2 |   class SessionConfig
 3 |     attr_accessor :cookies, :proxy, :user_agent, :encoding
 4 |     attr_writer :retry_request_errors, :skip_request_errors
 5 | 
 6 |     def retry_request_errors
 7 |       @retry_request_errors ||= []
 8 |     end
 9 | 
10 |     def skip_request_errors
11 |       @skip_request_errors ||= []
12 |     end
13 | 
14 |     def restart_if
15 |       @restart_if ||= {}
16 |     end
17 | 
18 |     def before_request
19 |       @before_request ||= {}
20 |     end
21 |   end
22 | end
23 | 


--------------------------------------------------------------------------------
/lib/kimurai/cli.rb:
--------------------------------------------------------------------------------
  1 | require 'thor'
  2 | 
  3 | module Kimurai
  4 |   class CLI < Thor
  5 |     map %w[--version -v] => :__print_version
  6 | 
  7 |     desc "generate", "Generator, available types: project, spider, schedule"
  8 |     def generate(generator_type, *args)
  9 |       case generator_type
 10 |       when "project"
 11 |         project_name = args.shift
 12 |         raise "Provide project name to generate a new project" unless project_name.present?
 13 |         Generator.new.generate_project(project_name)
 14 |       when "spider"
 15 |         spider_name = args.shift
 16 |         raise "Provide spider name to generate a spider" unless spider_name.present?
 17 |         Generator.new.generate_spider(spider_name, in_project: inside_project?)
 18 |       when "schedule"
 19 |         Generator.new.generate_schedule
 20 |       else
 21 |         raise "Don't know this generator type: #{generator_type}"
 22 |       end
 23 |     end
 24 | 
 25 |     ###
 26 | 
 27 |     desc "setup", "Setup server"
 28 |     option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
 29 |     option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
 30 |     option "ask-auth-pass", type: :boolean, banner: "Auth using password"
 31 |     option "ssh-key-path", type: :string, banner: "Auth using ssh key"
 32 |     option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
 33 |     def setup(user_host)
 34 |       command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
 35 | 
 36 |       pid = spawn *command
 37 |       Process.wait pid
 38 |     end
 39 | 
 40 |     desc "deploy", "Deploy project to the server and update cron schedule"
 41 |     option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
 42 |     option "ask-auth-pass", type: :boolean, banner: "Auth using password"
 43 |     option "ssh-key-path", type: :string, banner: "Auth using ssh key"
 44 |     option "repo-url", type: :string, banner: "Repo url"
 45 |     option "repo-key-path", type: :string, banner: "SSH key for a git repo"
 46 |     option "skip-check", type: :boolean, default: false, banner: "Skip git repository checks"
 47 |     def deploy(user_host)
 48 |       unless options["skip-check"]
 49 |         if !`git status --short`.empty?
 50 |           raise "Deploy: Please commit your changes first"
 51 |         elsif `git remote`.empty?
 52 |           raise "Deploy: Please add remote origin repository to your repo first"
 53 |         elsif !`git rev-list master...origin/master`.empty?
 54 |           raise "Deploy: Please push your commits to the remote origin repo first"
 55 |         end
 56 |       end
 57 | 
 58 |       repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
 59 |       repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
 60 | 
 61 |       command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
 62 |         vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
 63 |       ).get
 64 | 
 65 |       pid = spawn *command
 66 |       Process.wait pid
 67 |     end
 68 | 
 69 |     ###
 70 | 
 71 |     desc "crawl", "Run a particular spider by it's name"
 72 |     def crawl(spider_name)
 73 |       raise "Can't find Kimurai project" unless inside_project?
 74 |       require './config/boot'
 75 | 
 76 |       unless klass = Kimurai.find_by_name(spider_name)
 77 |         raise "Can't find spider with name `#{spider_name}` in the project. " \
 78 |           "To list all available spiders, run: `$ bundle exec kimurai list`"
 79 |       end
 80 | 
 81 |       # Set time_zone if exists
 82 |       if time_zone = Kimurai.configuration.time_zone
 83 |         Kimurai.time_zone = time_zone
 84 |       end
 85 | 
 86 |       klass.crawl!
 87 |     end
 88 | 
 89 |     desc "parse", "Parse url in the particular spider method"
 90 |     option :url, type: :string, required: true, banner: "Url to pass to the method"
 91 |     def parse(spider_name, method_name)
 92 |       raise "Can't find Kimurai project" unless inside_project?
 93 |       require './config/boot'
 94 | 
 95 |       unless klass = Kimurai.find_by_name(spider_name)
 96 |         raise "Can't find spider with name `#{spider_name}` in the project. " \
 97 |           "To list all available spiders, run: `$ bundle exec kimurai list`"
 98 |       end
 99 | 
100 |       klass.parse!(method_name, url: options["url"])
101 |     end
102 | 
103 |     desc "console", "Start Kimurai console"
104 |     option :engine, type: :string, banner: "Engine to use"
105 |     option :url, type: :string, banner: "Url to process"
106 |     def console(spider_name = nil)
107 |       require 'pry'
108 |       require './config/boot' if inside_project?
109 | 
110 |       if spider_name
111 |         raise "Can't find Kimurai project" unless inside_project?
112 | 
113 |         unless klass = Kimurai.find_by_name(spider_name)
114 |           raise "Can't find spider with name `#{spider_name}` in the project. " \
115 |             "To list all available spiders, run: `$ bundle exec kimurai list`"
116 |         end
117 |       else
118 |         klass = inside_project? ? ApplicationSpider : ::Kimurai::Base
119 |       end
120 | 
121 |       engine = options["engine"]&.delete(":")&.to_sym
122 |       if url = options["url"]
123 |         klass.new(engine).request_to(:console, url: options["url"])
124 |       else
125 |         klass.new(engine).public_send(:console)
126 |       end
127 |     end
128 | 
129 |     desc "list", "List all available spiders in the current project"
130 |     def list
131 |       raise "Can't find Kimurai project" unless inside_project?
132 |       require './config/boot'
133 | 
134 |       Kimurai.list.keys.sort.each { |name| puts name }
135 |     end
136 | 
137 |     desc "runner", "Run all spiders in the project in queue"
138 |     option :include, type: :array, default: [], banner: "List of spiders to run"
139 |     option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
140 |     option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
141 |     def runner
142 |       raise "Can't find Kimurai project" unless inside_project?
143 | 
144 |       jobs = options["jobs"]
145 |       raise "Jobs count can't be 0" if jobs == 0
146 | 
147 |       require './config/boot'
148 |       require 'kimurai/runner'
149 | 
150 |       spiders = options["include"].presence || Kimurai.list.keys
151 |       spiders -= options["exclude"]
152 | 
153 |       Runner.new(spiders, jobs).run!
154 |     end
155 | 
156 |     desc "--version, -v", "Print the version"
157 |     def __print_version
158 |       puts VERSION
159 |     end
160 | 
161 |     desc "dashboard", "Run dashboard"
162 |     def dashboard
163 |       raise "Can't find Kimurai project" unless inside_project?
164 | 
165 |       require './config/boot'
166 |       if Object.const_defined?("Kimurai::Dashboard")
167 |         require 'kimurai/dashboard/app'
168 |         Kimurai::Dashboard::App.run!
169 |       else
170 |         raise "Kimurai::Dashboard is not defined"
171 |       end
172 |     end
173 | 
174 |     private
175 | 
176 |     def inside_project?
177 |       Dir.exists?("spiders") && File.exists?("./config/boot.rb")
178 |     end
179 |   end
180 | end
181 | 
182 | require_relative 'cli/generator'
183 | require_relative 'cli/ansible_command_builder'
184 | 


--------------------------------------------------------------------------------
/lib/kimurai/cli/ansible_command_builder.rb:
--------------------------------------------------------------------------------
 1 | require 'cliver'
 2 | 
 3 | module Kimurai
 4 |   class CLI
 5 |     class AnsibleCommandBuilder
 6 |       def initialize(user_host, options, playbook:, vars: {})
 7 |         @user_host = user_host
 8 |         @options = options
 9 |         @playbook = playbook
10 |         @vars = vars
11 |       end
12 | 
13 |       def get
14 |         unless Cliver.detect("ansible-playbook")
15 |           raise "Can't find `ansible-playbook` executable, to install: " \
16 |             "Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
17 |         end
18 | 
19 |         user = @user_host[/(.*?)\@/, 1]
20 |         host = @user_host[/\@(.+)/, 1] || @user_host
21 |         inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
22 | 
23 |         gem_dir = Gem::Specification.find_by_name("kimurai").gem_dir
24 |         playbook_path = gem_dir + "/lib/kimurai/automation/" + "#{@playbook}.yml"
25 | 
26 |         command = [
27 |           "ansible-playbook", playbook_path,
28 |           "--inventory", inventory,
29 |           "--ssh-extra-args", "-oForwardAgent=yes",
30 |           "--connection", @options["local"] ? "local" : "smart",
31 |           "--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
32 |         ]
33 | 
34 |         if File.exists? "config/automation.yml"
35 |           require 'yaml'
36 |           if config = YAML.load_file("config/automation.yml").dig(@playbook)
37 |             config.each { |key, value| @vars[key] = value unless @vars[key] }
38 |           end
39 |         end
40 | 
41 |         @vars.each do |key, value|
42 |           next unless value.present?
43 |           command.push "--extra-vars", "#{key}=#{value}"
44 |         end
45 | 
46 |         if user
47 |           command.push "--user", user
48 |         end
49 | 
50 |         if @options["ask-sudo"]
51 |           command.push "--ask-become-pass"
52 |         end
53 | 
54 |         if @options["ask-auth-pass"]
55 |           unless Cliver.detect("sshpass")
56 |             raise "Can't find `sshpass` executable for password authentication, to install: " \
57 |               "Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
58 |           end
59 | 
60 |           command.push "--ask-pass"
61 |         end
62 | 
63 |         if ssh_key_path = @options["ssh-key-path"]
64 |           command.push "--private-key", ssh_key_path
65 |         end
66 | 
67 |         command
68 |       end
69 |     end
70 |   end
71 | end
72 | 


--------------------------------------------------------------------------------
/lib/kimurai/cli/generator.rb:
--------------------------------------------------------------------------------
 1 | module Kimurai
 2 |   class CLI
 3 |     class Generator < Thor::Group
 4 |       include Thor::Actions
 5 | 
 6 |       def self.source_root
 7 |         File.dirname(File.expand_path('..', __FILE__))
 8 |       end
 9 | 
10 |       def generate_project(project_name)
11 |         directory "template", project_name
12 |         inside(project_name) do
13 |           run "bundle install"
14 |           run "git init"
15 |         end
16 |       end
17 | 
18 |       def generate_spider(spider_name, in_project:)
19 |         spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
20 |         raise "Spider #{spider_path} already exists" if File.exists? spider_path
21 | 
22 |         spider_class = to_spider_class(spider_name)
23 |         create_file spider_path do
24 |           <<~RUBY
25 |             class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Kimurai::Base'}
26 |               @name = "#{spider_name}"
27 |               @start_urls = []
28 |               @config = {}
29 | 
30 |               def parse(response, url:, data: {})
31 |               end
32 |             end
33 |           RUBY
34 |         end
35 | 
36 |         unless in_project
37 |           insert_into_file spider_path, "  @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
38 |           prepend_to_file spider_path, "require 'kimurai'\n\n"
39 |           append_to_file spider_path, "\n#{spider_class}.crawl!"
40 |         end
41 |       end
42 | 
43 |       def generate_schedule
44 |         copy_file "template/config/schedule.rb", "./schedule.rb"
45 |       end
46 | 
47 |       private
48 | 
49 |       def to_spider_class(string)
50 |         string.sub(/^./) { $&.capitalize }
51 |           .gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
52 |           .gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
53 |           .gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
54 |       end
55 |     end
56 |   end
57 | end
58 | 


--------------------------------------------------------------------------------
/lib/kimurai/core_ext/array.rb:
--------------------------------------------------------------------------------
 1 | class Array
 2 |   def in_sorted_groups(number, fill_width = nil)
 3 |     sorted_groups = Array.new(number) { |a| a = [] }
 4 | 
 5 |     self.in_groups_of(number, fill_width).each do |group|
 6 |       number.times do |i|
 7 |         group.fetch(i) rescue next
 8 |         sorted_groups[i] << group[i]
 9 |       end
10 |     end
11 | 
12 |     sorted_groups
13 |   end
14 | end
15 | 


--------------------------------------------------------------------------------
/lib/kimurai/core_ext/hash.rb:
--------------------------------------------------------------------------------
1 | class Hash
2 |   def deep_merge_excl(second, exclude)
3 |     self.merge(second.slice(*exclude)).deep_merge(second.except(*exclude))
4 |   end
5 | end
6 | 


--------------------------------------------------------------------------------
/lib/kimurai/core_ext/numeric.rb:
--------------------------------------------------------------------------------
 1 | class Numeric
 2 |   # https://stackoverflow.com/a/1679963
 3 |   def duration
 4 |     secs  = self.to_int
 5 |     mins  = secs / 60
 6 |     hours = mins / 60
 7 |     days  = hours / 24
 8 | 
 9 |     if days > 0
10 |       "#{days}d, #{hours % 24}h"
11 |     elsif hours > 0
12 |       "#{hours}h, #{mins % 60}m"
13 |     elsif mins > 0
14 |       "#{mins}m, #{secs % 60}s"
15 |     elsif secs >= 0
16 |       "#{secs}s"
17 |     end
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/lib/kimurai/core_ext/string.rb:
--------------------------------------------------------------------------------
1 | require 'murmurhash3'
2 | 
3 | class String
4 |   def to_id
5 |     MurmurHash3::V32.str_hash(self)
6 |   end
7 | end
8 | 


--------------------------------------------------------------------------------
/lib/kimurai/pipeline.rb:
--------------------------------------------------------------------------------
 1 | module Kimurai
 2 |   class Pipeline
 3 |     class DropItemError < StandardError; end
 4 |     def self.name
 5 |       self.to_s.sub(/.*?::/, "").underscore.to_sym
 6 |     end
 7 | 
 8 |     include BaseHelper
 9 |     attr_accessor :spider
10 | 
11 |     def name
12 |       self.class.name
13 |     end
14 | 
15 |     ###
16 | 
17 |     def storage
18 |       spider.storage
19 |     end
20 | 
21 |     def unique?(scope, value)
22 |       spider.unique?(scope, value)
23 |     end
24 | 
25 |     def save_to(path, item, format:, position: true, append: false)
26 |       spider.save_to(path, item, format: format, position: position, append: append)
27 |     end
28 | 
29 |     def logger
30 |       spider.logger
31 |     end
32 |   end
33 | end
34 | 


--------------------------------------------------------------------------------
/lib/kimurai/runner.rb:
--------------------------------------------------------------------------------
 1 | require 'pmap'
 2 | 
 3 | module Kimurai
 4 |   class Runner
 5 |     attr_reader :jobs, :spiders, :session_info
 6 | 
 7 |     def initialize(spiders, parallel_jobs)
 8 |       @jobs = parallel_jobs
 9 |       @spiders = spiders
10 |       @start_time = Time.now
11 | 
12 |       @session_info = {
13 |         id: @start_time.to_i,
14 |         status: :processing,
15 |         start_time: @start_time,
16 |         stop_time: nil,
17 |         environment: Kimurai.env,
18 |         concurrent_jobs: @jobs,
19 |         spiders: @spiders
20 |       }
21 | 
22 |       if time_zone = Kimurai.configuration.time_zone
23 |         Kimurai.time_zone = time_zone
24 |       end
25 | 
26 |       ENV.store("SESSION_ID", @start_time.to_i.to_s)
27 |       ENV.store("RBCAT_COLORIZER", "false")
28 |     end
29 | 
30 |     def run!(exception_on_fail: true)
31 |       puts ">>> Runner: started: #{session_info}"
32 |       if at_start_callback = Kimurai.configuration.runner_at_start_callback
33 |         at_start_callback.call(session_info)
34 |       end
35 | 
36 |       running = true
37 |       spiders.peach_with_index(jobs) do |spider, i|
38 |         next unless running
39 | 
40 |         puts "> Runner: started spider: #{spider}, index: #{i}"
41 |         pid = spawn("bundle", "exec", "kimurai", "crawl", spider, [:out, :err] => "log/#{spider}.log")
42 |         Process.wait pid
43 | 
44 |         puts "< Runner: stopped spider: #{spider}, index: #{i}"
45 |       end
46 |     rescue StandardError, SignalException, SystemExit => e
47 |       running = false
48 | 
49 |       session_info.merge!(status: :failed, error: e.inspect, stop_time: Time.now)
50 |       exception_on_fail ? raise(e) : [session_info, e]
51 |     else
52 |       session_info.merge!(status: :completed, stop_time: Time.now)
53 |     ensure
54 |       if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
55 |         at_stop_callback.call(session_info)
56 |       end
57 |       puts "<<< Runner: stopped: #{session_info}"
58 |     end
59 |   end
60 | end
61 | 


--------------------------------------------------------------------------------
/lib/kimurai/template/.gitignore:
--------------------------------------------------------------------------------
 1 | /.bundle
 2 | /cache
 3 | /node_modules
 4 | 
 5 | /log/*
 6 | !/log/.keep
 7 | 
 8 | /tmp/*
 9 | !/tmp/.keep
10 | 
11 | /db/*
12 | !/db/.keep
13 | 
14 | .byebug_history
15 | *.swp
16 | .env
17 | 
18 | capybara-*.png
19 | 


--------------------------------------------------------------------------------
/lib/kimurai/template/Gemfile:
--------------------------------------------------------------------------------
 1 | source 'https://rubygems.org'
 2 | git_source(:github) { |repo| "https://github.com/#{repo}.git" }
 3 | 
 4 | ruby '>= 2.5'
 5 | 
 6 | # Framework
 7 | gem 'kimurai', '~> 1.4'
 8 | 
 9 | # Require files in directory and child directories recursively
10 | gem 'require_all'
11 | 
12 | # Dotenv
13 | gem 'dotenv'
14 | 
15 | # To debug spiders:
16 | group :development do
17 |   gem 'byebug', platforms: :mri
18 |   gem 'pry'
19 | end
20 | 
21 | # If you want to save items to the database, require one of these gems:
22 | # gem 'sqlite3'
23 | # gem 'pg'
24 | # gem 'mysql2'
25 | 
26 | # And use your preferred ORM/database connector:
27 | # gem 'activerecord', require: 'active_record'
28 | # gem 'sequel'
29 | 


--------------------------------------------------------------------------------
/lib/kimurai/template/README.md:
--------------------------------------------------------------------------------
1 | # README
2 | 
3 | New Kimurai project readme
4 | 


--------------------------------------------------------------------------------
/lib/kimurai/template/config/application.rb:
--------------------------------------------------------------------------------
 1 | Kimurai.configure do |config|
 2 |   # Default logger has colored mode in development.
 3 |   # If you would like to disable it, set `colorize_logger` to false.
 4 |   # config.colorize_logger = false
 5 | 
 6 |   # Logger level for default logger:
 7 |   # config.log_level = :info
 8 | 
 9 |   # Custom logger:
10 |   # config.logger = Logger.new(STDOUT)
11 | 
12 |   # Custom time zone (for logs):
13 |   # config.time_zone = "UTC"
14 |   # config.time_zone = "Europe/Moscow"
15 | 
16 |   # At start callback for a runner. Accepts argument with info as hash with
17 |   # keys: id, status, start_time, environment, concurrent_jobs, spiders list.
18 |   # For example, you can use this callback to send notification when runner was started:
19 |   # config.runner_at_start_callback = lambda do |info|
20 |   #   json = JSON.pretty_generate(info)
21 |   #   Sender.send_notification("Started session: #{json}")
22 |   # end
23 | 
24 |   # At stop callback for a runner. Accepts argument with info as hash with
25 |   # all `runner_at_start_callback` keys plus additional `stop_time` key. Also `status` contains
26 |   # stop status of a runner (completed or failed).
27 |   # You can use this callback to send notification when runner has been stopped:
28 |   # config.runner_at_stop_callback = lambda do |info|
29 |   #   json = JSON.pretty_generate(info)
30 |   #   Sender.send_notification("Stopped session: #{json}")
31 |   # end
32 | 
33 |   # Provide custom chrome binary path (default is any available chrome/chromium in the PATH):
34 |   # config.selenium_chrome_path = "/usr/bin/chromium-browser"
35 |   # Provide custom selenium chromedriver path (default is "/usr/local/bin/chromedriver"):
36 |   # config.chromedriver_path = "/usr/local/bin/chromedriver"
37 | end
38 | 


--------------------------------------------------------------------------------
/lib/kimurai/template/config/automation.yml:
--------------------------------------------------------------------------------
 1 | # software versions to install for `setup` command
 2 | setup:
 3 |   ruby: 2.5.1
 4 |   # check latest here http://phantomjs.org/download.html
 5 |   phantomjs: 2.1.1
 6 |   # check latest here https://github.com/mozilla/geckodriver/releases/
 7 |   geckodriver: 0.21.0
 8 |   # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
 9 |   chromedriver: 2.39
10 | # settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
11 | deploy:
12 |   # repo_url: git@bitbucket.org:username/repo_name.git
13 |   # repo_key_path: ~/.ssh/id_rsa
14 | 


--------------------------------------------------------------------------------
/lib/kimurai/template/config/boot.rb:
--------------------------------------------------------------------------------
 1 | # require project gems
 2 | require 'bundler/setup'
 3 | Bundler.require(:default, Kimurai.env)
 4 | 
 5 | # require custom ENV variables located in .env file
 6 | require 'dotenv/load'
 7 | 
 8 | # require initializers
 9 | Dir.glob(File.join("./config/initializers", "*.rb"), &method(:require))
10 | 
11 | # require helpers
12 | Dir.glob(File.join("./helpers", "*.rb"), &method(:require))
13 | 
14 | # require pipelines
15 | Dir.glob(File.join("./pipelines", "*.rb"), &method(:require))
16 | 
17 | # require spiders recursively in the `spiders/` folder
18 | require_relative '../spiders/application_spider'
19 | require_all "spiders"
20 | 
21 | # require Kimurai configuration
22 | require_relative 'application'
23 | 


--------------------------------------------------------------------------------
/lib/kimurai/template/config/initializers/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/config/initializers/.keep


--------------------------------------------------------------------------------
/lib/kimurai/template/config/schedule.rb:
--------------------------------------------------------------------------------
 1 | ### Settings ###
 2 | require 'tzinfo'
 3 | 
 4 | # Export current PATH to the cron
 5 | env :PATH, ENV["PATH"]
 6 | 
 7 | # Use 24 hour format when using `at:` option
 8 | set :chronic_options, hours24: true
 9 | 
10 | # Use local_to_utc helper to setup execution time using your local timezone instead
11 | # of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`).
12 | # Also maybe you'll want to set same timezone in kimurai as well (use `Kimurai.configuration.time_zone =` for that),
13 | # to have spiders logs in a specific time zone format.
14 | # Example usage of helper:
15 | # every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do
16 | #   crawl "google_spider.com", output: "log/google_spider.com.log"
17 | # end
18 | def local_to_utc(time_string, zone:)
19 |   TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
20 | end
21 | 
22 | # Note: by default Whenever exports cron commands with :environment == "production".
23 | # Note: Whenever can only append log data to a log file (>>). If you want
24 | # to overwrite (>) log file before each run, pass lambda:
25 | # crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
26 | 
27 | # Project job types
28 | job_type :crawl,  "cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output"
29 | job_type :runner, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output"
30 | 
31 | # Single file job type
32 | job_type :single, "cd :path && KIMURAI_ENV=:environment ruby :task :output"
33 | # Single with bundle exec
34 | job_type :single_bundle, "cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output"
35 | 
36 | ### Schedule ###
37 | # Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
38 | # every 1.day do
39 |   # Example to schedule a single spider in the project:
40 |   # crawl "google_spider.com", output: "log/google_spider.com.log"
41 | 
42 |   # Example to schedule all spiders in the project using runner. Each spider will write
43 |   # it's own output to the `log/spider_name.log` file (handled by a runner itself).
44 |   # Runner output will be written to log/runner.log file.
45 |   # Argument number it's a count of concurrent jobs:
46 |   # runner 3, output:"log/runner.log"
47 | 
48 |   # Example to schedule single spider (without project):
49 |   # single "single_spider.rb", output: "single_spider.log"
50 | # end
51 | 
52 | ### How to set a cron schedule ###
53 | # Run: `$ whenever --update-crontab --load-file config/schedule.rb`.
54 | # If you don't have whenever command, install the gem: `$ gem install whenever`.
55 | 
56 | ### How to cancel a schedule ###
57 | # Run: `$ whenever --clear-crontab --load-file config/schedule.rb`.
58 | 


--------------------------------------------------------------------------------
/lib/kimurai/template/db/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/db/.keep


--------------------------------------------------------------------------------
/lib/kimurai/template/helpers/application_helper.rb:
--------------------------------------------------------------------------------
1 | module ApplicationHelper
2 |   # Put here custom methods which are will be available for any spider
3 | end
4 | 


--------------------------------------------------------------------------------
/lib/kimurai/template/lib/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/lib/.keep


--------------------------------------------------------------------------------
/lib/kimurai/template/log/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/log/.keep


--------------------------------------------------------------------------------
/lib/kimurai/template/pipelines/saver.rb:
--------------------------------------------------------------------------------
 1 | class Saver < Kimurai::Pipeline
 2 |   def process_item(item, options: {})
 3 |     # Here you can save item to the database, send it to a remote API or
 4 |     # simply save item to a file format using `save_to` helper:
 5 | 
 6 |     # To get the name of a current spider: `spider.class.name`
 7 |     # save_to "db/#{spider.class.name}.json", item, format: :pretty_json
 8 | 
 9 |     item
10 |   end
11 | end
12 | 


--------------------------------------------------------------------------------
/lib/kimurai/template/pipelines/validator.rb:
--------------------------------------------------------------------------------
 1 | class Validator < Kimurai::Pipeline
 2 |   def process_item(item, options: {})
 3 |     # Here you can validate item and raise `DropItemError`
 4 |     # if one of the validations failed. Examples:
 5 | 
 6 |     # Check item sku for uniqueness using buit-in `unique?` helper:
 7 |     # unless unique?(:sku, item[:sku])
 8 |     #   raise DropItemError, "Item sku is not unique"
 9 |     # end
10 | 
11 |     # Drop item if title length shorter than 5 symbols:
12 |     # if item[:title].size < 5
13 |     #   raise DropItemError, "Item title is short"
14 |     # end
15 | 
16 |     # Drop item if it doesn't contains any images:
17 |     # unless item[:images].present?
18 |     #   raise DropItemError, "Item images are not present"
19 |     # end
20 | 
21 |     # Pass item to the next pipeline (if it wasn't dropped)
22 |     item
23 |   end
24 | end
25 | 


--------------------------------------------------------------------------------
/lib/kimurai/template/spiders/application_spider.rb:
--------------------------------------------------------------------------------
  1 | # ApplicationSpider is a default base spider class. You can set here
  2 | # default settings for all spiders inherited from ApplicationSpider.
  3 | # To generate a new spider, run: `$ kimurai generate spider spider_name`
  4 | 
  5 | class ApplicationSpider < Kimurai::Base
  6 |   include ApplicationHelper
  7 | 
  8 |   # Default engine for spiders (available engines: :mechanize, :poltergeist_phantomjs,
  9 |   # :selenium_firefox, :selenium_chrome)
 10 |   @engine = :poltergeist_phantomjs
 11 | 
 12 |   # Pipelines list, by order.
 13 |   # To process item through pipelines pass item to the `send_item` method
 14 |   @pipelines = [:validator, :saver]
 15 | 
 16 |   # Default config. Set here options which are default for all spiders inherited
 17 |   # from ApplicationSpider. Child's class config will be deep merged with this one
 18 |   @config = {
 19 |     # Custom headers, format: hash. Example: { "some header" => "some value", "another header" => "another value" }
 20 |     # Works only for :mechanize and :poltergeist_phantomjs engines (Selenium doesn't allow to set/get headers)
 21 |     # headers: {},
 22 | 
 23 |     # Custom User Agent, format: string or lambda.
 24 |     # Use lambda if you want to rotate user agents before each run:
 25 |     # user_agent: -> { ARRAY_OF_USER_AGENTS.sample }
 26 |     # Works for all engines
 27 |     # user_agent: "Mozilla/5.0 Firefox/61.0",
 28 | 
 29 |     # Custom cookies, format: array of hashes.
 30 |     # Format for a single cookie: { name: "cookie name", value: "cookie value", domain: ".example.com" }
 31 |     # Works for all engines
 32 |     # cookies: [],
 33 | 
 34 |     # Proxy, format: string or lambda. Format of a proxy string: "ip:port:protocol:user:password"
 35 |     # `protocol` can be http or socks5. User and password are optional.
 36 |     # Use lambda if you want to rotate proxies before each run:
 37 |     # proxy: -> { ARRAY_OF_PROXIES.sample }
 38 |     # Works for all engines, but keep in mind that Selenium drivers doesn't support proxies
 39 |     # with authorization. Also, Mechanize doesn't support socks5 proxy format (only http)
 40 |     # proxy: "3.4.5.6:3128:http:user:pass",
 41 | 
 42 |     # If enabled, browser will ignore any https errors. It's handy while using a proxy
 43 |     # with self-signed SSL cert (for example Crawlera or Mitmproxy)
 44 |     # Also, it will allow to visit webpages with expires SSL certificate.
 45 |     # Works for all engines
 46 |     ignore_ssl_errors: true,
 47 | 
 48 |     # Custom window size, works for all engines
 49 |     # window_size: [1366, 768],
 50 | 
 51 |     # Skip images downloading if true, works for all engines
 52 |     disable_images: true,
 53 | 
 54 |     # Selenium engines only: headless mode, `:native` or `:virtual_display` (default is :native)
 55 |     # Although native mode has a better performance, virtual display mode
 56 |     # sometimes can be useful. For example, some websites can detect (and block)
 57 |     # headless chrome, so you can use virtual_display mode instead
 58 |     # headless_mode: :native,
 59 | 
 60 |     # This option tells the browser not to use a proxy for the provided list of domains or IP addresses.
 61 |     # Format: array of strings. Works only for :selenium_firefox and selenium_chrome
 62 |     # proxy_bypass_list: [],
 63 | 
 64 |     # Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize
 65 |     # ssl_cert_path: "path/to/ssl_cert",
 66 | 
 67 |     # Inject some JavaScript code to the browser.
 68 |     # Format: array of strings, where each string is a path to JS file.
 69 |     # Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection)
 70 |     # extensions: ["lib/code_to_inject.js"],
 71 | 
 72 |     # Automatically skip duplicated (already visited) urls when using `request_to` method.
 73 |     # Possible values: `true` or `hash` with options.
 74 |     # In case of `true`, all visited urls will be added to the storage's scope `:requests_urls`
 75 |     # and if url already contains in this scope, request will be skipped.
 76 |     # You can configure this setting by providing additional options as hash:
 77 |     # `skip_duplicate_requests: { scope: :custom_scope, check_only: true }`, where:
 78 |     # `scope:` - use custom scope than `:requests_urls`
 79 |     # `check_only:` - if true, then scope will be only checked for url, url will not
 80 |     # be added to the scope if scope doesn't contains it.
 81 |     # works for all drivers
 82 |     # skip_duplicate_requests: true,
 83 | 
 84 |     # Automatically skip provided errors while requesting a page.
 85 |     # If raised error matches one of the errors in the list, then this error will be caught,
 86 |     # and request will be skipped.
 87 |     # It is a good idea to skip errors like NotFound(404), etc.
 88 |     # Format: array where elements are error classes or/and hashes. You can use hash format
 89 |     # for more flexibility: `{ error: "RuntimeError", message: "404 => Net::HTTPNotFound" }`.
 90 |     # Provided `message:` will be compared with a full error message using `String#include?`. Also
 91 |     # you can use regex instead: `{ error: "RuntimeError", message: /404|403/ }`.
 92 |     # skip_request_errors: [{ error: RuntimeError, message: "404 => Net::HTTPNotFound" }],
 93 | 
 94 |     # Automatically retry provided errors with a few attempts while requesting a page.
 95 |     # If raised error matches one of the errors in the list, then this error will be caught
 96 |     # and the request will be processed again within a delay. There are 3 attempts:
 97 |     # first: delay 15 sec, second: delay 30 sec, third: delay 45 sec.
 98 |     # If after 3 attempts there is still an exception, then the exception will be raised.
 99 |     # It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc.
100 |     # Format: same like for `skip_request_errors` option.
101 |     # retry_request_errors: [Net::ReadTimeout],
102 | 
103 |     # Handle page encoding while parsing html response using Nokogiri. There are two modes:
104 |     # Auto (`:auto`) (try to fetch correct encoding from <meta http-equiv="Content-Type"> or <meta charset> tags)
105 |     # Set required encoding manually, example: `encoding: "GB2312"` (Set required encoding manually)
106 |     # Default this option is unset.
107 |     # encoding: nil,
108 | 
109 |     # Restart browser if one of the options is true:
110 |     restart_if: {
111 |       # Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
112 |       # memory_limit: 350_000,
113 | 
114 |       # Restart browser if provided requests limit is exceeded (works for all engines)
115 |       # requests_limit: 100
116 |     },
117 | 
118 |     # Perform several actions before each request:
119 |     before_request: {
120 |       # Change proxy before each request. The `proxy:` option above should be presented
121 |       # and has lambda format. Works only for poltergeist and mechanize engines
122 |       # (Selenium doesn't support proxy rotation).
123 |       # change_proxy: true,
124 | 
125 |       # Change user agent before each request. The `user_agent:` option above should be presented
126 |       # and has lambda format. Works only for poltergeist and mechanize engines
127 |       # (selenium doesn't support to get/set headers).
128 |       # change_user_agent: true,
129 | 
130 |       # Clear all cookies before each request, works for all engines
131 |       # clear_cookies: true,
132 | 
133 |       # If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
134 |       # use this option instead (works for all engines)
135 |       # clear_and_set_cookies: true,
136 | 
137 |       # Global option to set delay between requests.
138 |       # Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
139 |       # delay number will be chosen randomly for each request: `rand (2..5) # => 3`
140 |       # delay: 1..3
141 |     }
142 |   }
143 | end
144 | 


--------------------------------------------------------------------------------
/lib/kimurai/template/tmp/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/tmp/.keep


--------------------------------------------------------------------------------
/lib/kimurai/version.rb:
--------------------------------------------------------------------------------
1 | module Kimurai
2 |   VERSION = "1.4.0"
3 | end
4 | 


--------------------------------------------------------------------------------
/test/kimurai_test.rb:
--------------------------------------------------------------------------------
 1 | require "test_helper"
 2 | 
 3 | class KimuraiTest < Minitest::Test
 4 |   def test_that_it_has_a_version_number
 5 |     refute_nil ::Kimurai::VERSION
 6 |   end
 7 | 
 8 |   def test_it_does_something_useful
 9 |     assert false
10 |   end
11 | end
12 | 


--------------------------------------------------------------------------------
/test/test_helper.rb:
--------------------------------------------------------------------------------
1 | $LOAD_PATH.unshift File.expand_path("../../lib", __FILE__)
2 | require "kimurai"
3 | 
4 | require "minitest/autorun"
5 | 


--------------------------------------------------------------------------------