├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── bin ├── console └── setup ├── exe └── kimurai ├── kimurai.gemspec ├── lib ├── kimurai.rb └── kimurai │ ├── automation │ ├── deploy.yml │ ├── setup.yml │ └── setup │ │ ├── chromium_chromedriver.yml │ │ ├── firefox_geckodriver.yml │ │ ├── phantomjs.yml │ │ └── ruby_environment.yml │ ├── base.rb │ ├── base │ ├── saver.rb │ └── storage.rb │ ├── base_helper.rb │ ├── browser_builder.rb │ ├── browser_builder │ ├── mechanize_builder.rb │ ├── poltergeist_phantomjs_builder.rb │ ├── selenium_chrome_builder.rb │ └── selenium_firefox_builder.rb │ ├── capybara_configuration.rb │ ├── capybara_ext │ ├── driver │ │ └── base.rb │ ├── mechanize │ │ └── driver.rb │ ├── poltergeist │ │ └── driver.rb │ ├── selenium │ │ └── driver.rb │ ├── session.rb │ └── session │ │ └── config.rb │ ├── cli.rb │ ├── cli │ ├── ansible_command_builder.rb │ └── generator.rb │ ├── core_ext │ ├── array.rb │ ├── hash.rb │ ├── numeric.rb │ └── string.rb │ ├── pipeline.rb │ ├── runner.rb │ ├── template │ ├── .gitignore │ ├── Gemfile │ ├── README.md │ ├── config │ │ ├── application.rb │ │ ├── automation.yml │ │ ├── boot.rb │ │ ├── initializers │ │ │ └── .keep │ │ └── schedule.rb │ ├── db │ │ └── .keep │ ├── helpers │ │ └── application_helper.rb │ ├── lib │ │ └── .keep │ ├── log │ │ └── .keep │ ├── pipelines │ │ ├── saver.rb │ │ └── validator.rb │ ├── spiders │ │ └── application_spider.rb │ └── tmp │ │ └── .keep │ └── version.rb └── test ├── kimurai_test.rb └── test_helper.rb /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /doc/ 6 | /pkg/ 7 | /spec/reports/ 8 | /tmp/ 9 | Gemfile.lock 10 | 11 | *.retry 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: ruby 3 | rvm: 4 | - 2.5.1 5 | before_install: gem install bundler -v 1.16.2 6 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | ## 1.4.0 3 | ### New 4 | * Add `encoding` config option (see [All available config options](https://github.com/vifreefly/kimuraframework#all-available-config-options)) 5 | * Validate url before processing a request (Base#request_to) 6 | 7 | ### Fixes 8 | * Fix console command bug (see [issue 21](https://github.com/vifreefly/kimuraframework/issues/21)) 9 | 10 | ## 1.3.2 11 | ### Fixes 12 | * In the project template, set Ruby version as >= 2.5 (before was hard-coded to 2.5.1) 13 | * Remove .ruby-version file (was hard-coded to 2.5.1) from the project template 14 | 15 | ## 1.3.1 16 | ### Fixes 17 | * Fixed bug in Base#save_to 18 | 19 | ## 1.3.0 20 | ### Breaking changes 1.3.0 21 | * Remove persistence database feature (because it's slow and makes things complicated) 22 | 23 | ### New 24 | * Add `--include` and `--exclude` options to CLI#runner 25 | * Add Base `#create_browser` method to easily create additional browser instances 26 | * Add Capybara::Session `#scroll_to_bottom` 27 | * Add skip_on_failure feature to `retry_request_errors` config option 28 | * Add info about `add_event` method to the README 29 | 30 | ### Fixes and improvements 31 | * Improve Runner 32 | * Fix time helper in schedule.rb 33 | * Add proxy validation to browser builders 34 | * Allow to pass different arguments to the `Base.parse` method 35 | 36 | ## 1.2.0 37 | ### New 38 | * Add possibility to add array of values to the storage (`Base::Storage#add`) 39 | * Add `exception_on_fail` option to `Base.crawl!` 40 | * Add possibility to pass request hash to the `start_urls` (You can use array of hashes as well, like: `@start_urls = [{ url: "https://example.com/cat?id=1", data: { category: "First Category" } }]`) 41 | * Implement `skip_request_errors` config feature. Added [Handle request errors](https://github.com/vifreefly/kimuraframework#handle-request-errors) chapter to the README. 42 | * Add option to choose response type for `Session#current_response` (`:html` default, or `:json`) 43 | * Add option to provide custom chrome and chromedriver paths 44 | 45 | ### Improvements 46 | * Refactor `Runner` 47 | 48 | ### Fixes 49 | * Fix `Base#Saver` (automatically create file if it doesn't exists in case of persistence database) 50 | * Do not deep merge config's `headers:` option 51 | 52 | ## 1.1.0 53 | ### Breaking changes 1.1.0 54 | `browser` config option depricated. Now all sub-options inside `browser` should be placed right into `@config` hash, without `browser` parent key. Example: 55 | 56 | ```ruby 57 | # Was: 58 | @config = { 59 | browser: { 60 | retry_request_errors: [Net::ReadTimeout], 61 | restart_if: { 62 | memory_limit: 350_000, 63 | requests_limit: 100 64 | }, 65 | before_request: { 66 | change_proxy: true, 67 | change_user_agent: true, 68 | clear_cookies: true, 69 | clear_and_set_cookies: true, 70 | delay: 1..3 71 | } 72 | } 73 | } 74 | 75 | # Now: 76 | @config = { 77 | retry_request_errors: [Net::ReadTimeout], 78 | restart_if: { 79 | memory_limit: 350_000, 80 | requests_limit: 100 81 | }, 82 | before_request: { 83 | change_proxy: true, 84 | change_user_agent: true, 85 | clear_cookies: true, 86 | clear_and_set_cookies: true, 87 | delay: 1..3 88 | } 89 | } 90 | ``` 91 | 92 | ### New 93 | * Add `storage` object with additional methods and persistence database feature 94 | * Add events feature to `run_info` 95 | * Add `skip_duplicate_requests` config option to automatically skip already visited urls when using requrst_to 96 | * Add `extensions` config option to allow inject JS code into browser (supported only by poltergeist_phantomjs engine) 97 | * Add Capybara::Session#within_new_window_by method 98 | 99 | ### Improvements 100 | * Add the last backtrace line to pipeline output when item was dropped 101 | * Do not destroy driver if it's not exists (for Base.parse! method) 102 | * Handle possible Net::ReadTimeout error while trying to #quit driver 103 | 104 | ### Fixes 105 | * Fix Mechanize::Driver#proxy (there was a bug while using proxy for mechanize engine without authorization) 106 | * Fix requests retries logic 107 | 108 | 109 | ## 1.0.1 110 | * Add missing `logger` method to pipeline 111 | * Fix `set_proxy` in Mechanize and Poltergeist builders 112 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } 4 | 5 | # Specify your gem's dependencies in kimurai.gemspec 6 | gemspec 7 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Victor Afanasev 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kimurai 2 | 3 | > UPD. I will soon have a time to work on issues for current 1.4 version and also plan to release new 2.0 version with https://github.com/twalpole/apparition engine. 4 | 5 | Kimurai is a modern web scraping framework written in Ruby which **works out of box with Headless Chromium/Firefox, PhantomJS**, or simple HTTP requests and **allows to scrape and interact with JavaScript rendered websites.** 6 | 7 | Kimurai based on well-known [Capybara](https://github.com/teamcapybara/capybara) and [Nokogiri](https://github.com/sparklemotion/nokogiri) gems, so you don't have to learn anything new. Lets see: 8 | 9 | ```ruby 10 | # github_spider.rb 11 | require 'kimurai' 12 | 13 | class GithubSpider < Kimurai::Base 14 | @name = "github_spider" 15 | @engine = :selenium_chrome 16 | @start_urls = ["https://github.com/search?q=Ruby%20Web%20Scraping"] 17 | @config = { 18 | user_agent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36", 19 | before_request: { delay: 4..7 } 20 | } 21 | 22 | def parse(response, url:, data: {}) 23 | response.xpath("//ul[@class='repo-list']/div//h3/a").each do |a| 24 | request_to :parse_repo_page, url: absolute_url(a[:href], base: url) 25 | end 26 | 27 | if next_page = response.at_xpath("//a[@class='next_page']") 28 | request_to :parse, url: absolute_url(next_page[:href], base: url) 29 | end 30 | end 31 | 32 | def parse_repo_page(response, url:, data: {}) 33 | item = {} 34 | 35 | item[:owner] = response.xpath("//h1//a[@rel='author']").text 36 | item[:repo_name] = response.xpath("//h1/strong[@itemprop='name']/a").text 37 | item[:repo_url] = url 38 | item[:description] = response.xpath("//span[@itemprop='about']").text.squish 39 | item[:tags] = response.xpath("//div[@id='topics-list-container']/div/a").map { |a| a.text.squish } 40 | item[:watch_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Watch')]/a[2]").text.squish 41 | item[:star_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Star')]/a[2]").text.squish 42 | item[:fork_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Fork')]/a[2]").text.squish 43 | item[:last_commit] = response.xpath("//span[@itemprop='dateModified']/*").text 44 | 45 | save_to "results.json", item, format: :pretty_json 46 | end 47 | end 48 | 49 | GithubSpider.crawl! 50 | ``` 51 | 52 |
53 | Run: $ ruby github_spider.rb 54 | 55 | ``` 56 | I, [2018-08-22 13:08:03 +0400#15477] [M: 47377500980720] INFO -- github_spider: Spider: started: github_spider 57 | D, [2018-08-22 13:08:03 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): created browser instance 58 | D, [2018-08-22 13:08:03 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): enabled `browser before_request delay` 59 | D, [2018-08-22 13:08:03 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: sleep 7 seconds before request... 60 | D, [2018-08-22 13:08:10 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): enabled custom user-agent 61 | D, [2018-08-22 13:08:10 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): enabled native headless_mode 62 | I, [2018-08-22 13:08:10 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: started get request to: https://github.com/search?q=Ruby%20Web%20Scraping 63 | I, [2018-08-22 13:08:26 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: finished get request to: https://github.com/search?q=Ruby%20Web%20Scraping 64 | I, [2018-08-22 13:08:26 +0400#15477] [M: 47377500980720] INFO -- github_spider: Info: visits: requests: 1, responses: 1 65 | D, [2018-08-22 13:08:27 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: driver.current_memory: 107968 66 | D, [2018-08-22 13:08:27 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: sleep 5 seconds before request... 67 | I, [2018-08-22 13:08:32 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: started get request to: https://github.com/lorien/awesome-web-scraping 68 | I, [2018-08-22 13:08:33 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: finished get request to: https://github.com/lorien/awesome-web-scraping 69 | I, [2018-08-22 13:08:33 +0400#15477] [M: 47377500980720] INFO -- github_spider: Info: visits: requests: 2, responses: 2 70 | D, [2018-08-22 13:08:33 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: driver.current_memory: 212542 71 | D, [2018-08-22 13:08:33 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: sleep 4 seconds before request... 72 | I, [2018-08-22 13:08:37 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: started get request to: https://github.com/jaimeiniesta/metainspector 73 | 74 | ... 75 | 76 | I, [2018-08-22 13:23:07 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: started get request to: https://github.com/preston/idclight 77 | I, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: finished get request to: https://github.com/preston/idclight 78 | I, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720] INFO -- github_spider: Info: visits: requests: 140, responses: 140 79 | D, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: driver.current_memory: 204198 80 | I, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: driver selenium_chrome has been destroyed 81 | 82 | I, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720] INFO -- github_spider: Spider: stopped: {:spider_name=>"github_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 13:08:03 +0400, :stop_time=>2018-08-22 13:23:08 +0400, :running_time=>"15m, 5s", :visits=>{:requests=>140, :responses=>140}, :error=>nil} 83 | ``` 84 |
85 | 86 |
87 | results.json 88 | 89 | ```json 90 | [ 91 | { 92 | "owner": "lorien", 93 | "repo_name": "awesome-web-scraping", 94 | "repo_url": "https://github.com/lorien/awesome-web-scraping", 95 | "description": "List of libraries, tools and APIs for web scraping and data processing.", 96 | "tags": [ 97 | "awesome", 98 | "awesome-list", 99 | "web-scraping", 100 | "data-processing", 101 | "python", 102 | "javascript", 103 | "php", 104 | "ruby" 105 | ], 106 | "watch_count": "159", 107 | "star_count": "2,423", 108 | "fork_count": "358", 109 | "last_commit": "4 days ago", 110 | "position": 1 111 | }, 112 | 113 | ... 114 | 115 | { 116 | "owner": "preston", 117 | "repo_name": "idclight", 118 | "repo_url": "https://github.com/preston/idclight", 119 | "description": "A Ruby gem for accessing the freely available IDClight (IDConverter Light) web service, which convert between different types of gene IDs such as Hugo and Entrez. Queries are screen scraped from http://idclight.bioinfo.cnio.es.", 120 | "tags": [ 121 | 122 | ], 123 | "watch_count": "6", 124 | "star_count": "1", 125 | "fork_count": "0", 126 | "last_commit": "on Apr 12, 2012", 127 | "position": 127 128 | } 129 | ] 130 | ``` 131 |

132 | 133 | Okay, that was easy. How about javascript rendered websites with dynamic HTML? Lets scrape a page with infinite scroll: 134 | 135 | ```ruby 136 | # infinite_scroll_spider.rb 137 | require 'kimurai' 138 | 139 | class InfiniteScrollSpider < Kimurai::Base 140 | @name = "infinite_scroll_spider" 141 | @engine = :selenium_chrome 142 | @start_urls = ["https://infinite-scroll.com/demo/full-page/"] 143 | 144 | def parse(response, url:, data: {}) 145 | posts_headers_path = "//article/h2" 146 | count = response.xpath(posts_headers_path).count 147 | 148 | loop do 149 | browser.execute_script("window.scrollBy(0,10000)") ; sleep 2 150 | response = browser.current_response 151 | 152 | new_count = response.xpath(posts_headers_path).count 153 | if count == new_count 154 | logger.info "> Pagination is done" and break 155 | else 156 | count = new_count 157 | logger.info "> Continue scrolling, current count is #{count}..." 158 | end 159 | end 160 | 161 | posts_headers = response.xpath(posts_headers_path).map(&:text) 162 | logger.info "> All posts from page: #{posts_headers.join('; ')}" 163 | end 164 | end 165 | 166 | InfiniteScrollSpider.crawl! 167 | ``` 168 | 169 |
170 | Run: $ ruby infinite_scroll_spider.rb 171 | 172 | ``` 173 | I, [2018-08-22 13:32:57 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: Spider: started: infinite_scroll_spider 174 | D, [2018-08-22 13:32:57 +0400#23356] [M: 47375890851320] DEBUG -- infinite_scroll_spider: BrowserBuilder (selenium_chrome): created browser instance 175 | D, [2018-08-22 13:32:57 +0400#23356] [M: 47375890851320] DEBUG -- infinite_scroll_spider: BrowserBuilder (selenium_chrome): enabled native headless_mode 176 | I, [2018-08-22 13:32:57 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: Browser: started get request to: https://infinite-scroll.com/demo/full-page/ 177 | I, [2018-08-22 13:33:03 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: Browser: finished get request to: https://infinite-scroll.com/demo/full-page/ 178 | I, [2018-08-22 13:33:03 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: Info: visits: requests: 1, responses: 1 179 | D, [2018-08-22 13:33:03 +0400#23356] [M: 47375890851320] DEBUG -- infinite_scroll_spider: Browser: driver.current_memory: 95463 180 | I, [2018-08-22 13:33:05 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: > Continue scrolling, current count is 5... 181 | I, [2018-08-22 13:33:18 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: > Continue scrolling, current count is 9... 182 | I, [2018-08-22 13:33:20 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: > Continue scrolling, current count is 11... 183 | I, [2018-08-22 13:33:26 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: > Continue scrolling, current count is 13... 184 | I, [2018-08-22 13:33:28 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: > Continue scrolling, current count is 15... 185 | I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: > Pagination is done 186 | I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: > All posts from page: 1a - Infinite Scroll full page demo; 1b - RGB Schemes logo in Computer Arts; 2a - RGB Schemes logo; 2b - Masonry gets horizontalOrder; 2c - Every vector 2016; 3a - Logo Pizza delivered; 3b - Some CodePens; 3c - 365daysofmusic.com; 3d - Holograms; 4a - Huebee: 1-click color picker; 4b - Word is Flickity is good; Flickity v2 released: groupCells, adaptiveHeight, parallax; New tech gets chatter; Isotope v3 released: stagger in, IE8 out; Packery v2 released 187 | I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: Browser: driver selenium_chrome has been destroyed 188 | I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: Spider: stopped: {:spider_name=>"infinite_scroll_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 13:32:57 +0400, :stop_time=>2018-08-22 13:33:30 +0400, :running_time=>"33s", :visits=>{:requests=>1, :responses=>1}, :error=>nil} 189 | 190 | ``` 191 |

192 | 193 | 194 | ## Features 195 | * Scrape javascript rendered websites out of box 196 | * Supported engines: [Headless Chrome](https://developers.google.com/web/updates/2017/04/headless-chrome), [Headless Firefox](https://developer.mozilla.org/en-US/docs/Mozilla/Firefox/Headless_mode), [PhantomJS](https://github.com/ariya/phantomjs) or simple HTTP requests ([mechanize](https://github.com/sparklemotion/mechanize) gem) 197 | * Write spider code once, and use it with any supported engine later 198 | * All the power of [Capybara](https://github.com/teamcapybara/capybara): use methods like `click_on`, `fill_in`, `select`, `choose`, `set`, `go_back`, etc. to interact with web pages 199 | * Rich [configuration](#spider-config): **set default headers, cookies, delay between requests, enable proxy/user-agents rotation** 200 | * Built-in helpers to make scraping easy, like [save_to](#save_to-helper) (save items to JSON, JSON lines, or CSV formats) or [unique?](#skip-duplicates-unique-helper) to skip duplicates 201 | * Automatically [handle requests errors](#handle-request-errors) 202 | * Automatically restart browsers when reaching memory limit [**(memory control)**](#spider-config) or requests limit 203 | * Easily [schedule spiders](#schedule-spiders-using-cron) within cron using [Whenever](https://github.com/javan/whenever) (no need to know cron syntax) 204 | * [Parallel scraping](#parallel-crawling-using-in_parallel) using simple method `in_parallel` 205 | * **Two modes:** use single file for a simple spider, or [generate](#project-mode) Scrapy-like **project** 206 | * Convenient development mode with [console](#interactive-console), colorized logger and debugger ([Pry](https://github.com/pry/pry), [Byebug](https://github.com/deivid-rodriguez/byebug)) 207 | * Automated [server environment setup](#setup) (for ubuntu 18.04) and [deploy](#deploy) using commands `kimurai setup` and `kimurai deploy` ([Ansible](https://github.com/ansible/ansible) under the hood) 208 | * Command-line [runner](#runner) to run all project spiders one by one or in parallel 209 | 210 | ## Table of Contents 211 | * [Kimurai](#kimurai) 212 | * [Features](#features) 213 | * [Table of Contents](#table-of-contents) 214 | * [Installation](#installation) 215 | * [Getting to Know](#getting-to-know) 216 | * [Interactive console](#interactive-console) 217 | * [Available engines](#available-engines) 218 | * [Minimum required spider structure](#minimum-required-spider-structure) 219 | * [Method arguments response, url and data](#method-arguments-response-url-and-data) 220 | * [browser object](#browser-object) 221 | * [request_to method](#request_to-method) 222 | * [save_to helper](#save_to-helper) 223 | * [Skip duplicates](#skip-duplicates) 224 | * [Automatically skip all duplicated requests urls](#automatically-skip-all-duplicated-requests-urls) 225 | * [Storage object](#storage-object) 226 | * [Handle request errors](#handle-request-errors) 227 | * [skip_request_errors](#skip_request_errors) 228 | * [retry_request_errors](#retry_request_errors) 229 | * [Logging custom events](#logging-custom-events) 230 | * [open_spider and close_spider callbacks](#open_spider-and-close_spider-callbacks) 231 | * [KIMURAI_ENV](#kimurai_env) 232 | * [Parallel crawling using in_parallel](#parallel-crawling-using-in_parallel) 233 | * [Active Support included](#active-support-included) 234 | * [Schedule spiders using Cron](#schedule-spiders-using-cron) 235 | * [Configuration options](#configuration-options) 236 | * [Using Kimurai inside existing Ruby application](#using-kimurai-inside-existing-ruby-application) 237 | * [crawl! method](#crawl-method) 238 | * [parse! method](#parsemethod_name-url-method) 239 | * [Kimurai.list and Kimurai.find_by_name](#kimurailist-and-kimuraifind_by_name) 240 | * [Automated sever setup and deployment](#automated-sever-setup-and-deployment) 241 | * [Setup](#setup) 242 | * [Deploy](#deploy) 243 | * [Spider @config](#spider-config) 244 | * [All available @config options](#all-available-config-options) 245 | * [@config settings inheritance](#config-settings-inheritance) 246 | * [Project mode](#project-mode) 247 | * [Generate new spider](#generate-new-spider) 248 | * [Crawl](#crawl) 249 | * [List](#list) 250 | * [Parse](#parse) 251 | * [Pipelines, send_item method](#pipelines-send_item-method) 252 | * [Runner](#runner) 253 | * [Runner callbacks](#runner-callbacks) 254 | * [Chat Support and Feedback](#chat-support-and-feedback) 255 | * [License](#license) 256 | 257 | 258 | ## Installation 259 | Kimurai requires Ruby version `>= 2.5.0`. Supported platforms: `Linux` and `Mac OS X`. 260 | 261 | 1) If your system doesn't have appropriate Ruby version, install it: 262 | 263 |
264 | Ubuntu 18.04 265 | 266 | ```bash 267 | # Install required packages for ruby-build 268 | sudo apt update 269 | sudo apt install git-core curl zlib1g-dev build-essential libssl-dev libreadline-dev libreadline6-dev libyaml-dev libxml2-dev libxslt1-dev libcurl4-openssl-dev libffi-dev 270 | 271 | # Install rbenv and ruby-build 272 | cd && git clone https://github.com/rbenv/rbenv.git ~/.rbenv 273 | echo 'export PATH="$HOME/.rbenv/bin:$PATH"' >> ~/.bashrc 274 | echo 'eval "$(rbenv init -)"' >> ~/.bashrc 275 | exec $SHELL 276 | 277 | git clone https://github.com/rbenv/ruby-build.git ~/.rbenv/plugins/ruby-build 278 | echo 'export PATH="$HOME/.rbenv/plugins/ruby-build/bin:$PATH"' >> ~/.bashrc 279 | exec $SHELL 280 | 281 | # Install latest Ruby 282 | rbenv install 2.5.3 283 | rbenv global 2.5.3 284 | 285 | gem install bundler 286 | ``` 287 |
288 | 289 |
290 | Mac OS X 291 | 292 | ```bash 293 | # Install homebrew if you don't have it https://brew.sh/ 294 | # Install rbenv and ruby-build: 295 | brew install rbenv ruby-build 296 | 297 | # Add rbenv to bash so that it loads every time you open a terminal 298 | echo 'if which rbenv > /dev/null; then eval "$(rbenv init -)"; fi' >> ~/.bash_profile 299 | source ~/.bash_profile 300 | 301 | # Install latest Ruby 302 | rbenv install 2.5.3 303 | rbenv global 2.5.3 304 | 305 | gem install bundler 306 | ``` 307 |
308 | 309 | 2) Install Kimurai gem: `$ gem install kimurai` 310 | 311 | 3) Install browsers with webdrivers: 312 | 313 |
314 | Ubuntu 18.04 315 | 316 | Note: for Ubuntu 16.04-18.04 there is available automatic installation using `setup` command: 317 | ```bash 318 | $ kimurai setup localhost --local --ask-sudo 319 | ``` 320 | It works using [Ansible](https://github.com/ansible/ansible) so you need to install it first: `$ sudo apt install ansible`. You can check using playbooks [here](lib/kimurai/automation). 321 | 322 | If you chose automatic installation, you can skip following and go to "Getting To Know" part. In case if you want to install everything manually: 323 | 324 | ```bash 325 | # Install basic tools 326 | sudo apt install -q -y unzip wget tar openssl 327 | 328 | # Install xvfb (for virtual_display headless mode, in additional to native) 329 | sudo apt install -q -y xvfb 330 | 331 | # Install chromium-browser and firefox 332 | sudo apt install -q -y chromium-browser firefox 333 | 334 | # Instal chromedriver (2.44 version) 335 | # All versions located here https://sites.google.com/a/chromium.org/chromedriver/downloads 336 | cd /tmp && wget https://chromedriver.storage.googleapis.com/2.44/chromedriver_linux64.zip 337 | sudo unzip chromedriver_linux64.zip -d /usr/local/bin 338 | rm -f chromedriver_linux64.zip 339 | 340 | # Install geckodriver (0.23.0 version) 341 | # All versions located here https://github.com/mozilla/geckodriver/releases/ 342 | cd /tmp && wget https://github.com/mozilla/geckodriver/releases/download/v0.23.0/geckodriver-v0.23.0-linux64.tar.gz 343 | sudo tar -xvzf geckodriver-v0.23.0-linux64.tar.gz -C /usr/local/bin 344 | rm -f geckodriver-v0.23.0-linux64.tar.gz 345 | 346 | # Install PhantomJS (2.1.1) 347 | # All versions located here http://phantomjs.org/download.html 348 | sudo apt install -q -y chrpath libxft-dev libfreetype6 libfreetype6-dev libfontconfig1 libfontconfig1-dev 349 | cd /tmp && wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 350 | tar -xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2 351 | sudo mv phantomjs-2.1.1-linux-x86_64 /usr/local/lib 352 | sudo ln -s /usr/local/lib/phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local/bin 353 | rm -f phantomjs-2.1.1-linux-x86_64.tar.bz2 354 | ``` 355 | 356 |
357 | 358 |
359 | Mac OS X 360 | 361 | ```bash 362 | # Install chrome and firefox 363 | brew cask install google-chrome firefox 364 | 365 | # Install chromedriver (latest) 366 | brew cask install chromedriver 367 | 368 | # Install geckodriver (latest) 369 | brew install geckodriver 370 | 371 | # Install PhantomJS (latest) 372 | brew install phantomjs 373 | ``` 374 |

375 | 376 | Also, if you want to save scraped items to the database (using [ActiveRecord](https://github.com/rails/rails/tree/master/activerecord), [Sequel](https://github.com/jeremyevans/sequel) or [MongoDB Ruby Driver](https://github.com/mongodb/mongo-ruby-driver)/[Mongoid](https://github.com/mongodb/mongoid)), you need to install database clients/servers: 377 | 378 |
379 | Ubuntu 18.04 380 | 381 | SQlite: `$ sudo apt -q -y install libsqlite3-dev sqlite3`. 382 | 383 | If you want to connect to a remote database, you don't need database server on a local machine (only client): 384 | ```bash 385 | # Install MySQL client 386 | sudo apt -q -y install mysql-client libmysqlclient-dev 387 | 388 | # Install Postgres client 389 | sudo apt install -q -y postgresql-client libpq-dev 390 | 391 | # Install MongoDB client 392 | sudo apt install -q -y mongodb-clients 393 | ``` 394 | 395 | But if you want to save items to a local database, database server required as well: 396 | ```bash 397 | # Install MySQL client and server 398 | sudo apt -q -y install mysql-server mysql-client libmysqlclient-dev 399 | 400 | # Install Postgres client and server 401 | sudo apt install -q -y postgresql postgresql-contrib libpq-dev 402 | 403 | # Install MongoDB client and server 404 | # version 4.0 (check here https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/) 405 | sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 9DA31620334BD75D9DCB49F368818C72E52529D4 406 | # for 16.04: 407 | # echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/4.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.0.list 408 | # for 18.04: 409 | echo "deb [ arch=amd64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/4.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.0.list 410 | sudo apt update 411 | sudo apt install -q -y mongodb-org 412 | sudo service mongod start 413 | ``` 414 |
415 | 416 |
417 | Mac OS X 418 | 419 | SQlite: `$ brew install sqlite3` 420 | 421 | ```bash 422 | # Install MySQL client and server 423 | brew install mysql 424 | # Start server if you need it: brew services start mysql 425 | 426 | # Install Postgres client and server 427 | brew install postgresql 428 | # Start server if you need it: brew services start postgresql 429 | 430 | # Install MongoDB client and server 431 | brew install mongodb 432 | # Start server if you need it: brew services start mongodb 433 | ``` 434 |
435 | 436 | 437 | ## Getting to Know 438 | ### Interactive console 439 | Before you get to know all Kimurai features, there is `$ kimurai console` command which is an interactive console where you can try and debug your scraping code very quickly, without having to run any spider (yes, it's like [Scrapy shell](https://doc.scrapy.org/en/latest/topics/shell.html#topics-shell)). 440 | 441 | ```bash 442 | $ kimurai console --engine selenium_chrome --url https://github.com/vifreefly/kimuraframework 443 | ``` 444 | 445 |
446 | Show output 447 | 448 | ``` 449 | $ kimurai console --engine selenium_chrome --url https://github.com/vifreefly/kimuraframework 450 | 451 | D, [2018-08-22 13:42:32 +0400#26079] [M: 47461994677760] DEBUG -- : BrowserBuilder (selenium_chrome): created browser instance 452 | D, [2018-08-22 13:42:32 +0400#26079] [M: 47461994677760] DEBUG -- : BrowserBuilder (selenium_chrome): enabled native headless_mode 453 | I, [2018-08-22 13:42:32 +0400#26079] [M: 47461994677760] INFO -- : Browser: started get request to: https://github.com/vifreefly/kimuraframework 454 | I, [2018-08-22 13:42:35 +0400#26079] [M: 47461994677760] INFO -- : Browser: finished get request to: https://github.com/vifreefly/kimuraframework 455 | D, [2018-08-22 13:42:35 +0400#26079] [M: 47461994677760] DEBUG -- : Browser: driver.current_memory: 201701 456 | 457 | From: /home/victor/code/kimurai/lib/kimurai/base.rb @ line 189 Kimurai::Base#console: 458 | 459 | 188: def console(response = nil, url: nil, data: {}) 460 | => 189: binding.pry 461 | 190: end 462 | 463 | [1] pry(#)> response.xpath("//title").text 464 | => "GitHub - vifreefly/kimuraframework: Modern web scraping framework written in Ruby which works out of box with Headless Chromium/Firefox, PhantomJS, or simple HTTP requests and allows to scrape and interact with JavaScript rendered websites" 465 | 466 | [2] pry(#)> ls 467 | Kimurai::Base#methods: browser console logger request_to save_to unique? 468 | instance variables: @browser @config @engine @logger @pipelines 469 | locals: _ __ _dir_ _ex_ _file_ _in_ _out_ _pry_ data response url 470 | 471 | [3] pry(#)> ls response 472 | Nokogiri::XML::PP::Node#methods: inspect pretty_print 473 | Nokogiri::XML::Searchable#methods: % / at at_css at_xpath css search xpath 474 | Enumerable#methods: 475 | all? collect drop each_with_index find_all grep_v lazy member? none? reject slice_when take_while without 476 | any? collect_concat drop_while each_with_object find_index group_by many? min one? reverse_each sort to_a zip 477 | as_json count each_cons entries first include? map min_by partition select sort_by to_h 478 | chunk cycle each_entry exclude? flat_map index_by max minmax pluck slice_after sum to_set 479 | chunk_while detect each_slice find grep inject max_by minmax_by reduce slice_before take uniq 480 | Nokogiri::XML::Node#methods: 481 | <=> append_class classes document? has_attribute? matches? node_name= processing_instruction? to_str 482 | == attr comment? each html? name= node_type read_only? to_xhtml 483 | > attribute content elem? inner_html namespace= parent= remove traverse 484 | [] attribute_nodes content= element? inner_html= namespace_scopes parse remove_attribute unlink 485 | []= attribute_with_ns create_external_subset element_children inner_text namespaced_key? path remove_class values 486 | accept before create_internal_subset elements internal_subset native_content= pointer_id replace write_html_to 487 | add_class blank? css_path encode_special_chars key? next prepend_child set_attribute write_to 488 | add_next_sibling cdata? decorate! external_subset keys next= previous text write_xhtml_to 489 | add_previous_sibling child delete first_element_child lang next_element previous= text? write_xml_to 490 | after children description fragment? lang= next_sibling previous_element to_html xml? 491 | ancestors children= do_xinclude get_attribute last_element_child node_name previous_sibling to_s 492 | Nokogiri::XML::Document#methods: 493 | << canonicalize collect_namespaces create_comment create_entity decorate document encoding errors name remove_namespaces! root= to_java url version 494 | add_child clone create_cdata create_element create_text_node decorators dup encoding= errors= namespaces root slop! to_xml validate 495 | Nokogiri::HTML::Document#methods: fragment meta_encoding meta_encoding= serialize title title= type 496 | instance variables: @decorators @errors @node_cache 497 | 498 | [4] pry(#)> exit 499 | I, [2018-08-22 13:43:47 +0400#26079] [M: 47461994677760] INFO -- : Browser: driver selenium_chrome has been destroyed 500 | $ 501 | ``` 502 |

503 | 504 | CLI options: 505 | * `--engine` (optional) [engine](#available-drivers) to use. Default is `mechanize` 506 | * `--url` (optional) url to process. If url omitted, `response` and `url` objects inside the console will be `nil` (use [browser](#browser-object) object to navigate to any webpage). 507 | 508 | ### Available engines 509 | Kimurai has support for following engines and mostly can switch between them without need to rewrite any code: 510 | 511 | * `:mechanize` - [pure Ruby fake http browser](https://github.com/sparklemotion/mechanize). Mechanize can't render javascript and don't know what DOM is it. It only can parse original HTML code of a page. Because of it, mechanize much faster, takes much less memory and in general much more stable than any real browser. Use mechanize if you can do it, and the website doesn't use javascript to render any meaningful parts of its structure. Still, because mechanize trying to mimic a real browser, it supports almost all Capybara's [methods to interact with a web page](http://cheatrags.com/capybara) (filling forms, clicking buttons, checkboxes, etc). 512 | * `:poltergeist_phantomjs` - [PhantomJS headless browser](https://github.com/ariya/phantomjs), can render javascript. In general, PhantomJS still faster than Headless Chrome (and Headless Firefox). PhantomJS has memory leakage, but Kimurai has [memory control feature](#crawler-config) so you shouldn't consider it as a problem. Also, some websites can recognize PhantomJS and block access to them. Like mechanize (and unlike selenium engines) `:poltergeist_phantomjs` can freely rotate proxies and change headers _on the fly_ (see [config section](#all-available-config-options)). 513 | * `:selenium_chrome` Chrome in headless mode driven by selenium. Modern headless browser solution with proper javascript rendering. 514 | * `:selenium_firefox` Firefox in headless mode driven by selenium. Usually takes more memory than other drivers, but sometimes can be useful. 515 | 516 | **Tip:** add `HEADLESS=false` ENV variable before command (`$ HEADLESS=false ruby spider.rb`) to run browser in normal (not headless) mode and see it's window (only for selenium-like engines). It works for [console](#interactive-console) command as well. 517 | 518 | 519 | ### Minimum required spider structure 520 | > You can manually create a spider file, or use generator instead: `$ kimurai generate spider simple_spider` 521 | 522 | ```ruby 523 | require 'kimurai' 524 | 525 | class SimpleSpider < Kimurai::Base 526 | @name = "simple_spider" 527 | @engine = :selenium_chrome 528 | @start_urls = ["https://example.com/"] 529 | 530 | def parse(response, url:, data: {}) 531 | end 532 | end 533 | 534 | SimpleSpider.crawl! 535 | ``` 536 | 537 | Where: 538 | * `@name` name of a spider. You can omit name if use single-file spider 539 | * `@engine` engine for a spider 540 | * `@start_urls` array of start urls to process one by one inside `parse` method 541 | * Method `parse` is the start method, should be always present in spider class 542 | 543 | 544 | ### Method arguments `response`, `url` and `data` 545 | 546 | ```ruby 547 | def parse(response, url:, data: {}) 548 | end 549 | ``` 550 | 551 | * `response` ([Nokogiri::HTML::Document](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document) object) Contains parsed HTML code of a processed webpage 552 | * `url` (String) url of a processed webpage 553 | * `data` (Hash) uses to pass data between requests 554 | 555 |
556 | Example how to use data 557 | 558 | Imagine that there is a product page which doesn't contain product category. Category name present only on category page with pagination. This is the case where we can use `data` to pass category name from `parse` to `parse_product` method: 559 | 560 | ```ruby 561 | class ProductsSpider < Kimurai::Base 562 | @engine = :selenium_chrome 563 | @start_urls = ["https://example-shop.com/example-product-category"] 564 | 565 | def parse(response, url:, data: {}) 566 | category_name = response.xpath("//path/to/category/name").text 567 | response.xpath("//path/to/products/urls").each do |product_url| 568 | # Merge category_name with current data hash and pass it next to parse_product method 569 | request_to(:parse_product, url: product_url[:href], data: data.merge(category_name: category_name)) 570 | end 571 | 572 | # ... 573 | end 574 | 575 | def parse_product(response, url:, data: {}) 576 | item = {} 577 | # Assign item's category_name from data[:category_name] 578 | item[:category_name] = data[:category_name] 579 | 580 | # ... 581 | end 582 | end 583 | 584 | ``` 585 |

586 | 587 | **You can query `response` using [XPath or CSS selectors](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/Searchable)**. Check Nokogiri tutorials to understand how to work with `response`: 588 | * [Parsing HTML with Nokogiri](http://ruby.bastardsbook.com/chapters/html-parsing/) - ruby.bastardsbook.com 589 | * [HOWTO parse HTML with Ruby & Nokogiri](https://readysteadycode.com/howto-parse-html-with-ruby-and-nokogiri) - readysteadycode.com 590 | * [Class: Nokogiri::HTML::Document](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document) (documentation) - rubydoc.info 591 | 592 | 593 | ### `browser` object 594 | 595 | From any spider instance method there is available `browser` object, which is [Capybara::Session](https://www.rubydoc.info/github/jnicklas/capybara/Capybara/Session) object and uses to process requests and get page response (`current_response` method). Usually you don't need to touch it directly, because there is `response` (see above) which contains page response after it was loaded. 596 | 597 | But if you need to interact with a page (like filling form fields, clicking elements, checkboxes, etc) `browser` is ready for you: 598 | 599 | ```ruby 600 | class GoogleSpider < Kimurai::Base 601 | @name = "google_spider" 602 | @engine = :selenium_chrome 603 | @start_urls = ["https://www.google.com/"] 604 | 605 | def parse(response, url:, data: {}) 606 | browser.fill_in "q", with: "Kimurai web scraping framework" 607 | browser.click_button "Google Search" 608 | 609 | # Update response to current response after interaction with a browser 610 | response = browser.current_response 611 | 612 | # Collect results 613 | results = response.xpath("//div[@class='g']//h3/a").map do |a| 614 | { title: a.text, url: a[:href] } 615 | end 616 | 617 | # ... 618 | end 619 | end 620 | ``` 621 | 622 | Check out **Capybara cheat sheets** where you can see all available methods **to interact with browser**: 623 | * [UI Testing with RSpec and Capybara [cheat sheet]](http://cheatrags.com/capybara) - cheatrags.com 624 | * [Capybara Cheatsheet PDF](https://thoughtbot.com/upcase/test-driven-rails-resources/capybara.pdf) - thoughtbot.com 625 | * [Class: Capybara::Session](https://www.rubydoc.info/github/jnicklas/capybara/Capybara/Session) (documentation) - rubydoc.info 626 | 627 | ### `request_to` method 628 | 629 | For making requests to a particular method there is `request_to`. It requires minimum two arguments: `:method_name` and `url:`. An optional argument is `data:` (see above what for is it). Example: 630 | 631 | ```ruby 632 | class Spider < Kimurai::Base 633 | @engine = :selenium_chrome 634 | @start_urls = ["https://example.com/"] 635 | 636 | def parse(response, url:, data: {}) 637 | # Process request to `parse_product` method with `https://example.com/some_product` url: 638 | request_to :parse_product, url: "https://example.com/some_product" 639 | end 640 | 641 | def parse_product(response, url:, data: {}) 642 | puts "From page https://example.com/some_product !" 643 | end 644 | end 645 | ``` 646 | 647 | Under the hood `request_to` simply call [#visit](https://www.rubydoc.info/github/jnicklas/capybara/Capybara%2FSession:visit) (`browser.visit(url)`) and then required method with arguments: 648 | 649 |
650 | request_to 651 | 652 | ```ruby 653 | def request_to(handler, url:, data: {}) 654 | request_data = { url: url, data: data } 655 | 656 | browser.visit(url) 657 | public_send(handler, browser.current_response, request_data) 658 | end 659 | ``` 660 |

661 | 662 | `request_to` just makes things simpler, and without it we could do something like: 663 | 664 |
665 | Check the code 666 | 667 | ```ruby 668 | class Spider < Kimurai::Base 669 | @engine = :selenium_chrome 670 | @start_urls = ["https://example.com/"] 671 | 672 | def parse(response, url:, data: {}) 673 | url_to_process = "https://example.com/some_product" 674 | 675 | browser.visit(url_to_process) 676 | parse_product(browser.current_response, url: url_to_process) 677 | end 678 | 679 | def parse_product(response, url:, data: {}) 680 | puts "From page https://example.com/some_product !" 681 | end 682 | end 683 | ``` 684 |
685 | 686 | ### `save_to` helper 687 | 688 | Sometimes all that you need is to simply save scraped data to a file format, like JSON or CSV. You can use `save_to` for it: 689 | 690 | ```ruby 691 | class ProductsSpider < Kimurai::Base 692 | @engine = :selenium_chrome 693 | @start_urls = ["https://example-shop.com/"] 694 | 695 | # ... 696 | 697 | def parse_product(response, url:, data: {}) 698 | item = {} 699 | 700 | item[:title] = response.xpath("//title/path").text 701 | item[:description] = response.xpath("//desc/path").text.squish 702 | item[:price] = response.xpath("//price/path").text[/\d+/]&.to_f 703 | 704 | # Add each new item to the `scraped_products.json` file: 705 | save_to "scraped_products.json", item, format: :json 706 | end 707 | end 708 | ``` 709 | 710 | Supported formats: 711 | * `:json` JSON 712 | * `:pretty_json` "pretty" JSON (`JSON.pretty_generate`) 713 | * `:jsonlines` [JSON Lines](http://jsonlines.org/) 714 | * `:csv` CSV 715 | 716 | Note: `save_to` requires data (item to save) to be a `Hash`. 717 | 718 | By default `save_to` add position key to an item hash. You can disable it with `position: false`: `save_to "scraped_products.json", item, format: :json, position: false`. 719 | 720 | **How helper works:** 721 | 722 | Until spider stops, each new item will be appended to a file. At the next run, helper will clear the content of a file first, and then start again appending items to it. 723 | 724 | > If you don't want file to be cleared before each run, add option `append: true`: `save_to "scraped_products.json", item, format: :json, append: true` 725 | 726 | ### Skip duplicates 727 | 728 | It's pretty common when websites have duplicated pages. For example when an e-commerce shop has the same products in different categories. To skip duplicates, there is simple `unique?` helper: 729 | 730 | ```ruby 731 | class ProductsSpider < Kimurai::Base 732 | @engine = :selenium_chrome 733 | @start_urls = ["https://example-shop.com/"] 734 | 735 | def parse(response, url:, data: {}) 736 | response.xpath("//categories/path").each do |category| 737 | request_to :parse_category, url: category[:href] 738 | end 739 | end 740 | 741 | # Check products for uniqueness using product url inside of parse_category: 742 | def parse_category(response, url:, data: {}) 743 | response.xpath("//products/path").each do |product| 744 | # Skip url if it's not unique: 745 | next unless unique?(:product_url, product[:href]) 746 | # Otherwise process it: 747 | request_to :parse_product, url: product[:href] 748 | end 749 | end 750 | 751 | # Or/and check products for uniqueness using product sku inside of parse_product: 752 | def parse_product(response, url:, data: {}) 753 | item = {} 754 | item[:sku] = response.xpath("//product/sku/path").text.strip.upcase 755 | # Don't save product and return from method if there is already saved item with the same sku: 756 | return unless unique?(:sku, item[:sku]) 757 | 758 | # ... 759 | save_to "results.json", item, format: :json 760 | end 761 | end 762 | ``` 763 | 764 | `unique?` helper works pretty simple: 765 | 766 | ```ruby 767 | # Check string "http://example.com" in scope `url` for a first time: 768 | unique?(:url, "http://example.com") 769 | # => true 770 | 771 | # Try again: 772 | unique?(:url, "http://example.com") 773 | # => false 774 | ``` 775 | 776 | To check something for uniqueness, you need to provide a scope: 777 | 778 | ```ruby 779 | # `product_url` scope 780 | unique?(:product_url, "http://example.com/product_1") 781 | 782 | # `id` scope 783 | unique?(:id, 324234232) 784 | 785 | # `custom` scope 786 | unique?(:custom, "Lorem Ipsum") 787 | ``` 788 | 789 | #### Automatically skip all duplicated requests urls 790 | 791 | It is possible to automatically skip all already visited urls while calling `request_to` method, using [@config](#all-available-config-options) option `skip_duplicate_requests: true`. With this option, all already visited urls will be automatically skipped. Also check the [@config](#all-available-config-options) for an additional options of this setting. 792 | 793 | #### `storage` object 794 | 795 | `unique?` method it's just an alias for `storage#unique?`. Storage has several methods: 796 | 797 | * `#all` - display storage hash where keys are existing scopes. 798 | * `#include?(scope, value)` - return `true` if value in the scope exists, and `false` if not 799 | * `#add(scope, value)` - add value to the scope 800 | * `#unique?(scope, value)` - method already described above, will return `false` if value in the scope exists, or return `true` + add value to the scope if value in the scope not exists. 801 | * `#clear!` - reset the whole storage by deleting all values from all scopes. 802 | 803 | 804 | ### Handle request errors 805 | It is quite common that some pages of crawling website can return different response code than `200 ok`. In such cases, method `request_to` (or `browser.visit`) can raise an exception. Kimurai provides `skip_request_errors` and `retry_request_errors` [config](#spider-config) options to handle such errors: 806 | 807 | #### skip_request_errors 808 | You can automatically skip some of errors while requesting a page using `skip_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught, and request will be skipped. It is a good idea to skip errors like NotFound(404), etc. 809 | 810 | Format for the option: array where elements are error classes or/and hashes. You can use _hash_ format for more flexibility: 811 | 812 | ``` 813 | @config = { 814 | skip_request_errors: [{ error: RuntimeError, message: "404 => Net::HTTPNotFound" }] 815 | } 816 | ``` 817 | In this case, provided `message:` will be compared with a full error message using `String#include?`. Also you can use regex instead: `{ error: RuntimeError, message: /404|403/ }`. 818 | 819 | #### retry_request_errors 820 | You can automatically retry some of errors with a few attempts while requesting a page using `retry_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught and the request will be processed again within a delay. 821 | 822 | There are 3 attempts: first: delay _15 sec_, second: delay _30 sec_, third: delay _45 sec_. If after 3 attempts there is still an exception, then the exception will be raised. It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc. 823 | 824 | Format for the option: same like for `skip_request_errors` option. 825 | 826 | If you would like to skip (not raise) error after all retries gone, you can specify `skip_on_failure: true` option: 827 | 828 | ```ruby 829 | @config = { 830 | retry_request_errors: [{ error: RuntimeError, skip_on_failure: true }] 831 | } 832 | ``` 833 | 834 | ### Logging custom events 835 | 836 | It is possible to save custom messages to the [run_info](#open_spider-and-close_spider-callbacks) hash using `add_event('Some message')` method. This feature helps you to keep track on important things which happened during crawling without checking the whole spider log (in case if you're logging these messages using `logger`). Example: 837 | 838 | ```ruby 839 | def parse_product(response, url:, data: {}) 840 | unless response.at_xpath("//path/to/add_to_card_button") 841 | add_event("Product is sold") and return 842 | end 843 | 844 | # ... 845 | end 846 | ``` 847 | 848 | ``` 849 | ... 850 | I, [2018-11-28 22:20:19 +0400#7402] [M: 47156576560640] INFO -- example_spider: Spider: new event (scope: custom): Product is sold 851 | ... 852 | I, [2018-11-28 22:20:19 +0400#7402] [M: 47156576560640] INFO -- example_spider: Spider: stopped: {:events=>{:custom=>{"Product is sold"=>1}}} 853 | ``` 854 | 855 | ### `open_spider` and `close_spider` callbacks 856 | 857 | You can define `.open_spider` and `.close_spider` callbacks (class methods) to perform some action before spider started or after spider has been stopped: 858 | 859 | ```ruby 860 | require 'kimurai' 861 | 862 | class ExampleSpider < Kimurai::Base 863 | @name = "example_spider" 864 | @engine = :selenium_chrome 865 | @start_urls = ["https://example.com/"] 866 | 867 | def self.open_spider 868 | logger.info "> Starting..." 869 | end 870 | 871 | def self.close_spider 872 | logger.info "> Stopped!" 873 | end 874 | 875 | def parse(response, url:, data: {}) 876 | logger.info "> Scraping..." 877 | end 878 | end 879 | 880 | ExampleSpider.crawl! 881 | ``` 882 | 883 |
884 | Output 885 | 886 | ``` 887 | I, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840] INFO -- example_spider: Spider: started: example_spider 888 | I, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840] INFO -- example_spider: > Starting... 889 | D, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840] DEBUG -- example_spider: BrowserBuilder (selenium_chrome): created browser instance 890 | D, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840] DEBUG -- example_spider: BrowserBuilder (selenium_chrome): enabled native headless_mode 891 | I, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840] INFO -- example_spider: Browser: started get request to: https://example.com/ 892 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] INFO -- example_spider: Browser: finished get request to: https://example.com/ 893 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] INFO -- example_spider: Info: visits: requests: 1, responses: 1 894 | D, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] DEBUG -- example_spider: Browser: driver.current_memory: 82415 895 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] INFO -- example_spider: > Scraping... 896 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] INFO -- example_spider: Browser: driver selenium_chrome has been destroyed 897 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] INFO -- example_spider: > Stopped! 898 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] INFO -- example_spider: Spider: stopped: {:spider_name=>"example_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 14:26:32 +0400, :stop_time=>2018-08-22 14:26:34 +0400, :running_time=>"1s", :visits=>{:requests=>1, :responses=>1}, :error=>nil} 899 | ``` 900 |

901 | 902 | Inside `open_spider` and `close_spider` class methods there is available `run_info` method which contains useful information about spider state: 903 | 904 | ```ruby 905 | 11: def self.open_spider 906 | => 12: binding.pry 907 | 13: end 908 | 909 | [1] pry(example_spider)> run_info 910 | => { 911 | :spider_name=>"example_spider", 912 | :status=>:running, 913 | :environment=>"development", 914 | :start_time=>2018-08-05 23:32:00 +0400, 915 | :stop_time=>nil, 916 | :running_time=>nil, 917 | :visits=>{:requests=>0, :responses=>0}, 918 | :error=>nil 919 | } 920 | ``` 921 | 922 | Inside `close_spider`, `run_info` will be updated: 923 | 924 | ```ruby 925 | 15: def self.close_spider 926 | => 16: binding.pry 927 | 17: end 928 | 929 | [1] pry(example_spider)> run_info 930 | => { 931 | :spider_name=>"example_spider", 932 | :status=>:completed, 933 | :environment=>"development", 934 | :start_time=>2018-08-05 23:32:00 +0400, 935 | :stop_time=>2018-08-05 23:32:06 +0400, 936 | :running_time=>6.214, 937 | :visits=>{:requests=>1, :responses=>1}, 938 | :error=>nil 939 | } 940 | ``` 941 | 942 | `run_info[:status]` helps to determine if spider was finished successfully or failed (possible values: `:completed`, `:failed`): 943 | 944 | ```ruby 945 | class ExampleSpider < Kimurai::Base 946 | @name = "example_spider" 947 | @engine = :selenium_chrome 948 | @start_urls = ["https://example.com/"] 949 | 950 | def self.close_spider 951 | puts ">>> run info: #{run_info}" 952 | end 953 | 954 | def parse(response, url:, data: {}) 955 | logger.info "> Scraping..." 956 | # Let's try to strip nil: 957 | nil.strip 958 | end 959 | end 960 | ``` 961 | 962 |
963 | Output 964 | 965 | ``` 966 | I, [2018-08-22 14:34:24 +0400#8459] [M: 47020523644400] INFO -- example_spider: Spider: started: example_spider 967 | D, [2018-08-22 14:34:25 +0400#8459] [M: 47020523644400] DEBUG -- example_spider: BrowserBuilder (selenium_chrome): created browser instance 968 | D, [2018-08-22 14:34:25 +0400#8459] [M: 47020523644400] DEBUG -- example_spider: BrowserBuilder (selenium_chrome): enabled native headless_mode 969 | I, [2018-08-22 14:34:25 +0400#8459] [M: 47020523644400] INFO -- example_spider: Browser: started get request to: https://example.com/ 970 | I, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400] INFO -- example_spider: Browser: finished get request to: https://example.com/ 971 | I, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400] INFO -- example_spider: Info: visits: requests: 1, responses: 1 972 | D, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400] DEBUG -- example_spider: Browser: driver.current_memory: 83351 973 | I, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400] INFO -- example_spider: > Scraping... 974 | I, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400] INFO -- example_spider: Browser: driver selenium_chrome has been destroyed 975 | 976 | >>> run info: {:spider_name=>"example_spider", :status=>:failed, :environment=>"development", :start_time=>2018-08-22 14:34:24 +0400, :stop_time=>2018-08-22 14:34:26 +0400, :running_time=>2.01, :visits=>{:requests=>1, :responses=>1}, :error=>"#"} 977 | 978 | F, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400] FATAL -- example_spider: Spider: stopped: {:spider_name=>"example_spider", :status=>:failed, :environment=>"development", :start_time=>2018-08-22 14:34:24 +0400, :stop_time=>2018-08-22 14:34:26 +0400, :running_time=>"2s", :visits=>{:requests=>1, :responses=>1}, :error=>"#"} 979 | Traceback (most recent call last): 980 | 6: from example_spider.rb:19:in `
' 981 | 5: from /home/victor/code/kimurai/lib/kimurai/base.rb:127:in `crawl!' 982 | 4: from /home/victor/code/kimurai/lib/kimurai/base.rb:127:in `each' 983 | 3: from /home/victor/code/kimurai/lib/kimurai/base.rb:128:in `block in crawl!' 984 | 2: from /home/victor/code/kimurai/lib/kimurai/base.rb:185:in `request_to' 985 | 1: from /home/victor/code/kimurai/lib/kimurai/base.rb:185:in `public_send' 986 | example_spider.rb:15:in `parse': undefined method `strip' for nil:NilClass (NoMethodError) 987 | ``` 988 |

989 | 990 | **Usage example:** if spider finished successfully, send JSON file with scraped items to a remote FTP location, otherwise (if spider failed), skip incompleted results and send email/notification to slack about it: 991 | 992 |
993 | Example 994 | 995 | Also you can use additional methods `completed?` or `failed?` 996 | 997 | ```ruby 998 | class Spider < Kimurai::Base 999 | @engine = :selenium_chrome 1000 | @start_urls = ["https://example.com/"] 1001 | 1002 | def self.close_spider 1003 | if completed? 1004 | send_file_to_ftp("results.json") 1005 | else 1006 | send_error_notification(run_info[:error]) 1007 | end 1008 | end 1009 | 1010 | def self.send_file_to_ftp(file_path) 1011 | # ... 1012 | end 1013 | 1014 | def self.send_error_notification(error) 1015 | # ... 1016 | end 1017 | 1018 | # ... 1019 | 1020 | def parse_item(response, url:, data: {}) 1021 | item = {} 1022 | # ... 1023 | 1024 | save_to "results.json", item, format: :json 1025 | end 1026 | end 1027 | ``` 1028 |
1029 | 1030 | 1031 | ### `KIMURAI_ENV` 1032 | Kimurai has environments, default is `development`. To provide custom environment pass `KIMURAI_ENV` ENV variable before command: `$ KIMURAI_ENV=production ruby spider.rb`. To access current environment there is `Kimurai.env` method. 1033 | 1034 | Usage example: 1035 | ```ruby 1036 | class Spider < Kimurai::Base 1037 | @engine = :selenium_chrome 1038 | @start_urls = ["https://example.com/"] 1039 | 1040 | def self.close_spider 1041 | if failed? && Kimurai.env == "production" 1042 | send_error_notification(run_info[:error]) 1043 | else 1044 | # Do nothing 1045 | end 1046 | end 1047 | 1048 | # ... 1049 | end 1050 | ``` 1051 | 1052 | ### Parallel crawling using `in_parallel` 1053 | Kimurai can process web pages concurrently in one single line: `in_parallel(:parse_product, urls, threads: 3)`, where `:parse_product` is a method to process, `urls` is array of urls to crawl and `threads:` is a number of threads: 1054 | 1055 | ```ruby 1056 | # amazon_spider.rb 1057 | require 'kimurai' 1058 | 1059 | class AmazonSpider < Kimurai::Base 1060 | @name = "amazon_spider" 1061 | @engine = :mechanize 1062 | @start_urls = ["https://www.amazon.com/"] 1063 | 1064 | def parse(response, url:, data: {}) 1065 | browser.fill_in "field-keywords", with: "Web Scraping Books" 1066 | browser.click_on "Go" 1067 | 1068 | # Walk through pagination and collect products urls: 1069 | urls = [] 1070 | loop do 1071 | response = browser.current_response 1072 | response.xpath("//li//a[contains(@class, 's-access-detail-page')]").each do |a| 1073 | urls << a[:href].sub(/ref=.+/, "") 1074 | end 1075 | 1076 | browser.find(:xpath, "//a[@id='pagnNextLink']", wait: 1).click rescue break 1077 | end 1078 | 1079 | # Process all collected urls concurrently within 3 threads: 1080 | in_parallel(:parse_book_page, urls, threads: 3) 1081 | end 1082 | 1083 | def parse_book_page(response, url:, data: {}) 1084 | item = {} 1085 | 1086 | item[:title] = response.xpath("//h1/span[@id]").text.squish 1087 | item[:url] = url 1088 | item[:price] = response.xpath("(//span[contains(@class, 'a-color-price')])[1]").text.squish.presence 1089 | item[:publisher] = response.xpath("//h2[text()='Product details']/following::b[text()='Publisher:']/following-sibling::text()[1]").text.squish.presence 1090 | 1091 | save_to "books.json", item, format: :pretty_json 1092 | end 1093 | end 1094 | 1095 | AmazonSpider.crawl! 1096 | ``` 1097 | 1098 |
1099 | Run: $ ruby amazon_spider.rb 1100 | 1101 | ``` 1102 | I, [2018-08-22 14:48:37 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Spider: started: amazon_spider 1103 | D, [2018-08-22 14:48:37 +0400#13033] [M: 46982297486840] DEBUG -- amazon_spider: BrowserBuilder (mechanize): created browser instance 1104 | I, [2018-08-22 14:48:37 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/ 1105 | I, [2018-08-22 14:48:38 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/ 1106 | I, [2018-08-22 14:48:38 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Info: visits: requests: 1, responses: 1 1107 | 1108 | I, [2018-08-22 14:48:43 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Spider: in_parallel: starting processing 52 urls within 3 threads 1109 | D, [2018-08-22 14:48:43 +0400#13033] [C: 46982320219020] DEBUG -- amazon_spider: BrowserBuilder (mechanize): created browser instance 1110 | I, [2018-08-22 14:48:43 +0400#13033] [C: 46982320219020] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Practical-Web-Scraping-Data-Science/dp/1484235819/ 1111 | D, [2018-08-22 14:48:44 +0400#13033] [C: 46982320189640] DEBUG -- amazon_spider: BrowserBuilder (mechanize): created browser instance 1112 | I, [2018-08-22 14:48:44 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Python-Web-Scraping-Cookbook-scraping/dp/1787285219/ 1113 | D, [2018-08-22 14:48:44 +0400#13033] [C: 46982319187320] DEBUG -- amazon_spider: BrowserBuilder (mechanize): created browser instance 1114 | I, [2018-08-22 14:48:44 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Scraping-Python-Community-Experience-Distilled/dp/1782164367/ 1115 | I, [2018-08-22 14:48:45 +0400#13033] [C: 46982320219020] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Practical-Web-Scraping-Data-Science/dp/1484235819/ 1116 | I, [2018-08-22 14:48:45 +0400#13033] [C: 46982320219020] INFO -- amazon_spider: Info: visits: requests: 4, responses: 2 1117 | I, [2018-08-22 14:48:45 +0400#13033] [C: 46982320219020] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Web-Scraping-Python-Collecting-Modern/dp/1491910291/ 1118 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Python-Web-Scraping-Cookbook-scraping/dp/1787285219/ 1119 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Info: visits: requests: 5, responses: 3 1120 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Web-Scraping-Python-Collecting-Modern/dp/1491985577/ 1121 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Scraping-Python-Community-Experience-Distilled/dp/1782164367/ 1122 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Info: visits: requests: 6, responses: 4 1123 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Web-Scraping-Excel-Effective-Scrapes-ebook/dp/B01CMMJGZ8/ 1124 | 1125 | ... 1126 | 1127 | I, [2018-08-22 14:49:10 +0400#13033] [C: 46982320219020] INFO -- amazon_spider: Info: visits: requests: 51, responses: 49 1128 | I, [2018-08-22 14:49:10 +0400#13033] [C: 46982320219020] INFO -- amazon_spider: Browser: driver mechanize has been destroyed 1129 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Scraping-Ice-Life-Bill-Rayburn-ebook/dp/B00C0NF1L8/ 1130 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Info: visits: requests: 51, responses: 50 1131 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Instant-Scraping-Jacob-Ward-2013-07-26/dp/B01FJ1G3G4/ 1132 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Php-architects-Guide-Scraping-Author/dp/B010DTKYY4/ 1133 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Info: visits: requests: 52, responses: 51 1134 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Ship-Tracking-Maritime-Domain-Awareness/dp/B001J5MTOK/ 1135 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Instant-Scraping-Jacob-Ward-2013-07-26/dp/B01FJ1G3G4/ 1136 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Info: visits: requests: 53, responses: 52 1137 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Browser: driver mechanize has been destroyed 1138 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Ship-Tracking-Maritime-Domain-Awareness/dp/B001J5MTOK/ 1139 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Info: visits: requests: 53, responses: 53 1140 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Browser: driver mechanize has been destroyed 1141 | 1142 | I, [2018-08-22 14:49:12 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Spider: in_parallel: stopped processing 52 urls within 3 threads, total time: 29s 1143 | I, [2018-08-22 14:49:12 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Browser: driver mechanize has been destroyed 1144 | 1145 | I, [2018-08-22 14:49:12 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Spider: stopped: {:spider_name=>"amazon_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 14:48:37 +0400, :stop_time=>2018-08-22 14:49:12 +0400, :running_time=>"35s", :visits=>{:requests=>53, :responses=>53}, :error=>nil} 1146 | 1147 | ``` 1148 |
1149 | 1150 |
1151 | books.json 1152 | 1153 | ```json 1154 | [ 1155 | { 1156 | "title": "Web Scraping with Python: Collecting More Data from the Modern Web2nd Edition", 1157 | "url": "https://www.amazon.com/Web-Scraping-Python-Collecting-Modern/dp/1491985577/", 1158 | "price": "$26.94", 1159 | "publisher": "O'Reilly Media; 2 edition (April 14, 2018)", 1160 | "position": 1 1161 | }, 1162 | { 1163 | "title": "Python Web Scraping Cookbook: Over 90 proven recipes to get you scraping with Python, micro services, Docker and AWS", 1164 | "url": "https://www.amazon.com/Python-Web-Scraping-Cookbook-scraping/dp/1787285219/", 1165 | "price": "$39.99", 1166 | "publisher": "Packt Publishing - ebooks Account (February 9, 2018)", 1167 | "position": 2 1168 | }, 1169 | { 1170 | "title": "Web Scraping with Python: Collecting Data from the Modern Web1st Edition", 1171 | "url": "https://www.amazon.com/Web-Scraping-Python-Collecting-Modern/dp/1491910291/", 1172 | "price": "$15.75", 1173 | "publisher": "O'Reilly Media; 1 edition (July 24, 2015)", 1174 | "position": 3 1175 | }, 1176 | 1177 | ... 1178 | 1179 | { 1180 | "title": "Instant Web Scraping with Java by Ryan Mitchell (2013-08-26)", 1181 | "url": "https://www.amazon.com/Instant-Scraping-Java-Mitchell-2013-08-26/dp/B01FEM76X2/", 1182 | "price": "$35.82", 1183 | "publisher": "Packt Publishing (2013-08-26) (1896)", 1184 | "position": 52 1185 | } 1186 | ] 1187 | ``` 1188 |

1189 | 1190 | > Note that [save_to](#save_to-helper) and [unique?](#skip-duplicates-unique-helper) helpers are thread-safe (protected by [Mutex](https://ruby-doc.org/core-2.5.1/Mutex.html)) and can be freely used inside threads. 1191 | 1192 | `in_parallel` can take additional options: 1193 | * `data:` pass with urls custom data hash: `in_parallel(:method, urls, threads: 3, data: { category: "Scraping" })` 1194 | * `delay:` set delay between requests: `in_parallel(:method, urls, threads: 3, delay: 2)`. Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a Range, delay number will be chosen randomly for each request: `rand (2..5) # => 3` 1195 | * `engine:` set custom engine than a default one: `in_parallel(:method, urls, threads: 3, engine: :poltergeist_phantomjs)` 1196 | * `config:` pass custom options to config (see [config section](#crawler-config)) 1197 | 1198 | ### Active Support included 1199 | 1200 | You can use all the power of familiar [Rails core-ext methods](https://guides.rubyonrails.org/active_support_core_extensions.html#loading-all-core-extensions) for scraping inside Kimurai. Especially take a look at [squish](https://apidock.com/rails/String/squish), [truncate_words](https://apidock.com/rails/String/truncate_words), [titleize](https://apidock.com/rails/String/titleize), [remove](https://apidock.com/rails/String/remove), [present?](https://guides.rubyonrails.org/active_support_core_extensions.html#blank-questionmark-and-present-questionmark) and [presence](https://guides.rubyonrails.org/active_support_core_extensions.html#presence). 1201 | 1202 | ### Schedule spiders using Cron 1203 | 1204 | 1) Inside spider directory generate [Whenever](https://github.com/javan/whenever) config: `$ kimurai generate schedule`. 1205 | 1206 |
1207 | schedule.rb 1208 | 1209 | ```ruby 1210 | ### Settings ### 1211 | require 'tzinfo' 1212 | 1213 | # Export current PATH to the cron 1214 | env :PATH, ENV["PATH"] 1215 | 1216 | # Use 24 hour format when using `at:` option 1217 | set :chronic_options, hours24: true 1218 | 1219 | # Use local_to_utc helper to setup execution time using your local timezone instead 1220 | # of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`). 1221 | # Also maybe you'll want to set same timezone in kimurai as well (use `Kimurai.configuration.time_zone =` for that), 1222 | # to have spiders logs in a specific time zone format. 1223 | # Example usage of helper: 1224 | # every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do 1225 | # crawl "google_spider.com", output: "log/google_spider.com.log" 1226 | # end 1227 | def local_to_utc(time_string, zone:) 1228 | TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string)) 1229 | end 1230 | 1231 | # Note: by default Whenever exports cron commands with :environment == "production". 1232 | # Note: Whenever can only append log data to a log file (>>). If you want 1233 | # to overwrite (>) log file before each run, pass lambda: 1234 | # crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" } 1235 | 1236 | # Project job types 1237 | job_type :crawl, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output" 1238 | job_type :runner, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output" 1239 | 1240 | # Single file job type 1241 | job_type :single, "cd :path && KIMURAI_ENV=:environment ruby :task :output" 1242 | # Single with bundle exec 1243 | job_type :single_bundle, "cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output" 1244 | 1245 | ### Schedule ### 1246 | # Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file): 1247 | # every 1.day do 1248 | # Example to schedule a single spider in the project: 1249 | # crawl "google_spider.com", output: "log/google_spider.com.log" 1250 | 1251 | # Example to schedule all spiders in the project using runner. Each spider will write 1252 | # it's own output to the `log/spider_name.log` file (handled by a runner itself). 1253 | # Runner output will be written to log/runner.log file. 1254 | # Argument number it's a count of concurrent jobs: 1255 | # runner 3, output:"log/runner.log" 1256 | 1257 | # Example to schedule single spider (without project): 1258 | # single "single_spider.rb", output: "single_spider.log" 1259 | # end 1260 | 1261 | ### How to set a cron schedule ### 1262 | # Run: `$ whenever --update-crontab --load-file config/schedule.rb`. 1263 | # If you don't have whenever command, install the gem: `$ gem install whenever`. 1264 | 1265 | ### How to cancel a schedule ### 1266 | # Run: `$ whenever --clear-crontab --load-file config/schedule.rb`. 1267 | ``` 1268 |

1269 | 1270 | 2) Add at the bottom of `schedule.rb` following code: 1271 | 1272 | ```ruby 1273 | every 1.day, at: "7:00" do 1274 | single "example_spider.rb", output: "example_spider.log" 1275 | end 1276 | ``` 1277 | 1278 | 3) Run: `$ whenever --update-crontab --load-file schedule.rb`. Done! 1279 | 1280 | You can check Whenever examples [here](https://github.com/javan/whenever#example-schedulerb-file). To cancel schedule, run: `$ whenever --clear-crontab --load-file schedule.rb`. 1281 | 1282 | ### Configuration options 1283 | You can configure several options using `configure` block: 1284 | 1285 | ```ruby 1286 | Kimurai.configure do |config| 1287 | # Default logger has colored mode in development. 1288 | # If you would like to disable it, set `colorize_logger` to false. 1289 | # config.colorize_logger = false 1290 | 1291 | # Logger level for default logger: 1292 | # config.log_level = :info 1293 | 1294 | # Custom logger: 1295 | # config.logger = Logger.new(STDOUT) 1296 | 1297 | # Custom time zone (for logs): 1298 | # config.time_zone = "UTC" 1299 | # config.time_zone = "Europe/Moscow" 1300 | 1301 | # Provide custom chrome binary path (default is any available chrome/chromium in the PATH): 1302 | # config.selenium_chrome_path = "/usr/bin/chromium-browser" 1303 | # Provide custom selenium chromedriver path (default is "/usr/local/bin/chromedriver"): 1304 | # config.chromedriver_path = "~/.local/bin/chromedriver" 1305 | end 1306 | ``` 1307 | 1308 | ### Using Kimurai inside existing Ruby application 1309 | 1310 | You can integrate Kimurai spiders (which are just Ruby classes) to an existing Ruby application like Rails or Sinatra, and run them using background jobs (for example). Check the following info to understand the running process of spiders: 1311 | 1312 | #### `.crawl!` method 1313 | 1314 | `.crawl!` (class method) performs a _full run_ of a particular spider. This method will return run_info if run was successful, or an exception if something went wrong. 1315 | 1316 | ```ruby 1317 | class ExampleSpider < Kimurai::Base 1318 | @name = "example_spider" 1319 | @engine = :mechanize 1320 | @start_urls = ["https://example.com/"] 1321 | 1322 | def parse(response, url:, data: {}) 1323 | title = response.xpath("//title").text.squish 1324 | end 1325 | end 1326 | 1327 | ExampleSpider.crawl! 1328 | # => { :spider_name => "example_spider", :status => :completed, :environment => "development", :start_time => 2018-08-22 18:20:16 +0400, :stop_time => 2018-08-22 18:20:17 +0400, :running_time => 1.216, :visits => { :requests => 1, :responses => 1 }, :items => { :sent => 0, :processed => 0 }, :error => nil } 1329 | ``` 1330 | 1331 | You can't `.crawl!` spider in different thread if it still running (because spider instances store some shared data in the `@run_info` class variable while `crawl`ing): 1332 | 1333 | ```ruby 1334 | 2.times do |i| 1335 | Thread.new { p i, ExampleSpider.crawl! } 1336 | end # => 1337 | 1338 | # 1 1339 | # false 1340 | 1341 | # 0 1342 | # {:spider_name=>"example_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 18:49:22 +0400, :stop_time=>2018-08-22 18:49:23 +0400, :running_time=>0.801, :visits=>{:requests=>1, :responses=>1}, :items=>{:sent=>0, :processed=>0}, :error=>nil} 1343 | ``` 1344 | 1345 | So what if you're don't care about stats and just want to process request to a particular spider method and get the returning value from this method? Use `.parse!` instead: 1346 | 1347 | #### `.parse!(:method_name, url:)` method 1348 | 1349 | `.parse!` (class method) creates a new spider instance and performs a request to given method with a given url. Value from the method will be returned back: 1350 | 1351 | ```ruby 1352 | class ExampleSpider < Kimurai::Base 1353 | @name = "example_spider" 1354 | @engine = :mechanize 1355 | @start_urls = ["https://example.com/"] 1356 | 1357 | def parse(response, url:, data: {}) 1358 | title = response.xpath("//title").text.squish 1359 | end 1360 | end 1361 | 1362 | ExampleSpider.parse!(:parse, url: "https://example.com/") 1363 | # => "Example Domain" 1364 | ``` 1365 | 1366 | Like `.crawl!`, `.parse!` method takes care of a browser instance and kills it (`browser.destroy_driver!`) before returning the value. Unlike `.crawl!`, `.parse!` method can be called from different threads at the same time: 1367 | 1368 | ```ruby 1369 | urls = ["https://www.google.com/", "https://www.reddit.com/", "https://en.wikipedia.org/"] 1370 | 1371 | urls.each do |url| 1372 | Thread.new { p ExampleSpider.parse!(:parse, url: url) } 1373 | end # => 1374 | 1375 | # "Google" 1376 | # "Wikipedia, the free encyclopedia" 1377 | # "reddit: the front page of the internetHotHot" 1378 | ``` 1379 | 1380 | Keep in mind, that [save_to](#save_to-helper) and [unique?](#skip-duplicates) helpers are not thread-safe while using `.parse!` method. 1381 | 1382 | #### `Kimurai.list` and `Kimurai.find_by_name()` 1383 | 1384 | ```ruby 1385 | class GoogleSpider < Kimurai::Base 1386 | @name = "google_spider" 1387 | end 1388 | 1389 | class RedditSpider < Kimurai::Base 1390 | @name = "reddit_spider" 1391 | end 1392 | 1393 | class WikipediaSpider < Kimurai::Base 1394 | @name = "wikipedia_spider" 1395 | end 1396 | 1397 | # To get the list of all available spider classes: 1398 | Kimurai.list 1399 | # => {"google_spider"=>GoogleSpider, "reddit_spider"=>RedditSpider, "wikipedia_spider"=>WikipediaSpider} 1400 | 1401 | # To find a particular spider class by it's name: 1402 | Kimurai.find_by_name("reddit_spider") 1403 | # => RedditSpider 1404 | ``` 1405 | 1406 | 1407 | ### Automated sever setup and deployment 1408 | > **EXPERIMENTAL** 1409 | 1410 | #### Setup 1411 | You can automatically setup [required environment](#installation) for Kimurai on the remote server (currently there is only Ubuntu Server 18.04 support) using `$ kimurai setup` command. `setup` will perform installation of: latest Ruby with Rbenv, browsers with webdrivers and in additional databases clients (only clients) for MySQL, Postgres and MongoDB (so you can connect to a remote database from ruby). 1412 | 1413 | > To perform remote server setup, [Ansible](https://github.com/ansible/ansible) is required **on the desktop** machine (to install: Ubuntu: `$ sudo apt install ansible`, Mac OS X: `$ brew install ansible`) 1414 | 1415 | > It's recommended to use regular user to setup the server, not `root`. To create a new user, login to the server `$ ssh root@your_server_ip`, type `$ adduser username` to create a user, and `$ gpasswd -a username sudo` to add new user to a sudo group. 1416 | 1417 | Example: 1418 | 1419 | ```bash 1420 | $ kimurai setup deploy@123.123.123.123 --ask-sudo --ssh-key-path path/to/private_key 1421 | ``` 1422 | 1423 | CLI options: 1424 | * `--ask-sudo` pass this option to ask sudo (user) password for system-wide installation of packages (`apt install`) 1425 | * `--ssh-key-path path/to/private_key` authorization on the server using private ssh key. You can omit it if required key already [added to keychain](https://help.github.com/articles/generating-a-new-ssh-key-and-adding-it-to-the-ssh-agent/#adding-your-ssh-key-to-the-ssh-agent) on your desktop (Ansible uses [SSH agent forwarding](https://developer.github.com/v3/guides/using-ssh-agent-forwarding/)) 1426 | * `--ask-auth-pass` authorization on the server using user password, alternative option to `--ssh-key-path`. 1427 | * `-p port_number` custom port for ssh connection (`-p 2222`) 1428 | 1429 | > You can check setup playbook [here](lib/kimurai/automation/setup.yml) 1430 | 1431 | #### Deploy 1432 | 1433 | After successful `setup` you can deploy a spider to the remote server using `$ kimurai deploy` command. On each deploy there are performing several tasks: 1) pull repo from a remote origin to `~/repo_name` user directory 2) run `bundle install` 3) Update crontab `whenever --update-crontab` (to update spider schedule from schedule.rb file). 1434 | 1435 | Before `deploy` make sure that inside spider directory you have: 1) git repository with remote origin (bitbucket, github, etc.) 2) `Gemfile` 3) schedule.rb inside subfolder `config` (`config/schedule.rb`). 1436 | 1437 | Example: 1438 | 1439 | ```bash 1440 | $ kimurai deploy deploy@123.123.123.123 --ssh-key-path path/to/private_key --repo-key-path path/to/repo_private_key 1441 | ``` 1442 | 1443 | CLI options: _same like for [setup](#setup) command_ (except `--ask-sudo`), plus 1444 | * `--repo-url` provide custom repo url (`--repo-url git@bitbucket.org:username/repo_name.git`), otherwise current `origin/master` will be taken (output from `$ git remote get-url origin`) 1445 | * `--repo-key-path` if git repository is private, authorization is required to pull the code on the remote server. Use this option to provide a private repository SSH key. You can omit it if required key already added to keychain on your desktop (same like with `--ssh-key-path` option) 1446 | 1447 | > You can check deploy playbook [here](lib/kimurai/automation/deploy.yml) 1448 | 1449 | ## Spider `@config` 1450 | 1451 | Using `@config` you can set several options for a spider, like proxy, user-agent, default cookies/headers, delay between requests, browser **memory control** and so on: 1452 | 1453 | ```ruby 1454 | class Spider < Kimurai::Base 1455 | USER_AGENTS = ["Chrome", "Firefox", "Safari", "Opera"] 1456 | PROXIES = ["2.3.4.5:8080:http:username:password", "3.4.5.6:3128:http", "1.2.3.4:3000:socks5"] 1457 | 1458 | @engine = :poltergeist_phantomjs 1459 | @start_urls = ["https://example.com/"] 1460 | @config = { 1461 | headers: { "custom_header" => "custom_value" }, 1462 | cookies: [{ name: "cookie_name", value: "cookie_value", domain: ".example.com" }], 1463 | user_agent: -> { USER_AGENTS.sample }, 1464 | proxy: -> { PROXIES.sample }, 1465 | window_size: [1366, 768], 1466 | disable_images: true, 1467 | restart_if: { 1468 | # Restart browser if provided memory limit (in kilobytes) is exceeded: 1469 | memory_limit: 350_000 1470 | }, 1471 | before_request: { 1472 | # Change user agent before each request: 1473 | change_user_agent: true, 1474 | # Change proxy before each request: 1475 | change_proxy: true, 1476 | # Clear all cookies and set default cookies (if provided) before each request: 1477 | clear_and_set_cookies: true, 1478 | # Process delay before each request: 1479 | delay: 1..3 1480 | } 1481 | } 1482 | 1483 | def parse(response, url:, data: {}) 1484 | # ... 1485 | end 1486 | end 1487 | ``` 1488 | 1489 | ### All available `@config` options 1490 | 1491 | ```ruby 1492 | @config = { 1493 | # Custom headers, format: hash. Example: { "some header" => "some value", "another header" => "another value" } 1494 | # Works only for :mechanize and :poltergeist_phantomjs engines (Selenium doesn't allow to set/get headers) 1495 | headers: {}, 1496 | 1497 | # Custom User Agent, format: string or lambda. 1498 | # Use lambda if you want to rotate user agents before each run: 1499 | # user_agent: -> { ARRAY_OF_USER_AGENTS.sample } 1500 | # Works for all engines 1501 | user_agent: "Mozilla/5.0 Firefox/61.0", 1502 | 1503 | # Custom cookies, format: array of hashes. 1504 | # Format for a single cookie: { name: "cookie name", value: "cookie value", domain: ".example.com" } 1505 | # Works for all engines 1506 | cookies: [], 1507 | 1508 | # Proxy, format: string or lambda. Format of a proxy string: "ip:port:protocol:user:password" 1509 | # `protocol` can be http or socks5. User and password are optional. 1510 | # Use lambda if you want to rotate proxies before each run: 1511 | # proxy: -> { ARRAY_OF_PROXIES.sample } 1512 | # Works for all engines, but keep in mind that Selenium drivers doesn't support proxies 1513 | # with authorization. Also, Mechanize doesn't support socks5 proxy format (only http) 1514 | proxy: "3.4.5.6:3128:http:user:pass", 1515 | 1516 | # If enabled, browser will ignore any https errors. It's handy while using a proxy 1517 | # with self-signed SSL cert (for example Crawlera or Mitmproxy) 1518 | # Also, it will allow to visit webpages with expires SSL certificate. 1519 | # Works for all engines 1520 | ignore_ssl_errors: true, 1521 | 1522 | # Custom window size, works for all engines 1523 | window_size: [1366, 768], 1524 | 1525 | # Skip images downloading if true, works for all engines 1526 | disable_images: true, 1527 | 1528 | # Selenium engines only: headless mode, `:native` or `:virtual_display` (default is :native) 1529 | # Although native mode has a better performance, virtual display mode 1530 | # sometimes can be useful. For example, some websites can detect (and block) 1531 | # headless chrome, so you can use virtual_display mode instead 1532 | headless_mode: :native, 1533 | 1534 | # This option tells the browser not to use a proxy for the provided list of domains or IP addresses. 1535 | # Format: array of strings. Works only for :selenium_firefox and selenium_chrome 1536 | proxy_bypass_list: [], 1537 | 1538 | # Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize 1539 | ssl_cert_path: "path/to/ssl_cert", 1540 | 1541 | # Inject some JavaScript code to the browser. 1542 | # Format: array of strings, where each string is a path to JS file. 1543 | # Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection) 1544 | extensions: ["lib/code_to_inject.js"], 1545 | 1546 | # Automatically skip duplicated (already visited) urls when using `request_to` method. 1547 | # Possible values: `true` or `hash` with options. 1548 | # In case of `true`, all visited urls will be added to the storage's scope `:requests_urls` 1549 | # and if url already contains in this scope, request will be skipped. 1550 | # You can configure this setting by providing additional options as hash: 1551 | # `skip_duplicate_requests: { scope: :custom_scope, check_only: true }`, where: 1552 | # `scope:` - use custom scope than `:requests_urls` 1553 | # `check_only:` - if true, then scope will be only checked for url, url will not 1554 | # be added to the scope if scope doesn't contains it. 1555 | # works for all drivers 1556 | skip_duplicate_requests: true, 1557 | 1558 | # Automatically skip provided errors while requesting a page. 1559 | # If raised error matches one of the errors in the list, then this error will be caught, 1560 | # and request will be skipped. 1561 | # It is a good idea to skip errors like NotFound(404), etc. 1562 | # Format: array where elements are error classes or/and hashes. You can use hash format 1563 | # for more flexibility: `{ error: "RuntimeError", message: "404 => Net::HTTPNotFound" }`. 1564 | # Provided `message:` will be compared with a full error message using `String#include?`. Also 1565 | # you can use regex instead: `{ error: "RuntimeError", message: /404|403/ }`. 1566 | skip_request_errors: [{ error: RuntimeError, message: "404 => Net::HTTPNotFound" }], 1567 | 1568 | # Automatically retry provided errors with a few attempts while requesting a page. 1569 | # If raised error matches one of the errors in the list, then this error will be caught 1570 | # and the request will be processed again within a delay. There are 3 attempts: 1571 | # first: delay 15 sec, second: delay 30 sec, third: delay 45 sec. 1572 | # If after 3 attempts there is still an exception, then the exception will be raised. 1573 | # It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc. 1574 | # Format: same like for `skip_request_errors` option. 1575 | retry_request_errors: [Net::ReadTimeout], 1576 | 1577 | # Handle page encoding while parsing html response using Nokogiri. There are two modes: 1578 | # Auto (`:auto`) (try to fetch correct encoding from or tags) 1579 | # Set required encoding manually, example: `encoding: "GB2312"` (Set required encoding manually) 1580 | # Default this option is unset. 1581 | encoding: nil, 1582 | 1583 | # Restart browser if one of the options is true: 1584 | restart_if: { 1585 | # Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines) 1586 | memory_limit: 350_000, 1587 | 1588 | # Restart browser if provided requests limit is exceeded (works for all engines) 1589 | requests_limit: 100 1590 | }, 1591 | 1592 | # Perform several actions before each request: 1593 | before_request: { 1594 | # Change proxy before each request. The `proxy:` option above should be presented 1595 | # and has lambda format. Works only for poltergeist and mechanize engines 1596 | # (Selenium doesn't support proxy rotation). 1597 | change_proxy: true, 1598 | 1599 | # Change user agent before each request. The `user_agent:` option above should be presented 1600 | # and has lambda format. Works only for poltergeist and mechanize engines 1601 | # (selenium doesn't support to get/set headers). 1602 | change_user_agent: true, 1603 | 1604 | # Clear all cookies before each request, works for all engines 1605 | clear_cookies: true, 1606 | 1607 | # If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented) 1608 | # use this option instead (works for all engines) 1609 | clear_and_set_cookies: true, 1610 | 1611 | # Global option to set delay between requests. 1612 | # Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range, 1613 | # delay number will be chosen randomly for each request: `rand (2..5) # => 3` 1614 | delay: 1..3 1615 | } 1616 | } 1617 | ``` 1618 | 1619 | As you can see, most of the options are universal for any engine. 1620 | 1621 | ### `@config` settings inheritance 1622 | Settings can be inherited: 1623 | 1624 | ```ruby 1625 | class ApplicationSpider < Kimurai::Base 1626 | @engine = :poltergeist_phantomjs 1627 | @config = { 1628 | user_agent: "Firefox", 1629 | disable_images: true, 1630 | restart_if: { memory_limit: 350_000 }, 1631 | before_request: { delay: 1..2 } 1632 | } 1633 | end 1634 | 1635 | class CustomSpider < ApplicationSpider 1636 | @name = "custom_spider" 1637 | @start_urls = ["https://example.com/"] 1638 | @config = { 1639 | before_request: { delay: 4..6 } 1640 | } 1641 | 1642 | def parse(response, url:, data: {}) 1643 | # ... 1644 | end 1645 | end 1646 | ``` 1647 | 1648 | Here, `@config` of `CustomSpider` will be _[deep merged](https://apidock.com/rails/Hash/deep_merge)_ with `ApplicationSpider` config, so `CustomSpider` will keep all inherited options with only `delay` updated. 1649 | 1650 | ## Project mode 1651 | 1652 | Kimurai can work in project mode ([Like Scrapy](https://doc.scrapy.org/en/latest/intro/tutorial.html#creating-a-project)). To generate a new project, run: `$ kimurai generate project web_spiders` (where `web_spiders` is a name of project). 1653 | 1654 | Structure of the project: 1655 | 1656 | ```bash 1657 | . 1658 | ├── config/ 1659 | │   ├── initializers/ 1660 | │   ├── application.rb 1661 | │   ├── automation.yml 1662 | │   ├── boot.rb 1663 | │   └── schedule.rb 1664 | ├── spiders/ 1665 | │   └── application_spider.rb 1666 | ├── db/ 1667 | ├── helpers/ 1668 | │   └── application_helper.rb 1669 | ├── lib/ 1670 | ├── log/ 1671 | ├── pipelines/ 1672 | │   ├── validator.rb 1673 | │   └── saver.rb 1674 | ├── tmp/ 1675 | ├── .env 1676 | ├── Gemfile 1677 | ├── Gemfile.lock 1678 | └── README.md 1679 | ``` 1680 | 1681 |
1682 | Description 1683 | 1684 | * `config/` folder for configutation files 1685 | * `config/initializers` [Rails-like initializers](https://guides.rubyonrails.org/configuring.html#using-initializer-files) to load custom code at start of framework 1686 | * `config/application.rb` configuration settings for Kimurai (`Kimurai.configure do` block) 1687 | * `config/automation.yml` specify some settings for [setup and deploy](#automated-sever-setup-and-deployment) 1688 | * `config/boot.rb` loads framework and project 1689 | * `config/schedule.rb` Cron [schedule for spiders](#schedule-spiders-using-cron) 1690 | * `spiders/` folder for spiders 1691 | * `spiders/application_spider.rb` Base parent class for all spiders 1692 | * `db/` store here all database files (`sqlite`, `json`, `csv`, etc.) 1693 | * `helpers/` Rails-like helpers for spiders 1694 | * `helpers/application_helper.rb` all methods inside ApplicationHelper module will be available for all spiders 1695 | * `lib/` put here custom Ruby code 1696 | * `log/` folder for logs 1697 | * `pipelines/` folder for [Scrapy-like](https://doc.scrapy.org/en/latest/topics/item-pipeline.html) pipelines. One file = one pipeline 1698 | * `pipelines/validator.rb` example pipeline to validate item 1699 | * `pipelines/saver.rb` example pipeline to save item 1700 | * `tmp/` folder for temp. files 1701 | * `.env` file to store ENV variables for project and load them using [Dotenv](https://github.com/bkeepers/dotenv) 1702 | * `Gemfile` dependency file 1703 | * `Readme.md` example project readme 1704 |
1705 | 1706 | 1707 | ### Generate new spider 1708 | To generate a new spider in the project, run: 1709 | 1710 | ```bash 1711 | $ kimurai generate spider example_spider 1712 | create spiders/example_spider.rb 1713 | ``` 1714 | 1715 | Command will generate a new spider class inherited from `ApplicationSpider`: 1716 | 1717 | ```ruby 1718 | class ExampleSpider < ApplicationSpider 1719 | @name = "example_spider" 1720 | @start_urls = [] 1721 | @config = {} 1722 | 1723 | def parse(response, url:, data: {}) 1724 | end 1725 | end 1726 | ``` 1727 | 1728 | ### Crawl 1729 | To run a particular spider in the project, run: `$ bundle exec kimurai crawl example_spider`. Don't forget to add `bundle exec` before command to load required environment. 1730 | 1731 | ### List 1732 | To list all project spiders, run: `$ bundle exec kimurai list` 1733 | 1734 | ### Parse 1735 | For project spiders you can use `$ kimurai parse` command which helps to debug spiders: 1736 | 1737 | ```bash 1738 | $ bundle exec kimurai parse example_spider parse_product --url https://example-shop.com/product-1 1739 | ``` 1740 | 1741 | where `example_spider` is a spider to run, `parse_product` is a spider method to process and `--url` is url to open inside processing method. 1742 | 1743 | ### Pipelines, `send_item` method 1744 | You can use item pipelines to organize and store in one place item processing logic for all project spiders (also check Scrapy [description of pipelines](https://doc.scrapy.org/en/latest/topics/item-pipeline.html#item-pipeline)). 1745 | 1746 | Imagine if you have three spiders where each of them crawls different e-commerce shop and saves only shoe positions. For each spider, you want to save items only with "shoe" category, unique sku, valid title/price and with existing images. To avoid code duplication between spiders, use pipelines: 1747 | 1748 |
1749 | Example 1750 | 1751 | pipelines/validator.rb 1752 | ```ruby 1753 | class Validator < Kimurai::Pipeline 1754 | def process_item(item, options: {}) 1755 | # Here you can validate item and raise `DropItemError` 1756 | # if one of the validations failed. Examples: 1757 | 1758 | # Drop item if it's category is not "shoe": 1759 | if item[:category] != "shoe" 1760 | raise DropItemError, "Wrong item category" 1761 | end 1762 | 1763 | # Check item sku for uniqueness using buit-in unique? helper: 1764 | unless unique?(:sku, item[:sku]) 1765 | raise DropItemError, "Item sku is not unique" 1766 | end 1767 | 1768 | # Drop item if title length shorter than 5 symbols: 1769 | if item[:title].size < 5 1770 | raise DropItemError, "Item title is short" 1771 | end 1772 | 1773 | # Drop item if price is not present 1774 | unless item[:price].present? 1775 | raise DropItemError, "item price is not present" 1776 | end 1777 | 1778 | # Drop item if it doesn't contains any images: 1779 | unless item[:images].present? 1780 | raise DropItemError, "Item images are not present" 1781 | end 1782 | 1783 | # Pass item to the next pipeline (if it wasn't dropped): 1784 | item 1785 | end 1786 | end 1787 | 1788 | ``` 1789 | 1790 | pipelines/saver.rb 1791 | ```ruby 1792 | class Saver < Kimurai::Pipeline 1793 | def process_item(item, options: {}) 1794 | # Here you can save item to the database, send it to a remote API or 1795 | # simply save item to a file format using `save_to` helper: 1796 | 1797 | # To get the name of current spider: `spider.class.name` 1798 | save_to "db/#{spider.class.name}.json", item, format: :json 1799 | 1800 | item 1801 | end 1802 | end 1803 | ``` 1804 | 1805 | spiders/application_spider.rb 1806 | ```ruby 1807 | class ApplicationSpider < Kimurai::Base 1808 | @engine = :selenium_chrome 1809 | # Define pipelines (by order) for all spiders: 1810 | @pipelines = [:validator, :saver] 1811 | end 1812 | ``` 1813 | 1814 | spiders/shop_spider_1.rb 1815 | ```ruby 1816 | class ShopSpiderOne < ApplicationSpider 1817 | @name = "shop_spider_1" 1818 | @start_urls = ["https://shop-1.com"] 1819 | 1820 | # ... 1821 | 1822 | def parse_product(response, url:, data: {}) 1823 | # ... 1824 | 1825 | # Send item to pipelines: 1826 | send_item item 1827 | end 1828 | end 1829 | ``` 1830 | 1831 | spiders/shop_spider_2.rb 1832 | ```ruby 1833 | class ShopSpiderTwo < ApplicationSpider 1834 | @name = "shop_spider_2" 1835 | @start_urls = ["https://shop-2.com"] 1836 | 1837 | def parse_product(response, url:, data: {}) 1838 | # ... 1839 | 1840 | # Send item to pipelines: 1841 | send_item item 1842 | end 1843 | end 1844 | ``` 1845 | 1846 | spiders/shop_spider_3.rb 1847 | ```ruby 1848 | class ShopSpiderThree < ApplicationSpider 1849 | @name = "shop_spider_3" 1850 | @start_urls = ["https://shop-3.com"] 1851 | 1852 | def parse_product(response, url:, data: {}) 1853 | # ... 1854 | 1855 | # Send item to pipelines: 1856 | send_item item 1857 | end 1858 | end 1859 | ``` 1860 |

1861 | 1862 | When you start using pipelines, there are stats for items appears: 1863 | 1864 |
1865 | Example 1866 | 1867 | pipelines/validator.rb 1868 | ```ruby 1869 | class Validator < Kimurai::Pipeline 1870 | def process_item(item, options: {}) 1871 | if item[:star_count] < 10 1872 | raise DropItemError, "Repository doesn't have enough stars" 1873 | end 1874 | 1875 | item 1876 | end 1877 | end 1878 | ``` 1879 | 1880 | spiders/github_spider.rb 1881 | ```ruby 1882 | class GithubSpider < ApplicationSpider 1883 | @name = "github_spider" 1884 | @engine = :selenium_chrome 1885 | @pipelines = [:validator] 1886 | @start_urls = ["https://github.com/search?q=Ruby%20Web%20Scraping"] 1887 | @config = { 1888 | user_agent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36", 1889 | before_request: { delay: 4..7 } 1890 | } 1891 | 1892 | def parse(response, url:, data: {}) 1893 | response.xpath("//ul[@class='repo-list']/div//h3/a").each do |a| 1894 | request_to :parse_repo_page, url: absolute_url(a[:href], base: url) 1895 | end 1896 | 1897 | if next_page = response.at_xpath("//a[@class='next_page']") 1898 | request_to :parse, url: absolute_url(next_page[:href], base: url) 1899 | end 1900 | end 1901 | 1902 | def parse_repo_page(response, url:, data: {}) 1903 | item = {} 1904 | 1905 | item[:owner] = response.xpath("//h1//a[@rel='author']").text 1906 | item[:repo_name] = response.xpath("//h1/strong[@itemprop='name']/a").text 1907 | item[:repo_url] = url 1908 | item[:description] = response.xpath("//span[@itemprop='about']").text.squish 1909 | item[:tags] = response.xpath("//div[@id='topics-list-container']/div/a").map { |a| a.text.squish } 1910 | item[:watch_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Watch')]/a[2]").text.squish.delete(",").to_i 1911 | item[:star_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Star')]/a[2]").text.squish.delete(",").to_i 1912 | item[:fork_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Fork')]/a[2]").text.squish.delete(",").to_i 1913 | item[:last_commit] = response.xpath("//span[@itemprop='dateModified']/*").text 1914 | 1915 | send_item item 1916 | end 1917 | end 1918 | ``` 1919 | 1920 | ``` 1921 | $ bundle exec kimurai crawl github_spider 1922 | 1923 | I, [2018-08-22 15:56:35 +0400#1358] [M: 47347279209980] INFO -- github_spider: Spider: started: github_spider 1924 | D, [2018-08-22 15:56:35 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): created browser instance 1925 | I, [2018-08-22 15:56:40 +0400#1358] [M: 47347279209980] INFO -- github_spider: Browser: started get request to: https://github.com/search?q=Ruby%20Web%20Scraping 1926 | I, [2018-08-22 15:56:44 +0400#1358] [M: 47347279209980] INFO -- github_spider: Browser: finished get request to: https://github.com/search?q=Ruby%20Web%20Scraping 1927 | I, [2018-08-22 15:56:44 +0400#1358] [M: 47347279209980] INFO -- github_spider: Info: visits: requests: 1, responses: 1 1928 | D, [2018-08-22 15:56:44 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: driver.current_memory: 116182 1929 | D, [2018-08-22 15:56:44 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: sleep 5 seconds before request... 1930 | 1931 | I, [2018-08-22 15:56:49 +0400#1358] [M: 47347279209980] INFO -- github_spider: Browser: started get request to: https://github.com/lorien/awesome-web-scraping 1932 | I, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] INFO -- github_spider: Browser: finished get request to: https://github.com/lorien/awesome-web-scraping 1933 | I, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] INFO -- github_spider: Info: visits: requests: 2, responses: 2 1934 | D, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: driver.current_memory: 217432 1935 | D, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Pipeline: starting processing item through 1 pipeline... 1936 | I, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] INFO -- github_spider: Pipeline: processed: {"owner":"lorien","repo_name":"awesome-web-scraping","repo_url":"https://github.com/lorien/awesome-web-scraping","description":"List of libraries, tools and APIs for web scraping and data processing.","tags":["awesome","awesome-list","web-scraping","data-processing","python","javascript","php","ruby"],"watch_count":159,"star_count":2423,"fork_count":358,"last_commit":"4 days ago"} 1937 | I, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] INFO -- github_spider: Info: items: sent: 1, processed: 1 1938 | D, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: sleep 6 seconds before request... 1939 | 1940 | ... 1941 | 1942 | I, [2018-08-22 16:11:50 +0400#1358] [M: 47347279209980] INFO -- github_spider: Browser: started get request to: https://github.com/preston/idclight 1943 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] INFO -- github_spider: Browser: finished get request to: https://github.com/preston/idclight 1944 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] INFO -- github_spider: Info: visits: requests: 140, responses: 140 1945 | D, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: driver.current_memory: 211713 1946 | 1947 | D, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Pipeline: starting processing item through 1 pipeline... 1948 | E, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] ERROR -- github_spider: Pipeline: dropped: #, item: {:owner=>"preston", :repo_name=>"idclight", :repo_url=>"https://github.com/preston/idclight", :description=>"A Ruby gem for accessing the freely available IDClight (IDConverter Light) web service, which convert between different types of gene IDs such as Hugo and Entrez. Queries are screen scraped from http://idclight.bioinfo.cnio.es.", :tags=>[], :watch_count=>6, :star_count=>1, :fork_count=>0, :last_commit=>"on Apr 12, 2012"} 1949 | 1950 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] INFO -- github_spider: Info: items: sent: 127, processed: 12 1951 | 1952 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] INFO -- github_spider: Browser: driver selenium_chrome has been destroyed 1953 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] INFO -- github_spider: Spider: stopped: {:spider_name=>"github_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 15:56:35 +0400, :stop_time=>2018-08-22 16:11:51 +0400, :running_time=>"15m, 16s", :visits=>{:requests=>140, :responses=>140}, :items=>{:sent=>127, :processed=>12}, :error=>nil} 1954 | ``` 1955 |

1956 | 1957 | Also, you can pass custom options to pipeline from a particular spider if you want to change pipeline behavior for this spider: 1958 | 1959 |
1960 | Example 1961 | 1962 | spiders/custom_spider.rb 1963 | ```ruby 1964 | class CustomSpider < ApplicationSpider 1965 | @name = "custom_spider" 1966 | @start_urls = ["https://example.com"] 1967 | @pipelines = [:validator] 1968 | 1969 | # ... 1970 | 1971 | def parse_item(response, url:, data: {}) 1972 | # ... 1973 | 1974 | # Pass custom option `skip_uniq_checking` for Validator pipeline: 1975 | send_item item, validator: { skip_uniq_checking: true } 1976 | end 1977 | end 1978 | 1979 | ``` 1980 | 1981 | pipelines/validator.rb 1982 | ```ruby 1983 | class Validator < Kimurai::Pipeline 1984 | def process_item(item, options: {}) 1985 | 1986 | # Do not check item sku for uniqueness if options[:skip_uniq_checking] is true 1987 | if options[:skip_uniq_checking] != true 1988 | raise DropItemError, "Item sku is not unique" unless unique?(:sku, item[:sku]) 1989 | end 1990 | end 1991 | end 1992 | ``` 1993 |
1994 | 1995 | 1996 | ### Runner 1997 | 1998 | You can run project spiders one by one or in parallel using `$ kimurai runner` command: 1999 | 2000 | ``` 2001 | $ bundle exec kimurai list 2002 | custom_spider 2003 | example_spider 2004 | github_spider 2005 | 2006 | $ bundle exec kimurai runner -j 3 2007 | >>> Runner: started: {:id=>1533727423, :status=>:processing, :start_time=>2018-08-08 15:23:43 +0400, :stop_time=>nil, :environment=>"development", :concurrent_jobs=>3, :spiders=>["custom_spider", "github_spider", "example_spider"]} 2008 | > Runner: started spider: custom_spider, index: 0 2009 | > Runner: started spider: github_spider, index: 1 2010 | > Runner: started spider: example_spider, index: 2 2011 | < Runner: stopped spider: custom_spider, index: 0 2012 | < Runner: stopped spider: example_spider, index: 2 2013 | < Runner: stopped spider: github_spider, index: 1 2014 | <<< Runner: stopped: {:id=>1533727423, :status=>:completed, :start_time=>2018-08-08 15:23:43 +0400, :stop_time=>2018-08-08 15:25:11 +0400, :environment=>"development", :concurrent_jobs=>3, :spiders=>["custom_spider", "github_spider", "example_spider"]} 2015 | ``` 2016 | 2017 | Each spider runs in a separate process. Spiders logs available at `log/` folder. Pass `-j` option to specify how many spiders should be processed at the same time (default is 1). 2018 | 2019 | You can provide additional arguments like `--include` or `--exclude` to specify which spiders to run: 2020 | 2021 | ```bash 2022 | # Run only custom_spider and example_spider: 2023 | $ bundle exec kimurai runner --include custom_spider example_spider 2024 | 2025 | # Run all except github_spider: 2026 | $ bundle exec kimurai runner --exclude github_spider 2027 | ``` 2028 | 2029 | #### Runner callbacks 2030 | 2031 | You can perform custom actions before runner starts and after runner stops using `config.runner_at_start_callback` and `config.runner_at_stop_callback`. Check [config/application.rb](lib/kimurai/template/config/application.rb) to see example. 2032 | 2033 | 2034 | ## Chat Support and Feedback 2035 | Will be updated 2036 | 2037 | ## License 2038 | The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). 2039 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rake/testtask" 3 | 4 | Rake::TestTask.new(:test) do |t| 5 | t.libs << "test" 6 | t.libs << "lib" 7 | t.test_files = FileList["test/**/*_test.rb"] 8 | end 9 | 10 | task :default => :test 11 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require "kimurai" 5 | 6 | # You can add fixtures and/or initialization code here to make experimenting 7 | # with your gem easier. You can also use a different console, if you like. 8 | 9 | # (If you use this, don't forget to add pry to your Gemfile!) 10 | # require "pry" 11 | # Pry.start 12 | 13 | require "irb" 14 | IRB.start(__FILE__) 15 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /exe/kimurai: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'kimurai' 4 | require 'kimurai/cli' 5 | 6 | Kimurai::CLI.start(ARGV) 7 | -------------------------------------------------------------------------------- /kimurai.gemspec: -------------------------------------------------------------------------------- 1 | 2 | lib = File.expand_path("../lib", __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require "kimurai/version" 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "kimurai" 8 | spec.version = Kimurai::VERSION 9 | spec.authors = ["Victor Afanasev"] 10 | spec.email = ["vicfreefly@gmail.com"] 11 | 12 | spec.summary = "Modern web scraping framework written in Ruby and based on Capybara/Nokogiri" 13 | spec.homepage = "https://github.com/vifreefly/kimuraframework" 14 | spec.license = "MIT" 15 | 16 | # Specify which files should be added to the gem when it is released. 17 | # The `git ls-files -z` loads the files in the RubyGem that have been added into git. 18 | spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do 19 | `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) } 20 | end 21 | spec.bindir = "exe" 22 | spec.executables = "kimurai" 23 | spec.require_paths = ["lib"] 24 | spec.required_ruby_version = ">= 2.5.0" 25 | 26 | spec.add_dependency "thor" 27 | spec.add_dependency "cliver" 28 | spec.add_dependency "activesupport" 29 | spec.add_dependency "murmurhash3" 30 | spec.add_dependency "nokogiri" 31 | 32 | spec.add_dependency "capybara", ">= 2.15", "< 4.0" 33 | spec.add_dependency "capybara-mechanize" 34 | spec.add_dependency "poltergeist" 35 | spec.add_dependency "selenium-webdriver" 36 | 37 | spec.add_dependency "headless" 38 | spec.add_dependency "pmap" 39 | 40 | spec.add_dependency "whenever" 41 | 42 | spec.add_dependency "rbcat", "~> 0.2" 43 | spec.add_dependency "pry" 44 | 45 | spec.add_development_dependency "bundler", "~> 1.16" 46 | spec.add_development_dependency "rake", "~> 10.0" 47 | spec.add_development_dependency "minitest", "~> 5.0" 48 | end 49 | -------------------------------------------------------------------------------- /lib/kimurai.rb: -------------------------------------------------------------------------------- 1 | require 'ostruct' 2 | require 'logger' 3 | require 'json' 4 | require 'active_support' 5 | require 'active_support/core_ext' 6 | require 'rbcat' 7 | 8 | require_relative 'kimurai/version' 9 | 10 | require_relative 'kimurai/core_ext/numeric' 11 | require_relative 'kimurai/core_ext/string' 12 | require_relative 'kimurai/core_ext/array' 13 | require_relative 'kimurai/core_ext/hash' 14 | 15 | require_relative 'kimurai/browser_builder' 16 | require_relative 'kimurai/base_helper' 17 | require_relative 'kimurai/pipeline' 18 | require_relative 'kimurai/base' 19 | 20 | module Kimurai 21 | class << self 22 | def configuration 23 | @configuration ||= OpenStruct.new 24 | end 25 | 26 | def configure 27 | yield(configuration) 28 | end 29 | 30 | def env 31 | ENV.fetch("KIMURAI_ENV") { "development" } 32 | end 33 | 34 | def time_zone 35 | ENV["TZ"] 36 | end 37 | 38 | def time_zone=(value) 39 | ENV.store("TZ", value) 40 | end 41 | 42 | def list 43 | Base.descendants.map do |klass| 44 | next unless klass.name 45 | [klass.name, klass] 46 | end.compact.to_h 47 | end 48 | 49 | def find_by_name(name) 50 | return unless name 51 | Base.descendants.find { |klass| klass.name == name } 52 | end 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /lib/kimurai/automation/deploy.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | vars: 4 | rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv 5 | rbenv_shims_path: "{{ rbenv_root_path }}/shims" 6 | repo_url: 7 | repo_name: 8 | repo_key_path: 9 | 10 | tasks: 11 | - name: Copy custom git ssh key to /tmp/private_key (if provided) 12 | when: repo_key_path is not none 13 | copy: 14 | src: "{{ repo_key_path }}" 15 | dest: /tmp/private_key 16 | mode: 0600 17 | 18 | - name: Clone/pull project repo to ~/{{ repo_name }} user directory (using ssh-agent forwarding or https) 19 | when: repo_key_path is none 20 | git: 21 | repo: "{{ repo_url }}" 22 | dest: "~/{{ repo_name }}" 23 | force: true 24 | accept_hostkey: true 25 | 26 | - name: Clone/pull project repo to ~/{{ repo_name }} user directory (using custom git ssh key) 27 | when: repo_key_path is not none 28 | git: 29 | repo: "{{ repo_url }}" 30 | dest: "~/{{ repo_name }}" 31 | force: true 32 | accept_hostkey: true 33 | key_file: /tmp/private_key 34 | 35 | - name: Delete custom git ssh key from /tmp/private_key (if provided) 36 | when: repo_key_path is not none 37 | file: 38 | state: absent 39 | path: /tmp/private_key 40 | 41 | - name: Run bundle install 42 | command: bundle install 43 | args: 44 | chdir: ~/{{ repo_name }} 45 | environment: 46 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}" 47 | 48 | - name: Run whenever to update crontab 49 | command: whenever --update-crontab 50 | args: 51 | chdir: ~/{{ repo_name }} 52 | environment: 53 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}" 54 | 55 | -------------------------------------------------------------------------------- /lib/kimurai/automation/setup.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | vars: 4 | ruby: 2.5.3 5 | rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv 6 | rbenv_shims_path: "{{ rbenv_root_path }}/shims" 7 | ruby_versions_path: "{{ rbenv_root_path }}/versions" 8 | # check latest here http://phantomjs.org/download.html 9 | phantomjs: 2.1.1 10 | # check latest here https://github.com/mozilla/geckodriver/releases/ 11 | geckodriver: 0.23.0 12 | # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads 13 | chromedriver: 2.44 14 | 15 | tasks: 16 | - name: Update apt cache 17 | become: true 18 | apt: update_cache=yes cache_valid_time=86400 19 | 20 | - name: Install base packages 21 | become: true 22 | apt: 23 | pkg: "{{ item }}" 24 | state: present 25 | with_items: 26 | - git 27 | - xvfb 28 | - libsqlite3-dev 29 | - sqlite3 30 | - mongodb-clients 31 | - mysql-client 32 | - libmysqlclient-dev 33 | - postgresql-client 34 | - libpq-dev 35 | 36 | - import_tasks: setup/ruby_environment.yml 37 | 38 | - import_tasks: setup/phantomjs.yml 39 | become: true 40 | 41 | - import_tasks: setup/firefox_geckodriver.yml 42 | become: true 43 | 44 | - import_tasks: setup/chromium_chromedriver.yml 45 | become: true 46 | -------------------------------------------------------------------------------- /lib/kimurai/automation/setup/chromium_chromedriver.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install chromium browser 3 | apt: 4 | pkg: chromium-browser 5 | state: present 6 | 7 | - name: Get current chromedriver version 8 | shell: chromedriver --version 9 | args: 10 | executable: /bin/bash 11 | register: current_chromedriver_version 12 | changed_when: false 13 | ignore_errors: true 14 | 15 | - name: Install unzip tool to unarchive chromedriver archive 16 | apt: 17 | pkg: unzip 18 | state: present 19 | 20 | - name: Download chromedriver binary archive and unarchive it to /usr/local/bin 21 | unarchive: 22 | src: https://chromedriver.storage.googleapis.com/{{ chromedriver }}/chromedriver_linux64.zip 23 | dest: /usr/local/bin 24 | remote_src: true 25 | mode: a+x 26 | when: chromedriver not in current_chromedriver_version.stdout_lines 27 | -------------------------------------------------------------------------------- /lib/kimurai/automation/setup/firefox_geckodriver.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install firefox 3 | apt: 4 | pkg: firefox 5 | state: present 6 | 7 | - name: Get current geckodriver version 8 | shell: geckodriver --version 9 | args: 10 | executable: /bin/bash 11 | register: current_geckodriver_version 12 | changed_when: false 13 | ignore_errors: true 14 | 15 | - name: Download geckodriver binary archive and unarchive it to /usr/local/bin 16 | unarchive: 17 | src: https://github.com/mozilla/geckodriver/releases/download/v{{ geckodriver }}/geckodriver-v{{ geckodriver }}-linux64.tar.gz 18 | dest: /usr/local/bin 19 | remote_src: true 20 | when: geckodriver not in current_geckodriver_version.stdout 21 | -------------------------------------------------------------------------------- /lib/kimurai/automation/setup/phantomjs.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install dependencies for PhantomJS 3 | apt: 4 | pkg: "{{ item }}" 5 | state: present 6 | with_items: 7 | - chrpath 8 | - libxft-dev 9 | - libfreetype6 10 | - libfreetype6-dev 11 | - libfontconfig1 12 | - libfontconfig1-dev 13 | 14 | - name: Get current phantomjs version 15 | shell: phantomjs -v 16 | args: 17 | executable: /bin/bash 18 | register: current_phantomjs_version 19 | changed_when: false 20 | ignore_errors: true 21 | 22 | - name: Download phantomJS archive and unarchive it to /usr/local/lib 23 | unarchive: 24 | src: https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-{{ phantomjs }}-linux-x86_64.tar.bz2 25 | dest: /usr/local/lib 26 | remote_src: true 27 | when: phantomjs not in current_phantomjs_version.stdout 28 | 29 | - name: Link PhantomJS binary to /usr/local/bin/phantomjs 30 | file: 31 | src: /usr/local/lib/phantomjs-{{ phantomjs }}-linux-x86_64/bin/phantomjs 32 | dest: /usr/local/bin/phantomjs 33 | state: link 34 | -------------------------------------------------------------------------------- /lib/kimurai/automation/setup/ruby_environment.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install dependencies for ruby-build 3 | become: true 4 | apt: 5 | pkg: "{{ item }}" 6 | state: present 7 | with_items: 8 | - zlib1g-dev 9 | - build-essential 10 | - libssl-dev 11 | - libreadline-dev 12 | - libreadline6-dev 13 | - libyaml-dev 14 | - libxml2-dev 15 | - libxslt1-dev 16 | - libcurl4-openssl-dev 17 | - libffi-dev 18 | 19 | - name: Clone Rbenv repository to the {{ ansible_user_id }} user directory 20 | git: 21 | repo: https://github.com/sstephenson/rbenv.git 22 | dest: "{{ rbenv_root_path }}" 23 | 24 | - name: Clone ruby-build repo to the {{ ansible_user_id }} user directory 25 | git: 26 | repo: https://github.com/sstephenson/ruby-build.git 27 | dest: "{{ rbenv_root_path }}/plugins/ruby-build" 28 | 29 | - name: Add Rbenv path to the .bashrc 30 | lineinfile: 31 | dest: ~/.bashrc 32 | regexp: '^export PATH="\$HOME\/\.rbenv' 33 | line: export PATH="$HOME/.rbenv/bin:$PATH" 34 | state: present 35 | 36 | - name: Add Rbenv init to the .bashrc 37 | lineinfile: 38 | dest: ~/.bashrc 39 | regexp: '^eval "\$\(rbenv' 40 | line: eval "$(rbenv init -)" 41 | state: present 42 | 43 | - name: Check if desired Ruby version already installed 44 | stat: 45 | path: "{{ ruby_versions_path }}/{{ ruby }}" 46 | register: ruby_present 47 | 48 | - name: Install desired Ruby version using ruby-build (this can take a while) 49 | command: rbenv install {{ ruby }} 50 | when: not ruby_present.stat.exists 51 | environment: 52 | CONFIGURE_OPTS: "--disable-install-doc" 53 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}" 54 | 55 | - name: Get current Ruby version 56 | command: "ruby -v" 57 | register: current_ruby_version 58 | changed_when: false 59 | ignore_errors: true 60 | environment: 61 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}" 62 | 63 | - name: Set desired Ruby version as a global version 64 | command: "rbenv global {{ ruby }}" 65 | when: ruby not in current_ruby_version.stdout 66 | environment: 67 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}" 68 | register: set_ruby 69 | 70 | - name: Execute `rbenv rehash` command 71 | command: rbenv rehash 72 | when: set_ruby.changed 73 | environment: 74 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}" 75 | 76 | - name: Create ~/.gemrc file to skip docs 77 | copy: 78 | dest: ~/.gemrc 79 | content: "gem: --no-ri --no-rdoc" 80 | 81 | - name: Create ~/.bundle directory 82 | file: 83 | dest: ~/.bundle 84 | state: directory 85 | 86 | - name: Create ~/.bundle/config file with default settings `BUNDLE_GIT__ALLOW_INSECURE true` and `BUNDLE_JOBS 4` 87 | copy: 88 | dest: ~/.bundle/config 89 | content: | 90 | BUNDLE_GIT__ALLOW_INSECURE: "true" 91 | BUNDLE_JOBS: "4" 92 | 93 | - name: Check if Bundler gem installed 94 | stat: 95 | path: "{{ ruby_versions_path }}/{{ ruby }}/bin/bundler" 96 | register: bundler_gem_present 97 | 98 | - name: Install Bundler gem 99 | command: gem install bundler 100 | when: not bundler_gem_present.stat.exists 101 | environment: 102 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}" 103 | 104 | - name: Check if Whenever gem installed 105 | stat: 106 | path: "{{ ruby_versions_path }}/{{ ruby }}/bin/whenever" 107 | register: whenever_gem_present 108 | 109 | - name: Install Whenever gem 110 | command: gem install whenever 111 | when: not whenever_gem_present.stat.exists 112 | environment: 113 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}" 114 | 115 | - name: Check if Kimurai gem installed 116 | stat: 117 | path: "{{ ruby_versions_path }}/{{ ruby }}/bin/kimurai" 118 | register: kimurai_gem_present 119 | 120 | - name: Install Kimurai gem 121 | command: gem install kimurai 122 | when: not kimurai_gem_present.stat.exists 123 | environment: 124 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}" 125 | -------------------------------------------------------------------------------- /lib/kimurai/base.rb: -------------------------------------------------------------------------------- 1 | require_relative 'base/saver' 2 | require_relative 'base/storage' 3 | 4 | module Kimurai 5 | class Base 6 | class InvalidUrlError < StandardError; end 7 | 8 | # don't deep merge config's headers hash option 9 | DMERGE_EXCLUDE = [:headers] 10 | 11 | LoggerFormatter = proc do |severity, datetime, progname, msg| 12 | current_thread_id = Thread.current.object_id 13 | thread_type = Thread.main == Thread.current ? "M" : "C" 14 | output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n" 15 | .freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg] 16 | 17 | if Kimurai.configuration.colorize_logger != false && Kimurai.env == "development" 18 | Rbcat.colorize(output, predefined: [:jsonhash, :logger]) 19 | else 20 | output 21 | end 22 | end 23 | 24 | include BaseHelper 25 | 26 | ### 27 | 28 | class << self 29 | attr_reader :run_info, :savers, :storage 30 | end 31 | 32 | def self.running? 33 | @run_info && @run_info[:status] == :running 34 | end 35 | 36 | def self.completed? 37 | @run_info && @run_info[:status] == :completed 38 | end 39 | 40 | def self.failed? 41 | @run_info && @run_info[:status] == :failed 42 | end 43 | 44 | def self.visits 45 | @run_info && @run_info[:visits] 46 | end 47 | 48 | def self.items 49 | @run_info && @run_info[:items] 50 | end 51 | 52 | def self.update(type, subtype) 53 | return unless @run_info 54 | @update_mutex.synchronize { @run_info[type][subtype] += 1 } 55 | end 56 | 57 | def self.add_event(scope, event) 58 | return unless @run_info 59 | @update_mutex.synchronize { @run_info[:events][scope][event] += 1 } 60 | end 61 | 62 | ### 63 | 64 | @engine = :mechanize 65 | @pipelines = [] 66 | @config = {} 67 | 68 | def self.name 69 | @name 70 | end 71 | 72 | def self.engine 73 | @engine ||= superclass.engine 74 | end 75 | 76 | def self.pipelines 77 | @pipelines ||= superclass.pipelines 78 | end 79 | 80 | def self.start_urls 81 | @start_urls 82 | end 83 | 84 | def self.config 85 | if superclass.equal?(::Object) 86 | @config 87 | else 88 | superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE) 89 | end 90 | end 91 | 92 | ### 93 | 94 | def self.logger 95 | @logger ||= Kimurai.configuration.logger || begin 96 | log_level = (ENV["LOG_LEVEL"] || Kimurai.configuration.log_level || "DEBUG").to_s.upcase 97 | log_level = "Logger::#{log_level}".constantize 98 | Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name) 99 | end 100 | end 101 | 102 | def self.crawl!(exception_on_fail: true) 103 | logger.error "Spider: already running: #{name}" and return false if running? 104 | 105 | @storage = Storage.new 106 | @savers = {} 107 | @update_mutex = Mutex.new 108 | 109 | @run_info = { 110 | spider_name: name, status: :running, error: nil, environment: Kimurai.env, 111 | start_time: Time.new, stop_time: nil, running_time: nil, 112 | visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 }, 113 | events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) } 114 | } 115 | 116 | ### 117 | 118 | logger.info "Spider: started: #{name}" 119 | open_spider if self.respond_to? :open_spider 120 | 121 | spider = self.new 122 | spider.with_info = true 123 | if start_urls 124 | start_urls.each do |start_url| 125 | if start_url.class == Hash 126 | spider.request_to(:parse, start_url) 127 | else 128 | spider.request_to(:parse, url: start_url) 129 | end 130 | end 131 | else 132 | spider.parse 133 | end 134 | rescue StandardError, SignalException, SystemExit => e 135 | @run_info.merge!(status: :failed, error: e.inspect) 136 | exception_on_fail ? raise(e) : [@run_info, e] 137 | else 138 | @run_info.merge!(status: :completed) 139 | ensure 140 | if spider 141 | spider.browser.destroy_driver! if spider.instance_variable_get("@browser") 142 | 143 | stop_time = Time.now 144 | total_time = (stop_time - @run_info[:start_time]).round(3) 145 | @run_info.merge!(stop_time: stop_time, running_time: total_time) 146 | 147 | close_spider if self.respond_to? :close_spider 148 | 149 | message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}" 150 | failed? ? logger.fatal(message) : logger.info(message) 151 | 152 | @run_info, @storage, @savers, @update_mutex = nil 153 | end 154 | end 155 | 156 | def self.parse!(handler, *args, **request) 157 | spider = self.new 158 | 159 | if args.present? 160 | spider.public_send(handler, *args) 161 | elsif request.present? 162 | spider.request_to(handler, request) 163 | else 164 | spider.public_send(handler) 165 | end 166 | ensure 167 | spider.browser.destroy_driver! if spider.instance_variable_get("@browser") 168 | end 169 | 170 | ### 171 | 172 | attr_reader :logger 173 | attr_accessor :with_info 174 | 175 | def initialize(engine = self.class.engine, config: {}) 176 | @engine = engine || self.class.engine 177 | @config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE) 178 | @pipelines = self.class.pipelines.map do |pipeline_name| 179 | klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name } 180 | instance = klass.new 181 | instance.spider = self 182 | [pipeline_name, instance] 183 | end.to_h 184 | 185 | @logger = self.class.logger 186 | @savers = {} 187 | end 188 | 189 | def browser 190 | @browser ||= BrowserBuilder.build(@engine, @config, spider: self) 191 | end 192 | 193 | def request_to(handler, delay = nil, url:, data: {}, response_type: :html) 194 | raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).kind_of?(URI::HTTP) 195 | 196 | if @config[:skip_duplicate_requests] && !unique_request?(url) 197 | add_event(:duplicate_requests) if self.with_info 198 | logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return 199 | end 200 | 201 | visited = delay ? browser.visit(url, delay: delay) : browser.visit(url) 202 | return unless visited 203 | 204 | public_send(handler, browser.current_response(response_type), { url: url, data: data }) 205 | end 206 | 207 | def console(response = nil, url: nil, data: {}) 208 | binding.pry 209 | end 210 | 211 | ### 212 | 213 | def storage 214 | # Note: for `.crawl!` uses shared thread safe Storage instance, 215 | # otherwise, each spider instance will have it's own Storage 216 | @storage ||= self.with_info ? self.class.storage : Storage.new 217 | end 218 | 219 | def unique?(scope, value) 220 | storage.unique?(scope, value) 221 | end 222 | 223 | def save_to(path, item, format:, position: true, append: false) 224 | @savers[path] ||= begin 225 | options = { format: format, position: position, append: append } 226 | if self.with_info 227 | self.class.savers[path] ||= Saver.new(path, options) 228 | else 229 | Saver.new(path, options) 230 | end 231 | end 232 | 233 | @savers[path].save(item) 234 | end 235 | 236 | ### 237 | 238 | def add_event(scope = :custom, event) 239 | if self.with_info 240 | self.class.add_event(scope, event) 241 | end 242 | 243 | logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom 244 | end 245 | 246 | ### 247 | 248 | private 249 | 250 | def create_browser(engine, config = {}) 251 | Kimurai::BrowserBuilder.build(engine, config, spider: self) 252 | end 253 | 254 | def unique_request?(url) 255 | options = @config[:skip_duplicate_requests] 256 | if options.class == Hash 257 | scope = options[:scope] || :requests_urls 258 | if options[:check_only] 259 | storage.include?(scope, url) ? false : true 260 | else 261 | storage.unique?(scope, url) ? true : false 262 | end 263 | else 264 | storage.unique?(:requests_urls, url) ? true : false 265 | end 266 | end 267 | 268 | def send_item(item, options = {}) 269 | logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..." 270 | self.class.update(:items, :sent) if self.with_info 271 | 272 | @pipelines.each do |name, instance| 273 | item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item) 274 | end 275 | rescue => e 276 | logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}" 277 | add_event(:drop_items_errors, e.inspect) if self.with_info 278 | false 279 | else 280 | self.class.update(:items, :processed) if self.with_info 281 | logger.info "Pipeline: processed: #{JSON.generate(item)}" 282 | true 283 | ensure 284 | if self.with_info 285 | logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}" 286 | end 287 | end 288 | 289 | def in_parallel(handler, urls, threads:, data: {}, delay: nil, engine: @engine, config: {}) 290 | parts = urls.in_sorted_groups(threads, false) 291 | urls_count = urls.size 292 | 293 | all = [] 294 | start_time = Time.now 295 | logger.info "Spider: in_parallel: starting processing #{urls_count} urls within #{threads} threads" 296 | 297 | parts.each do |part| 298 | all << Thread.new(part) do |part| 299 | Thread.current.abort_on_exception = true 300 | 301 | spider = self.class.new(engine, config: @config.deep_merge_excl(config, DMERGE_EXCLUDE)) 302 | spider.with_info = true if self.with_info 303 | 304 | part.each do |url_data| 305 | if url_data.class == Hash 306 | if url_data[:url].present? && url_data[:data].present? 307 | spider.request_to(handler, delay, url_data) 308 | else 309 | spider.public_send(handler, url_data) 310 | end 311 | else 312 | spider.request_to(handler, delay, url: url_data, data: data) 313 | end 314 | end 315 | ensure 316 | spider.browser.destroy_driver! if spider.instance_variable_get("@browser") 317 | end 318 | 319 | sleep 0.5 320 | end 321 | 322 | all.each(&:join) 323 | logger.info "Spider: in_parallel: stopped processing #{urls_count} urls within #{threads} threads, total time: #{(Time.now - start_time).duration}" 324 | end 325 | end 326 | end 327 | -------------------------------------------------------------------------------- /lib/kimurai/base/saver.rb: -------------------------------------------------------------------------------- 1 | require 'json' 2 | require 'csv' 3 | 4 | module Kimurai 5 | class Base 6 | class Saver 7 | attr_reader :format, :path, :position, :append 8 | 9 | def initialize(path, format:, position: true, append: false) 10 | unless %i(json pretty_json jsonlines csv).include?(format) 11 | raise "SimpleSaver: wrong type of format: #{format}" 12 | end 13 | 14 | @path = path 15 | @format = format 16 | @position = position 17 | @index = 0 18 | @append = append 19 | @mutex = Mutex.new 20 | end 21 | 22 | def save(item) 23 | @mutex.synchronize do 24 | @index += 1 25 | item[:position] = @index if position 26 | 27 | case format 28 | when :json 29 | save_to_json(item) 30 | when :pretty_json 31 | save_to_pretty_json(item) 32 | when :jsonlines 33 | save_to_jsonlines(item) 34 | when :csv 35 | save_to_csv(item) 36 | end 37 | end 38 | end 39 | 40 | private 41 | 42 | def save_to_json(item) 43 | data = JSON.generate([item]) 44 | 45 | if @index > 1 || append && File.exists?(path) 46 | file_content = File.read(path).sub(/\}\]\Z/, "\}\,") 47 | File.open(path, "w") do |f| 48 | f.write(file_content + data.sub(/\A\[/, "")) 49 | end 50 | else 51 | File.open(path, "w") { |f| f.write(data) } 52 | end 53 | end 54 | 55 | def save_to_pretty_json(item) 56 | data = JSON.pretty_generate([item]) 57 | 58 | if @index > 1 || append && File.exists?(path) 59 | file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n") 60 | File.open(path, "w") do |f| 61 | f.write(file_content + data.sub(/\A\[\n/, "")) 62 | end 63 | else 64 | File.open(path, "w") { |f| f.write(data) } 65 | end 66 | end 67 | 68 | def save_to_jsonlines(item) 69 | data = JSON.generate(item) 70 | 71 | if @index > 1 || append && File.exists?(path) 72 | File.open(path, "a") { |file| file.write("\n" + data) } 73 | else 74 | File.open(path, "w") { |file| file.write(data) } 75 | end 76 | end 77 | 78 | def save_to_csv(item) 79 | data = flatten_hash(item) 80 | 81 | if @index > 1 || append && File.exists?(path) 82 | CSV.open(path, "a+", force_quotes: true) do |csv| 83 | csv << data.values 84 | end 85 | else 86 | CSV.open(path, "w", force_quotes: true) do |csv| 87 | csv << data.keys 88 | csv << data.values 89 | end 90 | end 91 | end 92 | 93 | def flatten_hash(hash) 94 | hash.each_with_object({}) do |(k, v), h| 95 | if v.is_a? Hash 96 | flatten_hash(v).map { |h_k, h_v| h["#{k}.#{h_k}"] = h_v } 97 | else 98 | h[k&.to_s] = v 99 | end 100 | end 101 | end 102 | end 103 | end 104 | end 105 | 106 | 107 | -------------------------------------------------------------------------------- /lib/kimurai/base/storage.rb: -------------------------------------------------------------------------------- 1 | module Kimurai 2 | class Base 3 | class Storage 4 | attr_reader :database 5 | 6 | def initialize 7 | @mutex = Mutex.new 8 | @database = {} 9 | end 10 | 11 | def all(scope = nil) 12 | @mutex.synchronize do 13 | scope ? database.fetch(scope, []) : database 14 | end 15 | end 16 | 17 | def include?(scope, value) 18 | @mutex.synchronize do 19 | database[scope] ||= [] 20 | database[scope].include?(value) 21 | end 22 | end 23 | 24 | def add(scope, value) 25 | @mutex.synchronize do 26 | database[scope] ||= [] 27 | if value.kind_of?(Array) 28 | database[scope] += value 29 | database[scope].uniq! 30 | else 31 | database[scope].push(value) unless database[scope].include?(value) 32 | end 33 | end 34 | end 35 | 36 | ### 37 | 38 | def unique?(scope, value) 39 | @mutex.synchronize do 40 | database[scope] ||= [] 41 | database[scope].include?(value) ? false : database[scope].push(value) and true 42 | end 43 | end 44 | 45 | ### 46 | 47 | def clear! 48 | @mutex.synchronize do 49 | @database = {} 50 | end 51 | end 52 | end 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /lib/kimurai/base_helper.rb: -------------------------------------------------------------------------------- 1 | module Kimurai 2 | module BaseHelper 3 | private 4 | 5 | def absolute_url(url, base:) 6 | return unless url 7 | URI.join(base, URI.escape(url)).to_s 8 | end 9 | 10 | def escape_url(url) 11 | uri = URI.parse(url) 12 | rescue URI::InvalidURIError => e 13 | URI.parse(URI.escape url).to_s rescue url 14 | else 15 | url 16 | end 17 | 18 | def normalize_url(url, base:) 19 | escape_url(absolute_url(url, base: base)) 20 | end 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /lib/kimurai/browser_builder.rb: -------------------------------------------------------------------------------- 1 | module Kimurai 2 | module BrowserBuilder 3 | def self.build(engine, config = {}, spider:) 4 | if config[:browser].present? 5 | raise "++++++ BrowserBuilder: browser option is depricated. Now all sub-options inside " \ 6 | "`browser` should be placed right into `@config` hash, without `browser` parent key.\n" \ 7 | "See more here: https://github.com/vifreefly/kimuraframework/blob/master/CHANGELOG.md#breaking-changes-110 ++++++" 8 | end 9 | 10 | begin 11 | require "kimurai/browser_builder/#{engine}_builder" 12 | rescue LoadError => e 13 | end 14 | 15 | builder_class_name = "#{engine}_builder".classify 16 | builder = "Kimurai::BrowserBuilder::#{builder_class_name}".constantize 17 | builder.new(config, spider: spider).build 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/kimurai/browser_builder/mechanize_builder.rb: -------------------------------------------------------------------------------- 1 | require 'capybara' 2 | require 'capybara/mechanize' 3 | require_relative '../capybara_configuration' 4 | require_relative '../capybara_ext/mechanize/driver' 5 | require_relative '../capybara_ext/session' 6 | 7 | module Kimurai::BrowserBuilder 8 | class MechanizeBuilder 9 | attr_reader :logger, :spider 10 | 11 | def initialize(config, spider:) 12 | @config = config 13 | @spider = spider 14 | @logger = spider.logger 15 | end 16 | 17 | def build 18 | # Register driver 19 | Capybara.register_driver :mechanize do |app| 20 | driver = Capybara::Mechanize::Driver.new("app") 21 | # keep the history as small as possible (by default it's unlimited) 22 | driver.configure { |a| a.history.max_size = 2 } 23 | driver 24 | end 25 | 26 | # Create browser instance (Capybara session) 27 | @browser = Capybara::Session.new(:mechanize) 28 | @browser.spider = spider 29 | logger.debug "BrowserBuilder (mechanize): created browser instance" 30 | 31 | if @config[:extensions].present? 32 | logger.error "BrowserBuilder (mechanize): `extensions` option not supported, skipped" 33 | end 34 | 35 | # Proxy 36 | if proxy = @config[:proxy].presence 37 | proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip 38 | ip, port, type = proxy_string.split(":") 39 | 40 | if type == "http" 41 | @browser.driver.set_proxy(*proxy_string.split(":")) 42 | logger.debug "BrowserBuilder (mechanize): enabled http proxy, ip: #{ip}, port: #{port}" 43 | else 44 | logger.error "BrowserBuilder (mechanize): can't set #{type} proxy (not supported), skipped" 45 | end 46 | end 47 | 48 | # SSL 49 | if ssl_cert_path = @config[:ssl_cert_path].presence 50 | @browser.driver.browser.agent.http.ca_file = ssl_cert_path 51 | logger.debug "BrowserBuilder (mechanize): enabled custom ssl_cert" 52 | end 53 | 54 | if @config[:ignore_ssl_errors].present? 55 | @browser.driver.browser.agent.verify_mode = OpenSSL::SSL::VERIFY_NONE 56 | logger.debug "BrowserBuilder (mechanize): enabled ignore_ssl_errors" 57 | end 58 | 59 | # Headers 60 | if headers = @config[:headers].presence 61 | @browser.driver.headers = headers 62 | logger.debug "BrowserBuilder (mechanize): enabled custom headers" 63 | end 64 | 65 | if user_agent = @config[:user_agent].presence 66 | user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip 67 | 68 | @browser.driver.add_header("User-Agent", user_agent_string) 69 | logger.debug "BrowserBuilder (mechanize): enabled custom user_agent" 70 | end 71 | 72 | # Cookies 73 | if cookies = @config[:cookies].presence 74 | cookies.each do |cookie| 75 | @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie) 76 | end 77 | 78 | logger.debug "BrowserBuilder (mechanize): enabled custom cookies" 79 | end 80 | 81 | # Browser instance options 82 | # skip_request_errors 83 | if skip_errors = @config[:skip_request_errors].presence 84 | @browser.config.skip_request_errors = skip_errors 85 | logger.debug "BrowserBuilder (mechanize): enabled skip_request_errors" 86 | end 87 | 88 | # retry_request_errors 89 | if retry_errors = @config[:retry_request_errors].presence 90 | @browser.config.retry_request_errors = retry_errors 91 | logger.debug "BrowserBuilder (mechanize): enabled retry_request_errors" 92 | end 93 | 94 | # restart_if 95 | if @config[:restart_if].present? 96 | logger.warn "BrowserBuilder (mechanize): restart_if options not supported by Mechanize, skipped" 97 | end 98 | 99 | # before_request clear_cookies 100 | if @config.dig(:before_request, :clear_cookies) 101 | @browser.config.before_request[:clear_cookies] = true 102 | logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_cookies" 103 | end 104 | 105 | # before_request clear_and_set_cookies 106 | if @config.dig(:before_request, :clear_and_set_cookies) 107 | if cookies = @config[:cookies].presence 108 | @browser.config.cookies = cookies 109 | @browser.config.before_request[:clear_and_set_cookies] = true 110 | logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_and_set_cookies" 111 | else 112 | logger.error "BrowserBuilder (mechanize): cookies should be present to enable before_request.clear_and_set_cookies, skipped" 113 | end 114 | end 115 | 116 | # before_request change_user_agent 117 | if @config.dig(:before_request, :change_user_agent) 118 | if @config[:user_agent].present? && @config[:user_agent].class == Proc 119 | @browser.config.user_agent = @config[:user_agent] 120 | @browser.config.before_request[:change_user_agent] = true 121 | logger.debug "BrowserBuilder (mechanize): enabled before_request.change_user_agent" 122 | else 123 | logger.error "BrowserBuilder (mechanize): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped" 124 | end 125 | end 126 | 127 | # before_request change_proxy 128 | if @config.dig(:before_request, :change_proxy) 129 | if @config[:proxy].present? && @config[:proxy].class == Proc 130 | @browser.config.proxy = @config[:proxy] 131 | @browser.config.before_request[:change_proxy] = true 132 | logger.debug "BrowserBuilder (mechanize): enabled before_request.change_proxy" 133 | else 134 | logger.error "BrowserBuilder (mechanize): proxy should be present and has lambda format to enable before_request.change_proxy, skipped" 135 | end 136 | end 137 | 138 | # before_request delay 139 | if delay = @config.dig(:before_request, :delay).presence 140 | @browser.config.before_request[:delay] = delay 141 | logger.debug "BrowserBuilder (mechanize): enabled before_request.delay" 142 | end 143 | 144 | # encoding 145 | if encoding = @config[:encoding] 146 | @browser.config.encoding = encoding 147 | logger.debug "BrowserBuilder (mechanize): enabled encoding: #{encoding}" 148 | end 149 | 150 | # return Capybara session instance 151 | @browser 152 | end 153 | end 154 | end 155 | -------------------------------------------------------------------------------- /lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb: -------------------------------------------------------------------------------- 1 | require 'capybara' 2 | require 'capybara/poltergeist' 3 | require_relative '../capybara_configuration' 4 | require_relative '../capybara_ext/poltergeist/driver' 5 | require_relative '../capybara_ext/session' 6 | 7 | module Kimurai::BrowserBuilder 8 | class PoltergeistPhantomjsBuilder 9 | attr_reader :logger, :spider 10 | 11 | def initialize(config, spider:) 12 | @config = config 13 | @spider = spider 14 | @logger = spider.logger 15 | end 16 | 17 | def build 18 | # Register driver 19 | Capybara.register_driver :poltergeist_phantomjs do |app| 20 | # Create driver options 21 | driver_options = { 22 | js_errors: false, debug: false, inspector: false, phantomjs_options: [] 23 | } 24 | 25 | if extensions = @config[:extensions].presence 26 | driver_options[:extensions] = extensions 27 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled extensions" 28 | end 29 | 30 | # Window size 31 | if size = @config[:window_size].presence 32 | driver_options[:window_size] = size 33 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled window_size" 34 | end 35 | 36 | # SSL 37 | if ssl_cert_path = @config[:ssl_cert_path].presence 38 | driver_options[:phantomjs_options] << "--ssl-certificates-path=#{ssl_cert_path}" 39 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom ssl_cert" 40 | end 41 | 42 | if @config[:ignore_ssl_errors].present? 43 | driver_options[:phantomjs_options].push("--ignore-ssl-errors=yes", "--ssl-protocol=any") 44 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled ignore_ssl_errors" 45 | end 46 | 47 | # Disable images 48 | if @config[:disable_images].present? 49 | driver_options[:phantomjs_options] << "--load-images=no" 50 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled disable_images" 51 | end 52 | 53 | Capybara::Poltergeist::Driver.new(app, driver_options) 54 | end 55 | 56 | # Create browser instance (Capybara session) 57 | @browser = Capybara::Session.new(:poltergeist_phantomjs) 58 | @browser.spider = spider 59 | logger.debug "BrowserBuilder (poltergeist_phantomjs): created browser instance" 60 | 61 | # Proxy 62 | if proxy = @config[:proxy].presence 63 | proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip 64 | ip, port, type = proxy_string.split(":") 65 | 66 | if %w(http socks5).include?(type) 67 | @browser.driver.set_proxy(*proxy_string.split(":")) 68 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled #{type} proxy, ip: #{ip}, port: #{port}" 69 | else 70 | logger.error "BrowserBuilder (poltergeist_phantomjs): wrong type of proxy: #{type}, skipped" 71 | end 72 | end 73 | 74 | # Headers 75 | if headers = @config[:headers].presence 76 | @browser.driver.headers = headers 77 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom headers" 78 | end 79 | 80 | if user_agent = @config[:user_agent].presence 81 | user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip 82 | 83 | @browser.driver.add_header("User-Agent", user_agent_string) 84 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom user_agent" 85 | end 86 | 87 | # Cookies 88 | if cookies = @config[:cookies].presence 89 | cookies.each do |cookie| 90 | @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie) 91 | end 92 | 93 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom cookies" 94 | end 95 | 96 | # Browser instance options 97 | # skip_request_errors 98 | if skip_errors = @config[:skip_request_errors].presence 99 | @browser.config.skip_request_errors = skip_errors 100 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled skip_request_errors" 101 | end 102 | 103 | # retry_request_errors 104 | if retry_errors = @config[:retry_request_errors].presence 105 | @browser.config.retry_request_errors = retry_errors 106 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled retry_request_errors" 107 | end 108 | 109 | # restart_if 110 | if requests_limit = @config.dig(:restart_if, :requests_limit).presence 111 | @browser.config.restart_if[:requests_limit] = requests_limit 112 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.requests_limit >= #{requests_limit}" 113 | end 114 | 115 | if memory_limit = @config.dig(:restart_if, :memory_limit).presence 116 | @browser.config.restart_if[:memory_limit] = memory_limit 117 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.memory_limit >= #{memory_limit}" 118 | end 119 | 120 | # before_request clear_cookies 121 | if @config.dig(:before_request, :clear_cookies) 122 | @browser.config.before_request[:clear_cookies] = true 123 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_cookies" 124 | end 125 | 126 | # before_request clear_and_set_cookies 127 | if @config.dig(:before_request, :clear_and_set_cookies) 128 | if cookies = @config[:cookies].presence 129 | @browser.config.cookies = cookies 130 | @browser.config.before_request[:clear_and_set_cookies] = true 131 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_and_set_cookies" 132 | else 133 | logger.error "BrowserBuilder (poltergeist_phantomjs): cookies should be present to enable before_request.clear_and_set_cookies, skipped" 134 | end 135 | end 136 | 137 | # before_request change_user_agent 138 | if @config.dig(:before_request, :change_user_agent) 139 | if @config[:user_agent].present? && @config[:user_agent].class == Proc 140 | @browser.config.user_agent = @config[:user_agent] 141 | @browser.config.before_request[:change_user_agent] = true 142 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_user_agent" 143 | else 144 | logger.error "BrowserBuilder (poltergeist_phantomjs): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped" 145 | end 146 | end 147 | 148 | # before_request change_proxy 149 | if @config.dig(:before_request, :change_proxy) 150 | if @config[:proxy].present? && @config[:proxy].class == Proc 151 | @browser.config.proxy = @config[:proxy] 152 | @browser.config.before_request[:change_proxy] = true 153 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_proxy" 154 | else 155 | logger.error "BrowserBuilder (poltergeist_phantomjs): proxy should be present and has lambda format to enable before_request.change_proxy, skipped" 156 | end 157 | end 158 | 159 | # before_request delay 160 | if delay = @config.dig(:before_request, :delay).presence 161 | @browser.config.before_request[:delay] = delay 162 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.delay" 163 | end 164 | 165 | # encoding 166 | if encoding = @config[:encoding] 167 | @browser.config.encoding = encoding 168 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled encoding: #{encoding}" 169 | end 170 | 171 | # return Capybara session instance 172 | @browser 173 | end 174 | end 175 | end 176 | -------------------------------------------------------------------------------- /lib/kimurai/browser_builder/selenium_chrome_builder.rb: -------------------------------------------------------------------------------- 1 | require 'capybara' 2 | require 'selenium-webdriver' 3 | require_relative '../capybara_configuration' 4 | require_relative '../capybara_ext/selenium/driver' 5 | require_relative '../capybara_ext/session' 6 | 7 | module Kimurai::BrowserBuilder 8 | class SeleniumChromeBuilder 9 | class << self 10 | attr_accessor :virtual_display 11 | end 12 | 13 | attr_reader :logger, :spider 14 | 15 | def initialize(config, spider:) 16 | @config = config 17 | @spider = spider 18 | @logger = spider.logger 19 | end 20 | 21 | def build 22 | # Register driver 23 | Capybara.register_driver :selenium_chrome do |app| 24 | # Create driver options 25 | opts = { args: %w[--disable-gpu --no-sandbox --disable-translate] } 26 | 27 | # Provide custom chrome browser path: 28 | if chrome_path = Kimurai.configuration.selenium_chrome_path 29 | opts.merge!(binary: chrome_path) 30 | end 31 | 32 | # See all options here: https://seleniumhq.github.io/selenium/docs/api/rb/Selenium/WebDriver/Chrome/Options.html 33 | driver_options = Selenium::WebDriver::Chrome::Options.new(opts) 34 | 35 | # Window size 36 | if size = @config[:window_size].presence 37 | driver_options.args << "--window-size=#{size.join(',')}" 38 | logger.debug "BrowserBuilder (selenium_chrome): enabled window_size" 39 | end 40 | 41 | # Proxy 42 | if proxy = @config[:proxy].presence 43 | proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip 44 | ip, port, type, user, password = proxy_string.split(":") 45 | 46 | if %w(http socks5).include?(type) 47 | if user.nil? && password.nil? 48 | driver_options.args << "--proxy-server=#{type}://#{ip}:#{port}" 49 | logger.debug "BrowserBuilder (selenium_chrome): enabled #{type} proxy, ip: #{ip}, port: #{port}" 50 | else 51 | logger.error "BrowserBuilder (selenium_chrome): proxy with authentication doesn't supported by selenium, skipped" 52 | end 53 | else 54 | logger.error "BrowserBuilder (selenium_chrome): wrong type of proxy: #{type}, skipped" 55 | end 56 | end 57 | 58 | if proxy_bypass_list = @config[:proxy_bypass_list].presence 59 | if proxy 60 | driver_options.args << "--proxy-bypass-list=#{proxy_bypass_list.join(';')}" 61 | logger.debug "BrowserBuilder (selenium_chrome): enabled proxy_bypass_list" 62 | else 63 | logger.error "BrowserBuilder (selenium_chrome): provide `proxy` to set proxy_bypass_list, skipped" 64 | end 65 | end 66 | 67 | # SSL 68 | if @config[:ignore_ssl_errors].present? 69 | driver_options.args << "--ignore-certificate-errors" 70 | driver_options.args << "--allow-insecure-localhost" 71 | logger.debug "BrowserBuilder (selenium_chrome): enabled ignore_ssl_errors" 72 | end 73 | 74 | # Disable images 75 | if @config[:disable_images].present? 76 | driver_options.prefs["profile.managed_default_content_settings.images"] = 2 77 | logger.debug "BrowserBuilder (selenium_chrome): enabled disable_images" 78 | end 79 | 80 | # Headers 81 | if @config[:headers].present? 82 | logger.warn "BrowserBuilder: (selenium_chrome): custom headers doesn't supported by selenium, skipped" 83 | end 84 | 85 | if user_agent = @config[:user_agent].presence 86 | user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip 87 | driver_options.args << "--user-agent='#{user_agent_string}'" 88 | logger.debug "BrowserBuilder (selenium_chrome): enabled custom user_agent" 89 | end 90 | 91 | # Headless mode 92 | if ENV["HEADLESS"] != "false" 93 | if @config[:headless_mode] == :virtual_display 94 | if Gem::Platform.local.os == "linux" 95 | unless self.class.virtual_display 96 | require 'headless' 97 | self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false) 98 | self.class.virtual_display.start 99 | end 100 | 101 | logger.debug "BrowserBuilder (selenium_chrome): enabled virtual_display headless_mode" 102 | else 103 | logger.error "BrowserBuilder (selenium_chrome): virtual_display headless_mode works only " \ 104 | "on Linux platform. Browser will run in normal mode. Set `native` mode instead." 105 | end 106 | else 107 | driver_options.args << "--headless" 108 | logger.debug "BrowserBuilder (selenium_chrome): enabled native headless_mode" 109 | end 110 | end 111 | 112 | chromedriver_path = Kimurai.configuration.chromedriver_path || "/usr/local/bin/chromedriver" 113 | service = Selenium::WebDriver::Service.chrome(path: chromedriver_path) 114 | Capybara::Selenium::Driver.new(app, browser: :chrome, options: driver_options, service: service) 115 | end 116 | 117 | # Create browser instance (Capybara session) 118 | @browser = Capybara::Session.new(:selenium_chrome) 119 | @browser.spider = spider 120 | logger.debug "BrowserBuilder (selenium_chrome): created browser instance" 121 | 122 | if @config[:extensions].present? 123 | logger.error "BrowserBuilder (selenium_chrome): `extensions` option not supported by Selenium, skipped" 124 | end 125 | 126 | # Cookies 127 | if cookies = @config[:cookies].presence 128 | @browser.config.cookies = cookies 129 | logger.debug "BrowserBuilder (selenium_chrome): enabled custom cookies" 130 | end 131 | 132 | # Browser instance options 133 | # skip_request_errors 134 | if skip_errors = @config[:skip_request_errors].presence 135 | @browser.config.skip_request_errors = skip_errors 136 | logger.debug "BrowserBuilder (selenium_chrome): enabled skip_request_errors" 137 | end 138 | 139 | # retry_request_errors 140 | if retry_errors = @config[:retry_request_errors].presence 141 | @browser.config.retry_request_errors = retry_errors 142 | logger.debug "BrowserBuilder (selenium_chrome): enabled retry_request_errors" 143 | end 144 | 145 | # restart_if 146 | if requests_limit = @config.dig(:restart_if, :requests_limit).presence 147 | @browser.config.restart_if[:requests_limit] = requests_limit 148 | logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.requests_limit >= #{requests_limit}" 149 | end 150 | 151 | if memory_limit = @config.dig(:restart_if, :memory_limit).presence 152 | @browser.config.restart_if[:memory_limit] = memory_limit 153 | logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.memory_limit >= #{memory_limit}" 154 | end 155 | 156 | # before_request clear_cookies 157 | if @config.dig(:before_request, :clear_cookies) 158 | @browser.config.before_request[:clear_cookies] = true 159 | logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_cookies" 160 | end 161 | 162 | # before_request clear_and_set_cookies 163 | if @config.dig(:before_request, :clear_and_set_cookies) 164 | if cookies = @config[:cookies].presence 165 | @browser.config.cookies = cookies 166 | @browser.config.before_request[:clear_and_set_cookies] = true 167 | logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_and_set_cookies" 168 | else 169 | logger.error "BrowserBuilder (selenium_chrome): cookies should be present to enable before_request.clear_and_set_cookies, skipped" 170 | end 171 | end 172 | 173 | # before_request change_user_agent 174 | if @config.dig(:before_request, :change_user_agent) 175 | logger.error "BrowserBuilder (selenium_chrome): before_request.change_user_agent option not supported by Selenium, skipped" 176 | end 177 | 178 | # before_request change_proxy 179 | if @config.dig(:before_request, :change_proxy) 180 | logger.error "BrowserBuilder (selenium_chrome): before_request.change_proxy option not supported by Selenium, skipped" 181 | end 182 | 183 | # before_request delay 184 | if delay = @config.dig(:before_request, :delay).presence 185 | @browser.config.before_request[:delay] = delay 186 | logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.delay" 187 | end 188 | 189 | # encoding 190 | if encoding = @config[:encoding] 191 | @browser.config.encoding = encoding 192 | logger.debug "BrowserBuilder (selenium_chrome): enabled encoding: #{encoding}" 193 | end 194 | 195 | # return Capybara session instance 196 | @browser 197 | end 198 | end 199 | end 200 | -------------------------------------------------------------------------------- /lib/kimurai/browser_builder/selenium_firefox_builder.rb: -------------------------------------------------------------------------------- 1 | require 'capybara' 2 | require 'selenium-webdriver' 3 | require_relative '../capybara_configuration' 4 | require_relative '../capybara_ext/selenium/driver' 5 | require_relative '../capybara_ext/session' 6 | 7 | module Kimurai::BrowserBuilder 8 | class SeleniumFirefoxBuilder 9 | class << self 10 | attr_accessor :virtual_display 11 | end 12 | 13 | attr_reader :logger, :spider 14 | 15 | def initialize(config, spider:) 16 | @config = config 17 | @spider = spider 18 | @logger = spider.logger 19 | end 20 | 21 | def build 22 | # Register driver 23 | Capybara.register_driver :selenium_firefox do |app| 24 | # Create driver options 25 | driver_options = Selenium::WebDriver::Firefox::Options.new 26 | driver_options.profile = Selenium::WebDriver::Firefox::Profile.new 27 | driver_options.profile["browser.link.open_newwindow"] = 3 # open windows in tabs 28 | driver_options.profile["media.peerconnection.enabled"] = false # disable web rtc 29 | 30 | # Proxy 31 | if proxy = @config[:proxy].presence 32 | proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip 33 | ip, port, type, user, password = proxy_string.split(":") 34 | 35 | if user.nil? && password.nil? 36 | driver_options.profile["network.proxy.type"] = 1 37 | if type == "http" 38 | driver_options.profile["network.proxy.http"] = ip 39 | driver_options.profile["network.proxy.http_port"] = port.to_i 40 | driver_options.profile["network.proxy.ssl"] = ip 41 | driver_options.profile["network.proxy.ssl_port"] = port.to_i 42 | 43 | logger.debug "BrowserBuilder (selenium_firefox): enabled http proxy, ip: #{ip}, port: #{port}" 44 | elsif type == "socks5" 45 | driver_options.profile["network.proxy.socks"] = ip 46 | driver_options.profile["network.proxy.socks_port"] = port.to_i 47 | driver_options.profile["network.proxy.socks_version"] = 5 48 | driver_options.profile["network.proxy.socks_remote_dns"] = true 49 | 50 | logger.debug "BrowserBuilder (selenium_firefox): enabled socks5 proxy, ip: #{ip}, port: #{port}" 51 | else 52 | logger.error "BrowserBuilder (selenium_firefox): wrong type of proxy: #{type}, skipped" 53 | end 54 | else 55 | logger.error "BrowserBuilder (selenium_firefox): proxy with authentication doesn't supported by selenium, skipped" 56 | end 57 | end 58 | 59 | if proxy_bypass_list = @config[:proxy_bypass_list].presence 60 | if proxy 61 | driver_options.profile["network.proxy.no_proxies_on"] = proxy_bypass_list.join(", ") 62 | logger.debug "BrowserBuilder (selenium_firefox): enabled proxy_bypass_list" 63 | else 64 | logger.error "BrowserBuilder (selenium_firefox): provide `proxy` to set proxy_bypass_list, skipped" 65 | end 66 | end 67 | 68 | # SSL 69 | if @config[:ignore_ssl_errors].present? 70 | driver_options.profile.secure_ssl = false 71 | driver_options.profile.assume_untrusted_certificate_issuer = true 72 | logger.debug "BrowserBuilder (selenium_firefox): enabled ignore_ssl_errors" 73 | end 74 | 75 | # Disable images 76 | if @config[:disable_images].present? 77 | driver_options.profile["permissions.default.image"] = 2 78 | logger.debug "BrowserBuilder (selenium_firefox): enabled disable_images" 79 | end 80 | 81 | # Headers 82 | if @config[:headers].present? 83 | logger.warn "BrowserBuilder: (selenium_firefox): custom headers doesn't supported by selenium, skipped" 84 | end 85 | 86 | if user_agent = @config[:user_agent].presence 87 | user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip 88 | driver_options.profile["general.useragent.override"] = user_agent_string 89 | logger.debug "BrowserBuilder (selenium_firefox): enabled custom user_agent" 90 | end 91 | 92 | # Headless mode 93 | if ENV["HEADLESS"] != "false" 94 | if @config[:headless_mode] == :virtual_display 95 | if Gem::Platform.local.os == "linux" 96 | unless self.class.virtual_display 97 | require 'headless' 98 | self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false) 99 | self.class.virtual_display.start 100 | end 101 | 102 | logger.debug "BrowserBuilder (selenium_firefox): enabled virtual_display headless_mode" 103 | else 104 | logger.error "BrowserBuilder (selenium_firefox): virtual_display headless_mode works only " \ 105 | "on Linux platform. Browser will run in normal mode. Set `native` mode instead." 106 | end 107 | else 108 | driver_options.args << "--headless" 109 | logger.debug "BrowserBuilder (selenium_firefox): enabled native headless_mode" 110 | end 111 | end 112 | 113 | Capybara::Selenium::Driver.new(app, browser: :firefox, options: driver_options) 114 | end 115 | 116 | # Create browser instance (Capybara session) 117 | @browser = Capybara::Session.new(:selenium_firefox) 118 | @browser.spider = spider 119 | logger.debug "BrowserBuilder (selenium_firefox): created browser instance" 120 | 121 | if @config[:extensions].present? 122 | logger.error "BrowserBuilder (selenium_firefox): `extensions` option not supported by Selenium, skipped" 123 | end 124 | 125 | # Window size 126 | if size = @config[:window_size].presence 127 | @browser.current_window.resize_to(*size) 128 | logger.debug "BrowserBuilder (selenium_firefox): enabled window_size" 129 | end 130 | 131 | # Cookies 132 | if cookies = @config[:cookies].presence 133 | @browser.config.cookies = cookies 134 | logger.debug "BrowserBuilder (selenium_firefox): enabled custom cookies" 135 | end 136 | 137 | # Browser instance options 138 | # skip_request_errors 139 | if skip_errors = @config[:skip_request_errors].presence 140 | @browser.config.skip_request_errors = skip_errors 141 | logger.debug "BrowserBuilder (selenium_firefox): enabled skip_request_errors" 142 | end 143 | 144 | # retry_request_errors 145 | if retry_errors = @config[:retry_request_errors].presence 146 | @browser.config.retry_request_errors = retry_errors 147 | logger.debug "BrowserBuilder (selenium_firefox): enabled retry_request_errors" 148 | end 149 | 150 | # restart_if 151 | if requests_limit = @config.dig(:restart_if, :requests_limit).presence 152 | @browser.config.restart_if[:requests_limit] = requests_limit 153 | logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.requests_limit >= #{requests_limit}" 154 | end 155 | 156 | if memory_limit = @config.dig(:restart_if, :memory_limit).presence 157 | @browser.config.restart_if[:memory_limit] = memory_limit 158 | logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.memory_limit >= #{memory_limit}" 159 | end 160 | 161 | # before_request clear_cookies 162 | if @config.dig(:before_request, :clear_cookies) 163 | @browser.config.before_request[:clear_cookies] = true 164 | logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_cookies" 165 | end 166 | 167 | # before_request clear_and_set_cookies 168 | if @config.dig(:before_request, :clear_and_set_cookies) 169 | if cookies = @config[:cookies].presence 170 | @browser.config.cookies = cookies 171 | @browser.config.before_request[:clear_and_set_cookies] = true 172 | logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_and_set_cookies" 173 | else 174 | logger.error "BrowserBuilder (selenium_firefox): cookies should be present to enable before_request.clear_and_set_cookies, skipped" 175 | end 176 | end 177 | 178 | # before_request change_user_agent 179 | if @config.dig(:before_request, :change_user_agent) 180 | logger.error "BrowserBuilder (selenium_firefox): before_request.change_user_agent option not supported by Selenium, skipped" 181 | end 182 | 183 | # before_request change_proxy 184 | if @config.dig(:before_request, :change_proxy) 185 | logger.error "BrowserBuilder (selenium_firefox): before_request.change_proxy option not supported by Selenium, skipped" 186 | end 187 | 188 | # before_request delay 189 | if delay = @config.dig(:before_request, :delay).presence 190 | @browser.config.before_request[:delay] = delay 191 | logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.delay" 192 | end 193 | 194 | # encoding 195 | if encoding = @config[:encoding] 196 | @browser.config.encoding = encoding 197 | logger.debug "BrowserBuilder (selenium_firefox): enabled encoding: #{encoding}" 198 | end 199 | 200 | # return Capybara session instance 201 | @browser 202 | end 203 | end 204 | end 205 | -------------------------------------------------------------------------------- /lib/kimurai/capybara_configuration.rb: -------------------------------------------------------------------------------- 1 | require 'capybara' 2 | 3 | Capybara.configure do |config| 4 | config.run_server = false 5 | config.default_selector = :xpath 6 | config.save_path = "tmp" 7 | config.default_max_wait_time = 10 8 | config.ignore_hidden_elements = false 9 | config.threadsafe = true 10 | end 11 | -------------------------------------------------------------------------------- /lib/kimurai/capybara_ext/driver/base.rb: -------------------------------------------------------------------------------- 1 | require 'pathname' 2 | 3 | class Capybara::Driver::Base 4 | attr_accessor :visited 5 | attr_writer :requests, :responses 6 | 7 | def requests 8 | @requests ||= 0 9 | end 10 | 11 | def responses 12 | @responses ||= 0 13 | end 14 | 15 | def current_memory 16 | driver_pid = pid 17 | 18 | all = (get_descendant_processes(driver_pid) << driver_pid).uniq 19 | all.map { |pid| get_process_memory(pid) }.sum 20 | end 21 | 22 | private 23 | 24 | def get_descendant_processes(base) 25 | descendants = Hash.new { |ht, k| ht[k] = [k] } 26 | Hash[*`ps -eo pid,ppid`.scan(/\d+/).map(&:to_i)].each do |pid, ppid| 27 | descendants[ppid] << descendants[pid] 28 | end 29 | 30 | descendants[base].flatten - [base] 31 | end 32 | 33 | # https://github.com/schneems/get_process_mem 34 | # Note: for Linux takes PSS (not RSS) memory (I think PSS better fits in this case) 35 | def get_process_memory(pid) 36 | case @platform ||= Gem::Platform.local.os 37 | when "linux" 38 | begin 39 | file = Pathname.new "/proc/#{pid}/smaps" 40 | return 0 unless file.exist? 41 | 42 | lines = file.each_line.select { |line| line.match(/^Pss/) } 43 | return 0 if lines.empty? 44 | 45 | lines.reduce(0) do |sum, line| 46 | line.match(/(?(\d*\.{0,1}\d+))\s+(?\w\w)/) do |m| 47 | sum += m[:value].to_i 48 | end 49 | 50 | sum 51 | end 52 | rescue Errno::EACCES 53 | 0 54 | end 55 | when "darwin" 56 | mem = `ps -o rss= -p #{pid}`.strip 57 | mem.empty? ? 0 : mem.to_i 58 | else 59 | raise "Can't check process memory, wrong type of platform: #{@platform}" 60 | end 61 | end 62 | end 63 | -------------------------------------------------------------------------------- /lib/kimurai/capybara_ext/mechanize/driver.rb: -------------------------------------------------------------------------------- 1 | require 'mechanize' 2 | require_relative '../driver/base' 3 | 4 | class Capybara::Mechanize::Driver 5 | # Extend capybara-mechnize to support Poltergeist-like methods 6 | # https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver 7 | 8 | def set_proxy(ip, port, type, user = nil, password = nil) 9 | # type is always "http", "socks" is not supported (yet) 10 | browser.agent.set_proxy(ip, port, user, password) 11 | end 12 | 13 | ### 14 | 15 | def headers 16 | browser.agent.request_headers 17 | end 18 | 19 | def headers=(headers) 20 | browser.agent.request_headers = headers 21 | end 22 | 23 | def add_header(name, value) 24 | browser.agent.request_headers[name] = value 25 | end 26 | 27 | ### 28 | 29 | def get_cookies 30 | browser.agent.cookies 31 | end 32 | 33 | def set_cookie(name, value, options = {}) 34 | options[:name] ||= name 35 | options[:value] ||= value 36 | 37 | cookie = Mechanize::Cookie.new(options.merge path: "/") 38 | browser.agent.cookie_jar << cookie 39 | end 40 | 41 | def set_cookies(cookies) 42 | cookies.each do |cookie| 43 | set_cookie(cookie[:name], cookie[:value], cookie) 44 | end 45 | end 46 | 47 | def clear_cookies 48 | browser.agent.cookie_jar.clear! 49 | end 50 | 51 | ### 52 | 53 | def quit 54 | browser.agent.shutdown 55 | end 56 | 57 | ### 58 | 59 | # Reset parent method `current_memory` for mechanize (we can't measure memory of Mechanize driver) 60 | def current_memory 61 | nil 62 | end 63 | 64 | def pid 65 | nil 66 | end 67 | 68 | def port 69 | nil 70 | end 71 | end 72 | -------------------------------------------------------------------------------- /lib/kimurai/capybara_ext/poltergeist/driver.rb: -------------------------------------------------------------------------------- 1 | require_relative '../driver/base' 2 | 3 | module Capybara::Poltergeist 4 | class Driver 5 | def pid 6 | client_pid 7 | end 8 | 9 | def port 10 | server.port 11 | end 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /lib/kimurai/capybara_ext/selenium/driver.rb: -------------------------------------------------------------------------------- 1 | require_relative '../driver/base' 2 | 3 | class Capybara::Selenium::Driver 4 | def get_cookies 5 | browser.manage.all_cookies 6 | end 7 | 8 | def set_cookie(name, value, options = {}) 9 | options[:name] ||= name 10 | options[:value] ||= value 11 | 12 | browser.manage.add_cookie(options) 13 | end 14 | 15 | def set_cookies(cookies) 16 | cookies.each do |cookie| 17 | set_cookie(cookie[:name], cookie[:value], cookie) 18 | end 19 | end 20 | 21 | def clear_cookies 22 | browser.manage.delete_all_cookies 23 | end 24 | 25 | ### 26 | 27 | def pid 28 | @pid ||= `lsof -i tcp:#{port} -t`.strip.to_i 29 | end 30 | 31 | def port 32 | @port ||= browser.send(:bridge).instance_variable_get("@http").instance_variable_get("@server_url").port 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /lib/kimurai/capybara_ext/session.rb: -------------------------------------------------------------------------------- 1 | require 'capybara' 2 | require 'nokogiri' 3 | require 'json' 4 | require_relative 'session/config' 5 | 6 | module Capybara 7 | class Session 8 | attr_accessor :spider 9 | 10 | alias_method :original_visit, :visit 11 | def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3) 12 | if spider 13 | process_delay(delay) if delay 14 | retries, sleep_interval = 0, 0 15 | 16 | begin 17 | check_request_options(visit_uri) unless skip_request_options 18 | driver.requests += 1 and logger.info "Browser: started get request to: #{visit_uri}" 19 | spider.class.update(:visits, :requests) if spider.with_info 20 | 21 | original_visit(visit_uri) 22 | rescue => e 23 | if match_error?(e, type: :to_skip) 24 | logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}" 25 | spider.add_event(:requests_errors, e.inspect) if spider.with_info 26 | false 27 | elsif match_error?(e, type: :to_retry) 28 | logger.error "Browser: retry request error: #{e.inspect}, url: #{visit_uri}" 29 | spider.add_event(:requests_errors, e.inspect) if spider.with_info 30 | 31 | if (retries += 1) <= max_retries 32 | logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}" 33 | sleep sleep_interval and retry 34 | else 35 | logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone" 36 | raise e unless skip_error_on_failure?(e) 37 | end 38 | else 39 | raise e 40 | end 41 | else 42 | driver.responses += 1 and logger.info "Browser: finished get request to: #{visit_uri}" 43 | spider.class.update(:visits, :responses) if spider.with_info 44 | driver.visited = true unless driver.visited 45 | true 46 | ensure 47 | if spider.with_info 48 | logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}" 49 | end 50 | 51 | if memory = driver.current_memory 52 | logger.debug "Browser: driver.current_memory: #{memory}" 53 | end 54 | end 55 | else 56 | original_visit(visit_uri) 57 | end 58 | end 59 | 60 | def destroy_driver! 61 | if @driver 62 | begin 63 | @driver.quit 64 | # handle Net::ReadTimeout error for Selenium like drivers 65 | rescue Net::ReadTimeout => e 66 | @driver.quit 67 | end 68 | 69 | @driver = nil 70 | logger.info "Browser: driver #{mode} has been destroyed" 71 | else 72 | logger.warn "Browser: driver #{mode} is not present" 73 | end 74 | end 75 | 76 | def restart! 77 | if mode.match?(/poltergeist/) 78 | @driver.browser.restart 79 | @driver.requests, @driver.responses = 0, 0 80 | else 81 | destroy_driver! 82 | driver 83 | end 84 | 85 | logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}" 86 | end 87 | 88 | def current_response(response_type = :html) 89 | case response_type 90 | when :html 91 | if config.encoding 92 | if config.encoding == :auto 93 | charset = body.force_encoding("ISO-8859-1").encode("UTF-8")[/ { browser.find("//some/element/path").click } 116 | # browser.within_new_window_by(action: action) do 117 | # do some stuff and then automatically close this tab and return back to the first tab 118 | # end 119 | def within_new_window_by(action: nil, url: nil) 120 | case 121 | when action 122 | opened_window = window_opened_by { action.call } 123 | within_window(opened_window) do 124 | yield 125 | current_window.close 126 | end 127 | when url 128 | within_window(open_new_window) do 129 | visit(url) 130 | 131 | yield 132 | current_window.close 133 | end 134 | end 135 | end 136 | 137 | ### 138 | 139 | def scroll_to_bottom 140 | execute_script("window.scrollBy(0,10000)") 141 | end 142 | 143 | private 144 | 145 | def skip_error_on_failure?(e) 146 | config.retry_request_errors.any? do |error| 147 | error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash) 148 | end 149 | end 150 | 151 | def match_error?(e, type:) 152 | errors = 153 | case type 154 | when :to_retry then config.retry_request_errors 155 | when :to_skip then config.skip_request_errors 156 | end 157 | 158 | errors.any? do |error| 159 | if error.kind_of?(Hash) 160 | match_class = e.class.ancestors.include?(error[:error]) 161 | if error[:message].present? 162 | if error[:message].kind_of?(Regexp) 163 | e.message&.match?(error[:message]) 164 | else 165 | e.message&.include?(error[:message]) 166 | end && match_class 167 | else 168 | match_class 169 | end 170 | else 171 | e.class.ancestors.include?(error) 172 | end 173 | end 174 | end 175 | 176 | def process_delay(delay) 177 | interval = (delay.class == Range ? rand(delay) : delay) 178 | logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..." 179 | sleep interval 180 | end 181 | 182 | def check_request_options(url_to_visit) 183 | # restart_if 184 | if memory_limit = config.restart_if[:memory_limit] 185 | memory = driver.current_memory 186 | if memory && memory >= memory_limit 187 | logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})" 188 | restart! 189 | end 190 | end 191 | 192 | if requests_limit = config.restart_if[:requests_limit] 193 | requests = driver.requests 194 | if requests >= requests_limit 195 | logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})" 196 | restart! 197 | end 198 | end 199 | 200 | # cookies 201 | # (Selenium only) if config.cookies present and browser was just created, 202 | # visit url_to_visit first and only then set cookies: 203 | if driver.visited.nil? && config.cookies && mode.match?(/selenium/) 204 | visit(url_to_visit, skip_request_options: true) 205 | config.cookies.each do |cookie| 206 | driver.set_cookie(cookie[:name], cookie[:value], cookie) 207 | end 208 | end 209 | 210 | if config.before_request[:clear_cookies] 211 | driver.clear_cookies 212 | logger.debug "Browser: cleared cookies before request" 213 | end 214 | 215 | if config.before_request[:clear_and_set_cookies] 216 | driver.clear_cookies 217 | 218 | # (Selenium only) if browser is not visited yet any page, visit url_to_visit 219 | # first and then set cookies (needs after browser restart): 220 | if driver.visited.nil? && mode.match?(/selenium/) 221 | visit(url_to_visit, skip_request_options: true) 222 | end 223 | 224 | config.cookies.each do |cookie| 225 | driver.set_cookie(cookie[:name], cookie[:value], cookie) 226 | end 227 | 228 | logger.debug "Browser: cleared and set cookies before request" 229 | end 230 | 231 | # user_agent 232 | if config.before_request[:change_user_agent] 233 | driver.add_header("User-Agent", config.user_agent.call) 234 | logger.debug "Browser: changed user_agent before request" 235 | end 236 | 237 | # proxy 238 | if config.before_request[:change_proxy] 239 | proxy_string = config.proxy.call 240 | driver.set_proxy(*proxy_string.split(":")) 241 | logger.debug "Browser: changed proxy before request" 242 | end 243 | end 244 | 245 | def logger 246 | spider.logger 247 | end 248 | end 249 | end 250 | -------------------------------------------------------------------------------- /lib/kimurai/capybara_ext/session/config.rb: -------------------------------------------------------------------------------- 1 | module Capybara 2 | class SessionConfig 3 | attr_accessor :cookies, :proxy, :user_agent, :encoding 4 | attr_writer :retry_request_errors, :skip_request_errors 5 | 6 | def retry_request_errors 7 | @retry_request_errors ||= [] 8 | end 9 | 10 | def skip_request_errors 11 | @skip_request_errors ||= [] 12 | end 13 | 14 | def restart_if 15 | @restart_if ||= {} 16 | end 17 | 18 | def before_request 19 | @before_request ||= {} 20 | end 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /lib/kimurai/cli.rb: -------------------------------------------------------------------------------- 1 | require 'thor' 2 | 3 | module Kimurai 4 | class CLI < Thor 5 | map %w[--version -v] => :__print_version 6 | 7 | desc "generate", "Generator, available types: project, spider, schedule" 8 | def generate(generator_type, *args) 9 | case generator_type 10 | when "project" 11 | project_name = args.shift 12 | raise "Provide project name to generate a new project" unless project_name.present? 13 | Generator.new.generate_project(project_name) 14 | when "spider" 15 | spider_name = args.shift 16 | raise "Provide spider name to generate a spider" unless spider_name.present? 17 | Generator.new.generate_spider(spider_name, in_project: inside_project?) 18 | when "schedule" 19 | Generator.new.generate_schedule 20 | else 21 | raise "Don't know this generator type: #{generator_type}" 22 | end 23 | end 24 | 25 | ### 26 | 27 | desc "setup", "Setup server" 28 | option :port, aliases: :p, type: :string, banner: "Port for ssh connection" 29 | option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages" 30 | option "ask-auth-pass", type: :boolean, banner: "Auth using password" 31 | option "ssh-key-path", type: :string, banner: "Auth using ssh key" 32 | option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)" 33 | def setup(user_host) 34 | command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get 35 | 36 | pid = spawn *command 37 | Process.wait pid 38 | end 39 | 40 | desc "deploy", "Deploy project to the server and update cron schedule" 41 | option :port, aliases: :p, type: :string, banner: "Port for ssh connection" 42 | option "ask-auth-pass", type: :boolean, banner: "Auth using password" 43 | option "ssh-key-path", type: :string, banner: "Auth using ssh key" 44 | option "repo-url", type: :string, banner: "Repo url" 45 | option "repo-key-path", type: :string, banner: "SSH key for a git repo" 46 | option "skip-check", type: :boolean, default: false, banner: "Skip git repository checks" 47 | def deploy(user_host) 48 | unless options["skip-check"] 49 | if !`git status --short`.empty? 50 | raise "Deploy: Please commit your changes first" 51 | elsif `git remote`.empty? 52 | raise "Deploy: Please add remote origin repository to your repo first" 53 | elsif !`git rev-list master...origin/master`.empty? 54 | raise "Deploy: Please push your commits to the remote origin repo first" 55 | end 56 | end 57 | 58 | repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip 59 | repo_name = repo_url[/\/([^\/]*)\.git/i, 1] 60 | 61 | command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy", 62 | vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] } 63 | ).get 64 | 65 | pid = spawn *command 66 | Process.wait pid 67 | end 68 | 69 | ### 70 | 71 | desc "crawl", "Run a particular spider by it's name" 72 | def crawl(spider_name) 73 | raise "Can't find Kimurai project" unless inside_project? 74 | require './config/boot' 75 | 76 | unless klass = Kimurai.find_by_name(spider_name) 77 | raise "Can't find spider with name `#{spider_name}` in the project. " \ 78 | "To list all available spiders, run: `$ bundle exec kimurai list`" 79 | end 80 | 81 | # Set time_zone if exists 82 | if time_zone = Kimurai.configuration.time_zone 83 | Kimurai.time_zone = time_zone 84 | end 85 | 86 | klass.crawl! 87 | end 88 | 89 | desc "parse", "Parse url in the particular spider method" 90 | option :url, type: :string, required: true, banner: "Url to pass to the method" 91 | def parse(spider_name, method_name) 92 | raise "Can't find Kimurai project" unless inside_project? 93 | require './config/boot' 94 | 95 | unless klass = Kimurai.find_by_name(spider_name) 96 | raise "Can't find spider with name `#{spider_name}` in the project. " \ 97 | "To list all available spiders, run: `$ bundle exec kimurai list`" 98 | end 99 | 100 | klass.parse!(method_name, url: options["url"]) 101 | end 102 | 103 | desc "console", "Start Kimurai console" 104 | option :engine, type: :string, banner: "Engine to use" 105 | option :url, type: :string, banner: "Url to process" 106 | def console(spider_name = nil) 107 | require 'pry' 108 | require './config/boot' if inside_project? 109 | 110 | if spider_name 111 | raise "Can't find Kimurai project" unless inside_project? 112 | 113 | unless klass = Kimurai.find_by_name(spider_name) 114 | raise "Can't find spider with name `#{spider_name}` in the project. " \ 115 | "To list all available spiders, run: `$ bundle exec kimurai list`" 116 | end 117 | else 118 | klass = inside_project? ? ApplicationSpider : ::Kimurai::Base 119 | end 120 | 121 | engine = options["engine"]&.delete(":")&.to_sym 122 | if url = options["url"] 123 | klass.new(engine).request_to(:console, url: options["url"]) 124 | else 125 | klass.new(engine).public_send(:console) 126 | end 127 | end 128 | 129 | desc "list", "List all available spiders in the current project" 130 | def list 131 | raise "Can't find Kimurai project" unless inside_project? 132 | require './config/boot' 133 | 134 | Kimurai.list.keys.sort.each { |name| puts name } 135 | end 136 | 137 | desc "runner", "Run all spiders in the project in queue" 138 | option :include, type: :array, default: [], banner: "List of spiders to run" 139 | option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run" 140 | option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs" 141 | def runner 142 | raise "Can't find Kimurai project" unless inside_project? 143 | 144 | jobs = options["jobs"] 145 | raise "Jobs count can't be 0" if jobs == 0 146 | 147 | require './config/boot' 148 | require 'kimurai/runner' 149 | 150 | spiders = options["include"].presence || Kimurai.list.keys 151 | spiders -= options["exclude"] 152 | 153 | Runner.new(spiders, jobs).run! 154 | end 155 | 156 | desc "--version, -v", "Print the version" 157 | def __print_version 158 | puts VERSION 159 | end 160 | 161 | desc "dashboard", "Run dashboard" 162 | def dashboard 163 | raise "Can't find Kimurai project" unless inside_project? 164 | 165 | require './config/boot' 166 | if Object.const_defined?("Kimurai::Dashboard") 167 | require 'kimurai/dashboard/app' 168 | Kimurai::Dashboard::App.run! 169 | else 170 | raise "Kimurai::Dashboard is not defined" 171 | end 172 | end 173 | 174 | private 175 | 176 | def inside_project? 177 | Dir.exists?("spiders") && File.exists?("./config/boot.rb") 178 | end 179 | end 180 | end 181 | 182 | require_relative 'cli/generator' 183 | require_relative 'cli/ansible_command_builder' 184 | -------------------------------------------------------------------------------- /lib/kimurai/cli/ansible_command_builder.rb: -------------------------------------------------------------------------------- 1 | require 'cliver' 2 | 3 | module Kimurai 4 | class CLI 5 | class AnsibleCommandBuilder 6 | def initialize(user_host, options, playbook:, vars: {}) 7 | @user_host = user_host 8 | @options = options 9 | @playbook = playbook 10 | @vars = vars 11 | end 12 | 13 | def get 14 | unless Cliver.detect("ansible-playbook") 15 | raise "Can't find `ansible-playbook` executable, to install: " \ 16 | "Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`" 17 | end 18 | 19 | user = @user_host[/(.*?)\@/, 1] 20 | host = @user_host[/\@(.+)/, 1] || @user_host 21 | inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host}," 22 | 23 | gem_dir = Gem::Specification.find_by_name("kimurai").gem_dir 24 | playbook_path = gem_dir + "/lib/kimurai/automation/" + "#{@playbook}.yml" 25 | 26 | command = [ 27 | "ansible-playbook", playbook_path, 28 | "--inventory", inventory, 29 | "--ssh-extra-args", "-oForwardAgent=yes", 30 | "--connection", @options["local"] ? "local" : "smart", 31 | "--extra-vars", "ansible_python_interpreter=/usr/bin/python3" 32 | ] 33 | 34 | if File.exists? "config/automation.yml" 35 | require 'yaml' 36 | if config = YAML.load_file("config/automation.yml").dig(@playbook) 37 | config.each { |key, value| @vars[key] = value unless @vars[key] } 38 | end 39 | end 40 | 41 | @vars.each do |key, value| 42 | next unless value.present? 43 | command.push "--extra-vars", "#{key}=#{value}" 44 | end 45 | 46 | if user 47 | command.push "--user", user 48 | end 49 | 50 | if @options["ask-sudo"] 51 | command.push "--ask-become-pass" 52 | end 53 | 54 | if @options["ask-auth-pass"] 55 | unless Cliver.detect("sshpass") 56 | raise "Can't find `sshpass` executable for password authentication, to install: " \ 57 | "Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`" 58 | end 59 | 60 | command.push "--ask-pass" 61 | end 62 | 63 | if ssh_key_path = @options["ssh-key-path"] 64 | command.push "--private-key", ssh_key_path 65 | end 66 | 67 | command 68 | end 69 | end 70 | end 71 | end 72 | -------------------------------------------------------------------------------- /lib/kimurai/cli/generator.rb: -------------------------------------------------------------------------------- 1 | module Kimurai 2 | class CLI 3 | class Generator < Thor::Group 4 | include Thor::Actions 5 | 6 | def self.source_root 7 | File.dirname(File.expand_path('..', __FILE__)) 8 | end 9 | 10 | def generate_project(project_name) 11 | directory "template", project_name 12 | inside(project_name) do 13 | run "bundle install" 14 | run "git init" 15 | end 16 | end 17 | 18 | def generate_spider(spider_name, in_project:) 19 | spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb" 20 | raise "Spider #{spider_path} already exists" if File.exists? spider_path 21 | 22 | spider_class = to_spider_class(spider_name) 23 | create_file spider_path do 24 | <<~RUBY 25 | class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Kimurai::Base'} 26 | @name = "#{spider_name}" 27 | @start_urls = [] 28 | @config = {} 29 | 30 | def parse(response, url:, data: {}) 31 | end 32 | end 33 | RUBY 34 | end 35 | 36 | unless in_project 37 | insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n" 38 | prepend_to_file spider_path, "require 'kimurai'\n\n" 39 | append_to_file spider_path, "\n#{spider_class}.crawl!" 40 | end 41 | end 42 | 43 | def generate_schedule 44 | copy_file "template/config/schedule.rb", "./schedule.rb" 45 | end 46 | 47 | private 48 | 49 | def to_spider_class(string) 50 | string.sub(/^./) { $&.capitalize } 51 | .gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" } 52 | .gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" } 53 | .gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" } 54 | end 55 | end 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /lib/kimurai/core_ext/array.rb: -------------------------------------------------------------------------------- 1 | class Array 2 | def in_sorted_groups(number, fill_width = nil) 3 | sorted_groups = Array.new(number) { |a| a = [] } 4 | 5 | self.in_groups_of(number, fill_width).each do |group| 6 | number.times do |i| 7 | group.fetch(i) rescue next 8 | sorted_groups[i] << group[i] 9 | end 10 | end 11 | 12 | sorted_groups 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /lib/kimurai/core_ext/hash.rb: -------------------------------------------------------------------------------- 1 | class Hash 2 | def deep_merge_excl(second, exclude) 3 | self.merge(second.slice(*exclude)).deep_merge(second.except(*exclude)) 4 | end 5 | end 6 | -------------------------------------------------------------------------------- /lib/kimurai/core_ext/numeric.rb: -------------------------------------------------------------------------------- 1 | class Numeric 2 | # https://stackoverflow.com/a/1679963 3 | def duration 4 | secs = self.to_int 5 | mins = secs / 60 6 | hours = mins / 60 7 | days = hours / 24 8 | 9 | if days > 0 10 | "#{days}d, #{hours % 24}h" 11 | elsif hours > 0 12 | "#{hours}h, #{mins % 60}m" 13 | elsif mins > 0 14 | "#{mins}m, #{secs % 60}s" 15 | elsif secs >= 0 16 | "#{secs}s" 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/kimurai/core_ext/string.rb: -------------------------------------------------------------------------------- 1 | require 'murmurhash3' 2 | 3 | class String 4 | def to_id 5 | MurmurHash3::V32.str_hash(self) 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /lib/kimurai/pipeline.rb: -------------------------------------------------------------------------------- 1 | module Kimurai 2 | class Pipeline 3 | class DropItemError < StandardError; end 4 | def self.name 5 | self.to_s.sub(/.*?::/, "").underscore.to_sym 6 | end 7 | 8 | include BaseHelper 9 | attr_accessor :spider 10 | 11 | def name 12 | self.class.name 13 | end 14 | 15 | ### 16 | 17 | def storage 18 | spider.storage 19 | end 20 | 21 | def unique?(scope, value) 22 | spider.unique?(scope, value) 23 | end 24 | 25 | def save_to(path, item, format:, position: true, append: false) 26 | spider.save_to(path, item, format: format, position: position, append: append) 27 | end 28 | 29 | def logger 30 | spider.logger 31 | end 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /lib/kimurai/runner.rb: -------------------------------------------------------------------------------- 1 | require 'pmap' 2 | 3 | module Kimurai 4 | class Runner 5 | attr_reader :jobs, :spiders, :session_info 6 | 7 | def initialize(spiders, parallel_jobs) 8 | @jobs = parallel_jobs 9 | @spiders = spiders 10 | @start_time = Time.now 11 | 12 | @session_info = { 13 | id: @start_time.to_i, 14 | status: :processing, 15 | start_time: @start_time, 16 | stop_time: nil, 17 | environment: Kimurai.env, 18 | concurrent_jobs: @jobs, 19 | spiders: @spiders 20 | } 21 | 22 | if time_zone = Kimurai.configuration.time_zone 23 | Kimurai.time_zone = time_zone 24 | end 25 | 26 | ENV.store("SESSION_ID", @start_time.to_i.to_s) 27 | ENV.store("RBCAT_COLORIZER", "false") 28 | end 29 | 30 | def run!(exception_on_fail: true) 31 | puts ">>> Runner: started: #{session_info}" 32 | if at_start_callback = Kimurai.configuration.runner_at_start_callback 33 | at_start_callback.call(session_info) 34 | end 35 | 36 | running = true 37 | spiders.peach_with_index(jobs) do |spider, i| 38 | next unless running 39 | 40 | puts "> Runner: started spider: #{spider}, index: #{i}" 41 | pid = spawn("bundle", "exec", "kimurai", "crawl", spider, [:out, :err] => "log/#{spider}.log") 42 | Process.wait pid 43 | 44 | puts "< Runner: stopped spider: #{spider}, index: #{i}" 45 | end 46 | rescue StandardError, SignalException, SystemExit => e 47 | running = false 48 | 49 | session_info.merge!(status: :failed, error: e.inspect, stop_time: Time.now) 50 | exception_on_fail ? raise(e) : [session_info, e] 51 | else 52 | session_info.merge!(status: :completed, stop_time: Time.now) 53 | ensure 54 | if at_stop_callback = Kimurai.configuration.runner_at_stop_callback 55 | at_stop_callback.call(session_info) 56 | end 57 | puts "<<< Runner: stopped: #{session_info}" 58 | end 59 | end 60 | end 61 | -------------------------------------------------------------------------------- /lib/kimurai/template/.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle 2 | /cache 3 | /node_modules 4 | 5 | /log/* 6 | !/log/.keep 7 | 8 | /tmp/* 9 | !/tmp/.keep 10 | 11 | /db/* 12 | !/db/.keep 13 | 14 | .byebug_history 15 | *.swp 16 | .env 17 | 18 | capybara-*.png 19 | -------------------------------------------------------------------------------- /lib/kimurai/template/Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | git_source(:github) { |repo| "https://github.com/#{repo}.git" } 3 | 4 | ruby '>= 2.5' 5 | 6 | # Framework 7 | gem 'kimurai', '~> 1.4' 8 | 9 | # Require files in directory and child directories recursively 10 | gem 'require_all' 11 | 12 | # Dotenv 13 | gem 'dotenv' 14 | 15 | # To debug spiders: 16 | group :development do 17 | gem 'byebug', platforms: :mri 18 | gem 'pry' 19 | end 20 | 21 | # If you want to save items to the database, require one of these gems: 22 | # gem 'sqlite3' 23 | # gem 'pg' 24 | # gem 'mysql2' 25 | 26 | # And use your preferred ORM/database connector: 27 | # gem 'activerecord', require: 'active_record' 28 | # gem 'sequel' 29 | -------------------------------------------------------------------------------- /lib/kimurai/template/README.md: -------------------------------------------------------------------------------- 1 | # README 2 | 3 | New Kimurai project readme 4 | -------------------------------------------------------------------------------- /lib/kimurai/template/config/application.rb: -------------------------------------------------------------------------------- 1 | Kimurai.configure do |config| 2 | # Default logger has colored mode in development. 3 | # If you would like to disable it, set `colorize_logger` to false. 4 | # config.colorize_logger = false 5 | 6 | # Logger level for default logger: 7 | # config.log_level = :info 8 | 9 | # Custom logger: 10 | # config.logger = Logger.new(STDOUT) 11 | 12 | # Custom time zone (for logs): 13 | # config.time_zone = "UTC" 14 | # config.time_zone = "Europe/Moscow" 15 | 16 | # At start callback for a runner. Accepts argument with info as hash with 17 | # keys: id, status, start_time, environment, concurrent_jobs, spiders list. 18 | # For example, you can use this callback to send notification when runner was started: 19 | # config.runner_at_start_callback = lambda do |info| 20 | # json = JSON.pretty_generate(info) 21 | # Sender.send_notification("Started session: #{json}") 22 | # end 23 | 24 | # At stop callback for a runner. Accepts argument with info as hash with 25 | # all `runner_at_start_callback` keys plus additional `stop_time` key. Also `status` contains 26 | # stop status of a runner (completed or failed). 27 | # You can use this callback to send notification when runner has been stopped: 28 | # config.runner_at_stop_callback = lambda do |info| 29 | # json = JSON.pretty_generate(info) 30 | # Sender.send_notification("Stopped session: #{json}") 31 | # end 32 | 33 | # Provide custom chrome binary path (default is any available chrome/chromium in the PATH): 34 | # config.selenium_chrome_path = "/usr/bin/chromium-browser" 35 | # Provide custom selenium chromedriver path (default is "/usr/local/bin/chromedriver"): 36 | # config.chromedriver_path = "/usr/local/bin/chromedriver" 37 | end 38 | -------------------------------------------------------------------------------- /lib/kimurai/template/config/automation.yml: -------------------------------------------------------------------------------- 1 | # software versions to install for `setup` command 2 | setup: 3 | ruby: 2.5.1 4 | # check latest here http://phantomjs.org/download.html 5 | phantomjs: 2.1.1 6 | # check latest here https://github.com/mozilla/geckodriver/releases/ 7 | geckodriver: 0.21.0 8 | # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads 9 | chromedriver: 2.39 10 | # settings for deploy command, you can use cli options as well (--repo-url, --git-key-path) 11 | deploy: 12 | # repo_url: git@bitbucket.org:username/repo_name.git 13 | # repo_key_path: ~/.ssh/id_rsa 14 | -------------------------------------------------------------------------------- /lib/kimurai/template/config/boot.rb: -------------------------------------------------------------------------------- 1 | # require project gems 2 | require 'bundler/setup' 3 | Bundler.require(:default, Kimurai.env) 4 | 5 | # require custom ENV variables located in .env file 6 | require 'dotenv/load' 7 | 8 | # require initializers 9 | Dir.glob(File.join("./config/initializers", "*.rb"), &method(:require)) 10 | 11 | # require helpers 12 | Dir.glob(File.join("./helpers", "*.rb"), &method(:require)) 13 | 14 | # require pipelines 15 | Dir.glob(File.join("./pipelines", "*.rb"), &method(:require)) 16 | 17 | # require spiders recursively in the `spiders/` folder 18 | require_relative '../spiders/application_spider' 19 | require_all "spiders" 20 | 21 | # require Kimurai configuration 22 | require_relative 'application' 23 | -------------------------------------------------------------------------------- /lib/kimurai/template/config/initializers/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/config/initializers/.keep -------------------------------------------------------------------------------- /lib/kimurai/template/config/schedule.rb: -------------------------------------------------------------------------------- 1 | ### Settings ### 2 | require 'tzinfo' 3 | 4 | # Export current PATH to the cron 5 | env :PATH, ENV["PATH"] 6 | 7 | # Use 24 hour format when using `at:` option 8 | set :chronic_options, hours24: true 9 | 10 | # Use local_to_utc helper to setup execution time using your local timezone instead 11 | # of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`). 12 | # Also maybe you'll want to set same timezone in kimurai as well (use `Kimurai.configuration.time_zone =` for that), 13 | # to have spiders logs in a specific time zone format. 14 | # Example usage of helper: 15 | # every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do 16 | # crawl "google_spider.com", output: "log/google_spider.com.log" 17 | # end 18 | def local_to_utc(time_string, zone:) 19 | TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string)) 20 | end 21 | 22 | # Note: by default Whenever exports cron commands with :environment == "production". 23 | # Note: Whenever can only append log data to a log file (>>). If you want 24 | # to overwrite (>) log file before each run, pass lambda: 25 | # crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" } 26 | 27 | # Project job types 28 | job_type :crawl, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output" 29 | job_type :runner, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output" 30 | 31 | # Single file job type 32 | job_type :single, "cd :path && KIMURAI_ENV=:environment ruby :task :output" 33 | # Single with bundle exec 34 | job_type :single_bundle, "cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output" 35 | 36 | ### Schedule ### 37 | # Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file): 38 | # every 1.day do 39 | # Example to schedule a single spider in the project: 40 | # crawl "google_spider.com", output: "log/google_spider.com.log" 41 | 42 | # Example to schedule all spiders in the project using runner. Each spider will write 43 | # it's own output to the `log/spider_name.log` file (handled by a runner itself). 44 | # Runner output will be written to log/runner.log file. 45 | # Argument number it's a count of concurrent jobs: 46 | # runner 3, output:"log/runner.log" 47 | 48 | # Example to schedule single spider (without project): 49 | # single "single_spider.rb", output: "single_spider.log" 50 | # end 51 | 52 | ### How to set a cron schedule ### 53 | # Run: `$ whenever --update-crontab --load-file config/schedule.rb`. 54 | # If you don't have whenever command, install the gem: `$ gem install whenever`. 55 | 56 | ### How to cancel a schedule ### 57 | # Run: `$ whenever --clear-crontab --load-file config/schedule.rb`. 58 | -------------------------------------------------------------------------------- /lib/kimurai/template/db/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/db/.keep -------------------------------------------------------------------------------- /lib/kimurai/template/helpers/application_helper.rb: -------------------------------------------------------------------------------- 1 | module ApplicationHelper 2 | # Put here custom methods which are will be available for any spider 3 | end 4 | -------------------------------------------------------------------------------- /lib/kimurai/template/lib/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/lib/.keep -------------------------------------------------------------------------------- /lib/kimurai/template/log/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/log/.keep -------------------------------------------------------------------------------- /lib/kimurai/template/pipelines/saver.rb: -------------------------------------------------------------------------------- 1 | class Saver < Kimurai::Pipeline 2 | def process_item(item, options: {}) 3 | # Here you can save item to the database, send it to a remote API or 4 | # simply save item to a file format using `save_to` helper: 5 | 6 | # To get the name of a current spider: `spider.class.name` 7 | # save_to "db/#{spider.class.name}.json", item, format: :pretty_json 8 | 9 | item 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /lib/kimurai/template/pipelines/validator.rb: -------------------------------------------------------------------------------- 1 | class Validator < Kimurai::Pipeline 2 | def process_item(item, options: {}) 3 | # Here you can validate item and raise `DropItemError` 4 | # if one of the validations failed. Examples: 5 | 6 | # Check item sku for uniqueness using buit-in `unique?` helper: 7 | # unless unique?(:sku, item[:sku]) 8 | # raise DropItemError, "Item sku is not unique" 9 | # end 10 | 11 | # Drop item if title length shorter than 5 symbols: 12 | # if item[:title].size < 5 13 | # raise DropItemError, "Item title is short" 14 | # end 15 | 16 | # Drop item if it doesn't contains any images: 17 | # unless item[:images].present? 18 | # raise DropItemError, "Item images are not present" 19 | # end 20 | 21 | # Pass item to the next pipeline (if it wasn't dropped) 22 | item 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /lib/kimurai/template/spiders/application_spider.rb: -------------------------------------------------------------------------------- 1 | # ApplicationSpider is a default base spider class. You can set here 2 | # default settings for all spiders inherited from ApplicationSpider. 3 | # To generate a new spider, run: `$ kimurai generate spider spider_name` 4 | 5 | class ApplicationSpider < Kimurai::Base 6 | include ApplicationHelper 7 | 8 | # Default engine for spiders (available engines: :mechanize, :poltergeist_phantomjs, 9 | # :selenium_firefox, :selenium_chrome) 10 | @engine = :poltergeist_phantomjs 11 | 12 | # Pipelines list, by order. 13 | # To process item through pipelines pass item to the `send_item` method 14 | @pipelines = [:validator, :saver] 15 | 16 | # Default config. Set here options which are default for all spiders inherited 17 | # from ApplicationSpider. Child's class config will be deep merged with this one 18 | @config = { 19 | # Custom headers, format: hash. Example: { "some header" => "some value", "another header" => "another value" } 20 | # Works only for :mechanize and :poltergeist_phantomjs engines (Selenium doesn't allow to set/get headers) 21 | # headers: {}, 22 | 23 | # Custom User Agent, format: string or lambda. 24 | # Use lambda if you want to rotate user agents before each run: 25 | # user_agent: -> { ARRAY_OF_USER_AGENTS.sample } 26 | # Works for all engines 27 | # user_agent: "Mozilla/5.0 Firefox/61.0", 28 | 29 | # Custom cookies, format: array of hashes. 30 | # Format for a single cookie: { name: "cookie name", value: "cookie value", domain: ".example.com" } 31 | # Works for all engines 32 | # cookies: [], 33 | 34 | # Proxy, format: string or lambda. Format of a proxy string: "ip:port:protocol:user:password" 35 | # `protocol` can be http or socks5. User and password are optional. 36 | # Use lambda if you want to rotate proxies before each run: 37 | # proxy: -> { ARRAY_OF_PROXIES.sample } 38 | # Works for all engines, but keep in mind that Selenium drivers doesn't support proxies 39 | # with authorization. Also, Mechanize doesn't support socks5 proxy format (only http) 40 | # proxy: "3.4.5.6:3128:http:user:pass", 41 | 42 | # If enabled, browser will ignore any https errors. It's handy while using a proxy 43 | # with self-signed SSL cert (for example Crawlera or Mitmproxy) 44 | # Also, it will allow to visit webpages with expires SSL certificate. 45 | # Works for all engines 46 | ignore_ssl_errors: true, 47 | 48 | # Custom window size, works for all engines 49 | # window_size: [1366, 768], 50 | 51 | # Skip images downloading if true, works for all engines 52 | disable_images: true, 53 | 54 | # Selenium engines only: headless mode, `:native` or `:virtual_display` (default is :native) 55 | # Although native mode has a better performance, virtual display mode 56 | # sometimes can be useful. For example, some websites can detect (and block) 57 | # headless chrome, so you can use virtual_display mode instead 58 | # headless_mode: :native, 59 | 60 | # This option tells the browser not to use a proxy for the provided list of domains or IP addresses. 61 | # Format: array of strings. Works only for :selenium_firefox and selenium_chrome 62 | # proxy_bypass_list: [], 63 | 64 | # Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize 65 | # ssl_cert_path: "path/to/ssl_cert", 66 | 67 | # Inject some JavaScript code to the browser. 68 | # Format: array of strings, where each string is a path to JS file. 69 | # Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection) 70 | # extensions: ["lib/code_to_inject.js"], 71 | 72 | # Automatically skip duplicated (already visited) urls when using `request_to` method. 73 | # Possible values: `true` or `hash` with options. 74 | # In case of `true`, all visited urls will be added to the storage's scope `:requests_urls` 75 | # and if url already contains in this scope, request will be skipped. 76 | # You can configure this setting by providing additional options as hash: 77 | # `skip_duplicate_requests: { scope: :custom_scope, check_only: true }`, where: 78 | # `scope:` - use custom scope than `:requests_urls` 79 | # `check_only:` - if true, then scope will be only checked for url, url will not 80 | # be added to the scope if scope doesn't contains it. 81 | # works for all drivers 82 | # skip_duplicate_requests: true, 83 | 84 | # Automatically skip provided errors while requesting a page. 85 | # If raised error matches one of the errors in the list, then this error will be caught, 86 | # and request will be skipped. 87 | # It is a good idea to skip errors like NotFound(404), etc. 88 | # Format: array where elements are error classes or/and hashes. You can use hash format 89 | # for more flexibility: `{ error: "RuntimeError", message: "404 => Net::HTTPNotFound" }`. 90 | # Provided `message:` will be compared with a full error message using `String#include?`. Also 91 | # you can use regex instead: `{ error: "RuntimeError", message: /404|403/ }`. 92 | # skip_request_errors: [{ error: RuntimeError, message: "404 => Net::HTTPNotFound" }], 93 | 94 | # Automatically retry provided errors with a few attempts while requesting a page. 95 | # If raised error matches one of the errors in the list, then this error will be caught 96 | # and the request will be processed again within a delay. There are 3 attempts: 97 | # first: delay 15 sec, second: delay 30 sec, third: delay 45 sec. 98 | # If after 3 attempts there is still an exception, then the exception will be raised. 99 | # It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc. 100 | # Format: same like for `skip_request_errors` option. 101 | # retry_request_errors: [Net::ReadTimeout], 102 | 103 | # Handle page encoding while parsing html response using Nokogiri. There are two modes: 104 | # Auto (`:auto`) (try to fetch correct encoding from or tags) 105 | # Set required encoding manually, example: `encoding: "GB2312"` (Set required encoding manually) 106 | # Default this option is unset. 107 | # encoding: nil, 108 | 109 | # Restart browser if one of the options is true: 110 | restart_if: { 111 | # Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines) 112 | # memory_limit: 350_000, 113 | 114 | # Restart browser if provided requests limit is exceeded (works for all engines) 115 | # requests_limit: 100 116 | }, 117 | 118 | # Perform several actions before each request: 119 | before_request: { 120 | # Change proxy before each request. The `proxy:` option above should be presented 121 | # and has lambda format. Works only for poltergeist and mechanize engines 122 | # (Selenium doesn't support proxy rotation). 123 | # change_proxy: true, 124 | 125 | # Change user agent before each request. The `user_agent:` option above should be presented 126 | # and has lambda format. Works only for poltergeist and mechanize engines 127 | # (selenium doesn't support to get/set headers). 128 | # change_user_agent: true, 129 | 130 | # Clear all cookies before each request, works for all engines 131 | # clear_cookies: true, 132 | 133 | # If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented) 134 | # use this option instead (works for all engines) 135 | # clear_and_set_cookies: true, 136 | 137 | # Global option to set delay between requests. 138 | # Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range, 139 | # delay number will be chosen randomly for each request: `rand (2..5) # => 3` 140 | # delay: 1..3 141 | } 142 | } 143 | end 144 | -------------------------------------------------------------------------------- /lib/kimurai/template/tmp/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/tmp/.keep -------------------------------------------------------------------------------- /lib/kimurai/version.rb: -------------------------------------------------------------------------------- 1 | module Kimurai 2 | VERSION = "1.4.0" 3 | end 4 | -------------------------------------------------------------------------------- /test/kimurai_test.rb: -------------------------------------------------------------------------------- 1 | require "test_helper" 2 | 3 | class KimuraiTest < Minitest::Test 4 | def test_that_it_has_a_version_number 5 | refute_nil ::Kimurai::VERSION 6 | end 7 | 8 | def test_it_does_something_useful 9 | assert false 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift File.expand_path("../../lib", __FILE__) 2 | require "kimurai" 3 | 4 | require "minitest/autorun" 5 | --------------------------------------------------------------------------------