├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── Gemfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── bin
├── console
└── setup
├── exe
└── kimurai
├── kimurai.gemspec
├── lib
├── kimurai.rb
└── kimurai
│ ├── automation
│ ├── deploy.yml
│ ├── setup.yml
│ └── setup
│ │ ├── chromium_chromedriver.yml
│ │ ├── firefox_geckodriver.yml
│ │ ├── phantomjs.yml
│ │ └── ruby_environment.yml
│ ├── base.rb
│ ├── base
│ ├── saver.rb
│ └── storage.rb
│ ├── base_helper.rb
│ ├── browser_builder.rb
│ ├── browser_builder
│ ├── mechanize_builder.rb
│ ├── poltergeist_phantomjs_builder.rb
│ ├── selenium_chrome_builder.rb
│ └── selenium_firefox_builder.rb
│ ├── capybara_configuration.rb
│ ├── capybara_ext
│ ├── driver
│ │ └── base.rb
│ ├── mechanize
│ │ └── driver.rb
│ ├── poltergeist
│ │ └── driver.rb
│ ├── selenium
│ │ └── driver.rb
│ ├── session.rb
│ └── session
│ │ └── config.rb
│ ├── cli.rb
│ ├── cli
│ ├── ansible_command_builder.rb
│ └── generator.rb
│ ├── core_ext
│ ├── array.rb
│ ├── hash.rb
│ ├── numeric.rb
│ └── string.rb
│ ├── pipeline.rb
│ ├── runner.rb
│ ├── template
│ ├── .gitignore
│ ├── Gemfile
│ ├── README.md
│ ├── config
│ │ ├── application.rb
│ │ ├── automation.yml
│ │ ├── boot.rb
│ │ ├── initializers
│ │ │ └── .keep
│ │ └── schedule.rb
│ ├── db
│ │ └── .keep
│ ├── helpers
│ │ └── application_helper.rb
│ ├── lib
│ │ └── .keep
│ ├── log
│ │ └── .keep
│ ├── pipelines
│ │ ├── saver.rb
│ │ └── validator.rb
│ ├── spiders
│ │ └── application_spider.rb
│ └── tmp
│ │ └── .keep
│ └── version.rb
└── test
├── kimurai_test.rb
└── test_helper.rb
/.gitignore:
--------------------------------------------------------------------------------
1 | /.bundle/
2 | /.yardoc
3 | /_yardoc/
4 | /coverage/
5 | /doc/
6 | /pkg/
7 | /spec/reports/
8 | /tmp/
9 | Gemfile.lock
10 |
11 | *.retry
12 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: false
2 | language: ruby
3 | rvm:
4 | - 2.5.1
5 | before_install: gem install bundler -v 1.16.2
6 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # CHANGELOG
2 | ## 1.4.0
3 | ### New
4 | * Add `encoding` config option (see [All available config options](https://github.com/vifreefly/kimuraframework#all-available-config-options))
5 | * Validate url before processing a request (Base#request_to)
6 |
7 | ### Fixes
8 | * Fix console command bug (see [issue 21](https://github.com/vifreefly/kimuraframework/issues/21))
9 |
10 | ## 1.3.2
11 | ### Fixes
12 | * In the project template, set Ruby version as >= 2.5 (before was hard-coded to 2.5.1)
13 | * Remove .ruby-version file (was hard-coded to 2.5.1) from the project template
14 |
15 | ## 1.3.1
16 | ### Fixes
17 | * Fixed bug in Base#save_to
18 |
19 | ## 1.3.0
20 | ### Breaking changes 1.3.0
21 | * Remove persistence database feature (because it's slow and makes things complicated)
22 |
23 | ### New
24 | * Add `--include` and `--exclude` options to CLI#runner
25 | * Add Base `#create_browser` method to easily create additional browser instances
26 | * Add Capybara::Session `#scroll_to_bottom`
27 | * Add skip_on_failure feature to `retry_request_errors` config option
28 | * Add info about `add_event` method to the README
29 |
30 | ### Fixes and improvements
31 | * Improve Runner
32 | * Fix time helper in schedule.rb
33 | * Add proxy validation to browser builders
34 | * Allow to pass different arguments to the `Base.parse` method
35 |
36 | ## 1.2.0
37 | ### New
38 | * Add possibility to add array of values to the storage (`Base::Storage#add`)
39 | * Add `exception_on_fail` option to `Base.crawl!`
40 | * Add possibility to pass request hash to the `start_urls` (You can use array of hashes as well, like: `@start_urls = [{ url: "https://example.com/cat?id=1", data: { category: "First Category" } }]`)
41 | * Implement `skip_request_errors` config feature. Added [Handle request errors](https://github.com/vifreefly/kimuraframework#handle-request-errors) chapter to the README.
42 | * Add option to choose response type for `Session#current_response` (`:html` default, or `:json`)
43 | * Add option to provide custom chrome and chromedriver paths
44 |
45 | ### Improvements
46 | * Refactor `Runner`
47 |
48 | ### Fixes
49 | * Fix `Base#Saver` (automatically create file if it doesn't exists in case of persistence database)
50 | * Do not deep merge config's `headers:` option
51 |
52 | ## 1.1.0
53 | ### Breaking changes 1.1.0
54 | `browser` config option depricated. Now all sub-options inside `browser` should be placed right into `@config` hash, without `browser` parent key. Example:
55 |
56 | ```ruby
57 | # Was:
58 | @config = {
59 | browser: {
60 | retry_request_errors: [Net::ReadTimeout],
61 | restart_if: {
62 | memory_limit: 350_000,
63 | requests_limit: 100
64 | },
65 | before_request: {
66 | change_proxy: true,
67 | change_user_agent: true,
68 | clear_cookies: true,
69 | clear_and_set_cookies: true,
70 | delay: 1..3
71 | }
72 | }
73 | }
74 |
75 | # Now:
76 | @config = {
77 | retry_request_errors: [Net::ReadTimeout],
78 | restart_if: {
79 | memory_limit: 350_000,
80 | requests_limit: 100
81 | },
82 | before_request: {
83 | change_proxy: true,
84 | change_user_agent: true,
85 | clear_cookies: true,
86 | clear_and_set_cookies: true,
87 | delay: 1..3
88 | }
89 | }
90 | ```
91 |
92 | ### New
93 | * Add `storage` object with additional methods and persistence database feature
94 | * Add events feature to `run_info`
95 | * Add `skip_duplicate_requests` config option to automatically skip already visited urls when using requrst_to
96 | * Add `extensions` config option to allow inject JS code into browser (supported only by poltergeist_phantomjs engine)
97 | * Add Capybara::Session#within_new_window_by method
98 |
99 | ### Improvements
100 | * Add the last backtrace line to pipeline output when item was dropped
101 | * Do not destroy driver if it's not exists (for Base.parse! method)
102 | * Handle possible Net::ReadTimeout error while trying to #quit driver
103 |
104 | ### Fixes
105 | * Fix Mechanize::Driver#proxy (there was a bug while using proxy for mechanize engine without authorization)
106 | * Fix requests retries logic
107 |
108 |
109 | ## 1.0.1
110 | * Add missing `logger` method to pipeline
111 | * Fix `set_proxy` in Mechanize and Poltergeist builders
112 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 |
3 | git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4 |
5 | # Specify your gem's dependencies in kimurai.gemspec
6 | gemspec
7 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2018 Victor Afanasev
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Kimurai
2 |
3 | > UPD. I will soon have a time to work on issues for current 1.4 version and also plan to release new 2.0 version with https://github.com/twalpole/apparition engine.
4 |
5 | Kimurai is a modern web scraping framework written in Ruby which **works out of box with Headless Chromium/Firefox, PhantomJS**, or simple HTTP requests and **allows to scrape and interact with JavaScript rendered websites.**
6 |
7 | Kimurai based on well-known [Capybara](https://github.com/teamcapybara/capybara) and [Nokogiri](https://github.com/sparklemotion/nokogiri) gems, so you don't have to learn anything new. Lets see:
8 |
9 | ```ruby
10 | # github_spider.rb
11 | require 'kimurai'
12 |
13 | class GithubSpider < Kimurai::Base
14 | @name = "github_spider"
15 | @engine = :selenium_chrome
16 | @start_urls = ["https://github.com/search?q=Ruby%20Web%20Scraping"]
17 | @config = {
18 | user_agent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36",
19 | before_request: { delay: 4..7 }
20 | }
21 |
22 | def parse(response, url:, data: {})
23 | response.xpath("//ul[@class='repo-list']/div//h3/a").each do |a|
24 | request_to :parse_repo_page, url: absolute_url(a[:href], base: url)
25 | end
26 |
27 | if next_page = response.at_xpath("//a[@class='next_page']")
28 | request_to :parse, url: absolute_url(next_page[:href], base: url)
29 | end
30 | end
31 |
32 | def parse_repo_page(response, url:, data: {})
33 | item = {}
34 |
35 | item[:owner] = response.xpath("//h1//a[@rel='author']").text
36 | item[:repo_name] = response.xpath("//h1/strong[@itemprop='name']/a").text
37 | item[:repo_url] = url
38 | item[:description] = response.xpath("//span[@itemprop='about']").text.squish
39 | item[:tags] = response.xpath("//div[@id='topics-list-container']/div/a").map { |a| a.text.squish }
40 | item[:watch_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Watch')]/a[2]").text.squish
41 | item[:star_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Star')]/a[2]").text.squish
42 | item[:fork_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Fork')]/a[2]").text.squish
43 | item[:last_commit] = response.xpath("//span[@itemprop='dateModified']/*").text
44 |
45 | save_to "results.json", item, format: :pretty_json
46 | end
47 | end
48 |
49 | GithubSpider.crawl!
50 | ```
51 |
52 |
53 | Run: $ ruby github_spider.rb
54 |
55 | ```
56 | I, [2018-08-22 13:08:03 +0400#15477] [M: 47377500980720] INFO -- github_spider: Spider: started: github_spider
57 | D, [2018-08-22 13:08:03 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): created browser instance
58 | D, [2018-08-22 13:08:03 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): enabled `browser before_request delay`
59 | D, [2018-08-22 13:08:03 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: sleep 7 seconds before request...
60 | D, [2018-08-22 13:08:10 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): enabled custom user-agent
61 | D, [2018-08-22 13:08:10 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): enabled native headless_mode
62 | I, [2018-08-22 13:08:10 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: started get request to: https://github.com/search?q=Ruby%20Web%20Scraping
63 | I, [2018-08-22 13:08:26 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: finished get request to: https://github.com/search?q=Ruby%20Web%20Scraping
64 | I, [2018-08-22 13:08:26 +0400#15477] [M: 47377500980720] INFO -- github_spider: Info: visits: requests: 1, responses: 1
65 | D, [2018-08-22 13:08:27 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: driver.current_memory: 107968
66 | D, [2018-08-22 13:08:27 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: sleep 5 seconds before request...
67 | I, [2018-08-22 13:08:32 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: started get request to: https://github.com/lorien/awesome-web-scraping
68 | I, [2018-08-22 13:08:33 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: finished get request to: https://github.com/lorien/awesome-web-scraping
69 | I, [2018-08-22 13:08:33 +0400#15477] [M: 47377500980720] INFO -- github_spider: Info: visits: requests: 2, responses: 2
70 | D, [2018-08-22 13:08:33 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: driver.current_memory: 212542
71 | D, [2018-08-22 13:08:33 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: sleep 4 seconds before request...
72 | I, [2018-08-22 13:08:37 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: started get request to: https://github.com/jaimeiniesta/metainspector
73 |
74 | ...
75 |
76 | I, [2018-08-22 13:23:07 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: started get request to: https://github.com/preston/idclight
77 | I, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: finished get request to: https://github.com/preston/idclight
78 | I, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720] INFO -- github_spider: Info: visits: requests: 140, responses: 140
79 | D, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720] DEBUG -- github_spider: Browser: driver.current_memory: 204198
80 | I, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720] INFO -- github_spider: Browser: driver selenium_chrome has been destroyed
81 |
82 | I, [2018-08-22 13:23:08 +0400#15477] [M: 47377500980720] INFO -- github_spider: Spider: stopped: {:spider_name=>"github_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 13:08:03 +0400, :stop_time=>2018-08-22 13:23:08 +0400, :running_time=>"15m, 5s", :visits=>{:requests=>140, :responses=>140}, :error=>nil}
83 | ```
84 |
85 |
86 |
87 | results.json
88 |
89 | ```json
90 | [
91 | {
92 | "owner": "lorien",
93 | "repo_name": "awesome-web-scraping",
94 | "repo_url": "https://github.com/lorien/awesome-web-scraping",
95 | "description": "List of libraries, tools and APIs for web scraping and data processing.",
96 | "tags": [
97 | "awesome",
98 | "awesome-list",
99 | "web-scraping",
100 | "data-processing",
101 | "python",
102 | "javascript",
103 | "php",
104 | "ruby"
105 | ],
106 | "watch_count": "159",
107 | "star_count": "2,423",
108 | "fork_count": "358",
109 | "last_commit": "4 days ago",
110 | "position": 1
111 | },
112 |
113 | ...
114 |
115 | {
116 | "owner": "preston",
117 | "repo_name": "idclight",
118 | "repo_url": "https://github.com/preston/idclight",
119 | "description": "A Ruby gem for accessing the freely available IDClight (IDConverter Light) web service, which convert between different types of gene IDs such as Hugo and Entrez. Queries are screen scraped from http://idclight.bioinfo.cnio.es.",
120 | "tags": [
121 |
122 | ],
123 | "watch_count": "6",
124 | "star_count": "1",
125 | "fork_count": "0",
126 | "last_commit": "on Apr 12, 2012",
127 | "position": 127
128 | }
129 | ]
130 | ```
131 |
132 |
133 | Okay, that was easy. How about javascript rendered websites with dynamic HTML? Lets scrape a page with infinite scroll:
134 |
135 | ```ruby
136 | # infinite_scroll_spider.rb
137 | require 'kimurai'
138 |
139 | class InfiniteScrollSpider < Kimurai::Base
140 | @name = "infinite_scroll_spider"
141 | @engine = :selenium_chrome
142 | @start_urls = ["https://infinite-scroll.com/demo/full-page/"]
143 |
144 | def parse(response, url:, data: {})
145 | posts_headers_path = "//article/h2"
146 | count = response.xpath(posts_headers_path).count
147 |
148 | loop do
149 | browser.execute_script("window.scrollBy(0,10000)") ; sleep 2
150 | response = browser.current_response
151 |
152 | new_count = response.xpath(posts_headers_path).count
153 | if count == new_count
154 | logger.info "> Pagination is done" and break
155 | else
156 | count = new_count
157 | logger.info "> Continue scrolling, current count is #{count}..."
158 | end
159 | end
160 |
161 | posts_headers = response.xpath(posts_headers_path).map(&:text)
162 | logger.info "> All posts from page: #{posts_headers.join('; ')}"
163 | end
164 | end
165 |
166 | InfiniteScrollSpider.crawl!
167 | ```
168 |
169 |
170 | Run: $ ruby infinite_scroll_spider.rb
171 |
172 | ```
173 | I, [2018-08-22 13:32:57 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: Spider: started: infinite_scroll_spider
174 | D, [2018-08-22 13:32:57 +0400#23356] [M: 47375890851320] DEBUG -- infinite_scroll_spider: BrowserBuilder (selenium_chrome): created browser instance
175 | D, [2018-08-22 13:32:57 +0400#23356] [M: 47375890851320] DEBUG -- infinite_scroll_spider: BrowserBuilder (selenium_chrome): enabled native headless_mode
176 | I, [2018-08-22 13:32:57 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: Browser: started get request to: https://infinite-scroll.com/demo/full-page/
177 | I, [2018-08-22 13:33:03 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: Browser: finished get request to: https://infinite-scroll.com/demo/full-page/
178 | I, [2018-08-22 13:33:03 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: Info: visits: requests: 1, responses: 1
179 | D, [2018-08-22 13:33:03 +0400#23356] [M: 47375890851320] DEBUG -- infinite_scroll_spider: Browser: driver.current_memory: 95463
180 | I, [2018-08-22 13:33:05 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: > Continue scrolling, current count is 5...
181 | I, [2018-08-22 13:33:18 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: > Continue scrolling, current count is 9...
182 | I, [2018-08-22 13:33:20 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: > Continue scrolling, current count is 11...
183 | I, [2018-08-22 13:33:26 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: > Continue scrolling, current count is 13...
184 | I, [2018-08-22 13:33:28 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: > Continue scrolling, current count is 15...
185 | I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: > Pagination is done
186 | I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: > All posts from page: 1a - Infinite Scroll full page demo; 1b - RGB Schemes logo in Computer Arts; 2a - RGB Schemes logo; 2b - Masonry gets horizontalOrder; 2c - Every vector 2016; 3a - Logo Pizza delivered; 3b - Some CodePens; 3c - 365daysofmusic.com; 3d - Holograms; 4a - Huebee: 1-click color picker; 4b - Word is Flickity is good; Flickity v2 released: groupCells, adaptiveHeight, parallax; New tech gets chatter; Isotope v3 released: stagger in, IE8 out; Packery v2 released
187 | I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: Browser: driver selenium_chrome has been destroyed
188 | I, [2018-08-22 13:33:30 +0400#23356] [M: 47375890851320] INFO -- infinite_scroll_spider: Spider: stopped: {:spider_name=>"infinite_scroll_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 13:32:57 +0400, :stop_time=>2018-08-22 13:33:30 +0400, :running_time=>"33s", :visits=>{:requests=>1, :responses=>1}, :error=>nil}
189 |
190 | ```
191 |
192 |
193 |
194 | ## Features
195 | * Scrape javascript rendered websites out of box
196 | * Supported engines: [Headless Chrome](https://developers.google.com/web/updates/2017/04/headless-chrome), [Headless Firefox](https://developer.mozilla.org/en-US/docs/Mozilla/Firefox/Headless_mode), [PhantomJS](https://github.com/ariya/phantomjs) or simple HTTP requests ([mechanize](https://github.com/sparklemotion/mechanize) gem)
197 | * Write spider code once, and use it with any supported engine later
198 | * All the power of [Capybara](https://github.com/teamcapybara/capybara): use methods like `click_on`, `fill_in`, `select`, `choose`, `set`, `go_back`, etc. to interact with web pages
199 | * Rich [configuration](#spider-config): **set default headers, cookies, delay between requests, enable proxy/user-agents rotation**
200 | * Built-in helpers to make scraping easy, like [save_to](#save_to-helper) (save items to JSON, JSON lines, or CSV formats) or [unique?](#skip-duplicates-unique-helper) to skip duplicates
201 | * Automatically [handle requests errors](#handle-request-errors)
202 | * Automatically restart browsers when reaching memory limit [**(memory control)**](#spider-config) or requests limit
203 | * Easily [schedule spiders](#schedule-spiders-using-cron) within cron using [Whenever](https://github.com/javan/whenever) (no need to know cron syntax)
204 | * [Parallel scraping](#parallel-crawling-using-in_parallel) using simple method `in_parallel`
205 | * **Two modes:** use single file for a simple spider, or [generate](#project-mode) Scrapy-like **project**
206 | * Convenient development mode with [console](#interactive-console), colorized logger and debugger ([Pry](https://github.com/pry/pry), [Byebug](https://github.com/deivid-rodriguez/byebug))
207 | * Automated [server environment setup](#setup) (for ubuntu 18.04) and [deploy](#deploy) using commands `kimurai setup` and `kimurai deploy` ([Ansible](https://github.com/ansible/ansible) under the hood)
208 | * Command-line [runner](#runner) to run all project spiders one by one or in parallel
209 |
210 | ## Table of Contents
211 | * [Kimurai](#kimurai)
212 | * [Features](#features)
213 | * [Table of Contents](#table-of-contents)
214 | * [Installation](#installation)
215 | * [Getting to Know](#getting-to-know)
216 | * [Interactive console](#interactive-console)
217 | * [Available engines](#available-engines)
218 | * [Minimum required spider structure](#minimum-required-spider-structure)
219 | * [Method arguments response, url and data](#method-arguments-response-url-and-data)
220 | * [browser object](#browser-object)
221 | * [request_to method](#request_to-method)
222 | * [save_to helper](#save_to-helper)
223 | * [Skip duplicates](#skip-duplicates)
224 | * [Automatically skip all duplicated requests urls](#automatically-skip-all-duplicated-requests-urls)
225 | * [Storage object](#storage-object)
226 | * [Handle request errors](#handle-request-errors)
227 | * [skip_request_errors](#skip_request_errors)
228 | * [retry_request_errors](#retry_request_errors)
229 | * [Logging custom events](#logging-custom-events)
230 | * [open_spider and close_spider callbacks](#open_spider-and-close_spider-callbacks)
231 | * [KIMURAI_ENV](#kimurai_env)
232 | * [Parallel crawling using in_parallel](#parallel-crawling-using-in_parallel)
233 | * [Active Support included](#active-support-included)
234 | * [Schedule spiders using Cron](#schedule-spiders-using-cron)
235 | * [Configuration options](#configuration-options)
236 | * [Using Kimurai inside existing Ruby application](#using-kimurai-inside-existing-ruby-application)
237 | * [crawl! method](#crawl-method)
238 | * [parse! method](#parsemethod_name-url-method)
239 | * [Kimurai.list and Kimurai.find_by_name](#kimurailist-and-kimuraifind_by_name)
240 | * [Automated sever setup and deployment](#automated-sever-setup-and-deployment)
241 | * [Setup](#setup)
242 | * [Deploy](#deploy)
243 | * [Spider @config](#spider-config)
244 | * [All available @config options](#all-available-config-options)
245 | * [@config settings inheritance](#config-settings-inheritance)
246 | * [Project mode](#project-mode)
247 | * [Generate new spider](#generate-new-spider)
248 | * [Crawl](#crawl)
249 | * [List](#list)
250 | * [Parse](#parse)
251 | * [Pipelines, send_item method](#pipelines-send_item-method)
252 | * [Runner](#runner)
253 | * [Runner callbacks](#runner-callbacks)
254 | * [Chat Support and Feedback](#chat-support-and-feedback)
255 | * [License](#license)
256 |
257 |
258 | ## Installation
259 | Kimurai requires Ruby version `>= 2.5.0`. Supported platforms: `Linux` and `Mac OS X`.
260 |
261 | 1) If your system doesn't have appropriate Ruby version, install it:
262 |
263 |
264 | Ubuntu 18.04
265 |
266 | ```bash
267 | # Install required packages for ruby-build
268 | sudo apt update
269 | sudo apt install git-core curl zlib1g-dev build-essential libssl-dev libreadline-dev libreadline6-dev libyaml-dev libxml2-dev libxslt1-dev libcurl4-openssl-dev libffi-dev
270 |
271 | # Install rbenv and ruby-build
272 | cd && git clone https://github.com/rbenv/rbenv.git ~/.rbenv
273 | echo 'export PATH="$HOME/.rbenv/bin:$PATH"' >> ~/.bashrc
274 | echo 'eval "$(rbenv init -)"' >> ~/.bashrc
275 | exec $SHELL
276 |
277 | git clone https://github.com/rbenv/ruby-build.git ~/.rbenv/plugins/ruby-build
278 | echo 'export PATH="$HOME/.rbenv/plugins/ruby-build/bin:$PATH"' >> ~/.bashrc
279 | exec $SHELL
280 |
281 | # Install latest Ruby
282 | rbenv install 2.5.3
283 | rbenv global 2.5.3
284 |
285 | gem install bundler
286 | ```
287 |
288 |
289 |
290 | Mac OS X
291 |
292 | ```bash
293 | # Install homebrew if you don't have it https://brew.sh/
294 | # Install rbenv and ruby-build:
295 | brew install rbenv ruby-build
296 |
297 | # Add rbenv to bash so that it loads every time you open a terminal
298 | echo 'if which rbenv > /dev/null; then eval "$(rbenv init -)"; fi' >> ~/.bash_profile
299 | source ~/.bash_profile
300 |
301 | # Install latest Ruby
302 | rbenv install 2.5.3
303 | rbenv global 2.5.3
304 |
305 | gem install bundler
306 | ```
307 |
308 |
309 | 2) Install Kimurai gem: `$ gem install kimurai`
310 |
311 | 3) Install browsers with webdrivers:
312 |
313 |
314 | Ubuntu 18.04
315 |
316 | Note: for Ubuntu 16.04-18.04 there is available automatic installation using `setup` command:
317 | ```bash
318 | $ kimurai setup localhost --local --ask-sudo
319 | ```
320 | It works using [Ansible](https://github.com/ansible/ansible) so you need to install it first: `$ sudo apt install ansible`. You can check using playbooks [here](lib/kimurai/automation).
321 |
322 | If you chose automatic installation, you can skip following and go to "Getting To Know" part. In case if you want to install everything manually:
323 |
324 | ```bash
325 | # Install basic tools
326 | sudo apt install -q -y unzip wget tar openssl
327 |
328 | # Install xvfb (for virtual_display headless mode, in additional to native)
329 | sudo apt install -q -y xvfb
330 |
331 | # Install chromium-browser and firefox
332 | sudo apt install -q -y chromium-browser firefox
333 |
334 | # Instal chromedriver (2.44 version)
335 | # All versions located here https://sites.google.com/a/chromium.org/chromedriver/downloads
336 | cd /tmp && wget https://chromedriver.storage.googleapis.com/2.44/chromedriver_linux64.zip
337 | sudo unzip chromedriver_linux64.zip -d /usr/local/bin
338 | rm -f chromedriver_linux64.zip
339 |
340 | # Install geckodriver (0.23.0 version)
341 | # All versions located here https://github.com/mozilla/geckodriver/releases/
342 | cd /tmp && wget https://github.com/mozilla/geckodriver/releases/download/v0.23.0/geckodriver-v0.23.0-linux64.tar.gz
343 | sudo tar -xvzf geckodriver-v0.23.0-linux64.tar.gz -C /usr/local/bin
344 | rm -f geckodriver-v0.23.0-linux64.tar.gz
345 |
346 | # Install PhantomJS (2.1.1)
347 | # All versions located here http://phantomjs.org/download.html
348 | sudo apt install -q -y chrpath libxft-dev libfreetype6 libfreetype6-dev libfontconfig1 libfontconfig1-dev
349 | cd /tmp && wget https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2
350 | tar -xvjf phantomjs-2.1.1-linux-x86_64.tar.bz2
351 | sudo mv phantomjs-2.1.1-linux-x86_64 /usr/local/lib
352 | sudo ln -s /usr/local/lib/phantomjs-2.1.1-linux-x86_64/bin/phantomjs /usr/local/bin
353 | rm -f phantomjs-2.1.1-linux-x86_64.tar.bz2
354 | ```
355 |
356 |
357 |
358 |
359 | Mac OS X
360 |
361 | ```bash
362 | # Install chrome and firefox
363 | brew cask install google-chrome firefox
364 |
365 | # Install chromedriver (latest)
366 | brew cask install chromedriver
367 |
368 | # Install geckodriver (latest)
369 | brew install geckodriver
370 |
371 | # Install PhantomJS (latest)
372 | brew install phantomjs
373 | ```
374 |
375 |
376 | Also, if you want to save scraped items to the database (using [ActiveRecord](https://github.com/rails/rails/tree/master/activerecord), [Sequel](https://github.com/jeremyevans/sequel) or [MongoDB Ruby Driver](https://github.com/mongodb/mongo-ruby-driver)/[Mongoid](https://github.com/mongodb/mongoid)), you need to install database clients/servers:
377 |
378 |
379 | Ubuntu 18.04
380 |
381 | SQlite: `$ sudo apt -q -y install libsqlite3-dev sqlite3`.
382 |
383 | If you want to connect to a remote database, you don't need database server on a local machine (only client):
384 | ```bash
385 | # Install MySQL client
386 | sudo apt -q -y install mysql-client libmysqlclient-dev
387 |
388 | # Install Postgres client
389 | sudo apt install -q -y postgresql-client libpq-dev
390 |
391 | # Install MongoDB client
392 | sudo apt install -q -y mongodb-clients
393 | ```
394 |
395 | But if you want to save items to a local database, database server required as well:
396 | ```bash
397 | # Install MySQL client and server
398 | sudo apt -q -y install mysql-server mysql-client libmysqlclient-dev
399 |
400 | # Install Postgres client and server
401 | sudo apt install -q -y postgresql postgresql-contrib libpq-dev
402 |
403 | # Install MongoDB client and server
404 | # version 4.0 (check here https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/)
405 | sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 9DA31620334BD75D9DCB49F368818C72E52529D4
406 | # for 16.04:
407 | # echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/4.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.0.list
408 | # for 18.04:
409 | echo "deb [ arch=amd64 ] https://repo.mongodb.org/apt/ubuntu bionic/mongodb-org/4.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.0.list
410 | sudo apt update
411 | sudo apt install -q -y mongodb-org
412 | sudo service mongod start
413 | ```
414 |
415 |
416 |
417 | Mac OS X
418 |
419 | SQlite: `$ brew install sqlite3`
420 |
421 | ```bash
422 | # Install MySQL client and server
423 | brew install mysql
424 | # Start server if you need it: brew services start mysql
425 |
426 | # Install Postgres client and server
427 | brew install postgresql
428 | # Start server if you need it: brew services start postgresql
429 |
430 | # Install MongoDB client and server
431 | brew install mongodb
432 | # Start server if you need it: brew services start mongodb
433 | ```
434 |
435 |
436 |
437 | ## Getting to Know
438 | ### Interactive console
439 | Before you get to know all Kimurai features, there is `$ kimurai console` command which is an interactive console where you can try and debug your scraping code very quickly, without having to run any spider (yes, it's like [Scrapy shell](https://doc.scrapy.org/en/latest/topics/shell.html#topics-shell)).
440 |
441 | ```bash
442 | $ kimurai console --engine selenium_chrome --url https://github.com/vifreefly/kimuraframework
443 | ```
444 |
445 |
446 | Show output
447 |
448 | ```
449 | $ kimurai console --engine selenium_chrome --url https://github.com/vifreefly/kimuraframework
450 |
451 | D, [2018-08-22 13:42:32 +0400#26079] [M: 47461994677760] DEBUG -- : BrowserBuilder (selenium_chrome): created browser instance
452 | D, [2018-08-22 13:42:32 +0400#26079] [M: 47461994677760] DEBUG -- : BrowserBuilder (selenium_chrome): enabled native headless_mode
453 | I, [2018-08-22 13:42:32 +0400#26079] [M: 47461994677760] INFO -- : Browser: started get request to: https://github.com/vifreefly/kimuraframework
454 | I, [2018-08-22 13:42:35 +0400#26079] [M: 47461994677760] INFO -- : Browser: finished get request to: https://github.com/vifreefly/kimuraframework
455 | D, [2018-08-22 13:42:35 +0400#26079] [M: 47461994677760] DEBUG -- : Browser: driver.current_memory: 201701
456 |
457 | From: /home/victor/code/kimurai/lib/kimurai/base.rb @ line 189 Kimurai::Base#console:
458 |
459 | 188: def console(response = nil, url: nil, data: {})
460 | => 189: binding.pry
461 | 190: end
462 |
463 | [1] pry(#)> response.xpath("//title").text
464 | => "GitHub - vifreefly/kimuraframework: Modern web scraping framework written in Ruby which works out of box with Headless Chromium/Firefox, PhantomJS, or simple HTTP requests and allows to scrape and interact with JavaScript rendered websites"
465 |
466 | [2] pry(#)> ls
467 | Kimurai::Base#methods: browser console logger request_to save_to unique?
468 | instance variables: @browser @config @engine @logger @pipelines
469 | locals: _ __ _dir_ _ex_ _file_ _in_ _out_ _pry_ data response url
470 |
471 | [3] pry(#)> ls response
472 | Nokogiri::XML::PP::Node#methods: inspect pretty_print
473 | Nokogiri::XML::Searchable#methods: % / at at_css at_xpath css search xpath
474 | Enumerable#methods:
475 | all? collect drop each_with_index find_all grep_v lazy member? none? reject slice_when take_while without
476 | any? collect_concat drop_while each_with_object find_index group_by many? min one? reverse_each sort to_a zip
477 | as_json count each_cons entries first include? map min_by partition select sort_by to_h
478 | chunk cycle each_entry exclude? flat_map index_by max minmax pluck slice_after sum to_set
479 | chunk_while detect each_slice find grep inject max_by minmax_by reduce slice_before take uniq
480 | Nokogiri::XML::Node#methods:
481 | <=> append_class classes document? has_attribute? matches? node_name= processing_instruction? to_str
482 | == attr comment? each html? name= node_type read_only? to_xhtml
483 | > attribute content elem? inner_html namespace= parent= remove traverse
484 | [] attribute_nodes content= element? inner_html= namespace_scopes parse remove_attribute unlink
485 | []= attribute_with_ns create_external_subset element_children inner_text namespaced_key? path remove_class values
486 | accept before create_internal_subset elements internal_subset native_content= pointer_id replace write_html_to
487 | add_class blank? css_path encode_special_chars key? next prepend_child set_attribute write_to
488 | add_next_sibling cdata? decorate! external_subset keys next= previous text write_xhtml_to
489 | add_previous_sibling child delete first_element_child lang next_element previous= text? write_xml_to
490 | after children description fragment? lang= next_sibling previous_element to_html xml?
491 | ancestors children= do_xinclude get_attribute last_element_child node_name previous_sibling to_s
492 | Nokogiri::XML::Document#methods:
493 | << canonicalize collect_namespaces create_comment create_entity decorate document encoding errors name remove_namespaces! root= to_java url version
494 | add_child clone create_cdata create_element create_text_node decorators dup encoding= errors= namespaces root slop! to_xml validate
495 | Nokogiri::HTML::Document#methods: fragment meta_encoding meta_encoding= serialize title title= type
496 | instance variables: @decorators @errors @node_cache
497 |
498 | [4] pry(#)> exit
499 | I, [2018-08-22 13:43:47 +0400#26079] [M: 47461994677760] INFO -- : Browser: driver selenium_chrome has been destroyed
500 | $
501 | ```
502 |
503 |
504 | CLI options:
505 | * `--engine` (optional) [engine](#available-drivers) to use. Default is `mechanize`
506 | * `--url` (optional) url to process. If url omitted, `response` and `url` objects inside the console will be `nil` (use [browser](#browser-object) object to navigate to any webpage).
507 |
508 | ### Available engines
509 | Kimurai has support for following engines and mostly can switch between them without need to rewrite any code:
510 |
511 | * `:mechanize` - [pure Ruby fake http browser](https://github.com/sparklemotion/mechanize). Mechanize can't render javascript and don't know what DOM is it. It only can parse original HTML code of a page. Because of it, mechanize much faster, takes much less memory and in general much more stable than any real browser. Use mechanize if you can do it, and the website doesn't use javascript to render any meaningful parts of its structure. Still, because mechanize trying to mimic a real browser, it supports almost all Capybara's [methods to interact with a web page](http://cheatrags.com/capybara) (filling forms, clicking buttons, checkboxes, etc).
512 | * `:poltergeist_phantomjs` - [PhantomJS headless browser](https://github.com/ariya/phantomjs), can render javascript. In general, PhantomJS still faster than Headless Chrome (and Headless Firefox). PhantomJS has memory leakage, but Kimurai has [memory control feature](#crawler-config) so you shouldn't consider it as a problem. Also, some websites can recognize PhantomJS and block access to them. Like mechanize (and unlike selenium engines) `:poltergeist_phantomjs` can freely rotate proxies and change headers _on the fly_ (see [config section](#all-available-config-options)).
513 | * `:selenium_chrome` Chrome in headless mode driven by selenium. Modern headless browser solution with proper javascript rendering.
514 | * `:selenium_firefox` Firefox in headless mode driven by selenium. Usually takes more memory than other drivers, but sometimes can be useful.
515 |
516 | **Tip:** add `HEADLESS=false` ENV variable before command (`$ HEADLESS=false ruby spider.rb`) to run browser in normal (not headless) mode and see it's window (only for selenium-like engines). It works for [console](#interactive-console) command as well.
517 |
518 |
519 | ### Minimum required spider structure
520 | > You can manually create a spider file, or use generator instead: `$ kimurai generate spider simple_spider`
521 |
522 | ```ruby
523 | require 'kimurai'
524 |
525 | class SimpleSpider < Kimurai::Base
526 | @name = "simple_spider"
527 | @engine = :selenium_chrome
528 | @start_urls = ["https://example.com/"]
529 |
530 | def parse(response, url:, data: {})
531 | end
532 | end
533 |
534 | SimpleSpider.crawl!
535 | ```
536 |
537 | Where:
538 | * `@name` name of a spider. You can omit name if use single-file spider
539 | * `@engine` engine for a spider
540 | * `@start_urls` array of start urls to process one by one inside `parse` method
541 | * Method `parse` is the start method, should be always present in spider class
542 |
543 |
544 | ### Method arguments `response`, `url` and `data`
545 |
546 | ```ruby
547 | def parse(response, url:, data: {})
548 | end
549 | ```
550 |
551 | * `response` ([Nokogiri::HTML::Document](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document) object) Contains parsed HTML code of a processed webpage
552 | * `url` (String) url of a processed webpage
553 | * `data` (Hash) uses to pass data between requests
554 |
555 |
556 | Example how to use data
557 |
558 | Imagine that there is a product page which doesn't contain product category. Category name present only on category page with pagination. This is the case where we can use `data` to pass category name from `parse` to `parse_product` method:
559 |
560 | ```ruby
561 | class ProductsSpider < Kimurai::Base
562 | @engine = :selenium_chrome
563 | @start_urls = ["https://example-shop.com/example-product-category"]
564 |
565 | def parse(response, url:, data: {})
566 | category_name = response.xpath("//path/to/category/name").text
567 | response.xpath("//path/to/products/urls").each do |product_url|
568 | # Merge category_name with current data hash and pass it next to parse_product method
569 | request_to(:parse_product, url: product_url[:href], data: data.merge(category_name: category_name))
570 | end
571 |
572 | # ...
573 | end
574 |
575 | def parse_product(response, url:, data: {})
576 | item = {}
577 | # Assign item's category_name from data[:category_name]
578 | item[:category_name] = data[:category_name]
579 |
580 | # ...
581 | end
582 | end
583 |
584 | ```
585 |
586 |
587 | **You can query `response` using [XPath or CSS selectors](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/Searchable)**. Check Nokogiri tutorials to understand how to work with `response`:
588 | * [Parsing HTML with Nokogiri](http://ruby.bastardsbook.com/chapters/html-parsing/) - ruby.bastardsbook.com
589 | * [HOWTO parse HTML with Ruby & Nokogiri](https://readysteadycode.com/howto-parse-html-with-ruby-and-nokogiri) - readysteadycode.com
590 | * [Class: Nokogiri::HTML::Document](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document) (documentation) - rubydoc.info
591 |
592 |
593 | ### `browser` object
594 |
595 | From any spider instance method there is available `browser` object, which is [Capybara::Session](https://www.rubydoc.info/github/jnicklas/capybara/Capybara/Session) object and uses to process requests and get page response (`current_response` method). Usually you don't need to touch it directly, because there is `response` (see above) which contains page response after it was loaded.
596 |
597 | But if you need to interact with a page (like filling form fields, clicking elements, checkboxes, etc) `browser` is ready for you:
598 |
599 | ```ruby
600 | class GoogleSpider < Kimurai::Base
601 | @name = "google_spider"
602 | @engine = :selenium_chrome
603 | @start_urls = ["https://www.google.com/"]
604 |
605 | def parse(response, url:, data: {})
606 | browser.fill_in "q", with: "Kimurai web scraping framework"
607 | browser.click_button "Google Search"
608 |
609 | # Update response to current response after interaction with a browser
610 | response = browser.current_response
611 |
612 | # Collect results
613 | results = response.xpath("//div[@class='g']//h3/a").map do |a|
614 | { title: a.text, url: a[:href] }
615 | end
616 |
617 | # ...
618 | end
619 | end
620 | ```
621 |
622 | Check out **Capybara cheat sheets** where you can see all available methods **to interact with browser**:
623 | * [UI Testing with RSpec and Capybara [cheat sheet]](http://cheatrags.com/capybara) - cheatrags.com
624 | * [Capybara Cheatsheet PDF](https://thoughtbot.com/upcase/test-driven-rails-resources/capybara.pdf) - thoughtbot.com
625 | * [Class: Capybara::Session](https://www.rubydoc.info/github/jnicklas/capybara/Capybara/Session) (documentation) - rubydoc.info
626 |
627 | ### `request_to` method
628 |
629 | For making requests to a particular method there is `request_to`. It requires minimum two arguments: `:method_name` and `url:`. An optional argument is `data:` (see above what for is it). Example:
630 |
631 | ```ruby
632 | class Spider < Kimurai::Base
633 | @engine = :selenium_chrome
634 | @start_urls = ["https://example.com/"]
635 |
636 | def parse(response, url:, data: {})
637 | # Process request to `parse_product` method with `https://example.com/some_product` url:
638 | request_to :parse_product, url: "https://example.com/some_product"
639 | end
640 |
641 | def parse_product(response, url:, data: {})
642 | puts "From page https://example.com/some_product !"
643 | end
644 | end
645 | ```
646 |
647 | Under the hood `request_to` simply call [#visit](https://www.rubydoc.info/github/jnicklas/capybara/Capybara%2FSession:visit) (`browser.visit(url)`) and then required method with arguments:
648 |
649 |
650 | request_to
651 |
652 | ```ruby
653 | def request_to(handler, url:, data: {})
654 | request_data = { url: url, data: data }
655 |
656 | browser.visit(url)
657 | public_send(handler, browser.current_response, request_data)
658 | end
659 | ```
660 |
661 |
662 | `request_to` just makes things simpler, and without it we could do something like:
663 |
664 |
665 | Check the code
666 |
667 | ```ruby
668 | class Spider < Kimurai::Base
669 | @engine = :selenium_chrome
670 | @start_urls = ["https://example.com/"]
671 |
672 | def parse(response, url:, data: {})
673 | url_to_process = "https://example.com/some_product"
674 |
675 | browser.visit(url_to_process)
676 | parse_product(browser.current_response, url: url_to_process)
677 | end
678 |
679 | def parse_product(response, url:, data: {})
680 | puts "From page https://example.com/some_product !"
681 | end
682 | end
683 | ```
684 |
685 |
686 | ### `save_to` helper
687 |
688 | Sometimes all that you need is to simply save scraped data to a file format, like JSON or CSV. You can use `save_to` for it:
689 |
690 | ```ruby
691 | class ProductsSpider < Kimurai::Base
692 | @engine = :selenium_chrome
693 | @start_urls = ["https://example-shop.com/"]
694 |
695 | # ...
696 |
697 | def parse_product(response, url:, data: {})
698 | item = {}
699 |
700 | item[:title] = response.xpath("//title/path").text
701 | item[:description] = response.xpath("//desc/path").text.squish
702 | item[:price] = response.xpath("//price/path").text[/\d+/]&.to_f
703 |
704 | # Add each new item to the `scraped_products.json` file:
705 | save_to "scraped_products.json", item, format: :json
706 | end
707 | end
708 | ```
709 |
710 | Supported formats:
711 | * `:json` JSON
712 | * `:pretty_json` "pretty" JSON (`JSON.pretty_generate`)
713 | * `:jsonlines` [JSON Lines](http://jsonlines.org/)
714 | * `:csv` CSV
715 |
716 | Note: `save_to` requires data (item to save) to be a `Hash`.
717 |
718 | By default `save_to` add position key to an item hash. You can disable it with `position: false`: `save_to "scraped_products.json", item, format: :json, position: false`.
719 |
720 | **How helper works:**
721 |
722 | Until spider stops, each new item will be appended to a file. At the next run, helper will clear the content of a file first, and then start again appending items to it.
723 |
724 | > If you don't want file to be cleared before each run, add option `append: true`: `save_to "scraped_products.json", item, format: :json, append: true`
725 |
726 | ### Skip duplicates
727 |
728 | It's pretty common when websites have duplicated pages. For example when an e-commerce shop has the same products in different categories. To skip duplicates, there is simple `unique?` helper:
729 |
730 | ```ruby
731 | class ProductsSpider < Kimurai::Base
732 | @engine = :selenium_chrome
733 | @start_urls = ["https://example-shop.com/"]
734 |
735 | def parse(response, url:, data: {})
736 | response.xpath("//categories/path").each do |category|
737 | request_to :parse_category, url: category[:href]
738 | end
739 | end
740 |
741 | # Check products for uniqueness using product url inside of parse_category:
742 | def parse_category(response, url:, data: {})
743 | response.xpath("//products/path").each do |product|
744 | # Skip url if it's not unique:
745 | next unless unique?(:product_url, product[:href])
746 | # Otherwise process it:
747 | request_to :parse_product, url: product[:href]
748 | end
749 | end
750 |
751 | # Or/and check products for uniqueness using product sku inside of parse_product:
752 | def parse_product(response, url:, data: {})
753 | item = {}
754 | item[:sku] = response.xpath("//product/sku/path").text.strip.upcase
755 | # Don't save product and return from method if there is already saved item with the same sku:
756 | return unless unique?(:sku, item[:sku])
757 |
758 | # ...
759 | save_to "results.json", item, format: :json
760 | end
761 | end
762 | ```
763 |
764 | `unique?` helper works pretty simple:
765 |
766 | ```ruby
767 | # Check string "http://example.com" in scope `url` for a first time:
768 | unique?(:url, "http://example.com")
769 | # => true
770 |
771 | # Try again:
772 | unique?(:url, "http://example.com")
773 | # => false
774 | ```
775 |
776 | To check something for uniqueness, you need to provide a scope:
777 |
778 | ```ruby
779 | # `product_url` scope
780 | unique?(:product_url, "http://example.com/product_1")
781 |
782 | # `id` scope
783 | unique?(:id, 324234232)
784 |
785 | # `custom` scope
786 | unique?(:custom, "Lorem Ipsum")
787 | ```
788 |
789 | #### Automatically skip all duplicated requests urls
790 |
791 | It is possible to automatically skip all already visited urls while calling `request_to` method, using [@config](#all-available-config-options) option `skip_duplicate_requests: true`. With this option, all already visited urls will be automatically skipped. Also check the [@config](#all-available-config-options) for an additional options of this setting.
792 |
793 | #### `storage` object
794 |
795 | `unique?` method it's just an alias for `storage#unique?`. Storage has several methods:
796 |
797 | * `#all` - display storage hash where keys are existing scopes.
798 | * `#include?(scope, value)` - return `true` if value in the scope exists, and `false` if not
799 | * `#add(scope, value)` - add value to the scope
800 | * `#unique?(scope, value)` - method already described above, will return `false` if value in the scope exists, or return `true` + add value to the scope if value in the scope not exists.
801 | * `#clear!` - reset the whole storage by deleting all values from all scopes.
802 |
803 |
804 | ### Handle request errors
805 | It is quite common that some pages of crawling website can return different response code than `200 ok`. In such cases, method `request_to` (or `browser.visit`) can raise an exception. Kimurai provides `skip_request_errors` and `retry_request_errors` [config](#spider-config) options to handle such errors:
806 |
807 | #### skip_request_errors
808 | You can automatically skip some of errors while requesting a page using `skip_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught, and request will be skipped. It is a good idea to skip errors like NotFound(404), etc.
809 |
810 | Format for the option: array where elements are error classes or/and hashes. You can use _hash_ format for more flexibility:
811 |
812 | ```
813 | @config = {
814 | skip_request_errors: [{ error: RuntimeError, message: "404 => Net::HTTPNotFound" }]
815 | }
816 | ```
817 | In this case, provided `message:` will be compared with a full error message using `String#include?`. Also you can use regex instead: `{ error: RuntimeError, message: /404|403/ }`.
818 |
819 | #### retry_request_errors
820 | You can automatically retry some of errors with a few attempts while requesting a page using `retry_request_errors` [config](#spider-config) option. If raised error matches one of the errors in the list, then this error will be caught and the request will be processed again within a delay.
821 |
822 | There are 3 attempts: first: delay _15 sec_, second: delay _30 sec_, third: delay _45 sec_. If after 3 attempts there is still an exception, then the exception will be raised. It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc.
823 |
824 | Format for the option: same like for `skip_request_errors` option.
825 |
826 | If you would like to skip (not raise) error after all retries gone, you can specify `skip_on_failure: true` option:
827 |
828 | ```ruby
829 | @config = {
830 | retry_request_errors: [{ error: RuntimeError, skip_on_failure: true }]
831 | }
832 | ```
833 |
834 | ### Logging custom events
835 |
836 | It is possible to save custom messages to the [run_info](#open_spider-and-close_spider-callbacks) hash using `add_event('Some message')` method. This feature helps you to keep track on important things which happened during crawling without checking the whole spider log (in case if you're logging these messages using `logger`). Example:
837 |
838 | ```ruby
839 | def parse_product(response, url:, data: {})
840 | unless response.at_xpath("//path/to/add_to_card_button")
841 | add_event("Product is sold") and return
842 | end
843 |
844 | # ...
845 | end
846 | ```
847 |
848 | ```
849 | ...
850 | I, [2018-11-28 22:20:19 +0400#7402] [M: 47156576560640] INFO -- example_spider: Spider: new event (scope: custom): Product is sold
851 | ...
852 | I, [2018-11-28 22:20:19 +0400#7402] [M: 47156576560640] INFO -- example_spider: Spider: stopped: {:events=>{:custom=>{"Product is sold"=>1}}}
853 | ```
854 |
855 | ### `open_spider` and `close_spider` callbacks
856 |
857 | You can define `.open_spider` and `.close_spider` callbacks (class methods) to perform some action before spider started or after spider has been stopped:
858 |
859 | ```ruby
860 | require 'kimurai'
861 |
862 | class ExampleSpider < Kimurai::Base
863 | @name = "example_spider"
864 | @engine = :selenium_chrome
865 | @start_urls = ["https://example.com/"]
866 |
867 | def self.open_spider
868 | logger.info "> Starting..."
869 | end
870 |
871 | def self.close_spider
872 | logger.info "> Stopped!"
873 | end
874 |
875 | def parse(response, url:, data: {})
876 | logger.info "> Scraping..."
877 | end
878 | end
879 |
880 | ExampleSpider.crawl!
881 | ```
882 |
883 |
884 | Output
885 |
886 | ```
887 | I, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840] INFO -- example_spider: Spider: started: example_spider
888 | I, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840] INFO -- example_spider: > Starting...
889 | D, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840] DEBUG -- example_spider: BrowserBuilder (selenium_chrome): created browser instance
890 | D, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840] DEBUG -- example_spider: BrowserBuilder (selenium_chrome): enabled native headless_mode
891 | I, [2018-08-22 14:26:32 +0400#6001] [M: 46996522083840] INFO -- example_spider: Browser: started get request to: https://example.com/
892 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] INFO -- example_spider: Browser: finished get request to: https://example.com/
893 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] INFO -- example_spider: Info: visits: requests: 1, responses: 1
894 | D, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] DEBUG -- example_spider: Browser: driver.current_memory: 82415
895 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] INFO -- example_spider: > Scraping...
896 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] INFO -- example_spider: Browser: driver selenium_chrome has been destroyed
897 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] INFO -- example_spider: > Stopped!
898 | I, [2018-08-22 14:26:34 +0400#6001] [M: 46996522083840] INFO -- example_spider: Spider: stopped: {:spider_name=>"example_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 14:26:32 +0400, :stop_time=>2018-08-22 14:26:34 +0400, :running_time=>"1s", :visits=>{:requests=>1, :responses=>1}, :error=>nil}
899 | ```
900 |
901 |
902 | Inside `open_spider` and `close_spider` class methods there is available `run_info` method which contains useful information about spider state:
903 |
904 | ```ruby
905 | 11: def self.open_spider
906 | => 12: binding.pry
907 | 13: end
908 |
909 | [1] pry(example_spider)> run_info
910 | => {
911 | :spider_name=>"example_spider",
912 | :status=>:running,
913 | :environment=>"development",
914 | :start_time=>2018-08-05 23:32:00 +0400,
915 | :stop_time=>nil,
916 | :running_time=>nil,
917 | :visits=>{:requests=>0, :responses=>0},
918 | :error=>nil
919 | }
920 | ```
921 |
922 | Inside `close_spider`, `run_info` will be updated:
923 |
924 | ```ruby
925 | 15: def self.close_spider
926 | => 16: binding.pry
927 | 17: end
928 |
929 | [1] pry(example_spider)> run_info
930 | => {
931 | :spider_name=>"example_spider",
932 | :status=>:completed,
933 | :environment=>"development",
934 | :start_time=>2018-08-05 23:32:00 +0400,
935 | :stop_time=>2018-08-05 23:32:06 +0400,
936 | :running_time=>6.214,
937 | :visits=>{:requests=>1, :responses=>1},
938 | :error=>nil
939 | }
940 | ```
941 |
942 | `run_info[:status]` helps to determine if spider was finished successfully or failed (possible values: `:completed`, `:failed`):
943 |
944 | ```ruby
945 | class ExampleSpider < Kimurai::Base
946 | @name = "example_spider"
947 | @engine = :selenium_chrome
948 | @start_urls = ["https://example.com/"]
949 |
950 | def self.close_spider
951 | puts ">>> run info: #{run_info}"
952 | end
953 |
954 | def parse(response, url:, data: {})
955 | logger.info "> Scraping..."
956 | # Let's try to strip nil:
957 | nil.strip
958 | end
959 | end
960 | ```
961 |
962 |
963 | Output
964 |
965 | ```
966 | I, [2018-08-22 14:34:24 +0400#8459] [M: 47020523644400] INFO -- example_spider: Spider: started: example_spider
967 | D, [2018-08-22 14:34:25 +0400#8459] [M: 47020523644400] DEBUG -- example_spider: BrowserBuilder (selenium_chrome): created browser instance
968 | D, [2018-08-22 14:34:25 +0400#8459] [M: 47020523644400] DEBUG -- example_spider: BrowserBuilder (selenium_chrome): enabled native headless_mode
969 | I, [2018-08-22 14:34:25 +0400#8459] [M: 47020523644400] INFO -- example_spider: Browser: started get request to: https://example.com/
970 | I, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400] INFO -- example_spider: Browser: finished get request to: https://example.com/
971 | I, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400] INFO -- example_spider: Info: visits: requests: 1, responses: 1
972 | D, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400] DEBUG -- example_spider: Browser: driver.current_memory: 83351
973 | I, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400] INFO -- example_spider: > Scraping...
974 | I, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400] INFO -- example_spider: Browser: driver selenium_chrome has been destroyed
975 |
976 | >>> run info: {:spider_name=>"example_spider", :status=>:failed, :environment=>"development", :start_time=>2018-08-22 14:34:24 +0400, :stop_time=>2018-08-22 14:34:26 +0400, :running_time=>2.01, :visits=>{:requests=>1, :responses=>1}, :error=>"#"}
977 |
978 | F, [2018-08-22 14:34:26 +0400#8459] [M: 47020523644400] FATAL -- example_spider: Spider: stopped: {:spider_name=>"example_spider", :status=>:failed, :environment=>"development", :start_time=>2018-08-22 14:34:24 +0400, :stop_time=>2018-08-22 14:34:26 +0400, :running_time=>"2s", :visits=>{:requests=>1, :responses=>1}, :error=>"#"}
979 | Traceback (most recent call last):
980 | 6: from example_spider.rb:19:in `'
981 | 5: from /home/victor/code/kimurai/lib/kimurai/base.rb:127:in `crawl!'
982 | 4: from /home/victor/code/kimurai/lib/kimurai/base.rb:127:in `each'
983 | 3: from /home/victor/code/kimurai/lib/kimurai/base.rb:128:in `block in crawl!'
984 | 2: from /home/victor/code/kimurai/lib/kimurai/base.rb:185:in `request_to'
985 | 1: from /home/victor/code/kimurai/lib/kimurai/base.rb:185:in `public_send'
986 | example_spider.rb:15:in `parse': undefined method `strip' for nil:NilClass (NoMethodError)
987 | ```
988 |
989 |
990 | **Usage example:** if spider finished successfully, send JSON file with scraped items to a remote FTP location, otherwise (if spider failed), skip incompleted results and send email/notification to slack about it:
991 |
992 |
993 | Example
994 |
995 | Also you can use additional methods `completed?` or `failed?`
996 |
997 | ```ruby
998 | class Spider < Kimurai::Base
999 | @engine = :selenium_chrome
1000 | @start_urls = ["https://example.com/"]
1001 |
1002 | def self.close_spider
1003 | if completed?
1004 | send_file_to_ftp("results.json")
1005 | else
1006 | send_error_notification(run_info[:error])
1007 | end
1008 | end
1009 |
1010 | def self.send_file_to_ftp(file_path)
1011 | # ...
1012 | end
1013 |
1014 | def self.send_error_notification(error)
1015 | # ...
1016 | end
1017 |
1018 | # ...
1019 |
1020 | def parse_item(response, url:, data: {})
1021 | item = {}
1022 | # ...
1023 |
1024 | save_to "results.json", item, format: :json
1025 | end
1026 | end
1027 | ```
1028 |
1029 |
1030 |
1031 | ### `KIMURAI_ENV`
1032 | Kimurai has environments, default is `development`. To provide custom environment pass `KIMURAI_ENV` ENV variable before command: `$ KIMURAI_ENV=production ruby spider.rb`. To access current environment there is `Kimurai.env` method.
1033 |
1034 | Usage example:
1035 | ```ruby
1036 | class Spider < Kimurai::Base
1037 | @engine = :selenium_chrome
1038 | @start_urls = ["https://example.com/"]
1039 |
1040 | def self.close_spider
1041 | if failed? && Kimurai.env == "production"
1042 | send_error_notification(run_info[:error])
1043 | else
1044 | # Do nothing
1045 | end
1046 | end
1047 |
1048 | # ...
1049 | end
1050 | ```
1051 |
1052 | ### Parallel crawling using `in_parallel`
1053 | Kimurai can process web pages concurrently in one single line: `in_parallel(:parse_product, urls, threads: 3)`, where `:parse_product` is a method to process, `urls` is array of urls to crawl and `threads:` is a number of threads:
1054 |
1055 | ```ruby
1056 | # amazon_spider.rb
1057 | require 'kimurai'
1058 |
1059 | class AmazonSpider < Kimurai::Base
1060 | @name = "amazon_spider"
1061 | @engine = :mechanize
1062 | @start_urls = ["https://www.amazon.com/"]
1063 |
1064 | def parse(response, url:, data: {})
1065 | browser.fill_in "field-keywords", with: "Web Scraping Books"
1066 | browser.click_on "Go"
1067 |
1068 | # Walk through pagination and collect products urls:
1069 | urls = []
1070 | loop do
1071 | response = browser.current_response
1072 | response.xpath("//li//a[contains(@class, 's-access-detail-page')]").each do |a|
1073 | urls << a[:href].sub(/ref=.+/, "")
1074 | end
1075 |
1076 | browser.find(:xpath, "//a[@id='pagnNextLink']", wait: 1).click rescue break
1077 | end
1078 |
1079 | # Process all collected urls concurrently within 3 threads:
1080 | in_parallel(:parse_book_page, urls, threads: 3)
1081 | end
1082 |
1083 | def parse_book_page(response, url:, data: {})
1084 | item = {}
1085 |
1086 | item[:title] = response.xpath("//h1/span[@id]").text.squish
1087 | item[:url] = url
1088 | item[:price] = response.xpath("(//span[contains(@class, 'a-color-price')])[1]").text.squish.presence
1089 | item[:publisher] = response.xpath("//h2[text()='Product details']/following::b[text()='Publisher:']/following-sibling::text()[1]").text.squish.presence
1090 |
1091 | save_to "books.json", item, format: :pretty_json
1092 | end
1093 | end
1094 |
1095 | AmazonSpider.crawl!
1096 | ```
1097 |
1098 |
1099 | Run: $ ruby amazon_spider.rb
1100 |
1101 | ```
1102 | I, [2018-08-22 14:48:37 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Spider: started: amazon_spider
1103 | D, [2018-08-22 14:48:37 +0400#13033] [M: 46982297486840] DEBUG -- amazon_spider: BrowserBuilder (mechanize): created browser instance
1104 | I, [2018-08-22 14:48:37 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/
1105 | I, [2018-08-22 14:48:38 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/
1106 | I, [2018-08-22 14:48:38 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Info: visits: requests: 1, responses: 1
1107 |
1108 | I, [2018-08-22 14:48:43 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Spider: in_parallel: starting processing 52 urls within 3 threads
1109 | D, [2018-08-22 14:48:43 +0400#13033] [C: 46982320219020] DEBUG -- amazon_spider: BrowserBuilder (mechanize): created browser instance
1110 | I, [2018-08-22 14:48:43 +0400#13033] [C: 46982320219020] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Practical-Web-Scraping-Data-Science/dp/1484235819/
1111 | D, [2018-08-22 14:48:44 +0400#13033] [C: 46982320189640] DEBUG -- amazon_spider: BrowserBuilder (mechanize): created browser instance
1112 | I, [2018-08-22 14:48:44 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Python-Web-Scraping-Cookbook-scraping/dp/1787285219/
1113 | D, [2018-08-22 14:48:44 +0400#13033] [C: 46982319187320] DEBUG -- amazon_spider: BrowserBuilder (mechanize): created browser instance
1114 | I, [2018-08-22 14:48:44 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Scraping-Python-Community-Experience-Distilled/dp/1782164367/
1115 | I, [2018-08-22 14:48:45 +0400#13033] [C: 46982320219020] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Practical-Web-Scraping-Data-Science/dp/1484235819/
1116 | I, [2018-08-22 14:48:45 +0400#13033] [C: 46982320219020] INFO -- amazon_spider: Info: visits: requests: 4, responses: 2
1117 | I, [2018-08-22 14:48:45 +0400#13033] [C: 46982320219020] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Web-Scraping-Python-Collecting-Modern/dp/1491910291/
1118 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Python-Web-Scraping-Cookbook-scraping/dp/1787285219/
1119 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Info: visits: requests: 5, responses: 3
1120 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Web-Scraping-Python-Collecting-Modern/dp/1491985577/
1121 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Scraping-Python-Community-Experience-Distilled/dp/1782164367/
1122 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Info: visits: requests: 6, responses: 4
1123 | I, [2018-08-22 14:48:46 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Web-Scraping-Excel-Effective-Scrapes-ebook/dp/B01CMMJGZ8/
1124 |
1125 | ...
1126 |
1127 | I, [2018-08-22 14:49:10 +0400#13033] [C: 46982320219020] INFO -- amazon_spider: Info: visits: requests: 51, responses: 49
1128 | I, [2018-08-22 14:49:10 +0400#13033] [C: 46982320219020] INFO -- amazon_spider: Browser: driver mechanize has been destroyed
1129 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Scraping-Ice-Life-Bill-Rayburn-ebook/dp/B00C0NF1L8/
1130 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Info: visits: requests: 51, responses: 50
1131 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Instant-Scraping-Jacob-Ward-2013-07-26/dp/B01FJ1G3G4/
1132 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Php-architects-Guide-Scraping-Author/dp/B010DTKYY4/
1133 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Info: visits: requests: 52, responses: 51
1134 | I, [2018-08-22 14:49:11 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Browser: started get request to: https://www.amazon.com/Ship-Tracking-Maritime-Domain-Awareness/dp/B001J5MTOK/
1135 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Instant-Scraping-Jacob-Ward-2013-07-26/dp/B01FJ1G3G4/
1136 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Info: visits: requests: 53, responses: 52
1137 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982320189640] INFO -- amazon_spider: Browser: driver mechanize has been destroyed
1138 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Browser: finished get request to: https://www.amazon.com/Ship-Tracking-Maritime-Domain-Awareness/dp/B001J5MTOK/
1139 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Info: visits: requests: 53, responses: 53
1140 | I, [2018-08-22 14:49:12 +0400#13033] [C: 46982319187320] INFO -- amazon_spider: Browser: driver mechanize has been destroyed
1141 |
1142 | I, [2018-08-22 14:49:12 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Spider: in_parallel: stopped processing 52 urls within 3 threads, total time: 29s
1143 | I, [2018-08-22 14:49:12 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Browser: driver mechanize has been destroyed
1144 |
1145 | I, [2018-08-22 14:49:12 +0400#13033] [M: 46982297486840] INFO -- amazon_spider: Spider: stopped: {:spider_name=>"amazon_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 14:48:37 +0400, :stop_time=>2018-08-22 14:49:12 +0400, :running_time=>"35s", :visits=>{:requests=>53, :responses=>53}, :error=>nil}
1146 |
1147 | ```
1148 |
1149 |
1150 |
1151 | books.json
1152 |
1153 | ```json
1154 | [
1155 | {
1156 | "title": "Web Scraping with Python: Collecting More Data from the Modern Web2nd Edition",
1157 | "url": "https://www.amazon.com/Web-Scraping-Python-Collecting-Modern/dp/1491985577/",
1158 | "price": "$26.94",
1159 | "publisher": "O'Reilly Media; 2 edition (April 14, 2018)",
1160 | "position": 1
1161 | },
1162 | {
1163 | "title": "Python Web Scraping Cookbook: Over 90 proven recipes to get you scraping with Python, micro services, Docker and AWS",
1164 | "url": "https://www.amazon.com/Python-Web-Scraping-Cookbook-scraping/dp/1787285219/",
1165 | "price": "$39.99",
1166 | "publisher": "Packt Publishing - ebooks Account (February 9, 2018)",
1167 | "position": 2
1168 | },
1169 | {
1170 | "title": "Web Scraping with Python: Collecting Data from the Modern Web1st Edition",
1171 | "url": "https://www.amazon.com/Web-Scraping-Python-Collecting-Modern/dp/1491910291/",
1172 | "price": "$15.75",
1173 | "publisher": "O'Reilly Media; 1 edition (July 24, 2015)",
1174 | "position": 3
1175 | },
1176 |
1177 | ...
1178 |
1179 | {
1180 | "title": "Instant Web Scraping with Java by Ryan Mitchell (2013-08-26)",
1181 | "url": "https://www.amazon.com/Instant-Scraping-Java-Mitchell-2013-08-26/dp/B01FEM76X2/",
1182 | "price": "$35.82",
1183 | "publisher": "Packt Publishing (2013-08-26) (1896)",
1184 | "position": 52
1185 | }
1186 | ]
1187 | ```
1188 |
1189 |
1190 | > Note that [save_to](#save_to-helper) and [unique?](#skip-duplicates-unique-helper) helpers are thread-safe (protected by [Mutex](https://ruby-doc.org/core-2.5.1/Mutex.html)) and can be freely used inside threads.
1191 |
1192 | `in_parallel` can take additional options:
1193 | * `data:` pass with urls custom data hash: `in_parallel(:method, urls, threads: 3, data: { category: "Scraping" })`
1194 | * `delay:` set delay between requests: `in_parallel(:method, urls, threads: 3, delay: 2)`. Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a Range, delay number will be chosen randomly for each request: `rand (2..5) # => 3`
1195 | * `engine:` set custom engine than a default one: `in_parallel(:method, urls, threads: 3, engine: :poltergeist_phantomjs)`
1196 | * `config:` pass custom options to config (see [config section](#crawler-config))
1197 |
1198 | ### Active Support included
1199 |
1200 | You can use all the power of familiar [Rails core-ext methods](https://guides.rubyonrails.org/active_support_core_extensions.html#loading-all-core-extensions) for scraping inside Kimurai. Especially take a look at [squish](https://apidock.com/rails/String/squish), [truncate_words](https://apidock.com/rails/String/truncate_words), [titleize](https://apidock.com/rails/String/titleize), [remove](https://apidock.com/rails/String/remove), [present?](https://guides.rubyonrails.org/active_support_core_extensions.html#blank-questionmark-and-present-questionmark) and [presence](https://guides.rubyonrails.org/active_support_core_extensions.html#presence).
1201 |
1202 | ### Schedule spiders using Cron
1203 |
1204 | 1) Inside spider directory generate [Whenever](https://github.com/javan/whenever) config: `$ kimurai generate schedule`.
1205 |
1206 |
1207 | schedule.rb
1208 |
1209 | ```ruby
1210 | ### Settings ###
1211 | require 'tzinfo'
1212 |
1213 | # Export current PATH to the cron
1214 | env :PATH, ENV["PATH"]
1215 |
1216 | # Use 24 hour format when using `at:` option
1217 | set :chronic_options, hours24: true
1218 |
1219 | # Use local_to_utc helper to setup execution time using your local timezone instead
1220 | # of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`).
1221 | # Also maybe you'll want to set same timezone in kimurai as well (use `Kimurai.configuration.time_zone =` for that),
1222 | # to have spiders logs in a specific time zone format.
1223 | # Example usage of helper:
1224 | # every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do
1225 | # crawl "google_spider.com", output: "log/google_spider.com.log"
1226 | # end
1227 | def local_to_utc(time_string, zone:)
1228 | TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
1229 | end
1230 |
1231 | # Note: by default Whenever exports cron commands with :environment == "production".
1232 | # Note: Whenever can only append log data to a log file (>>). If you want
1233 | # to overwrite (>) log file before each run, pass lambda:
1234 | # crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
1235 |
1236 | # Project job types
1237 | job_type :crawl, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output"
1238 | job_type :runner, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output"
1239 |
1240 | # Single file job type
1241 | job_type :single, "cd :path && KIMURAI_ENV=:environment ruby :task :output"
1242 | # Single with bundle exec
1243 | job_type :single_bundle, "cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output"
1244 |
1245 | ### Schedule ###
1246 | # Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
1247 | # every 1.day do
1248 | # Example to schedule a single spider in the project:
1249 | # crawl "google_spider.com", output: "log/google_spider.com.log"
1250 |
1251 | # Example to schedule all spiders in the project using runner. Each spider will write
1252 | # it's own output to the `log/spider_name.log` file (handled by a runner itself).
1253 | # Runner output will be written to log/runner.log file.
1254 | # Argument number it's a count of concurrent jobs:
1255 | # runner 3, output:"log/runner.log"
1256 |
1257 | # Example to schedule single spider (without project):
1258 | # single "single_spider.rb", output: "single_spider.log"
1259 | # end
1260 |
1261 | ### How to set a cron schedule ###
1262 | # Run: `$ whenever --update-crontab --load-file config/schedule.rb`.
1263 | # If you don't have whenever command, install the gem: `$ gem install whenever`.
1264 |
1265 | ### How to cancel a schedule ###
1266 | # Run: `$ whenever --clear-crontab --load-file config/schedule.rb`.
1267 | ```
1268 |
1269 |
1270 | 2) Add at the bottom of `schedule.rb` following code:
1271 |
1272 | ```ruby
1273 | every 1.day, at: "7:00" do
1274 | single "example_spider.rb", output: "example_spider.log"
1275 | end
1276 | ```
1277 |
1278 | 3) Run: `$ whenever --update-crontab --load-file schedule.rb`. Done!
1279 |
1280 | You can check Whenever examples [here](https://github.com/javan/whenever#example-schedulerb-file). To cancel schedule, run: `$ whenever --clear-crontab --load-file schedule.rb`.
1281 |
1282 | ### Configuration options
1283 | You can configure several options using `configure` block:
1284 |
1285 | ```ruby
1286 | Kimurai.configure do |config|
1287 | # Default logger has colored mode in development.
1288 | # If you would like to disable it, set `colorize_logger` to false.
1289 | # config.colorize_logger = false
1290 |
1291 | # Logger level for default logger:
1292 | # config.log_level = :info
1293 |
1294 | # Custom logger:
1295 | # config.logger = Logger.new(STDOUT)
1296 |
1297 | # Custom time zone (for logs):
1298 | # config.time_zone = "UTC"
1299 | # config.time_zone = "Europe/Moscow"
1300 |
1301 | # Provide custom chrome binary path (default is any available chrome/chromium in the PATH):
1302 | # config.selenium_chrome_path = "/usr/bin/chromium-browser"
1303 | # Provide custom selenium chromedriver path (default is "/usr/local/bin/chromedriver"):
1304 | # config.chromedriver_path = "~/.local/bin/chromedriver"
1305 | end
1306 | ```
1307 |
1308 | ### Using Kimurai inside existing Ruby application
1309 |
1310 | You can integrate Kimurai spiders (which are just Ruby classes) to an existing Ruby application like Rails or Sinatra, and run them using background jobs (for example). Check the following info to understand the running process of spiders:
1311 |
1312 | #### `.crawl!` method
1313 |
1314 | `.crawl!` (class method) performs a _full run_ of a particular spider. This method will return run_info if run was successful, or an exception if something went wrong.
1315 |
1316 | ```ruby
1317 | class ExampleSpider < Kimurai::Base
1318 | @name = "example_spider"
1319 | @engine = :mechanize
1320 | @start_urls = ["https://example.com/"]
1321 |
1322 | def parse(response, url:, data: {})
1323 | title = response.xpath("//title").text.squish
1324 | end
1325 | end
1326 |
1327 | ExampleSpider.crawl!
1328 | # => { :spider_name => "example_spider", :status => :completed, :environment => "development", :start_time => 2018-08-22 18:20:16 +0400, :stop_time => 2018-08-22 18:20:17 +0400, :running_time => 1.216, :visits => { :requests => 1, :responses => 1 }, :items => { :sent => 0, :processed => 0 }, :error => nil }
1329 | ```
1330 |
1331 | You can't `.crawl!` spider in different thread if it still running (because spider instances store some shared data in the `@run_info` class variable while `crawl`ing):
1332 |
1333 | ```ruby
1334 | 2.times do |i|
1335 | Thread.new { p i, ExampleSpider.crawl! }
1336 | end # =>
1337 |
1338 | # 1
1339 | # false
1340 |
1341 | # 0
1342 | # {:spider_name=>"example_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 18:49:22 +0400, :stop_time=>2018-08-22 18:49:23 +0400, :running_time=>0.801, :visits=>{:requests=>1, :responses=>1}, :items=>{:sent=>0, :processed=>0}, :error=>nil}
1343 | ```
1344 |
1345 | So what if you're don't care about stats and just want to process request to a particular spider method and get the returning value from this method? Use `.parse!` instead:
1346 |
1347 | #### `.parse!(:method_name, url:)` method
1348 |
1349 | `.parse!` (class method) creates a new spider instance and performs a request to given method with a given url. Value from the method will be returned back:
1350 |
1351 | ```ruby
1352 | class ExampleSpider < Kimurai::Base
1353 | @name = "example_spider"
1354 | @engine = :mechanize
1355 | @start_urls = ["https://example.com/"]
1356 |
1357 | def parse(response, url:, data: {})
1358 | title = response.xpath("//title").text.squish
1359 | end
1360 | end
1361 |
1362 | ExampleSpider.parse!(:parse, url: "https://example.com/")
1363 | # => "Example Domain"
1364 | ```
1365 |
1366 | Like `.crawl!`, `.parse!` method takes care of a browser instance and kills it (`browser.destroy_driver!`) before returning the value. Unlike `.crawl!`, `.parse!` method can be called from different threads at the same time:
1367 |
1368 | ```ruby
1369 | urls = ["https://www.google.com/", "https://www.reddit.com/", "https://en.wikipedia.org/"]
1370 |
1371 | urls.each do |url|
1372 | Thread.new { p ExampleSpider.parse!(:parse, url: url) }
1373 | end # =>
1374 |
1375 | # "Google"
1376 | # "Wikipedia, the free encyclopedia"
1377 | # "reddit: the front page of the internetHotHot"
1378 | ```
1379 |
1380 | Keep in mind, that [save_to](#save_to-helper) and [unique?](#skip-duplicates) helpers are not thread-safe while using `.parse!` method.
1381 |
1382 | #### `Kimurai.list` and `Kimurai.find_by_name()`
1383 |
1384 | ```ruby
1385 | class GoogleSpider < Kimurai::Base
1386 | @name = "google_spider"
1387 | end
1388 |
1389 | class RedditSpider < Kimurai::Base
1390 | @name = "reddit_spider"
1391 | end
1392 |
1393 | class WikipediaSpider < Kimurai::Base
1394 | @name = "wikipedia_spider"
1395 | end
1396 |
1397 | # To get the list of all available spider classes:
1398 | Kimurai.list
1399 | # => {"google_spider"=>GoogleSpider, "reddit_spider"=>RedditSpider, "wikipedia_spider"=>WikipediaSpider}
1400 |
1401 | # To find a particular spider class by it's name:
1402 | Kimurai.find_by_name("reddit_spider")
1403 | # => RedditSpider
1404 | ```
1405 |
1406 |
1407 | ### Automated sever setup and deployment
1408 | > **EXPERIMENTAL**
1409 |
1410 | #### Setup
1411 | You can automatically setup [required environment](#installation) for Kimurai on the remote server (currently there is only Ubuntu Server 18.04 support) using `$ kimurai setup` command. `setup` will perform installation of: latest Ruby with Rbenv, browsers with webdrivers and in additional databases clients (only clients) for MySQL, Postgres and MongoDB (so you can connect to a remote database from ruby).
1412 |
1413 | > To perform remote server setup, [Ansible](https://github.com/ansible/ansible) is required **on the desktop** machine (to install: Ubuntu: `$ sudo apt install ansible`, Mac OS X: `$ brew install ansible`)
1414 |
1415 | > It's recommended to use regular user to setup the server, not `root`. To create a new user, login to the server `$ ssh root@your_server_ip`, type `$ adduser username` to create a user, and `$ gpasswd -a username sudo` to add new user to a sudo group.
1416 |
1417 | Example:
1418 |
1419 | ```bash
1420 | $ kimurai setup deploy@123.123.123.123 --ask-sudo --ssh-key-path path/to/private_key
1421 | ```
1422 |
1423 | CLI options:
1424 | * `--ask-sudo` pass this option to ask sudo (user) password for system-wide installation of packages (`apt install`)
1425 | * `--ssh-key-path path/to/private_key` authorization on the server using private ssh key. You can omit it if required key already [added to keychain](https://help.github.com/articles/generating-a-new-ssh-key-and-adding-it-to-the-ssh-agent/#adding-your-ssh-key-to-the-ssh-agent) on your desktop (Ansible uses [SSH agent forwarding](https://developer.github.com/v3/guides/using-ssh-agent-forwarding/))
1426 | * `--ask-auth-pass` authorization on the server using user password, alternative option to `--ssh-key-path`.
1427 | * `-p port_number` custom port for ssh connection (`-p 2222`)
1428 |
1429 | > You can check setup playbook [here](lib/kimurai/automation/setup.yml)
1430 |
1431 | #### Deploy
1432 |
1433 | After successful `setup` you can deploy a spider to the remote server using `$ kimurai deploy` command. On each deploy there are performing several tasks: 1) pull repo from a remote origin to `~/repo_name` user directory 2) run `bundle install` 3) Update crontab `whenever --update-crontab` (to update spider schedule from schedule.rb file).
1434 |
1435 | Before `deploy` make sure that inside spider directory you have: 1) git repository with remote origin (bitbucket, github, etc.) 2) `Gemfile` 3) schedule.rb inside subfolder `config` (`config/schedule.rb`).
1436 |
1437 | Example:
1438 |
1439 | ```bash
1440 | $ kimurai deploy deploy@123.123.123.123 --ssh-key-path path/to/private_key --repo-key-path path/to/repo_private_key
1441 | ```
1442 |
1443 | CLI options: _same like for [setup](#setup) command_ (except `--ask-sudo`), plus
1444 | * `--repo-url` provide custom repo url (`--repo-url git@bitbucket.org:username/repo_name.git`), otherwise current `origin/master` will be taken (output from `$ git remote get-url origin`)
1445 | * `--repo-key-path` if git repository is private, authorization is required to pull the code on the remote server. Use this option to provide a private repository SSH key. You can omit it if required key already added to keychain on your desktop (same like with `--ssh-key-path` option)
1446 |
1447 | > You can check deploy playbook [here](lib/kimurai/automation/deploy.yml)
1448 |
1449 | ## Spider `@config`
1450 |
1451 | Using `@config` you can set several options for a spider, like proxy, user-agent, default cookies/headers, delay between requests, browser **memory control** and so on:
1452 |
1453 | ```ruby
1454 | class Spider < Kimurai::Base
1455 | USER_AGENTS = ["Chrome", "Firefox", "Safari", "Opera"]
1456 | PROXIES = ["2.3.4.5:8080:http:username:password", "3.4.5.6:3128:http", "1.2.3.4:3000:socks5"]
1457 |
1458 | @engine = :poltergeist_phantomjs
1459 | @start_urls = ["https://example.com/"]
1460 | @config = {
1461 | headers: { "custom_header" => "custom_value" },
1462 | cookies: [{ name: "cookie_name", value: "cookie_value", domain: ".example.com" }],
1463 | user_agent: -> { USER_AGENTS.sample },
1464 | proxy: -> { PROXIES.sample },
1465 | window_size: [1366, 768],
1466 | disable_images: true,
1467 | restart_if: {
1468 | # Restart browser if provided memory limit (in kilobytes) is exceeded:
1469 | memory_limit: 350_000
1470 | },
1471 | before_request: {
1472 | # Change user agent before each request:
1473 | change_user_agent: true,
1474 | # Change proxy before each request:
1475 | change_proxy: true,
1476 | # Clear all cookies and set default cookies (if provided) before each request:
1477 | clear_and_set_cookies: true,
1478 | # Process delay before each request:
1479 | delay: 1..3
1480 | }
1481 | }
1482 |
1483 | def parse(response, url:, data: {})
1484 | # ...
1485 | end
1486 | end
1487 | ```
1488 |
1489 | ### All available `@config` options
1490 |
1491 | ```ruby
1492 | @config = {
1493 | # Custom headers, format: hash. Example: { "some header" => "some value", "another header" => "another value" }
1494 | # Works only for :mechanize and :poltergeist_phantomjs engines (Selenium doesn't allow to set/get headers)
1495 | headers: {},
1496 |
1497 | # Custom User Agent, format: string or lambda.
1498 | # Use lambda if you want to rotate user agents before each run:
1499 | # user_agent: -> { ARRAY_OF_USER_AGENTS.sample }
1500 | # Works for all engines
1501 | user_agent: "Mozilla/5.0 Firefox/61.0",
1502 |
1503 | # Custom cookies, format: array of hashes.
1504 | # Format for a single cookie: { name: "cookie name", value: "cookie value", domain: ".example.com" }
1505 | # Works for all engines
1506 | cookies: [],
1507 |
1508 | # Proxy, format: string or lambda. Format of a proxy string: "ip:port:protocol:user:password"
1509 | # `protocol` can be http or socks5. User and password are optional.
1510 | # Use lambda if you want to rotate proxies before each run:
1511 | # proxy: -> { ARRAY_OF_PROXIES.sample }
1512 | # Works for all engines, but keep in mind that Selenium drivers doesn't support proxies
1513 | # with authorization. Also, Mechanize doesn't support socks5 proxy format (only http)
1514 | proxy: "3.4.5.6:3128:http:user:pass",
1515 |
1516 | # If enabled, browser will ignore any https errors. It's handy while using a proxy
1517 | # with self-signed SSL cert (for example Crawlera or Mitmproxy)
1518 | # Also, it will allow to visit webpages with expires SSL certificate.
1519 | # Works for all engines
1520 | ignore_ssl_errors: true,
1521 |
1522 | # Custom window size, works for all engines
1523 | window_size: [1366, 768],
1524 |
1525 | # Skip images downloading if true, works for all engines
1526 | disable_images: true,
1527 |
1528 | # Selenium engines only: headless mode, `:native` or `:virtual_display` (default is :native)
1529 | # Although native mode has a better performance, virtual display mode
1530 | # sometimes can be useful. For example, some websites can detect (and block)
1531 | # headless chrome, so you can use virtual_display mode instead
1532 | headless_mode: :native,
1533 |
1534 | # This option tells the browser not to use a proxy for the provided list of domains or IP addresses.
1535 | # Format: array of strings. Works only for :selenium_firefox and selenium_chrome
1536 | proxy_bypass_list: [],
1537 |
1538 | # Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize
1539 | ssl_cert_path: "path/to/ssl_cert",
1540 |
1541 | # Inject some JavaScript code to the browser.
1542 | # Format: array of strings, where each string is a path to JS file.
1543 | # Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection)
1544 | extensions: ["lib/code_to_inject.js"],
1545 |
1546 | # Automatically skip duplicated (already visited) urls when using `request_to` method.
1547 | # Possible values: `true` or `hash` with options.
1548 | # In case of `true`, all visited urls will be added to the storage's scope `:requests_urls`
1549 | # and if url already contains in this scope, request will be skipped.
1550 | # You can configure this setting by providing additional options as hash:
1551 | # `skip_duplicate_requests: { scope: :custom_scope, check_only: true }`, where:
1552 | # `scope:` - use custom scope than `:requests_urls`
1553 | # `check_only:` - if true, then scope will be only checked for url, url will not
1554 | # be added to the scope if scope doesn't contains it.
1555 | # works for all drivers
1556 | skip_duplicate_requests: true,
1557 |
1558 | # Automatically skip provided errors while requesting a page.
1559 | # If raised error matches one of the errors in the list, then this error will be caught,
1560 | # and request will be skipped.
1561 | # It is a good idea to skip errors like NotFound(404), etc.
1562 | # Format: array where elements are error classes or/and hashes. You can use hash format
1563 | # for more flexibility: `{ error: "RuntimeError", message: "404 => Net::HTTPNotFound" }`.
1564 | # Provided `message:` will be compared with a full error message using `String#include?`. Also
1565 | # you can use regex instead: `{ error: "RuntimeError", message: /404|403/ }`.
1566 | skip_request_errors: [{ error: RuntimeError, message: "404 => Net::HTTPNotFound" }],
1567 |
1568 | # Automatically retry provided errors with a few attempts while requesting a page.
1569 | # If raised error matches one of the errors in the list, then this error will be caught
1570 | # and the request will be processed again within a delay. There are 3 attempts:
1571 | # first: delay 15 sec, second: delay 30 sec, third: delay 45 sec.
1572 | # If after 3 attempts there is still an exception, then the exception will be raised.
1573 | # It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc.
1574 | # Format: same like for `skip_request_errors` option.
1575 | retry_request_errors: [Net::ReadTimeout],
1576 |
1577 | # Handle page encoding while parsing html response using Nokogiri. There are two modes:
1578 | # Auto (`:auto`) (try to fetch correct encoding from or tags)
1579 | # Set required encoding manually, example: `encoding: "GB2312"` (Set required encoding manually)
1580 | # Default this option is unset.
1581 | encoding: nil,
1582 |
1583 | # Restart browser if one of the options is true:
1584 | restart_if: {
1585 | # Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
1586 | memory_limit: 350_000,
1587 |
1588 | # Restart browser if provided requests limit is exceeded (works for all engines)
1589 | requests_limit: 100
1590 | },
1591 |
1592 | # Perform several actions before each request:
1593 | before_request: {
1594 | # Change proxy before each request. The `proxy:` option above should be presented
1595 | # and has lambda format. Works only for poltergeist and mechanize engines
1596 | # (Selenium doesn't support proxy rotation).
1597 | change_proxy: true,
1598 |
1599 | # Change user agent before each request. The `user_agent:` option above should be presented
1600 | # and has lambda format. Works only for poltergeist and mechanize engines
1601 | # (selenium doesn't support to get/set headers).
1602 | change_user_agent: true,
1603 |
1604 | # Clear all cookies before each request, works for all engines
1605 | clear_cookies: true,
1606 |
1607 | # If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
1608 | # use this option instead (works for all engines)
1609 | clear_and_set_cookies: true,
1610 |
1611 | # Global option to set delay between requests.
1612 | # Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
1613 | # delay number will be chosen randomly for each request: `rand (2..5) # => 3`
1614 | delay: 1..3
1615 | }
1616 | }
1617 | ```
1618 |
1619 | As you can see, most of the options are universal for any engine.
1620 |
1621 | ### `@config` settings inheritance
1622 | Settings can be inherited:
1623 |
1624 | ```ruby
1625 | class ApplicationSpider < Kimurai::Base
1626 | @engine = :poltergeist_phantomjs
1627 | @config = {
1628 | user_agent: "Firefox",
1629 | disable_images: true,
1630 | restart_if: { memory_limit: 350_000 },
1631 | before_request: { delay: 1..2 }
1632 | }
1633 | end
1634 |
1635 | class CustomSpider < ApplicationSpider
1636 | @name = "custom_spider"
1637 | @start_urls = ["https://example.com/"]
1638 | @config = {
1639 | before_request: { delay: 4..6 }
1640 | }
1641 |
1642 | def parse(response, url:, data: {})
1643 | # ...
1644 | end
1645 | end
1646 | ```
1647 |
1648 | Here, `@config` of `CustomSpider` will be _[deep merged](https://apidock.com/rails/Hash/deep_merge)_ with `ApplicationSpider` config, so `CustomSpider` will keep all inherited options with only `delay` updated.
1649 |
1650 | ## Project mode
1651 |
1652 | Kimurai can work in project mode ([Like Scrapy](https://doc.scrapy.org/en/latest/intro/tutorial.html#creating-a-project)). To generate a new project, run: `$ kimurai generate project web_spiders` (where `web_spiders` is a name of project).
1653 |
1654 | Structure of the project:
1655 |
1656 | ```bash
1657 | .
1658 | ├── config/
1659 | │ ├── initializers/
1660 | │ ├── application.rb
1661 | │ ├── automation.yml
1662 | │ ├── boot.rb
1663 | │ └── schedule.rb
1664 | ├── spiders/
1665 | │ └── application_spider.rb
1666 | ├── db/
1667 | ├── helpers/
1668 | │ └── application_helper.rb
1669 | ├── lib/
1670 | ├── log/
1671 | ├── pipelines/
1672 | │ ├── validator.rb
1673 | │ └── saver.rb
1674 | ├── tmp/
1675 | ├── .env
1676 | ├── Gemfile
1677 | ├── Gemfile.lock
1678 | └── README.md
1679 | ```
1680 |
1681 |
1682 | Description
1683 |
1684 | * `config/` folder for configutation files
1685 | * `config/initializers` [Rails-like initializers](https://guides.rubyonrails.org/configuring.html#using-initializer-files) to load custom code at start of framework
1686 | * `config/application.rb` configuration settings for Kimurai (`Kimurai.configure do` block)
1687 | * `config/automation.yml` specify some settings for [setup and deploy](#automated-sever-setup-and-deployment)
1688 | * `config/boot.rb` loads framework and project
1689 | * `config/schedule.rb` Cron [schedule for spiders](#schedule-spiders-using-cron)
1690 | * `spiders/` folder for spiders
1691 | * `spiders/application_spider.rb` Base parent class for all spiders
1692 | * `db/` store here all database files (`sqlite`, `json`, `csv`, etc.)
1693 | * `helpers/` Rails-like helpers for spiders
1694 | * `helpers/application_helper.rb` all methods inside ApplicationHelper module will be available for all spiders
1695 | * `lib/` put here custom Ruby code
1696 | * `log/` folder for logs
1697 | * `pipelines/` folder for [Scrapy-like](https://doc.scrapy.org/en/latest/topics/item-pipeline.html) pipelines. One file = one pipeline
1698 | * `pipelines/validator.rb` example pipeline to validate item
1699 | * `pipelines/saver.rb` example pipeline to save item
1700 | * `tmp/` folder for temp. files
1701 | * `.env` file to store ENV variables for project and load them using [Dotenv](https://github.com/bkeepers/dotenv)
1702 | * `Gemfile` dependency file
1703 | * `Readme.md` example project readme
1704 |
1705 |
1706 |
1707 | ### Generate new spider
1708 | To generate a new spider in the project, run:
1709 |
1710 | ```bash
1711 | $ kimurai generate spider example_spider
1712 | create spiders/example_spider.rb
1713 | ```
1714 |
1715 | Command will generate a new spider class inherited from `ApplicationSpider`:
1716 |
1717 | ```ruby
1718 | class ExampleSpider < ApplicationSpider
1719 | @name = "example_spider"
1720 | @start_urls = []
1721 | @config = {}
1722 |
1723 | def parse(response, url:, data: {})
1724 | end
1725 | end
1726 | ```
1727 |
1728 | ### Crawl
1729 | To run a particular spider in the project, run: `$ bundle exec kimurai crawl example_spider`. Don't forget to add `bundle exec` before command to load required environment.
1730 |
1731 | ### List
1732 | To list all project spiders, run: `$ bundle exec kimurai list`
1733 |
1734 | ### Parse
1735 | For project spiders you can use `$ kimurai parse` command which helps to debug spiders:
1736 |
1737 | ```bash
1738 | $ bundle exec kimurai parse example_spider parse_product --url https://example-shop.com/product-1
1739 | ```
1740 |
1741 | where `example_spider` is a spider to run, `parse_product` is a spider method to process and `--url` is url to open inside processing method.
1742 |
1743 | ### Pipelines, `send_item` method
1744 | You can use item pipelines to organize and store in one place item processing logic for all project spiders (also check Scrapy [description of pipelines](https://doc.scrapy.org/en/latest/topics/item-pipeline.html#item-pipeline)).
1745 |
1746 | Imagine if you have three spiders where each of them crawls different e-commerce shop and saves only shoe positions. For each spider, you want to save items only with "shoe" category, unique sku, valid title/price and with existing images. To avoid code duplication between spiders, use pipelines:
1747 |
1748 |
1749 | Example
1750 |
1751 | pipelines/validator.rb
1752 | ```ruby
1753 | class Validator < Kimurai::Pipeline
1754 | def process_item(item, options: {})
1755 | # Here you can validate item and raise `DropItemError`
1756 | # if one of the validations failed. Examples:
1757 |
1758 | # Drop item if it's category is not "shoe":
1759 | if item[:category] != "shoe"
1760 | raise DropItemError, "Wrong item category"
1761 | end
1762 |
1763 | # Check item sku for uniqueness using buit-in unique? helper:
1764 | unless unique?(:sku, item[:sku])
1765 | raise DropItemError, "Item sku is not unique"
1766 | end
1767 |
1768 | # Drop item if title length shorter than 5 symbols:
1769 | if item[:title].size < 5
1770 | raise DropItemError, "Item title is short"
1771 | end
1772 |
1773 | # Drop item if price is not present
1774 | unless item[:price].present?
1775 | raise DropItemError, "item price is not present"
1776 | end
1777 |
1778 | # Drop item if it doesn't contains any images:
1779 | unless item[:images].present?
1780 | raise DropItemError, "Item images are not present"
1781 | end
1782 |
1783 | # Pass item to the next pipeline (if it wasn't dropped):
1784 | item
1785 | end
1786 | end
1787 |
1788 | ```
1789 |
1790 | pipelines/saver.rb
1791 | ```ruby
1792 | class Saver < Kimurai::Pipeline
1793 | def process_item(item, options: {})
1794 | # Here you can save item to the database, send it to a remote API or
1795 | # simply save item to a file format using `save_to` helper:
1796 |
1797 | # To get the name of current spider: `spider.class.name`
1798 | save_to "db/#{spider.class.name}.json", item, format: :json
1799 |
1800 | item
1801 | end
1802 | end
1803 | ```
1804 |
1805 | spiders/application_spider.rb
1806 | ```ruby
1807 | class ApplicationSpider < Kimurai::Base
1808 | @engine = :selenium_chrome
1809 | # Define pipelines (by order) for all spiders:
1810 | @pipelines = [:validator, :saver]
1811 | end
1812 | ```
1813 |
1814 | spiders/shop_spider_1.rb
1815 | ```ruby
1816 | class ShopSpiderOne < ApplicationSpider
1817 | @name = "shop_spider_1"
1818 | @start_urls = ["https://shop-1.com"]
1819 |
1820 | # ...
1821 |
1822 | def parse_product(response, url:, data: {})
1823 | # ...
1824 |
1825 | # Send item to pipelines:
1826 | send_item item
1827 | end
1828 | end
1829 | ```
1830 |
1831 | spiders/shop_spider_2.rb
1832 | ```ruby
1833 | class ShopSpiderTwo < ApplicationSpider
1834 | @name = "shop_spider_2"
1835 | @start_urls = ["https://shop-2.com"]
1836 |
1837 | def parse_product(response, url:, data: {})
1838 | # ...
1839 |
1840 | # Send item to pipelines:
1841 | send_item item
1842 | end
1843 | end
1844 | ```
1845 |
1846 | spiders/shop_spider_3.rb
1847 | ```ruby
1848 | class ShopSpiderThree < ApplicationSpider
1849 | @name = "shop_spider_3"
1850 | @start_urls = ["https://shop-3.com"]
1851 |
1852 | def parse_product(response, url:, data: {})
1853 | # ...
1854 |
1855 | # Send item to pipelines:
1856 | send_item item
1857 | end
1858 | end
1859 | ```
1860 |
1861 |
1862 | When you start using pipelines, there are stats for items appears:
1863 |
1864 |
1865 | Example
1866 |
1867 | pipelines/validator.rb
1868 | ```ruby
1869 | class Validator < Kimurai::Pipeline
1870 | def process_item(item, options: {})
1871 | if item[:star_count] < 10
1872 | raise DropItemError, "Repository doesn't have enough stars"
1873 | end
1874 |
1875 | item
1876 | end
1877 | end
1878 | ```
1879 |
1880 | spiders/github_spider.rb
1881 | ```ruby
1882 | class GithubSpider < ApplicationSpider
1883 | @name = "github_spider"
1884 | @engine = :selenium_chrome
1885 | @pipelines = [:validator]
1886 | @start_urls = ["https://github.com/search?q=Ruby%20Web%20Scraping"]
1887 | @config = {
1888 | user_agent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36",
1889 | before_request: { delay: 4..7 }
1890 | }
1891 |
1892 | def parse(response, url:, data: {})
1893 | response.xpath("//ul[@class='repo-list']/div//h3/a").each do |a|
1894 | request_to :parse_repo_page, url: absolute_url(a[:href], base: url)
1895 | end
1896 |
1897 | if next_page = response.at_xpath("//a[@class='next_page']")
1898 | request_to :parse, url: absolute_url(next_page[:href], base: url)
1899 | end
1900 | end
1901 |
1902 | def parse_repo_page(response, url:, data: {})
1903 | item = {}
1904 |
1905 | item[:owner] = response.xpath("//h1//a[@rel='author']").text
1906 | item[:repo_name] = response.xpath("//h1/strong[@itemprop='name']/a").text
1907 | item[:repo_url] = url
1908 | item[:description] = response.xpath("//span[@itemprop='about']").text.squish
1909 | item[:tags] = response.xpath("//div[@id='topics-list-container']/div/a").map { |a| a.text.squish }
1910 | item[:watch_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Watch')]/a[2]").text.squish.delete(",").to_i
1911 | item[:star_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Star')]/a[2]").text.squish.delete(",").to_i
1912 | item[:fork_count] = response.xpath("//ul[@class='pagehead-actions']/li[contains(., 'Fork')]/a[2]").text.squish.delete(",").to_i
1913 | item[:last_commit] = response.xpath("//span[@itemprop='dateModified']/*").text
1914 |
1915 | send_item item
1916 | end
1917 | end
1918 | ```
1919 |
1920 | ```
1921 | $ bundle exec kimurai crawl github_spider
1922 |
1923 | I, [2018-08-22 15:56:35 +0400#1358] [M: 47347279209980] INFO -- github_spider: Spider: started: github_spider
1924 | D, [2018-08-22 15:56:35 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: BrowserBuilder (selenium_chrome): created browser instance
1925 | I, [2018-08-22 15:56:40 +0400#1358] [M: 47347279209980] INFO -- github_spider: Browser: started get request to: https://github.com/search?q=Ruby%20Web%20Scraping
1926 | I, [2018-08-22 15:56:44 +0400#1358] [M: 47347279209980] INFO -- github_spider: Browser: finished get request to: https://github.com/search?q=Ruby%20Web%20Scraping
1927 | I, [2018-08-22 15:56:44 +0400#1358] [M: 47347279209980] INFO -- github_spider: Info: visits: requests: 1, responses: 1
1928 | D, [2018-08-22 15:56:44 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: driver.current_memory: 116182
1929 | D, [2018-08-22 15:56:44 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: sleep 5 seconds before request...
1930 |
1931 | I, [2018-08-22 15:56:49 +0400#1358] [M: 47347279209980] INFO -- github_spider: Browser: started get request to: https://github.com/lorien/awesome-web-scraping
1932 | I, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] INFO -- github_spider: Browser: finished get request to: https://github.com/lorien/awesome-web-scraping
1933 | I, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] INFO -- github_spider: Info: visits: requests: 2, responses: 2
1934 | D, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: driver.current_memory: 217432
1935 | D, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Pipeline: starting processing item through 1 pipeline...
1936 | I, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] INFO -- github_spider: Pipeline: processed: {"owner":"lorien","repo_name":"awesome-web-scraping","repo_url":"https://github.com/lorien/awesome-web-scraping","description":"List of libraries, tools and APIs for web scraping and data processing.","tags":["awesome","awesome-list","web-scraping","data-processing","python","javascript","php","ruby"],"watch_count":159,"star_count":2423,"fork_count":358,"last_commit":"4 days ago"}
1937 | I, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] INFO -- github_spider: Info: items: sent: 1, processed: 1
1938 | D, [2018-08-22 15:56:50 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: sleep 6 seconds before request...
1939 |
1940 | ...
1941 |
1942 | I, [2018-08-22 16:11:50 +0400#1358] [M: 47347279209980] INFO -- github_spider: Browser: started get request to: https://github.com/preston/idclight
1943 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] INFO -- github_spider: Browser: finished get request to: https://github.com/preston/idclight
1944 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] INFO -- github_spider: Info: visits: requests: 140, responses: 140
1945 | D, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Browser: driver.current_memory: 211713
1946 |
1947 | D, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] DEBUG -- github_spider: Pipeline: starting processing item through 1 pipeline...
1948 | E, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] ERROR -- github_spider: Pipeline: dropped: #, item: {:owner=>"preston", :repo_name=>"idclight", :repo_url=>"https://github.com/preston/idclight", :description=>"A Ruby gem for accessing the freely available IDClight (IDConverter Light) web service, which convert between different types of gene IDs such as Hugo and Entrez. Queries are screen scraped from http://idclight.bioinfo.cnio.es.", :tags=>[], :watch_count=>6, :star_count=>1, :fork_count=>0, :last_commit=>"on Apr 12, 2012"}
1949 |
1950 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] INFO -- github_spider: Info: items: sent: 127, processed: 12
1951 |
1952 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] INFO -- github_spider: Browser: driver selenium_chrome has been destroyed
1953 | I, [2018-08-22 16:11:51 +0400#1358] [M: 47347279209980] INFO -- github_spider: Spider: stopped: {:spider_name=>"github_spider", :status=>:completed, :environment=>"development", :start_time=>2018-08-22 15:56:35 +0400, :stop_time=>2018-08-22 16:11:51 +0400, :running_time=>"15m, 16s", :visits=>{:requests=>140, :responses=>140}, :items=>{:sent=>127, :processed=>12}, :error=>nil}
1954 | ```
1955 |
1956 |
1957 | Also, you can pass custom options to pipeline from a particular spider if you want to change pipeline behavior for this spider:
1958 |
1959 |
1960 | Example
1961 |
1962 | spiders/custom_spider.rb
1963 | ```ruby
1964 | class CustomSpider < ApplicationSpider
1965 | @name = "custom_spider"
1966 | @start_urls = ["https://example.com"]
1967 | @pipelines = [:validator]
1968 |
1969 | # ...
1970 |
1971 | def parse_item(response, url:, data: {})
1972 | # ...
1973 |
1974 | # Pass custom option `skip_uniq_checking` for Validator pipeline:
1975 | send_item item, validator: { skip_uniq_checking: true }
1976 | end
1977 | end
1978 |
1979 | ```
1980 |
1981 | pipelines/validator.rb
1982 | ```ruby
1983 | class Validator < Kimurai::Pipeline
1984 | def process_item(item, options: {})
1985 |
1986 | # Do not check item sku for uniqueness if options[:skip_uniq_checking] is true
1987 | if options[:skip_uniq_checking] != true
1988 | raise DropItemError, "Item sku is not unique" unless unique?(:sku, item[:sku])
1989 | end
1990 | end
1991 | end
1992 | ```
1993 |
1994 |
1995 |
1996 | ### Runner
1997 |
1998 | You can run project spiders one by one or in parallel using `$ kimurai runner` command:
1999 |
2000 | ```
2001 | $ bundle exec kimurai list
2002 | custom_spider
2003 | example_spider
2004 | github_spider
2005 |
2006 | $ bundle exec kimurai runner -j 3
2007 | >>> Runner: started: {:id=>1533727423, :status=>:processing, :start_time=>2018-08-08 15:23:43 +0400, :stop_time=>nil, :environment=>"development", :concurrent_jobs=>3, :spiders=>["custom_spider", "github_spider", "example_spider"]}
2008 | > Runner: started spider: custom_spider, index: 0
2009 | > Runner: started spider: github_spider, index: 1
2010 | > Runner: started spider: example_spider, index: 2
2011 | < Runner: stopped spider: custom_spider, index: 0
2012 | < Runner: stopped spider: example_spider, index: 2
2013 | < Runner: stopped spider: github_spider, index: 1
2014 | <<< Runner: stopped: {:id=>1533727423, :status=>:completed, :start_time=>2018-08-08 15:23:43 +0400, :stop_time=>2018-08-08 15:25:11 +0400, :environment=>"development", :concurrent_jobs=>3, :spiders=>["custom_spider", "github_spider", "example_spider"]}
2015 | ```
2016 |
2017 | Each spider runs in a separate process. Spiders logs available at `log/` folder. Pass `-j` option to specify how many spiders should be processed at the same time (default is 1).
2018 |
2019 | You can provide additional arguments like `--include` or `--exclude` to specify which spiders to run:
2020 |
2021 | ```bash
2022 | # Run only custom_spider and example_spider:
2023 | $ bundle exec kimurai runner --include custom_spider example_spider
2024 |
2025 | # Run all except github_spider:
2026 | $ bundle exec kimurai runner --exclude github_spider
2027 | ```
2028 |
2029 | #### Runner callbacks
2030 |
2031 | You can perform custom actions before runner starts and after runner stops using `config.runner_at_start_callback` and `config.runner_at_stop_callback`. Check [config/application.rb](lib/kimurai/template/config/application.rb) to see example.
2032 |
2033 |
2034 | ## Chat Support and Feedback
2035 | Will be updated
2036 |
2037 | ## License
2038 | The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
2039 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require "bundler/gem_tasks"
2 | require "rake/testtask"
3 |
4 | Rake::TestTask.new(:test) do |t|
5 | t.libs << "test"
6 | t.libs << "lib"
7 | t.test_files = FileList["test/**/*_test.rb"]
8 | end
9 |
10 | task :default => :test
11 |
--------------------------------------------------------------------------------
/bin/console:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | require "bundler/setup"
4 | require "kimurai"
5 |
6 | # You can add fixtures and/or initialization code here to make experimenting
7 | # with your gem easier. You can also use a different console, if you like.
8 |
9 | # (If you use this, don't forget to add pry to your Gemfile!)
10 | # require "pry"
11 | # Pry.start
12 |
13 | require "irb"
14 | IRB.start(__FILE__)
15 |
--------------------------------------------------------------------------------
/bin/setup:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euo pipefail
3 | IFS=$'\n\t'
4 | set -vx
5 |
6 | bundle install
7 |
8 | # Do any other automated setup that you need to do here
9 |
--------------------------------------------------------------------------------
/exe/kimurai:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | require 'kimurai'
4 | require 'kimurai/cli'
5 |
6 | Kimurai::CLI.start(ARGV)
7 |
--------------------------------------------------------------------------------
/kimurai.gemspec:
--------------------------------------------------------------------------------
1 |
2 | lib = File.expand_path("../lib", __FILE__)
3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4 | require "kimurai/version"
5 |
6 | Gem::Specification.new do |spec|
7 | spec.name = "kimurai"
8 | spec.version = Kimurai::VERSION
9 | spec.authors = ["Victor Afanasev"]
10 | spec.email = ["vicfreefly@gmail.com"]
11 |
12 | spec.summary = "Modern web scraping framework written in Ruby and based on Capybara/Nokogiri"
13 | spec.homepage = "https://github.com/vifreefly/kimuraframework"
14 | spec.license = "MIT"
15 |
16 | # Specify which files should be added to the gem when it is released.
17 | # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
18 | spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
19 | `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
20 | end
21 | spec.bindir = "exe"
22 | spec.executables = "kimurai"
23 | spec.require_paths = ["lib"]
24 | spec.required_ruby_version = ">= 2.5.0"
25 |
26 | spec.add_dependency "thor"
27 | spec.add_dependency "cliver"
28 | spec.add_dependency "activesupport"
29 | spec.add_dependency "murmurhash3"
30 | spec.add_dependency "nokogiri"
31 |
32 | spec.add_dependency "capybara", ">= 2.15", "< 4.0"
33 | spec.add_dependency "capybara-mechanize"
34 | spec.add_dependency "poltergeist"
35 | spec.add_dependency "selenium-webdriver"
36 |
37 | spec.add_dependency "headless"
38 | spec.add_dependency "pmap"
39 |
40 | spec.add_dependency "whenever"
41 |
42 | spec.add_dependency "rbcat", "~> 0.2"
43 | spec.add_dependency "pry"
44 |
45 | spec.add_development_dependency "bundler", "~> 1.16"
46 | spec.add_development_dependency "rake", "~> 10.0"
47 | spec.add_development_dependency "minitest", "~> 5.0"
48 | end
49 |
--------------------------------------------------------------------------------
/lib/kimurai.rb:
--------------------------------------------------------------------------------
1 | require 'ostruct'
2 | require 'logger'
3 | require 'json'
4 | require 'active_support'
5 | require 'active_support/core_ext'
6 | require 'rbcat'
7 |
8 | require_relative 'kimurai/version'
9 |
10 | require_relative 'kimurai/core_ext/numeric'
11 | require_relative 'kimurai/core_ext/string'
12 | require_relative 'kimurai/core_ext/array'
13 | require_relative 'kimurai/core_ext/hash'
14 |
15 | require_relative 'kimurai/browser_builder'
16 | require_relative 'kimurai/base_helper'
17 | require_relative 'kimurai/pipeline'
18 | require_relative 'kimurai/base'
19 |
20 | module Kimurai
21 | class << self
22 | def configuration
23 | @configuration ||= OpenStruct.new
24 | end
25 |
26 | def configure
27 | yield(configuration)
28 | end
29 |
30 | def env
31 | ENV.fetch("KIMURAI_ENV") { "development" }
32 | end
33 |
34 | def time_zone
35 | ENV["TZ"]
36 | end
37 |
38 | def time_zone=(value)
39 | ENV.store("TZ", value)
40 | end
41 |
42 | def list
43 | Base.descendants.map do |klass|
44 | next unless klass.name
45 | [klass.name, klass]
46 | end.compact.to_h
47 | end
48 |
49 | def find_by_name(name)
50 | return unless name
51 | Base.descendants.find { |klass| klass.name == name }
52 | end
53 | end
54 | end
55 |
--------------------------------------------------------------------------------
/lib/kimurai/automation/deploy.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: all
3 | vars:
4 | rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
5 | rbenv_shims_path: "{{ rbenv_root_path }}/shims"
6 | repo_url:
7 | repo_name:
8 | repo_key_path:
9 |
10 | tasks:
11 | - name: Copy custom git ssh key to /tmp/private_key (if provided)
12 | when: repo_key_path is not none
13 | copy:
14 | src: "{{ repo_key_path }}"
15 | dest: /tmp/private_key
16 | mode: 0600
17 |
18 | - name: Clone/pull project repo to ~/{{ repo_name }} user directory (using ssh-agent forwarding or https)
19 | when: repo_key_path is none
20 | git:
21 | repo: "{{ repo_url }}"
22 | dest: "~/{{ repo_name }}"
23 | force: true
24 | accept_hostkey: true
25 |
26 | - name: Clone/pull project repo to ~/{{ repo_name }} user directory (using custom git ssh key)
27 | when: repo_key_path is not none
28 | git:
29 | repo: "{{ repo_url }}"
30 | dest: "~/{{ repo_name }}"
31 | force: true
32 | accept_hostkey: true
33 | key_file: /tmp/private_key
34 |
35 | - name: Delete custom git ssh key from /tmp/private_key (if provided)
36 | when: repo_key_path is not none
37 | file:
38 | state: absent
39 | path: /tmp/private_key
40 |
41 | - name: Run bundle install
42 | command: bundle install
43 | args:
44 | chdir: ~/{{ repo_name }}
45 | environment:
46 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
47 |
48 | - name: Run whenever to update crontab
49 | command: whenever --update-crontab
50 | args:
51 | chdir: ~/{{ repo_name }}
52 | environment:
53 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
54 |
55 |
--------------------------------------------------------------------------------
/lib/kimurai/automation/setup.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - hosts: all
3 | vars:
4 | ruby: 2.5.3
5 | rbenv_root_path: /home/{{ ansible_user_id }}/.rbenv
6 | rbenv_shims_path: "{{ rbenv_root_path }}/shims"
7 | ruby_versions_path: "{{ rbenv_root_path }}/versions"
8 | # check latest here http://phantomjs.org/download.html
9 | phantomjs: 2.1.1
10 | # check latest here https://github.com/mozilla/geckodriver/releases/
11 | geckodriver: 0.23.0
12 | # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
13 | chromedriver: 2.44
14 |
15 | tasks:
16 | - name: Update apt cache
17 | become: true
18 | apt: update_cache=yes cache_valid_time=86400
19 |
20 | - name: Install base packages
21 | become: true
22 | apt:
23 | pkg: "{{ item }}"
24 | state: present
25 | with_items:
26 | - git
27 | - xvfb
28 | - libsqlite3-dev
29 | - sqlite3
30 | - mongodb-clients
31 | - mysql-client
32 | - libmysqlclient-dev
33 | - postgresql-client
34 | - libpq-dev
35 |
36 | - import_tasks: setup/ruby_environment.yml
37 |
38 | - import_tasks: setup/phantomjs.yml
39 | become: true
40 |
41 | - import_tasks: setup/firefox_geckodriver.yml
42 | become: true
43 |
44 | - import_tasks: setup/chromium_chromedriver.yml
45 | become: true
46 |
--------------------------------------------------------------------------------
/lib/kimurai/automation/setup/chromium_chromedriver.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Install chromium browser
3 | apt:
4 | pkg: chromium-browser
5 | state: present
6 |
7 | - name: Get current chromedriver version
8 | shell: chromedriver --version
9 | args:
10 | executable: /bin/bash
11 | register: current_chromedriver_version
12 | changed_when: false
13 | ignore_errors: true
14 |
15 | - name: Install unzip tool to unarchive chromedriver archive
16 | apt:
17 | pkg: unzip
18 | state: present
19 |
20 | - name: Download chromedriver binary archive and unarchive it to /usr/local/bin
21 | unarchive:
22 | src: https://chromedriver.storage.googleapis.com/{{ chromedriver }}/chromedriver_linux64.zip
23 | dest: /usr/local/bin
24 | remote_src: true
25 | mode: a+x
26 | when: chromedriver not in current_chromedriver_version.stdout_lines
27 |
--------------------------------------------------------------------------------
/lib/kimurai/automation/setup/firefox_geckodriver.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Install firefox
3 | apt:
4 | pkg: firefox
5 | state: present
6 |
7 | - name: Get current geckodriver version
8 | shell: geckodriver --version
9 | args:
10 | executable: /bin/bash
11 | register: current_geckodriver_version
12 | changed_when: false
13 | ignore_errors: true
14 |
15 | - name: Download geckodriver binary archive and unarchive it to /usr/local/bin
16 | unarchive:
17 | src: https://github.com/mozilla/geckodriver/releases/download/v{{ geckodriver }}/geckodriver-v{{ geckodriver }}-linux64.tar.gz
18 | dest: /usr/local/bin
19 | remote_src: true
20 | when: geckodriver not in current_geckodriver_version.stdout
21 |
--------------------------------------------------------------------------------
/lib/kimurai/automation/setup/phantomjs.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Install dependencies for PhantomJS
3 | apt:
4 | pkg: "{{ item }}"
5 | state: present
6 | with_items:
7 | - chrpath
8 | - libxft-dev
9 | - libfreetype6
10 | - libfreetype6-dev
11 | - libfontconfig1
12 | - libfontconfig1-dev
13 |
14 | - name: Get current phantomjs version
15 | shell: phantomjs -v
16 | args:
17 | executable: /bin/bash
18 | register: current_phantomjs_version
19 | changed_when: false
20 | ignore_errors: true
21 |
22 | - name: Download phantomJS archive and unarchive it to /usr/local/lib
23 | unarchive:
24 | src: https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-{{ phantomjs }}-linux-x86_64.tar.bz2
25 | dest: /usr/local/lib
26 | remote_src: true
27 | when: phantomjs not in current_phantomjs_version.stdout
28 |
29 | - name: Link PhantomJS binary to /usr/local/bin/phantomjs
30 | file:
31 | src: /usr/local/lib/phantomjs-{{ phantomjs }}-linux-x86_64/bin/phantomjs
32 | dest: /usr/local/bin/phantomjs
33 | state: link
34 |
--------------------------------------------------------------------------------
/lib/kimurai/automation/setup/ruby_environment.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Install dependencies for ruby-build
3 | become: true
4 | apt:
5 | pkg: "{{ item }}"
6 | state: present
7 | with_items:
8 | - zlib1g-dev
9 | - build-essential
10 | - libssl-dev
11 | - libreadline-dev
12 | - libreadline6-dev
13 | - libyaml-dev
14 | - libxml2-dev
15 | - libxslt1-dev
16 | - libcurl4-openssl-dev
17 | - libffi-dev
18 |
19 | - name: Clone Rbenv repository to the {{ ansible_user_id }} user directory
20 | git:
21 | repo: https://github.com/sstephenson/rbenv.git
22 | dest: "{{ rbenv_root_path }}"
23 |
24 | - name: Clone ruby-build repo to the {{ ansible_user_id }} user directory
25 | git:
26 | repo: https://github.com/sstephenson/ruby-build.git
27 | dest: "{{ rbenv_root_path }}/plugins/ruby-build"
28 |
29 | - name: Add Rbenv path to the .bashrc
30 | lineinfile:
31 | dest: ~/.bashrc
32 | regexp: '^export PATH="\$HOME\/\.rbenv'
33 | line: export PATH="$HOME/.rbenv/bin:$PATH"
34 | state: present
35 |
36 | - name: Add Rbenv init to the .bashrc
37 | lineinfile:
38 | dest: ~/.bashrc
39 | regexp: '^eval "\$\(rbenv'
40 | line: eval "$(rbenv init -)"
41 | state: present
42 |
43 | - name: Check if desired Ruby version already installed
44 | stat:
45 | path: "{{ ruby_versions_path }}/{{ ruby }}"
46 | register: ruby_present
47 |
48 | - name: Install desired Ruby version using ruby-build (this can take a while)
49 | command: rbenv install {{ ruby }}
50 | when: not ruby_present.stat.exists
51 | environment:
52 | CONFIGURE_OPTS: "--disable-install-doc"
53 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
54 |
55 | - name: Get current Ruby version
56 | command: "ruby -v"
57 | register: current_ruby_version
58 | changed_when: false
59 | ignore_errors: true
60 | environment:
61 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
62 |
63 | - name: Set desired Ruby version as a global version
64 | command: "rbenv global {{ ruby }}"
65 | when: ruby not in current_ruby_version.stdout
66 | environment:
67 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
68 | register: set_ruby
69 |
70 | - name: Execute `rbenv rehash` command
71 | command: rbenv rehash
72 | when: set_ruby.changed
73 | environment:
74 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
75 |
76 | - name: Create ~/.gemrc file to skip docs
77 | copy:
78 | dest: ~/.gemrc
79 | content: "gem: --no-ri --no-rdoc"
80 |
81 | - name: Create ~/.bundle directory
82 | file:
83 | dest: ~/.bundle
84 | state: directory
85 |
86 | - name: Create ~/.bundle/config file with default settings `BUNDLE_GIT__ALLOW_INSECURE true` and `BUNDLE_JOBS 4`
87 | copy:
88 | dest: ~/.bundle/config
89 | content: |
90 | BUNDLE_GIT__ALLOW_INSECURE: "true"
91 | BUNDLE_JOBS: "4"
92 |
93 | - name: Check if Bundler gem installed
94 | stat:
95 | path: "{{ ruby_versions_path }}/{{ ruby }}/bin/bundler"
96 | register: bundler_gem_present
97 |
98 | - name: Install Bundler gem
99 | command: gem install bundler
100 | when: not bundler_gem_present.stat.exists
101 | environment:
102 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
103 |
104 | - name: Check if Whenever gem installed
105 | stat:
106 | path: "{{ ruby_versions_path }}/{{ ruby }}/bin/whenever"
107 | register: whenever_gem_present
108 |
109 | - name: Install Whenever gem
110 | command: gem install whenever
111 | when: not whenever_gem_present.stat.exists
112 | environment:
113 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
114 |
115 | - name: Check if Kimurai gem installed
116 | stat:
117 | path: "{{ ruby_versions_path }}/{{ ruby }}/bin/kimurai"
118 | register: kimurai_gem_present
119 |
120 | - name: Install Kimurai gem
121 | command: gem install kimurai
122 | when: not kimurai_gem_present.stat.exists
123 | environment:
124 | PATH: "{{ rbenv_root_path }}/bin:{{ rbenv_root_path }}/shims:{{ ansible_env.PATH }}"
125 |
--------------------------------------------------------------------------------
/lib/kimurai/base.rb:
--------------------------------------------------------------------------------
1 | require_relative 'base/saver'
2 | require_relative 'base/storage'
3 |
4 | module Kimurai
5 | class Base
6 | class InvalidUrlError < StandardError; end
7 |
8 | # don't deep merge config's headers hash option
9 | DMERGE_EXCLUDE = [:headers]
10 |
11 | LoggerFormatter = proc do |severity, datetime, progname, msg|
12 | current_thread_id = Thread.current.object_id
13 | thread_type = Thread.main == Thread.current ? "M" : "C"
14 | output = "%s, [%s#%d] [%s: %s] %5s -- %s: %s\n"
15 | .freeze % [severity[0..0], datetime, $$, thread_type, current_thread_id, severity, progname, msg]
16 |
17 | if Kimurai.configuration.colorize_logger != false && Kimurai.env == "development"
18 | Rbcat.colorize(output, predefined: [:jsonhash, :logger])
19 | else
20 | output
21 | end
22 | end
23 |
24 | include BaseHelper
25 |
26 | ###
27 |
28 | class << self
29 | attr_reader :run_info, :savers, :storage
30 | end
31 |
32 | def self.running?
33 | @run_info && @run_info[:status] == :running
34 | end
35 |
36 | def self.completed?
37 | @run_info && @run_info[:status] == :completed
38 | end
39 |
40 | def self.failed?
41 | @run_info && @run_info[:status] == :failed
42 | end
43 |
44 | def self.visits
45 | @run_info && @run_info[:visits]
46 | end
47 |
48 | def self.items
49 | @run_info && @run_info[:items]
50 | end
51 |
52 | def self.update(type, subtype)
53 | return unless @run_info
54 | @update_mutex.synchronize { @run_info[type][subtype] += 1 }
55 | end
56 |
57 | def self.add_event(scope, event)
58 | return unless @run_info
59 | @update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
60 | end
61 |
62 | ###
63 |
64 | @engine = :mechanize
65 | @pipelines = []
66 | @config = {}
67 |
68 | def self.name
69 | @name
70 | end
71 |
72 | def self.engine
73 | @engine ||= superclass.engine
74 | end
75 |
76 | def self.pipelines
77 | @pipelines ||= superclass.pipelines
78 | end
79 |
80 | def self.start_urls
81 | @start_urls
82 | end
83 |
84 | def self.config
85 | if superclass.equal?(::Object)
86 | @config
87 | else
88 | superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE)
89 | end
90 | end
91 |
92 | ###
93 |
94 | def self.logger
95 | @logger ||= Kimurai.configuration.logger || begin
96 | log_level = (ENV["LOG_LEVEL"] || Kimurai.configuration.log_level || "DEBUG").to_s.upcase
97 | log_level = "Logger::#{log_level}".constantize
98 | Logger.new(STDOUT, formatter: LoggerFormatter, level: log_level, progname: name)
99 | end
100 | end
101 |
102 | def self.crawl!(exception_on_fail: true)
103 | logger.error "Spider: already running: #{name}" and return false if running?
104 |
105 | @storage = Storage.new
106 | @savers = {}
107 | @update_mutex = Mutex.new
108 |
109 | @run_info = {
110 | spider_name: name, status: :running, error: nil, environment: Kimurai.env,
111 | start_time: Time.new, stop_time: nil, running_time: nil,
112 | visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
113 | events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) }
114 | }
115 |
116 | ###
117 |
118 | logger.info "Spider: started: #{name}"
119 | open_spider if self.respond_to? :open_spider
120 |
121 | spider = self.new
122 | spider.with_info = true
123 | if start_urls
124 | start_urls.each do |start_url|
125 | if start_url.class == Hash
126 | spider.request_to(:parse, start_url)
127 | else
128 | spider.request_to(:parse, url: start_url)
129 | end
130 | end
131 | else
132 | spider.parse
133 | end
134 | rescue StandardError, SignalException, SystemExit => e
135 | @run_info.merge!(status: :failed, error: e.inspect)
136 | exception_on_fail ? raise(e) : [@run_info, e]
137 | else
138 | @run_info.merge!(status: :completed)
139 | ensure
140 | if spider
141 | spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
142 |
143 | stop_time = Time.now
144 | total_time = (stop_time - @run_info[:start_time]).round(3)
145 | @run_info.merge!(stop_time: stop_time, running_time: total_time)
146 |
147 | close_spider if self.respond_to? :close_spider
148 |
149 | message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
150 | failed? ? logger.fatal(message) : logger.info(message)
151 |
152 | @run_info, @storage, @savers, @update_mutex = nil
153 | end
154 | end
155 |
156 | def self.parse!(handler, *args, **request)
157 | spider = self.new
158 |
159 | if args.present?
160 | spider.public_send(handler, *args)
161 | elsif request.present?
162 | spider.request_to(handler, request)
163 | else
164 | spider.public_send(handler)
165 | end
166 | ensure
167 | spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
168 | end
169 |
170 | ###
171 |
172 | attr_reader :logger
173 | attr_accessor :with_info
174 |
175 | def initialize(engine = self.class.engine, config: {})
176 | @engine = engine || self.class.engine
177 | @config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
178 | @pipelines = self.class.pipelines.map do |pipeline_name|
179 | klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
180 | instance = klass.new
181 | instance.spider = self
182 | [pipeline_name, instance]
183 | end.to_h
184 |
185 | @logger = self.class.logger
186 | @savers = {}
187 | end
188 |
189 | def browser
190 | @browser ||= BrowserBuilder.build(@engine, @config, spider: self)
191 | end
192 |
193 | def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
194 | raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).kind_of?(URI::HTTP)
195 |
196 | if @config[:skip_duplicate_requests] && !unique_request?(url)
197 | add_event(:duplicate_requests) if self.with_info
198 | logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
199 | end
200 |
201 | visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
202 | return unless visited
203 |
204 | public_send(handler, browser.current_response(response_type), { url: url, data: data })
205 | end
206 |
207 | def console(response = nil, url: nil, data: {})
208 | binding.pry
209 | end
210 |
211 | ###
212 |
213 | def storage
214 | # Note: for `.crawl!` uses shared thread safe Storage instance,
215 | # otherwise, each spider instance will have it's own Storage
216 | @storage ||= self.with_info ? self.class.storage : Storage.new
217 | end
218 |
219 | def unique?(scope, value)
220 | storage.unique?(scope, value)
221 | end
222 |
223 | def save_to(path, item, format:, position: true, append: false)
224 | @savers[path] ||= begin
225 | options = { format: format, position: position, append: append }
226 | if self.with_info
227 | self.class.savers[path] ||= Saver.new(path, options)
228 | else
229 | Saver.new(path, options)
230 | end
231 | end
232 |
233 | @savers[path].save(item)
234 | end
235 |
236 | ###
237 |
238 | def add_event(scope = :custom, event)
239 | if self.with_info
240 | self.class.add_event(scope, event)
241 | end
242 |
243 | logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
244 | end
245 |
246 | ###
247 |
248 | private
249 |
250 | def create_browser(engine, config = {})
251 | Kimurai::BrowserBuilder.build(engine, config, spider: self)
252 | end
253 |
254 | def unique_request?(url)
255 | options = @config[:skip_duplicate_requests]
256 | if options.class == Hash
257 | scope = options[:scope] || :requests_urls
258 | if options[:check_only]
259 | storage.include?(scope, url) ? false : true
260 | else
261 | storage.unique?(scope, url) ? true : false
262 | end
263 | else
264 | storage.unique?(:requests_urls, url) ? true : false
265 | end
266 | end
267 |
268 | def send_item(item, options = {})
269 | logger.debug "Pipeline: starting processing item through #{@pipelines.size} #{'pipeline'.pluralize(@pipelines.size)}..."
270 | self.class.update(:items, :sent) if self.with_info
271 |
272 | @pipelines.each do |name, instance|
273 | item = options[name] ? instance.process_item(item, options: options[name]) : instance.process_item(item)
274 | end
275 | rescue => e
276 | logger.error "Pipeline: dropped: #{e.inspect} (#{e.backtrace.first}), item: #{item}"
277 | add_event(:drop_items_errors, e.inspect) if self.with_info
278 | false
279 | else
280 | self.class.update(:items, :processed) if self.with_info
281 | logger.info "Pipeline: processed: #{JSON.generate(item)}"
282 | true
283 | ensure
284 | if self.with_info
285 | logger.info "Info: items: sent: #{self.class.items[:sent]}, processed: #{self.class.items[:processed]}"
286 | end
287 | end
288 |
289 | def in_parallel(handler, urls, threads:, data: {}, delay: nil, engine: @engine, config: {})
290 | parts = urls.in_sorted_groups(threads, false)
291 | urls_count = urls.size
292 |
293 | all = []
294 | start_time = Time.now
295 | logger.info "Spider: in_parallel: starting processing #{urls_count} urls within #{threads} threads"
296 |
297 | parts.each do |part|
298 | all << Thread.new(part) do |part|
299 | Thread.current.abort_on_exception = true
300 |
301 | spider = self.class.new(engine, config: @config.deep_merge_excl(config, DMERGE_EXCLUDE))
302 | spider.with_info = true if self.with_info
303 |
304 | part.each do |url_data|
305 | if url_data.class == Hash
306 | if url_data[:url].present? && url_data[:data].present?
307 | spider.request_to(handler, delay, url_data)
308 | else
309 | spider.public_send(handler, url_data)
310 | end
311 | else
312 | spider.request_to(handler, delay, url: url_data, data: data)
313 | end
314 | end
315 | ensure
316 | spider.browser.destroy_driver! if spider.instance_variable_get("@browser")
317 | end
318 |
319 | sleep 0.5
320 | end
321 |
322 | all.each(&:join)
323 | logger.info "Spider: in_parallel: stopped processing #{urls_count} urls within #{threads} threads, total time: #{(Time.now - start_time).duration}"
324 | end
325 | end
326 | end
327 |
--------------------------------------------------------------------------------
/lib/kimurai/base/saver.rb:
--------------------------------------------------------------------------------
1 | require 'json'
2 | require 'csv'
3 |
4 | module Kimurai
5 | class Base
6 | class Saver
7 | attr_reader :format, :path, :position, :append
8 |
9 | def initialize(path, format:, position: true, append: false)
10 | unless %i(json pretty_json jsonlines csv).include?(format)
11 | raise "SimpleSaver: wrong type of format: #{format}"
12 | end
13 |
14 | @path = path
15 | @format = format
16 | @position = position
17 | @index = 0
18 | @append = append
19 | @mutex = Mutex.new
20 | end
21 |
22 | def save(item)
23 | @mutex.synchronize do
24 | @index += 1
25 | item[:position] = @index if position
26 |
27 | case format
28 | when :json
29 | save_to_json(item)
30 | when :pretty_json
31 | save_to_pretty_json(item)
32 | when :jsonlines
33 | save_to_jsonlines(item)
34 | when :csv
35 | save_to_csv(item)
36 | end
37 | end
38 | end
39 |
40 | private
41 |
42 | def save_to_json(item)
43 | data = JSON.generate([item])
44 |
45 | if @index > 1 || append && File.exists?(path)
46 | file_content = File.read(path).sub(/\}\]\Z/, "\}\,")
47 | File.open(path, "w") do |f|
48 | f.write(file_content + data.sub(/\A\[/, ""))
49 | end
50 | else
51 | File.open(path, "w") { |f| f.write(data) }
52 | end
53 | end
54 |
55 | def save_to_pretty_json(item)
56 | data = JSON.pretty_generate([item])
57 |
58 | if @index > 1 || append && File.exists?(path)
59 | file_content = File.read(path).sub(/\}\n\]\Z/, "\}\,\n")
60 | File.open(path, "w") do |f|
61 | f.write(file_content + data.sub(/\A\[\n/, ""))
62 | end
63 | else
64 | File.open(path, "w") { |f| f.write(data) }
65 | end
66 | end
67 |
68 | def save_to_jsonlines(item)
69 | data = JSON.generate(item)
70 |
71 | if @index > 1 || append && File.exists?(path)
72 | File.open(path, "a") { |file| file.write("\n" + data) }
73 | else
74 | File.open(path, "w") { |file| file.write(data) }
75 | end
76 | end
77 |
78 | def save_to_csv(item)
79 | data = flatten_hash(item)
80 |
81 | if @index > 1 || append && File.exists?(path)
82 | CSV.open(path, "a+", force_quotes: true) do |csv|
83 | csv << data.values
84 | end
85 | else
86 | CSV.open(path, "w", force_quotes: true) do |csv|
87 | csv << data.keys
88 | csv << data.values
89 | end
90 | end
91 | end
92 |
93 | def flatten_hash(hash)
94 | hash.each_with_object({}) do |(k, v), h|
95 | if v.is_a? Hash
96 | flatten_hash(v).map { |h_k, h_v| h["#{k}.#{h_k}"] = h_v }
97 | else
98 | h[k&.to_s] = v
99 | end
100 | end
101 | end
102 | end
103 | end
104 | end
105 |
106 |
107 |
--------------------------------------------------------------------------------
/lib/kimurai/base/storage.rb:
--------------------------------------------------------------------------------
1 | module Kimurai
2 | class Base
3 | class Storage
4 | attr_reader :database
5 |
6 | def initialize
7 | @mutex = Mutex.new
8 | @database = {}
9 | end
10 |
11 | def all(scope = nil)
12 | @mutex.synchronize do
13 | scope ? database.fetch(scope, []) : database
14 | end
15 | end
16 |
17 | def include?(scope, value)
18 | @mutex.synchronize do
19 | database[scope] ||= []
20 | database[scope].include?(value)
21 | end
22 | end
23 |
24 | def add(scope, value)
25 | @mutex.synchronize do
26 | database[scope] ||= []
27 | if value.kind_of?(Array)
28 | database[scope] += value
29 | database[scope].uniq!
30 | else
31 | database[scope].push(value) unless database[scope].include?(value)
32 | end
33 | end
34 | end
35 |
36 | ###
37 |
38 | def unique?(scope, value)
39 | @mutex.synchronize do
40 | database[scope] ||= []
41 | database[scope].include?(value) ? false : database[scope].push(value) and true
42 | end
43 | end
44 |
45 | ###
46 |
47 | def clear!
48 | @mutex.synchronize do
49 | @database = {}
50 | end
51 | end
52 | end
53 | end
54 | end
55 |
--------------------------------------------------------------------------------
/lib/kimurai/base_helper.rb:
--------------------------------------------------------------------------------
1 | module Kimurai
2 | module BaseHelper
3 | private
4 |
5 | def absolute_url(url, base:)
6 | return unless url
7 | URI.join(base, URI.escape(url)).to_s
8 | end
9 |
10 | def escape_url(url)
11 | uri = URI.parse(url)
12 | rescue URI::InvalidURIError => e
13 | URI.parse(URI.escape url).to_s rescue url
14 | else
15 | url
16 | end
17 |
18 | def normalize_url(url, base:)
19 | escape_url(absolute_url(url, base: base))
20 | end
21 | end
22 | end
23 |
--------------------------------------------------------------------------------
/lib/kimurai/browser_builder.rb:
--------------------------------------------------------------------------------
1 | module Kimurai
2 | module BrowserBuilder
3 | def self.build(engine, config = {}, spider:)
4 | if config[:browser].present?
5 | raise "++++++ BrowserBuilder: browser option is depricated. Now all sub-options inside " \
6 | "`browser` should be placed right into `@config` hash, without `browser` parent key.\n" \
7 | "See more here: https://github.com/vifreefly/kimuraframework/blob/master/CHANGELOG.md#breaking-changes-110 ++++++"
8 | end
9 |
10 | begin
11 | require "kimurai/browser_builder/#{engine}_builder"
12 | rescue LoadError => e
13 | end
14 |
15 | builder_class_name = "#{engine}_builder".classify
16 | builder = "Kimurai::BrowserBuilder::#{builder_class_name}".constantize
17 | builder.new(config, spider: spider).build
18 | end
19 | end
20 | end
21 |
--------------------------------------------------------------------------------
/lib/kimurai/browser_builder/mechanize_builder.rb:
--------------------------------------------------------------------------------
1 | require 'capybara'
2 | require 'capybara/mechanize'
3 | require_relative '../capybara_configuration'
4 | require_relative '../capybara_ext/mechanize/driver'
5 | require_relative '../capybara_ext/session'
6 |
7 | module Kimurai::BrowserBuilder
8 | class MechanizeBuilder
9 | attr_reader :logger, :spider
10 |
11 | def initialize(config, spider:)
12 | @config = config
13 | @spider = spider
14 | @logger = spider.logger
15 | end
16 |
17 | def build
18 | # Register driver
19 | Capybara.register_driver :mechanize do |app|
20 | driver = Capybara::Mechanize::Driver.new("app")
21 | # keep the history as small as possible (by default it's unlimited)
22 | driver.configure { |a| a.history.max_size = 2 }
23 | driver
24 | end
25 |
26 | # Create browser instance (Capybara session)
27 | @browser = Capybara::Session.new(:mechanize)
28 | @browser.spider = spider
29 | logger.debug "BrowserBuilder (mechanize): created browser instance"
30 |
31 | if @config[:extensions].present?
32 | logger.error "BrowserBuilder (mechanize): `extensions` option not supported, skipped"
33 | end
34 |
35 | # Proxy
36 | if proxy = @config[:proxy].presence
37 | proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
38 | ip, port, type = proxy_string.split(":")
39 |
40 | if type == "http"
41 | @browser.driver.set_proxy(*proxy_string.split(":"))
42 | logger.debug "BrowserBuilder (mechanize): enabled http proxy, ip: #{ip}, port: #{port}"
43 | else
44 | logger.error "BrowserBuilder (mechanize): can't set #{type} proxy (not supported), skipped"
45 | end
46 | end
47 |
48 | # SSL
49 | if ssl_cert_path = @config[:ssl_cert_path].presence
50 | @browser.driver.browser.agent.http.ca_file = ssl_cert_path
51 | logger.debug "BrowserBuilder (mechanize): enabled custom ssl_cert"
52 | end
53 |
54 | if @config[:ignore_ssl_errors].present?
55 | @browser.driver.browser.agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
56 | logger.debug "BrowserBuilder (mechanize): enabled ignore_ssl_errors"
57 | end
58 |
59 | # Headers
60 | if headers = @config[:headers].presence
61 | @browser.driver.headers = headers
62 | logger.debug "BrowserBuilder (mechanize): enabled custom headers"
63 | end
64 |
65 | if user_agent = @config[:user_agent].presence
66 | user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
67 |
68 | @browser.driver.add_header("User-Agent", user_agent_string)
69 | logger.debug "BrowserBuilder (mechanize): enabled custom user_agent"
70 | end
71 |
72 | # Cookies
73 | if cookies = @config[:cookies].presence
74 | cookies.each do |cookie|
75 | @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
76 | end
77 |
78 | logger.debug "BrowserBuilder (mechanize): enabled custom cookies"
79 | end
80 |
81 | # Browser instance options
82 | # skip_request_errors
83 | if skip_errors = @config[:skip_request_errors].presence
84 | @browser.config.skip_request_errors = skip_errors
85 | logger.debug "BrowserBuilder (mechanize): enabled skip_request_errors"
86 | end
87 |
88 | # retry_request_errors
89 | if retry_errors = @config[:retry_request_errors].presence
90 | @browser.config.retry_request_errors = retry_errors
91 | logger.debug "BrowserBuilder (mechanize): enabled retry_request_errors"
92 | end
93 |
94 | # restart_if
95 | if @config[:restart_if].present?
96 | logger.warn "BrowserBuilder (mechanize): restart_if options not supported by Mechanize, skipped"
97 | end
98 |
99 | # before_request clear_cookies
100 | if @config.dig(:before_request, :clear_cookies)
101 | @browser.config.before_request[:clear_cookies] = true
102 | logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_cookies"
103 | end
104 |
105 | # before_request clear_and_set_cookies
106 | if @config.dig(:before_request, :clear_and_set_cookies)
107 | if cookies = @config[:cookies].presence
108 | @browser.config.cookies = cookies
109 | @browser.config.before_request[:clear_and_set_cookies] = true
110 | logger.debug "BrowserBuilder (mechanize): enabled before_request.clear_and_set_cookies"
111 | else
112 | logger.error "BrowserBuilder (mechanize): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
113 | end
114 | end
115 |
116 | # before_request change_user_agent
117 | if @config.dig(:before_request, :change_user_agent)
118 | if @config[:user_agent].present? && @config[:user_agent].class == Proc
119 | @browser.config.user_agent = @config[:user_agent]
120 | @browser.config.before_request[:change_user_agent] = true
121 | logger.debug "BrowserBuilder (mechanize): enabled before_request.change_user_agent"
122 | else
123 | logger.error "BrowserBuilder (mechanize): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
124 | end
125 | end
126 |
127 | # before_request change_proxy
128 | if @config.dig(:before_request, :change_proxy)
129 | if @config[:proxy].present? && @config[:proxy].class == Proc
130 | @browser.config.proxy = @config[:proxy]
131 | @browser.config.before_request[:change_proxy] = true
132 | logger.debug "BrowserBuilder (mechanize): enabled before_request.change_proxy"
133 | else
134 | logger.error "BrowserBuilder (mechanize): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
135 | end
136 | end
137 |
138 | # before_request delay
139 | if delay = @config.dig(:before_request, :delay).presence
140 | @browser.config.before_request[:delay] = delay
141 | logger.debug "BrowserBuilder (mechanize): enabled before_request.delay"
142 | end
143 |
144 | # encoding
145 | if encoding = @config[:encoding]
146 | @browser.config.encoding = encoding
147 | logger.debug "BrowserBuilder (mechanize): enabled encoding: #{encoding}"
148 | end
149 |
150 | # return Capybara session instance
151 | @browser
152 | end
153 | end
154 | end
155 |
--------------------------------------------------------------------------------
/lib/kimurai/browser_builder/poltergeist_phantomjs_builder.rb:
--------------------------------------------------------------------------------
1 | require 'capybara'
2 | require 'capybara/poltergeist'
3 | require_relative '../capybara_configuration'
4 | require_relative '../capybara_ext/poltergeist/driver'
5 | require_relative '../capybara_ext/session'
6 |
7 | module Kimurai::BrowserBuilder
8 | class PoltergeistPhantomjsBuilder
9 | attr_reader :logger, :spider
10 |
11 | def initialize(config, spider:)
12 | @config = config
13 | @spider = spider
14 | @logger = spider.logger
15 | end
16 |
17 | def build
18 | # Register driver
19 | Capybara.register_driver :poltergeist_phantomjs do |app|
20 | # Create driver options
21 | driver_options = {
22 | js_errors: false, debug: false, inspector: false, phantomjs_options: []
23 | }
24 |
25 | if extensions = @config[:extensions].presence
26 | driver_options[:extensions] = extensions
27 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled extensions"
28 | end
29 |
30 | # Window size
31 | if size = @config[:window_size].presence
32 | driver_options[:window_size] = size
33 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled window_size"
34 | end
35 |
36 | # SSL
37 | if ssl_cert_path = @config[:ssl_cert_path].presence
38 | driver_options[:phantomjs_options] << "--ssl-certificates-path=#{ssl_cert_path}"
39 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom ssl_cert"
40 | end
41 |
42 | if @config[:ignore_ssl_errors].present?
43 | driver_options[:phantomjs_options].push("--ignore-ssl-errors=yes", "--ssl-protocol=any")
44 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled ignore_ssl_errors"
45 | end
46 |
47 | # Disable images
48 | if @config[:disable_images].present?
49 | driver_options[:phantomjs_options] << "--load-images=no"
50 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled disable_images"
51 | end
52 |
53 | Capybara::Poltergeist::Driver.new(app, driver_options)
54 | end
55 |
56 | # Create browser instance (Capybara session)
57 | @browser = Capybara::Session.new(:poltergeist_phantomjs)
58 | @browser.spider = spider
59 | logger.debug "BrowserBuilder (poltergeist_phantomjs): created browser instance"
60 |
61 | # Proxy
62 | if proxy = @config[:proxy].presence
63 | proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
64 | ip, port, type = proxy_string.split(":")
65 |
66 | if %w(http socks5).include?(type)
67 | @browser.driver.set_proxy(*proxy_string.split(":"))
68 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled #{type} proxy, ip: #{ip}, port: #{port}"
69 | else
70 | logger.error "BrowserBuilder (poltergeist_phantomjs): wrong type of proxy: #{type}, skipped"
71 | end
72 | end
73 |
74 | # Headers
75 | if headers = @config[:headers].presence
76 | @browser.driver.headers = headers
77 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom headers"
78 | end
79 |
80 | if user_agent = @config[:user_agent].presence
81 | user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
82 |
83 | @browser.driver.add_header("User-Agent", user_agent_string)
84 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom user_agent"
85 | end
86 |
87 | # Cookies
88 | if cookies = @config[:cookies].presence
89 | cookies.each do |cookie|
90 | @browser.driver.set_cookie(cookie[:name], cookie[:value], cookie)
91 | end
92 |
93 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled custom cookies"
94 | end
95 |
96 | # Browser instance options
97 | # skip_request_errors
98 | if skip_errors = @config[:skip_request_errors].presence
99 | @browser.config.skip_request_errors = skip_errors
100 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled skip_request_errors"
101 | end
102 |
103 | # retry_request_errors
104 | if retry_errors = @config[:retry_request_errors].presence
105 | @browser.config.retry_request_errors = retry_errors
106 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled retry_request_errors"
107 | end
108 |
109 | # restart_if
110 | if requests_limit = @config.dig(:restart_if, :requests_limit).presence
111 | @browser.config.restart_if[:requests_limit] = requests_limit
112 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.requests_limit >= #{requests_limit}"
113 | end
114 |
115 | if memory_limit = @config.dig(:restart_if, :memory_limit).presence
116 | @browser.config.restart_if[:memory_limit] = memory_limit
117 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled restart_if.memory_limit >= #{memory_limit}"
118 | end
119 |
120 | # before_request clear_cookies
121 | if @config.dig(:before_request, :clear_cookies)
122 | @browser.config.before_request[:clear_cookies] = true
123 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_cookies"
124 | end
125 |
126 | # before_request clear_and_set_cookies
127 | if @config.dig(:before_request, :clear_and_set_cookies)
128 | if cookies = @config[:cookies].presence
129 | @browser.config.cookies = cookies
130 | @browser.config.before_request[:clear_and_set_cookies] = true
131 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.clear_and_set_cookies"
132 | else
133 | logger.error "BrowserBuilder (poltergeist_phantomjs): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
134 | end
135 | end
136 |
137 | # before_request change_user_agent
138 | if @config.dig(:before_request, :change_user_agent)
139 | if @config[:user_agent].present? && @config[:user_agent].class == Proc
140 | @browser.config.user_agent = @config[:user_agent]
141 | @browser.config.before_request[:change_user_agent] = true
142 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_user_agent"
143 | else
144 | logger.error "BrowserBuilder (poltergeist_phantomjs): user_agent should be present and has lambda format to enable before_request.change_user_agent, skipped"
145 | end
146 | end
147 |
148 | # before_request change_proxy
149 | if @config.dig(:before_request, :change_proxy)
150 | if @config[:proxy].present? && @config[:proxy].class == Proc
151 | @browser.config.proxy = @config[:proxy]
152 | @browser.config.before_request[:change_proxy] = true
153 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.change_proxy"
154 | else
155 | logger.error "BrowserBuilder (poltergeist_phantomjs): proxy should be present and has lambda format to enable before_request.change_proxy, skipped"
156 | end
157 | end
158 |
159 | # before_request delay
160 | if delay = @config.dig(:before_request, :delay).presence
161 | @browser.config.before_request[:delay] = delay
162 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled before_request.delay"
163 | end
164 |
165 | # encoding
166 | if encoding = @config[:encoding]
167 | @browser.config.encoding = encoding
168 | logger.debug "BrowserBuilder (poltergeist_phantomjs): enabled encoding: #{encoding}"
169 | end
170 |
171 | # return Capybara session instance
172 | @browser
173 | end
174 | end
175 | end
176 |
--------------------------------------------------------------------------------
/lib/kimurai/browser_builder/selenium_chrome_builder.rb:
--------------------------------------------------------------------------------
1 | require 'capybara'
2 | require 'selenium-webdriver'
3 | require_relative '../capybara_configuration'
4 | require_relative '../capybara_ext/selenium/driver'
5 | require_relative '../capybara_ext/session'
6 |
7 | module Kimurai::BrowserBuilder
8 | class SeleniumChromeBuilder
9 | class << self
10 | attr_accessor :virtual_display
11 | end
12 |
13 | attr_reader :logger, :spider
14 |
15 | def initialize(config, spider:)
16 | @config = config
17 | @spider = spider
18 | @logger = spider.logger
19 | end
20 |
21 | def build
22 | # Register driver
23 | Capybara.register_driver :selenium_chrome do |app|
24 | # Create driver options
25 | opts = { args: %w[--disable-gpu --no-sandbox --disable-translate] }
26 |
27 | # Provide custom chrome browser path:
28 | if chrome_path = Kimurai.configuration.selenium_chrome_path
29 | opts.merge!(binary: chrome_path)
30 | end
31 |
32 | # See all options here: https://seleniumhq.github.io/selenium/docs/api/rb/Selenium/WebDriver/Chrome/Options.html
33 | driver_options = Selenium::WebDriver::Chrome::Options.new(opts)
34 |
35 | # Window size
36 | if size = @config[:window_size].presence
37 | driver_options.args << "--window-size=#{size.join(',')}"
38 | logger.debug "BrowserBuilder (selenium_chrome): enabled window_size"
39 | end
40 |
41 | # Proxy
42 | if proxy = @config[:proxy].presence
43 | proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
44 | ip, port, type, user, password = proxy_string.split(":")
45 |
46 | if %w(http socks5).include?(type)
47 | if user.nil? && password.nil?
48 | driver_options.args << "--proxy-server=#{type}://#{ip}:#{port}"
49 | logger.debug "BrowserBuilder (selenium_chrome): enabled #{type} proxy, ip: #{ip}, port: #{port}"
50 | else
51 | logger.error "BrowserBuilder (selenium_chrome): proxy with authentication doesn't supported by selenium, skipped"
52 | end
53 | else
54 | logger.error "BrowserBuilder (selenium_chrome): wrong type of proxy: #{type}, skipped"
55 | end
56 | end
57 |
58 | if proxy_bypass_list = @config[:proxy_bypass_list].presence
59 | if proxy
60 | driver_options.args << "--proxy-bypass-list=#{proxy_bypass_list.join(';')}"
61 | logger.debug "BrowserBuilder (selenium_chrome): enabled proxy_bypass_list"
62 | else
63 | logger.error "BrowserBuilder (selenium_chrome): provide `proxy` to set proxy_bypass_list, skipped"
64 | end
65 | end
66 |
67 | # SSL
68 | if @config[:ignore_ssl_errors].present?
69 | driver_options.args << "--ignore-certificate-errors"
70 | driver_options.args << "--allow-insecure-localhost"
71 | logger.debug "BrowserBuilder (selenium_chrome): enabled ignore_ssl_errors"
72 | end
73 |
74 | # Disable images
75 | if @config[:disable_images].present?
76 | driver_options.prefs["profile.managed_default_content_settings.images"] = 2
77 | logger.debug "BrowserBuilder (selenium_chrome): enabled disable_images"
78 | end
79 |
80 | # Headers
81 | if @config[:headers].present?
82 | logger.warn "BrowserBuilder: (selenium_chrome): custom headers doesn't supported by selenium, skipped"
83 | end
84 |
85 | if user_agent = @config[:user_agent].presence
86 | user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
87 | driver_options.args << "--user-agent='#{user_agent_string}'"
88 | logger.debug "BrowserBuilder (selenium_chrome): enabled custom user_agent"
89 | end
90 |
91 | # Headless mode
92 | if ENV["HEADLESS"] != "false"
93 | if @config[:headless_mode] == :virtual_display
94 | if Gem::Platform.local.os == "linux"
95 | unless self.class.virtual_display
96 | require 'headless'
97 | self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false)
98 | self.class.virtual_display.start
99 | end
100 |
101 | logger.debug "BrowserBuilder (selenium_chrome): enabled virtual_display headless_mode"
102 | else
103 | logger.error "BrowserBuilder (selenium_chrome): virtual_display headless_mode works only " \
104 | "on Linux platform. Browser will run in normal mode. Set `native` mode instead."
105 | end
106 | else
107 | driver_options.args << "--headless"
108 | logger.debug "BrowserBuilder (selenium_chrome): enabled native headless_mode"
109 | end
110 | end
111 |
112 | chromedriver_path = Kimurai.configuration.chromedriver_path || "/usr/local/bin/chromedriver"
113 | service = Selenium::WebDriver::Service.chrome(path: chromedriver_path)
114 | Capybara::Selenium::Driver.new(app, browser: :chrome, options: driver_options, service: service)
115 | end
116 |
117 | # Create browser instance (Capybara session)
118 | @browser = Capybara::Session.new(:selenium_chrome)
119 | @browser.spider = spider
120 | logger.debug "BrowserBuilder (selenium_chrome): created browser instance"
121 |
122 | if @config[:extensions].present?
123 | logger.error "BrowserBuilder (selenium_chrome): `extensions` option not supported by Selenium, skipped"
124 | end
125 |
126 | # Cookies
127 | if cookies = @config[:cookies].presence
128 | @browser.config.cookies = cookies
129 | logger.debug "BrowserBuilder (selenium_chrome): enabled custom cookies"
130 | end
131 |
132 | # Browser instance options
133 | # skip_request_errors
134 | if skip_errors = @config[:skip_request_errors].presence
135 | @browser.config.skip_request_errors = skip_errors
136 | logger.debug "BrowserBuilder (selenium_chrome): enabled skip_request_errors"
137 | end
138 |
139 | # retry_request_errors
140 | if retry_errors = @config[:retry_request_errors].presence
141 | @browser.config.retry_request_errors = retry_errors
142 | logger.debug "BrowserBuilder (selenium_chrome): enabled retry_request_errors"
143 | end
144 |
145 | # restart_if
146 | if requests_limit = @config.dig(:restart_if, :requests_limit).presence
147 | @browser.config.restart_if[:requests_limit] = requests_limit
148 | logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.requests_limit >= #{requests_limit}"
149 | end
150 |
151 | if memory_limit = @config.dig(:restart_if, :memory_limit).presence
152 | @browser.config.restart_if[:memory_limit] = memory_limit
153 | logger.debug "BrowserBuilder (selenium_chrome): enabled restart_if.memory_limit >= #{memory_limit}"
154 | end
155 |
156 | # before_request clear_cookies
157 | if @config.dig(:before_request, :clear_cookies)
158 | @browser.config.before_request[:clear_cookies] = true
159 | logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_cookies"
160 | end
161 |
162 | # before_request clear_and_set_cookies
163 | if @config.dig(:before_request, :clear_and_set_cookies)
164 | if cookies = @config[:cookies].presence
165 | @browser.config.cookies = cookies
166 | @browser.config.before_request[:clear_and_set_cookies] = true
167 | logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.clear_and_set_cookies"
168 | else
169 | logger.error "BrowserBuilder (selenium_chrome): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
170 | end
171 | end
172 |
173 | # before_request change_user_agent
174 | if @config.dig(:before_request, :change_user_agent)
175 | logger.error "BrowserBuilder (selenium_chrome): before_request.change_user_agent option not supported by Selenium, skipped"
176 | end
177 |
178 | # before_request change_proxy
179 | if @config.dig(:before_request, :change_proxy)
180 | logger.error "BrowserBuilder (selenium_chrome): before_request.change_proxy option not supported by Selenium, skipped"
181 | end
182 |
183 | # before_request delay
184 | if delay = @config.dig(:before_request, :delay).presence
185 | @browser.config.before_request[:delay] = delay
186 | logger.debug "BrowserBuilder (selenium_chrome): enabled before_request.delay"
187 | end
188 |
189 | # encoding
190 | if encoding = @config[:encoding]
191 | @browser.config.encoding = encoding
192 | logger.debug "BrowserBuilder (selenium_chrome): enabled encoding: #{encoding}"
193 | end
194 |
195 | # return Capybara session instance
196 | @browser
197 | end
198 | end
199 | end
200 |
--------------------------------------------------------------------------------
/lib/kimurai/browser_builder/selenium_firefox_builder.rb:
--------------------------------------------------------------------------------
1 | require 'capybara'
2 | require 'selenium-webdriver'
3 | require_relative '../capybara_configuration'
4 | require_relative '../capybara_ext/selenium/driver'
5 | require_relative '../capybara_ext/session'
6 |
7 | module Kimurai::BrowserBuilder
8 | class SeleniumFirefoxBuilder
9 | class << self
10 | attr_accessor :virtual_display
11 | end
12 |
13 | attr_reader :logger, :spider
14 |
15 | def initialize(config, spider:)
16 | @config = config
17 | @spider = spider
18 | @logger = spider.logger
19 | end
20 |
21 | def build
22 | # Register driver
23 | Capybara.register_driver :selenium_firefox do |app|
24 | # Create driver options
25 | driver_options = Selenium::WebDriver::Firefox::Options.new
26 | driver_options.profile = Selenium::WebDriver::Firefox::Profile.new
27 | driver_options.profile["browser.link.open_newwindow"] = 3 # open windows in tabs
28 | driver_options.profile["media.peerconnection.enabled"] = false # disable web rtc
29 |
30 | # Proxy
31 | if proxy = @config[:proxy].presence
32 | proxy_string = (proxy.class == Proc ? proxy.call : proxy).strip
33 | ip, port, type, user, password = proxy_string.split(":")
34 |
35 | if user.nil? && password.nil?
36 | driver_options.profile["network.proxy.type"] = 1
37 | if type == "http"
38 | driver_options.profile["network.proxy.http"] = ip
39 | driver_options.profile["network.proxy.http_port"] = port.to_i
40 | driver_options.profile["network.proxy.ssl"] = ip
41 | driver_options.profile["network.proxy.ssl_port"] = port.to_i
42 |
43 | logger.debug "BrowserBuilder (selenium_firefox): enabled http proxy, ip: #{ip}, port: #{port}"
44 | elsif type == "socks5"
45 | driver_options.profile["network.proxy.socks"] = ip
46 | driver_options.profile["network.proxy.socks_port"] = port.to_i
47 | driver_options.profile["network.proxy.socks_version"] = 5
48 | driver_options.profile["network.proxy.socks_remote_dns"] = true
49 |
50 | logger.debug "BrowserBuilder (selenium_firefox): enabled socks5 proxy, ip: #{ip}, port: #{port}"
51 | else
52 | logger.error "BrowserBuilder (selenium_firefox): wrong type of proxy: #{type}, skipped"
53 | end
54 | else
55 | logger.error "BrowserBuilder (selenium_firefox): proxy with authentication doesn't supported by selenium, skipped"
56 | end
57 | end
58 |
59 | if proxy_bypass_list = @config[:proxy_bypass_list].presence
60 | if proxy
61 | driver_options.profile["network.proxy.no_proxies_on"] = proxy_bypass_list.join(", ")
62 | logger.debug "BrowserBuilder (selenium_firefox): enabled proxy_bypass_list"
63 | else
64 | logger.error "BrowserBuilder (selenium_firefox): provide `proxy` to set proxy_bypass_list, skipped"
65 | end
66 | end
67 |
68 | # SSL
69 | if @config[:ignore_ssl_errors].present?
70 | driver_options.profile.secure_ssl = false
71 | driver_options.profile.assume_untrusted_certificate_issuer = true
72 | logger.debug "BrowserBuilder (selenium_firefox): enabled ignore_ssl_errors"
73 | end
74 |
75 | # Disable images
76 | if @config[:disable_images].present?
77 | driver_options.profile["permissions.default.image"] = 2
78 | logger.debug "BrowserBuilder (selenium_firefox): enabled disable_images"
79 | end
80 |
81 | # Headers
82 | if @config[:headers].present?
83 | logger.warn "BrowserBuilder: (selenium_firefox): custom headers doesn't supported by selenium, skipped"
84 | end
85 |
86 | if user_agent = @config[:user_agent].presence
87 | user_agent_string = (user_agent.class == Proc ? user_agent.call : user_agent).strip
88 | driver_options.profile["general.useragent.override"] = user_agent_string
89 | logger.debug "BrowserBuilder (selenium_firefox): enabled custom user_agent"
90 | end
91 |
92 | # Headless mode
93 | if ENV["HEADLESS"] != "false"
94 | if @config[:headless_mode] == :virtual_display
95 | if Gem::Platform.local.os == "linux"
96 | unless self.class.virtual_display
97 | require 'headless'
98 | self.class.virtual_display = Headless.new(reuse: true, destroy_at_exit: false)
99 | self.class.virtual_display.start
100 | end
101 |
102 | logger.debug "BrowserBuilder (selenium_firefox): enabled virtual_display headless_mode"
103 | else
104 | logger.error "BrowserBuilder (selenium_firefox): virtual_display headless_mode works only " \
105 | "on Linux platform. Browser will run in normal mode. Set `native` mode instead."
106 | end
107 | else
108 | driver_options.args << "--headless"
109 | logger.debug "BrowserBuilder (selenium_firefox): enabled native headless_mode"
110 | end
111 | end
112 |
113 | Capybara::Selenium::Driver.new(app, browser: :firefox, options: driver_options)
114 | end
115 |
116 | # Create browser instance (Capybara session)
117 | @browser = Capybara::Session.new(:selenium_firefox)
118 | @browser.spider = spider
119 | logger.debug "BrowserBuilder (selenium_firefox): created browser instance"
120 |
121 | if @config[:extensions].present?
122 | logger.error "BrowserBuilder (selenium_firefox): `extensions` option not supported by Selenium, skipped"
123 | end
124 |
125 | # Window size
126 | if size = @config[:window_size].presence
127 | @browser.current_window.resize_to(*size)
128 | logger.debug "BrowserBuilder (selenium_firefox): enabled window_size"
129 | end
130 |
131 | # Cookies
132 | if cookies = @config[:cookies].presence
133 | @browser.config.cookies = cookies
134 | logger.debug "BrowserBuilder (selenium_firefox): enabled custom cookies"
135 | end
136 |
137 | # Browser instance options
138 | # skip_request_errors
139 | if skip_errors = @config[:skip_request_errors].presence
140 | @browser.config.skip_request_errors = skip_errors
141 | logger.debug "BrowserBuilder (selenium_firefox): enabled skip_request_errors"
142 | end
143 |
144 | # retry_request_errors
145 | if retry_errors = @config[:retry_request_errors].presence
146 | @browser.config.retry_request_errors = retry_errors
147 | logger.debug "BrowserBuilder (selenium_firefox): enabled retry_request_errors"
148 | end
149 |
150 | # restart_if
151 | if requests_limit = @config.dig(:restart_if, :requests_limit).presence
152 | @browser.config.restart_if[:requests_limit] = requests_limit
153 | logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.requests_limit >= #{requests_limit}"
154 | end
155 |
156 | if memory_limit = @config.dig(:restart_if, :memory_limit).presence
157 | @browser.config.restart_if[:memory_limit] = memory_limit
158 | logger.debug "BrowserBuilder (selenium_firefox): enabled restart_if.memory_limit >= #{memory_limit}"
159 | end
160 |
161 | # before_request clear_cookies
162 | if @config.dig(:before_request, :clear_cookies)
163 | @browser.config.before_request[:clear_cookies] = true
164 | logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_cookies"
165 | end
166 |
167 | # before_request clear_and_set_cookies
168 | if @config.dig(:before_request, :clear_and_set_cookies)
169 | if cookies = @config[:cookies].presence
170 | @browser.config.cookies = cookies
171 | @browser.config.before_request[:clear_and_set_cookies] = true
172 | logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.clear_and_set_cookies"
173 | else
174 | logger.error "BrowserBuilder (selenium_firefox): cookies should be present to enable before_request.clear_and_set_cookies, skipped"
175 | end
176 | end
177 |
178 | # before_request change_user_agent
179 | if @config.dig(:before_request, :change_user_agent)
180 | logger.error "BrowserBuilder (selenium_firefox): before_request.change_user_agent option not supported by Selenium, skipped"
181 | end
182 |
183 | # before_request change_proxy
184 | if @config.dig(:before_request, :change_proxy)
185 | logger.error "BrowserBuilder (selenium_firefox): before_request.change_proxy option not supported by Selenium, skipped"
186 | end
187 |
188 | # before_request delay
189 | if delay = @config.dig(:before_request, :delay).presence
190 | @browser.config.before_request[:delay] = delay
191 | logger.debug "BrowserBuilder (selenium_firefox): enabled before_request.delay"
192 | end
193 |
194 | # encoding
195 | if encoding = @config[:encoding]
196 | @browser.config.encoding = encoding
197 | logger.debug "BrowserBuilder (selenium_firefox): enabled encoding: #{encoding}"
198 | end
199 |
200 | # return Capybara session instance
201 | @browser
202 | end
203 | end
204 | end
205 |
--------------------------------------------------------------------------------
/lib/kimurai/capybara_configuration.rb:
--------------------------------------------------------------------------------
1 | require 'capybara'
2 |
3 | Capybara.configure do |config|
4 | config.run_server = false
5 | config.default_selector = :xpath
6 | config.save_path = "tmp"
7 | config.default_max_wait_time = 10
8 | config.ignore_hidden_elements = false
9 | config.threadsafe = true
10 | end
11 |
--------------------------------------------------------------------------------
/lib/kimurai/capybara_ext/driver/base.rb:
--------------------------------------------------------------------------------
1 | require 'pathname'
2 |
3 | class Capybara::Driver::Base
4 | attr_accessor :visited
5 | attr_writer :requests, :responses
6 |
7 | def requests
8 | @requests ||= 0
9 | end
10 |
11 | def responses
12 | @responses ||= 0
13 | end
14 |
15 | def current_memory
16 | driver_pid = pid
17 |
18 | all = (get_descendant_processes(driver_pid) << driver_pid).uniq
19 | all.map { |pid| get_process_memory(pid) }.sum
20 | end
21 |
22 | private
23 |
24 | def get_descendant_processes(base)
25 | descendants = Hash.new { |ht, k| ht[k] = [k] }
26 | Hash[*`ps -eo pid,ppid`.scan(/\d+/).map(&:to_i)].each do |pid, ppid|
27 | descendants[ppid] << descendants[pid]
28 | end
29 |
30 | descendants[base].flatten - [base]
31 | end
32 |
33 | # https://github.com/schneems/get_process_mem
34 | # Note: for Linux takes PSS (not RSS) memory (I think PSS better fits in this case)
35 | def get_process_memory(pid)
36 | case @platform ||= Gem::Platform.local.os
37 | when "linux"
38 | begin
39 | file = Pathname.new "/proc/#{pid}/smaps"
40 | return 0 unless file.exist?
41 |
42 | lines = file.each_line.select { |line| line.match(/^Pss/) }
43 | return 0 if lines.empty?
44 |
45 | lines.reduce(0) do |sum, line|
46 | line.match(/(?(\d*\.{0,1}\d+))\s+(?\w\w)/) do |m|
47 | sum += m[:value].to_i
48 | end
49 |
50 | sum
51 | end
52 | rescue Errno::EACCES
53 | 0
54 | end
55 | when "darwin"
56 | mem = `ps -o rss= -p #{pid}`.strip
57 | mem.empty? ? 0 : mem.to_i
58 | else
59 | raise "Can't check process memory, wrong type of platform: #{@platform}"
60 | end
61 | end
62 | end
63 |
--------------------------------------------------------------------------------
/lib/kimurai/capybara_ext/mechanize/driver.rb:
--------------------------------------------------------------------------------
1 | require 'mechanize'
2 | require_relative '../driver/base'
3 |
4 | class Capybara::Mechanize::Driver
5 | # Extend capybara-mechnize to support Poltergeist-like methods
6 | # https://www.rubydoc.info/gems/poltergeist/Capybara/Poltergeist/Driver
7 |
8 | def set_proxy(ip, port, type, user = nil, password = nil)
9 | # type is always "http", "socks" is not supported (yet)
10 | browser.agent.set_proxy(ip, port, user, password)
11 | end
12 |
13 | ###
14 |
15 | def headers
16 | browser.agent.request_headers
17 | end
18 |
19 | def headers=(headers)
20 | browser.agent.request_headers = headers
21 | end
22 |
23 | def add_header(name, value)
24 | browser.agent.request_headers[name] = value
25 | end
26 |
27 | ###
28 |
29 | def get_cookies
30 | browser.agent.cookies
31 | end
32 |
33 | def set_cookie(name, value, options = {})
34 | options[:name] ||= name
35 | options[:value] ||= value
36 |
37 | cookie = Mechanize::Cookie.new(options.merge path: "/")
38 | browser.agent.cookie_jar << cookie
39 | end
40 |
41 | def set_cookies(cookies)
42 | cookies.each do |cookie|
43 | set_cookie(cookie[:name], cookie[:value], cookie)
44 | end
45 | end
46 |
47 | def clear_cookies
48 | browser.agent.cookie_jar.clear!
49 | end
50 |
51 | ###
52 |
53 | def quit
54 | browser.agent.shutdown
55 | end
56 |
57 | ###
58 |
59 | # Reset parent method `current_memory` for mechanize (we can't measure memory of Mechanize driver)
60 | def current_memory
61 | nil
62 | end
63 |
64 | def pid
65 | nil
66 | end
67 |
68 | def port
69 | nil
70 | end
71 | end
72 |
--------------------------------------------------------------------------------
/lib/kimurai/capybara_ext/poltergeist/driver.rb:
--------------------------------------------------------------------------------
1 | require_relative '../driver/base'
2 |
3 | module Capybara::Poltergeist
4 | class Driver
5 | def pid
6 | client_pid
7 | end
8 |
9 | def port
10 | server.port
11 | end
12 | end
13 | end
14 |
--------------------------------------------------------------------------------
/lib/kimurai/capybara_ext/selenium/driver.rb:
--------------------------------------------------------------------------------
1 | require_relative '../driver/base'
2 |
3 | class Capybara::Selenium::Driver
4 | def get_cookies
5 | browser.manage.all_cookies
6 | end
7 |
8 | def set_cookie(name, value, options = {})
9 | options[:name] ||= name
10 | options[:value] ||= value
11 |
12 | browser.manage.add_cookie(options)
13 | end
14 |
15 | def set_cookies(cookies)
16 | cookies.each do |cookie|
17 | set_cookie(cookie[:name], cookie[:value], cookie)
18 | end
19 | end
20 |
21 | def clear_cookies
22 | browser.manage.delete_all_cookies
23 | end
24 |
25 | ###
26 |
27 | def pid
28 | @pid ||= `lsof -i tcp:#{port} -t`.strip.to_i
29 | end
30 |
31 | def port
32 | @port ||= browser.send(:bridge).instance_variable_get("@http").instance_variable_get("@server_url").port
33 | end
34 | end
35 |
--------------------------------------------------------------------------------
/lib/kimurai/capybara_ext/session.rb:
--------------------------------------------------------------------------------
1 | require 'capybara'
2 | require 'nokogiri'
3 | require 'json'
4 | require_relative 'session/config'
5 |
6 | module Capybara
7 | class Session
8 | attr_accessor :spider
9 |
10 | alias_method :original_visit, :visit
11 | def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
12 | if spider
13 | process_delay(delay) if delay
14 | retries, sleep_interval = 0, 0
15 |
16 | begin
17 | check_request_options(visit_uri) unless skip_request_options
18 | driver.requests += 1 and logger.info "Browser: started get request to: #{visit_uri}"
19 | spider.class.update(:visits, :requests) if spider.with_info
20 |
21 | original_visit(visit_uri)
22 | rescue => e
23 | if match_error?(e, type: :to_skip)
24 | logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
25 | spider.add_event(:requests_errors, e.inspect) if spider.with_info
26 | false
27 | elsif match_error?(e, type: :to_retry)
28 | logger.error "Browser: retry request error: #{e.inspect}, url: #{visit_uri}"
29 | spider.add_event(:requests_errors, e.inspect) if spider.with_info
30 |
31 | if (retries += 1) <= max_retries
32 | logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
33 | sleep sleep_interval and retry
34 | else
35 | logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
36 | raise e unless skip_error_on_failure?(e)
37 | end
38 | else
39 | raise e
40 | end
41 | else
42 | driver.responses += 1 and logger.info "Browser: finished get request to: #{visit_uri}"
43 | spider.class.update(:visits, :responses) if spider.with_info
44 | driver.visited = true unless driver.visited
45 | true
46 | ensure
47 | if spider.with_info
48 | logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
49 | end
50 |
51 | if memory = driver.current_memory
52 | logger.debug "Browser: driver.current_memory: #{memory}"
53 | end
54 | end
55 | else
56 | original_visit(visit_uri)
57 | end
58 | end
59 |
60 | def destroy_driver!
61 | if @driver
62 | begin
63 | @driver.quit
64 | # handle Net::ReadTimeout error for Selenium like drivers
65 | rescue Net::ReadTimeout => e
66 | @driver.quit
67 | end
68 |
69 | @driver = nil
70 | logger.info "Browser: driver #{mode} has been destroyed"
71 | else
72 | logger.warn "Browser: driver #{mode} is not present"
73 | end
74 | end
75 |
76 | def restart!
77 | if mode.match?(/poltergeist/)
78 | @driver.browser.restart
79 | @driver.requests, @driver.responses = 0, 0
80 | else
81 | destroy_driver!
82 | driver
83 | end
84 |
85 | logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
86 | end
87 |
88 | def current_response(response_type = :html)
89 | case response_type
90 | when :html
91 | if config.encoding
92 | if config.encoding == :auto
93 | charset = body.force_encoding("ISO-8859-1").encode("UTF-8")[/ { browser.find("//some/element/path").click }
116 | # browser.within_new_window_by(action: action) do
117 | # do some stuff and then automatically close this tab and return back to the first tab
118 | # end
119 | def within_new_window_by(action: nil, url: nil)
120 | case
121 | when action
122 | opened_window = window_opened_by { action.call }
123 | within_window(opened_window) do
124 | yield
125 | current_window.close
126 | end
127 | when url
128 | within_window(open_new_window) do
129 | visit(url)
130 |
131 | yield
132 | current_window.close
133 | end
134 | end
135 | end
136 |
137 | ###
138 |
139 | def scroll_to_bottom
140 | execute_script("window.scrollBy(0,10000)")
141 | end
142 |
143 | private
144 |
145 | def skip_error_on_failure?(e)
146 | config.retry_request_errors.any? do |error|
147 | error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
148 | end
149 | end
150 |
151 | def match_error?(e, type:)
152 | errors =
153 | case type
154 | when :to_retry then config.retry_request_errors
155 | when :to_skip then config.skip_request_errors
156 | end
157 |
158 | errors.any? do |error|
159 | if error.kind_of?(Hash)
160 | match_class = e.class.ancestors.include?(error[:error])
161 | if error[:message].present?
162 | if error[:message].kind_of?(Regexp)
163 | e.message&.match?(error[:message])
164 | else
165 | e.message&.include?(error[:message])
166 | end && match_class
167 | else
168 | match_class
169 | end
170 | else
171 | e.class.ancestors.include?(error)
172 | end
173 | end
174 | end
175 |
176 | def process_delay(delay)
177 | interval = (delay.class == Range ? rand(delay) : delay)
178 | logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
179 | sleep interval
180 | end
181 |
182 | def check_request_options(url_to_visit)
183 | # restart_if
184 | if memory_limit = config.restart_if[:memory_limit]
185 | memory = driver.current_memory
186 | if memory && memory >= memory_limit
187 | logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
188 | restart!
189 | end
190 | end
191 |
192 | if requests_limit = config.restart_if[:requests_limit]
193 | requests = driver.requests
194 | if requests >= requests_limit
195 | logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
196 | restart!
197 | end
198 | end
199 |
200 | # cookies
201 | # (Selenium only) if config.cookies present and browser was just created,
202 | # visit url_to_visit first and only then set cookies:
203 | if driver.visited.nil? && config.cookies && mode.match?(/selenium/)
204 | visit(url_to_visit, skip_request_options: true)
205 | config.cookies.each do |cookie|
206 | driver.set_cookie(cookie[:name], cookie[:value], cookie)
207 | end
208 | end
209 |
210 | if config.before_request[:clear_cookies]
211 | driver.clear_cookies
212 | logger.debug "Browser: cleared cookies before request"
213 | end
214 |
215 | if config.before_request[:clear_and_set_cookies]
216 | driver.clear_cookies
217 |
218 | # (Selenium only) if browser is not visited yet any page, visit url_to_visit
219 | # first and then set cookies (needs after browser restart):
220 | if driver.visited.nil? && mode.match?(/selenium/)
221 | visit(url_to_visit, skip_request_options: true)
222 | end
223 |
224 | config.cookies.each do |cookie|
225 | driver.set_cookie(cookie[:name], cookie[:value], cookie)
226 | end
227 |
228 | logger.debug "Browser: cleared and set cookies before request"
229 | end
230 |
231 | # user_agent
232 | if config.before_request[:change_user_agent]
233 | driver.add_header("User-Agent", config.user_agent.call)
234 | logger.debug "Browser: changed user_agent before request"
235 | end
236 |
237 | # proxy
238 | if config.before_request[:change_proxy]
239 | proxy_string = config.proxy.call
240 | driver.set_proxy(*proxy_string.split(":"))
241 | logger.debug "Browser: changed proxy before request"
242 | end
243 | end
244 |
245 | def logger
246 | spider.logger
247 | end
248 | end
249 | end
250 |
--------------------------------------------------------------------------------
/lib/kimurai/capybara_ext/session/config.rb:
--------------------------------------------------------------------------------
1 | module Capybara
2 | class SessionConfig
3 | attr_accessor :cookies, :proxy, :user_agent, :encoding
4 | attr_writer :retry_request_errors, :skip_request_errors
5 |
6 | def retry_request_errors
7 | @retry_request_errors ||= []
8 | end
9 |
10 | def skip_request_errors
11 | @skip_request_errors ||= []
12 | end
13 |
14 | def restart_if
15 | @restart_if ||= {}
16 | end
17 |
18 | def before_request
19 | @before_request ||= {}
20 | end
21 | end
22 | end
23 |
--------------------------------------------------------------------------------
/lib/kimurai/cli.rb:
--------------------------------------------------------------------------------
1 | require 'thor'
2 |
3 | module Kimurai
4 | class CLI < Thor
5 | map %w[--version -v] => :__print_version
6 |
7 | desc "generate", "Generator, available types: project, spider, schedule"
8 | def generate(generator_type, *args)
9 | case generator_type
10 | when "project"
11 | project_name = args.shift
12 | raise "Provide project name to generate a new project" unless project_name.present?
13 | Generator.new.generate_project(project_name)
14 | when "spider"
15 | spider_name = args.shift
16 | raise "Provide spider name to generate a spider" unless spider_name.present?
17 | Generator.new.generate_spider(spider_name, in_project: inside_project?)
18 | when "schedule"
19 | Generator.new.generate_schedule
20 | else
21 | raise "Don't know this generator type: #{generator_type}"
22 | end
23 | end
24 |
25 | ###
26 |
27 | desc "setup", "Setup server"
28 | option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
29 | option "ask-sudo", type: :boolean, banner: "Provide sudo password for a user to install system-wide packages"
30 | option "ask-auth-pass", type: :boolean, banner: "Auth using password"
31 | option "ssh-key-path", type: :string, banner: "Auth using ssh key"
32 | option :local, type: :boolean, banner: "Run setup on a local machine (Ubuntu only)"
33 | def setup(user_host)
34 | command = AnsibleCommandBuilder.new(user_host, options, playbook: "setup").get
35 |
36 | pid = spawn *command
37 | Process.wait pid
38 | end
39 |
40 | desc "deploy", "Deploy project to the server and update cron schedule"
41 | option :port, aliases: :p, type: :string, banner: "Port for ssh connection"
42 | option "ask-auth-pass", type: :boolean, banner: "Auth using password"
43 | option "ssh-key-path", type: :string, banner: "Auth using ssh key"
44 | option "repo-url", type: :string, banner: "Repo url"
45 | option "repo-key-path", type: :string, banner: "SSH key for a git repo"
46 | option "skip-check", type: :boolean, default: false, banner: "Skip git repository checks"
47 | def deploy(user_host)
48 | unless options["skip-check"]
49 | if !`git status --short`.empty?
50 | raise "Deploy: Please commit your changes first"
51 | elsif `git remote`.empty?
52 | raise "Deploy: Please add remote origin repository to your repo first"
53 | elsif !`git rev-list master...origin/master`.empty?
54 | raise "Deploy: Please push your commits to the remote origin repo first"
55 | end
56 | end
57 |
58 | repo_url = options["repo-url"] ? options["repo-url"] : `git remote get-url origin`.strip
59 | repo_name = repo_url[/\/([^\/]*)\.git/i, 1]
60 |
61 | command = AnsibleCommandBuilder.new(user_host, options, playbook: "deploy",
62 | vars: { repo_url: repo_url, repo_name: repo_name, repo_key_path: options["repo-key-path"] }
63 | ).get
64 |
65 | pid = spawn *command
66 | Process.wait pid
67 | end
68 |
69 | ###
70 |
71 | desc "crawl", "Run a particular spider by it's name"
72 | def crawl(spider_name)
73 | raise "Can't find Kimurai project" unless inside_project?
74 | require './config/boot'
75 |
76 | unless klass = Kimurai.find_by_name(spider_name)
77 | raise "Can't find spider with name `#{spider_name}` in the project. " \
78 | "To list all available spiders, run: `$ bundle exec kimurai list`"
79 | end
80 |
81 | # Set time_zone if exists
82 | if time_zone = Kimurai.configuration.time_zone
83 | Kimurai.time_zone = time_zone
84 | end
85 |
86 | klass.crawl!
87 | end
88 |
89 | desc "parse", "Parse url in the particular spider method"
90 | option :url, type: :string, required: true, banner: "Url to pass to the method"
91 | def parse(spider_name, method_name)
92 | raise "Can't find Kimurai project" unless inside_project?
93 | require './config/boot'
94 |
95 | unless klass = Kimurai.find_by_name(spider_name)
96 | raise "Can't find spider with name `#{spider_name}` in the project. " \
97 | "To list all available spiders, run: `$ bundle exec kimurai list`"
98 | end
99 |
100 | klass.parse!(method_name, url: options["url"])
101 | end
102 |
103 | desc "console", "Start Kimurai console"
104 | option :engine, type: :string, banner: "Engine to use"
105 | option :url, type: :string, banner: "Url to process"
106 | def console(spider_name = nil)
107 | require 'pry'
108 | require './config/boot' if inside_project?
109 |
110 | if spider_name
111 | raise "Can't find Kimurai project" unless inside_project?
112 |
113 | unless klass = Kimurai.find_by_name(spider_name)
114 | raise "Can't find spider with name `#{spider_name}` in the project. " \
115 | "To list all available spiders, run: `$ bundle exec kimurai list`"
116 | end
117 | else
118 | klass = inside_project? ? ApplicationSpider : ::Kimurai::Base
119 | end
120 |
121 | engine = options["engine"]&.delete(":")&.to_sym
122 | if url = options["url"]
123 | klass.new(engine).request_to(:console, url: options["url"])
124 | else
125 | klass.new(engine).public_send(:console)
126 | end
127 | end
128 |
129 | desc "list", "List all available spiders in the current project"
130 | def list
131 | raise "Can't find Kimurai project" unless inside_project?
132 | require './config/boot'
133 |
134 | Kimurai.list.keys.sort.each { |name| puts name }
135 | end
136 |
137 | desc "runner", "Run all spiders in the project in queue"
138 | option :include, type: :array, default: [], banner: "List of spiders to run"
139 | option :exclude, type: :array, default: [], banner: "List of spiders to exclude from run"
140 | option :jobs, aliases: :j, type: :numeric, default: 1, banner: "The number of concurrent jobs"
141 | def runner
142 | raise "Can't find Kimurai project" unless inside_project?
143 |
144 | jobs = options["jobs"]
145 | raise "Jobs count can't be 0" if jobs == 0
146 |
147 | require './config/boot'
148 | require 'kimurai/runner'
149 |
150 | spiders = options["include"].presence || Kimurai.list.keys
151 | spiders -= options["exclude"]
152 |
153 | Runner.new(spiders, jobs).run!
154 | end
155 |
156 | desc "--version, -v", "Print the version"
157 | def __print_version
158 | puts VERSION
159 | end
160 |
161 | desc "dashboard", "Run dashboard"
162 | def dashboard
163 | raise "Can't find Kimurai project" unless inside_project?
164 |
165 | require './config/boot'
166 | if Object.const_defined?("Kimurai::Dashboard")
167 | require 'kimurai/dashboard/app'
168 | Kimurai::Dashboard::App.run!
169 | else
170 | raise "Kimurai::Dashboard is not defined"
171 | end
172 | end
173 |
174 | private
175 |
176 | def inside_project?
177 | Dir.exists?("spiders") && File.exists?("./config/boot.rb")
178 | end
179 | end
180 | end
181 |
182 | require_relative 'cli/generator'
183 | require_relative 'cli/ansible_command_builder'
184 |
--------------------------------------------------------------------------------
/lib/kimurai/cli/ansible_command_builder.rb:
--------------------------------------------------------------------------------
1 | require 'cliver'
2 |
3 | module Kimurai
4 | class CLI
5 | class AnsibleCommandBuilder
6 | def initialize(user_host, options, playbook:, vars: {})
7 | @user_host = user_host
8 | @options = options
9 | @playbook = playbook
10 | @vars = vars
11 | end
12 |
13 | def get
14 | unless Cliver.detect("ansible-playbook")
15 | raise "Can't find `ansible-playbook` executable, to install: " \
16 | "Mac OS X: `$ brew install ansible`, Ubuntu: `$ sudo apt install ansible`"
17 | end
18 |
19 | user = @user_host[/(.*?)\@/, 1]
20 | host = @user_host[/\@(.+)/, 1] || @user_host
21 | inventory = @options["port"] ? "#{host}:#{@options['port']}," : "#{host},"
22 |
23 | gem_dir = Gem::Specification.find_by_name("kimurai").gem_dir
24 | playbook_path = gem_dir + "/lib/kimurai/automation/" + "#{@playbook}.yml"
25 |
26 | command = [
27 | "ansible-playbook", playbook_path,
28 | "--inventory", inventory,
29 | "--ssh-extra-args", "-oForwardAgent=yes",
30 | "--connection", @options["local"] ? "local" : "smart",
31 | "--extra-vars", "ansible_python_interpreter=/usr/bin/python3"
32 | ]
33 |
34 | if File.exists? "config/automation.yml"
35 | require 'yaml'
36 | if config = YAML.load_file("config/automation.yml").dig(@playbook)
37 | config.each { |key, value| @vars[key] = value unless @vars[key] }
38 | end
39 | end
40 |
41 | @vars.each do |key, value|
42 | next unless value.present?
43 | command.push "--extra-vars", "#{key}=#{value}"
44 | end
45 |
46 | if user
47 | command.push "--user", user
48 | end
49 |
50 | if @options["ask-sudo"]
51 | command.push "--ask-become-pass"
52 | end
53 |
54 | if @options["ask-auth-pass"]
55 | unless Cliver.detect("sshpass")
56 | raise "Can't find `sshpass` executable for password authentication, to install: " \
57 | "Mac OS X: `$ brew install http://git.io/sshpass.rb`, Ubuntu: `$ sudo apt install sshpass`"
58 | end
59 |
60 | command.push "--ask-pass"
61 | end
62 |
63 | if ssh_key_path = @options["ssh-key-path"]
64 | command.push "--private-key", ssh_key_path
65 | end
66 |
67 | command
68 | end
69 | end
70 | end
71 | end
72 |
--------------------------------------------------------------------------------
/lib/kimurai/cli/generator.rb:
--------------------------------------------------------------------------------
1 | module Kimurai
2 | class CLI
3 | class Generator < Thor::Group
4 | include Thor::Actions
5 |
6 | def self.source_root
7 | File.dirname(File.expand_path('..', __FILE__))
8 | end
9 |
10 | def generate_project(project_name)
11 | directory "template", project_name
12 | inside(project_name) do
13 | run "bundle install"
14 | run "git init"
15 | end
16 | end
17 |
18 | def generate_spider(spider_name, in_project:)
19 | spider_path = in_project ? "spiders/#{spider_name}.rb" : "./#{spider_name}.rb"
20 | raise "Spider #{spider_path} already exists" if File.exists? spider_path
21 |
22 | spider_class = to_spider_class(spider_name)
23 | create_file spider_path do
24 | <<~RUBY
25 | class #{spider_class} < #{in_project ? 'ApplicationSpider' : 'Kimurai::Base'}
26 | @name = "#{spider_name}"
27 | @start_urls = []
28 | @config = {}
29 |
30 | def parse(response, url:, data: {})
31 | end
32 | end
33 | RUBY
34 | end
35 |
36 | unless in_project
37 | insert_into_file spider_path, " @engine = :mechanize\n", after: "@name = \"#{spider_name}\"\n"
38 | prepend_to_file spider_path, "require 'kimurai'\n\n"
39 | append_to_file spider_path, "\n#{spider_class}.crawl!"
40 | end
41 | end
42 |
43 | def generate_schedule
44 | copy_file "template/config/schedule.rb", "./schedule.rb"
45 | end
46 |
47 | private
48 |
49 | def to_spider_class(string)
50 | string.sub(/^./) { $&.capitalize }
51 | .gsub(/(?:_|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
52 | .gsub(/(?:-|(\/))([a-z\d]*)/) { "Dash#{$2.capitalize}" }
53 | .gsub(/(?:\.|(\/))([a-z\d]*)/) { "#{$1}#{$2.capitalize}" }
54 | end
55 | end
56 | end
57 | end
58 |
--------------------------------------------------------------------------------
/lib/kimurai/core_ext/array.rb:
--------------------------------------------------------------------------------
1 | class Array
2 | def in_sorted_groups(number, fill_width = nil)
3 | sorted_groups = Array.new(number) { |a| a = [] }
4 |
5 | self.in_groups_of(number, fill_width).each do |group|
6 | number.times do |i|
7 | group.fetch(i) rescue next
8 | sorted_groups[i] << group[i]
9 | end
10 | end
11 |
12 | sorted_groups
13 | end
14 | end
15 |
--------------------------------------------------------------------------------
/lib/kimurai/core_ext/hash.rb:
--------------------------------------------------------------------------------
1 | class Hash
2 | def deep_merge_excl(second, exclude)
3 | self.merge(second.slice(*exclude)).deep_merge(second.except(*exclude))
4 | end
5 | end
6 |
--------------------------------------------------------------------------------
/lib/kimurai/core_ext/numeric.rb:
--------------------------------------------------------------------------------
1 | class Numeric
2 | # https://stackoverflow.com/a/1679963
3 | def duration
4 | secs = self.to_int
5 | mins = secs / 60
6 | hours = mins / 60
7 | days = hours / 24
8 |
9 | if days > 0
10 | "#{days}d, #{hours % 24}h"
11 | elsif hours > 0
12 | "#{hours}h, #{mins % 60}m"
13 | elsif mins > 0
14 | "#{mins}m, #{secs % 60}s"
15 | elsif secs >= 0
16 | "#{secs}s"
17 | end
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/lib/kimurai/core_ext/string.rb:
--------------------------------------------------------------------------------
1 | require 'murmurhash3'
2 |
3 | class String
4 | def to_id
5 | MurmurHash3::V32.str_hash(self)
6 | end
7 | end
8 |
--------------------------------------------------------------------------------
/lib/kimurai/pipeline.rb:
--------------------------------------------------------------------------------
1 | module Kimurai
2 | class Pipeline
3 | class DropItemError < StandardError; end
4 | def self.name
5 | self.to_s.sub(/.*?::/, "").underscore.to_sym
6 | end
7 |
8 | include BaseHelper
9 | attr_accessor :spider
10 |
11 | def name
12 | self.class.name
13 | end
14 |
15 | ###
16 |
17 | def storage
18 | spider.storage
19 | end
20 |
21 | def unique?(scope, value)
22 | spider.unique?(scope, value)
23 | end
24 |
25 | def save_to(path, item, format:, position: true, append: false)
26 | spider.save_to(path, item, format: format, position: position, append: append)
27 | end
28 |
29 | def logger
30 | spider.logger
31 | end
32 | end
33 | end
34 |
--------------------------------------------------------------------------------
/lib/kimurai/runner.rb:
--------------------------------------------------------------------------------
1 | require 'pmap'
2 |
3 | module Kimurai
4 | class Runner
5 | attr_reader :jobs, :spiders, :session_info
6 |
7 | def initialize(spiders, parallel_jobs)
8 | @jobs = parallel_jobs
9 | @spiders = spiders
10 | @start_time = Time.now
11 |
12 | @session_info = {
13 | id: @start_time.to_i,
14 | status: :processing,
15 | start_time: @start_time,
16 | stop_time: nil,
17 | environment: Kimurai.env,
18 | concurrent_jobs: @jobs,
19 | spiders: @spiders
20 | }
21 |
22 | if time_zone = Kimurai.configuration.time_zone
23 | Kimurai.time_zone = time_zone
24 | end
25 |
26 | ENV.store("SESSION_ID", @start_time.to_i.to_s)
27 | ENV.store("RBCAT_COLORIZER", "false")
28 | end
29 |
30 | def run!(exception_on_fail: true)
31 | puts ">>> Runner: started: #{session_info}"
32 | if at_start_callback = Kimurai.configuration.runner_at_start_callback
33 | at_start_callback.call(session_info)
34 | end
35 |
36 | running = true
37 | spiders.peach_with_index(jobs) do |spider, i|
38 | next unless running
39 |
40 | puts "> Runner: started spider: #{spider}, index: #{i}"
41 | pid = spawn("bundle", "exec", "kimurai", "crawl", spider, [:out, :err] => "log/#{spider}.log")
42 | Process.wait pid
43 |
44 | puts "< Runner: stopped spider: #{spider}, index: #{i}"
45 | end
46 | rescue StandardError, SignalException, SystemExit => e
47 | running = false
48 |
49 | session_info.merge!(status: :failed, error: e.inspect, stop_time: Time.now)
50 | exception_on_fail ? raise(e) : [session_info, e]
51 | else
52 | session_info.merge!(status: :completed, stop_time: Time.now)
53 | ensure
54 | if at_stop_callback = Kimurai.configuration.runner_at_stop_callback
55 | at_stop_callback.call(session_info)
56 | end
57 | puts "<<< Runner: stopped: #{session_info}"
58 | end
59 | end
60 | end
61 |
--------------------------------------------------------------------------------
/lib/kimurai/template/.gitignore:
--------------------------------------------------------------------------------
1 | /.bundle
2 | /cache
3 | /node_modules
4 |
5 | /log/*
6 | !/log/.keep
7 |
8 | /tmp/*
9 | !/tmp/.keep
10 |
11 | /db/*
12 | !/db/.keep
13 |
14 | .byebug_history
15 | *.swp
16 | .env
17 |
18 | capybara-*.png
19 |
--------------------------------------------------------------------------------
/lib/kimurai/template/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | git_source(:github) { |repo| "https://github.com/#{repo}.git" }
3 |
4 | ruby '>= 2.5'
5 |
6 | # Framework
7 | gem 'kimurai', '~> 1.4'
8 |
9 | # Require files in directory and child directories recursively
10 | gem 'require_all'
11 |
12 | # Dotenv
13 | gem 'dotenv'
14 |
15 | # To debug spiders:
16 | group :development do
17 | gem 'byebug', platforms: :mri
18 | gem 'pry'
19 | end
20 |
21 | # If you want to save items to the database, require one of these gems:
22 | # gem 'sqlite3'
23 | # gem 'pg'
24 | # gem 'mysql2'
25 |
26 | # And use your preferred ORM/database connector:
27 | # gem 'activerecord', require: 'active_record'
28 | # gem 'sequel'
29 |
--------------------------------------------------------------------------------
/lib/kimurai/template/README.md:
--------------------------------------------------------------------------------
1 | # README
2 |
3 | New Kimurai project readme
4 |
--------------------------------------------------------------------------------
/lib/kimurai/template/config/application.rb:
--------------------------------------------------------------------------------
1 | Kimurai.configure do |config|
2 | # Default logger has colored mode in development.
3 | # If you would like to disable it, set `colorize_logger` to false.
4 | # config.colorize_logger = false
5 |
6 | # Logger level for default logger:
7 | # config.log_level = :info
8 |
9 | # Custom logger:
10 | # config.logger = Logger.new(STDOUT)
11 |
12 | # Custom time zone (for logs):
13 | # config.time_zone = "UTC"
14 | # config.time_zone = "Europe/Moscow"
15 |
16 | # At start callback for a runner. Accepts argument with info as hash with
17 | # keys: id, status, start_time, environment, concurrent_jobs, spiders list.
18 | # For example, you can use this callback to send notification when runner was started:
19 | # config.runner_at_start_callback = lambda do |info|
20 | # json = JSON.pretty_generate(info)
21 | # Sender.send_notification("Started session: #{json}")
22 | # end
23 |
24 | # At stop callback for a runner. Accepts argument with info as hash with
25 | # all `runner_at_start_callback` keys plus additional `stop_time` key. Also `status` contains
26 | # stop status of a runner (completed or failed).
27 | # You can use this callback to send notification when runner has been stopped:
28 | # config.runner_at_stop_callback = lambda do |info|
29 | # json = JSON.pretty_generate(info)
30 | # Sender.send_notification("Stopped session: #{json}")
31 | # end
32 |
33 | # Provide custom chrome binary path (default is any available chrome/chromium in the PATH):
34 | # config.selenium_chrome_path = "/usr/bin/chromium-browser"
35 | # Provide custom selenium chromedriver path (default is "/usr/local/bin/chromedriver"):
36 | # config.chromedriver_path = "/usr/local/bin/chromedriver"
37 | end
38 |
--------------------------------------------------------------------------------
/lib/kimurai/template/config/automation.yml:
--------------------------------------------------------------------------------
1 | # software versions to install for `setup` command
2 | setup:
3 | ruby: 2.5.1
4 | # check latest here http://phantomjs.org/download.html
5 | phantomjs: 2.1.1
6 | # check latest here https://github.com/mozilla/geckodriver/releases/
7 | geckodriver: 0.21.0
8 | # check latest here https://sites.google.com/a/chromium.org/chromedriver/downloads
9 | chromedriver: 2.39
10 | # settings for deploy command, you can use cli options as well (--repo-url, --git-key-path)
11 | deploy:
12 | # repo_url: git@bitbucket.org:username/repo_name.git
13 | # repo_key_path: ~/.ssh/id_rsa
14 |
--------------------------------------------------------------------------------
/lib/kimurai/template/config/boot.rb:
--------------------------------------------------------------------------------
1 | # require project gems
2 | require 'bundler/setup'
3 | Bundler.require(:default, Kimurai.env)
4 |
5 | # require custom ENV variables located in .env file
6 | require 'dotenv/load'
7 |
8 | # require initializers
9 | Dir.glob(File.join("./config/initializers", "*.rb"), &method(:require))
10 |
11 | # require helpers
12 | Dir.glob(File.join("./helpers", "*.rb"), &method(:require))
13 |
14 | # require pipelines
15 | Dir.glob(File.join("./pipelines", "*.rb"), &method(:require))
16 |
17 | # require spiders recursively in the `spiders/` folder
18 | require_relative '../spiders/application_spider'
19 | require_all "spiders"
20 |
21 | # require Kimurai configuration
22 | require_relative 'application'
23 |
--------------------------------------------------------------------------------
/lib/kimurai/template/config/initializers/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/config/initializers/.keep
--------------------------------------------------------------------------------
/lib/kimurai/template/config/schedule.rb:
--------------------------------------------------------------------------------
1 | ### Settings ###
2 | require 'tzinfo'
3 |
4 | # Export current PATH to the cron
5 | env :PATH, ENV["PATH"]
6 |
7 | # Use 24 hour format when using `at:` option
8 | set :chronic_options, hours24: true
9 |
10 | # Use local_to_utc helper to setup execution time using your local timezone instead
11 | # of server's timezone (which is probably and should be UTC, to check run `$ timedatectl`).
12 | # Also maybe you'll want to set same timezone in kimurai as well (use `Kimurai.configuration.time_zone =` for that),
13 | # to have spiders logs in a specific time zone format.
14 | # Example usage of helper:
15 | # every 1.day, at: local_to_utc("7:00", zone: "Europe/Moscow") do
16 | # crawl "google_spider.com", output: "log/google_spider.com.log"
17 | # end
18 | def local_to_utc(time_string, zone:)
19 | TZInfo::Timezone.get(zone).local_to_utc(Time.parse(time_string))
20 | end
21 |
22 | # Note: by default Whenever exports cron commands with :environment == "production".
23 | # Note: Whenever can only append log data to a log file (>>). If you want
24 | # to overwrite (>) log file before each run, pass lambda:
25 | # crawl "google_spider.com", output: -> { "> log/google_spider.com.log 2>&1" }
26 |
27 | # Project job types
28 | job_type :crawl, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai crawl :task :output"
29 | job_type :runner, "cd :path && KIMURAI_ENV=:environment bundle exec kimurai runner --jobs :task :output"
30 |
31 | # Single file job type
32 | job_type :single, "cd :path && KIMURAI_ENV=:environment ruby :task :output"
33 | # Single with bundle exec
34 | job_type :single_bundle, "cd :path && KIMURAI_ENV=:environment bundle exec ruby :task :output"
35 |
36 | ### Schedule ###
37 | # Usage (check examples here https://github.com/javan/whenever#example-schedulerb-file):
38 | # every 1.day do
39 | # Example to schedule a single spider in the project:
40 | # crawl "google_spider.com", output: "log/google_spider.com.log"
41 |
42 | # Example to schedule all spiders in the project using runner. Each spider will write
43 | # it's own output to the `log/spider_name.log` file (handled by a runner itself).
44 | # Runner output will be written to log/runner.log file.
45 | # Argument number it's a count of concurrent jobs:
46 | # runner 3, output:"log/runner.log"
47 |
48 | # Example to schedule single spider (without project):
49 | # single "single_spider.rb", output: "single_spider.log"
50 | # end
51 |
52 | ### How to set a cron schedule ###
53 | # Run: `$ whenever --update-crontab --load-file config/schedule.rb`.
54 | # If you don't have whenever command, install the gem: `$ gem install whenever`.
55 |
56 | ### How to cancel a schedule ###
57 | # Run: `$ whenever --clear-crontab --load-file config/schedule.rb`.
58 |
--------------------------------------------------------------------------------
/lib/kimurai/template/db/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/db/.keep
--------------------------------------------------------------------------------
/lib/kimurai/template/helpers/application_helper.rb:
--------------------------------------------------------------------------------
1 | module ApplicationHelper
2 | # Put here custom methods which are will be available for any spider
3 | end
4 |
--------------------------------------------------------------------------------
/lib/kimurai/template/lib/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/lib/.keep
--------------------------------------------------------------------------------
/lib/kimurai/template/log/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/log/.keep
--------------------------------------------------------------------------------
/lib/kimurai/template/pipelines/saver.rb:
--------------------------------------------------------------------------------
1 | class Saver < Kimurai::Pipeline
2 | def process_item(item, options: {})
3 | # Here you can save item to the database, send it to a remote API or
4 | # simply save item to a file format using `save_to` helper:
5 |
6 | # To get the name of a current spider: `spider.class.name`
7 | # save_to "db/#{spider.class.name}.json", item, format: :pretty_json
8 |
9 | item
10 | end
11 | end
12 |
--------------------------------------------------------------------------------
/lib/kimurai/template/pipelines/validator.rb:
--------------------------------------------------------------------------------
1 | class Validator < Kimurai::Pipeline
2 | def process_item(item, options: {})
3 | # Here you can validate item and raise `DropItemError`
4 | # if one of the validations failed. Examples:
5 |
6 | # Check item sku for uniqueness using buit-in `unique?` helper:
7 | # unless unique?(:sku, item[:sku])
8 | # raise DropItemError, "Item sku is not unique"
9 | # end
10 |
11 | # Drop item if title length shorter than 5 symbols:
12 | # if item[:title].size < 5
13 | # raise DropItemError, "Item title is short"
14 | # end
15 |
16 | # Drop item if it doesn't contains any images:
17 | # unless item[:images].present?
18 | # raise DropItemError, "Item images are not present"
19 | # end
20 |
21 | # Pass item to the next pipeline (if it wasn't dropped)
22 | item
23 | end
24 | end
25 |
--------------------------------------------------------------------------------
/lib/kimurai/template/spiders/application_spider.rb:
--------------------------------------------------------------------------------
1 | # ApplicationSpider is a default base spider class. You can set here
2 | # default settings for all spiders inherited from ApplicationSpider.
3 | # To generate a new spider, run: `$ kimurai generate spider spider_name`
4 |
5 | class ApplicationSpider < Kimurai::Base
6 | include ApplicationHelper
7 |
8 | # Default engine for spiders (available engines: :mechanize, :poltergeist_phantomjs,
9 | # :selenium_firefox, :selenium_chrome)
10 | @engine = :poltergeist_phantomjs
11 |
12 | # Pipelines list, by order.
13 | # To process item through pipelines pass item to the `send_item` method
14 | @pipelines = [:validator, :saver]
15 |
16 | # Default config. Set here options which are default for all spiders inherited
17 | # from ApplicationSpider. Child's class config will be deep merged with this one
18 | @config = {
19 | # Custom headers, format: hash. Example: { "some header" => "some value", "another header" => "another value" }
20 | # Works only for :mechanize and :poltergeist_phantomjs engines (Selenium doesn't allow to set/get headers)
21 | # headers: {},
22 |
23 | # Custom User Agent, format: string or lambda.
24 | # Use lambda if you want to rotate user agents before each run:
25 | # user_agent: -> { ARRAY_OF_USER_AGENTS.sample }
26 | # Works for all engines
27 | # user_agent: "Mozilla/5.0 Firefox/61.0",
28 |
29 | # Custom cookies, format: array of hashes.
30 | # Format for a single cookie: { name: "cookie name", value: "cookie value", domain: ".example.com" }
31 | # Works for all engines
32 | # cookies: [],
33 |
34 | # Proxy, format: string or lambda. Format of a proxy string: "ip:port:protocol:user:password"
35 | # `protocol` can be http or socks5. User and password are optional.
36 | # Use lambda if you want to rotate proxies before each run:
37 | # proxy: -> { ARRAY_OF_PROXIES.sample }
38 | # Works for all engines, but keep in mind that Selenium drivers doesn't support proxies
39 | # with authorization. Also, Mechanize doesn't support socks5 proxy format (only http)
40 | # proxy: "3.4.5.6:3128:http:user:pass",
41 |
42 | # If enabled, browser will ignore any https errors. It's handy while using a proxy
43 | # with self-signed SSL cert (for example Crawlera or Mitmproxy)
44 | # Also, it will allow to visit webpages with expires SSL certificate.
45 | # Works for all engines
46 | ignore_ssl_errors: true,
47 |
48 | # Custom window size, works for all engines
49 | # window_size: [1366, 768],
50 |
51 | # Skip images downloading if true, works for all engines
52 | disable_images: true,
53 |
54 | # Selenium engines only: headless mode, `:native` or `:virtual_display` (default is :native)
55 | # Although native mode has a better performance, virtual display mode
56 | # sometimes can be useful. For example, some websites can detect (and block)
57 | # headless chrome, so you can use virtual_display mode instead
58 | # headless_mode: :native,
59 |
60 | # This option tells the browser not to use a proxy for the provided list of domains or IP addresses.
61 | # Format: array of strings. Works only for :selenium_firefox and selenium_chrome
62 | # proxy_bypass_list: [],
63 |
64 | # Option to provide custom SSL certificate. Works only for :poltergeist_phantomjs and :mechanize
65 | # ssl_cert_path: "path/to/ssl_cert",
66 |
67 | # Inject some JavaScript code to the browser.
68 | # Format: array of strings, where each string is a path to JS file.
69 | # Works only for poltergeist_phantomjs engine (Selenium doesn't support JS code injection)
70 | # extensions: ["lib/code_to_inject.js"],
71 |
72 | # Automatically skip duplicated (already visited) urls when using `request_to` method.
73 | # Possible values: `true` or `hash` with options.
74 | # In case of `true`, all visited urls will be added to the storage's scope `:requests_urls`
75 | # and if url already contains in this scope, request will be skipped.
76 | # You can configure this setting by providing additional options as hash:
77 | # `skip_duplicate_requests: { scope: :custom_scope, check_only: true }`, where:
78 | # `scope:` - use custom scope than `:requests_urls`
79 | # `check_only:` - if true, then scope will be only checked for url, url will not
80 | # be added to the scope if scope doesn't contains it.
81 | # works for all drivers
82 | # skip_duplicate_requests: true,
83 |
84 | # Automatically skip provided errors while requesting a page.
85 | # If raised error matches one of the errors in the list, then this error will be caught,
86 | # and request will be skipped.
87 | # It is a good idea to skip errors like NotFound(404), etc.
88 | # Format: array where elements are error classes or/and hashes. You can use hash format
89 | # for more flexibility: `{ error: "RuntimeError", message: "404 => Net::HTTPNotFound" }`.
90 | # Provided `message:` will be compared with a full error message using `String#include?`. Also
91 | # you can use regex instead: `{ error: "RuntimeError", message: /404|403/ }`.
92 | # skip_request_errors: [{ error: RuntimeError, message: "404 => Net::HTTPNotFound" }],
93 |
94 | # Automatically retry provided errors with a few attempts while requesting a page.
95 | # If raised error matches one of the errors in the list, then this error will be caught
96 | # and the request will be processed again within a delay. There are 3 attempts:
97 | # first: delay 15 sec, second: delay 30 sec, third: delay 45 sec.
98 | # If after 3 attempts there is still an exception, then the exception will be raised.
99 | # It is a good idea to try to retry errros like `ReadTimeout`, `HTTPBadGateway`, etc.
100 | # Format: same like for `skip_request_errors` option.
101 | # retry_request_errors: [Net::ReadTimeout],
102 |
103 | # Handle page encoding while parsing html response using Nokogiri. There are two modes:
104 | # Auto (`:auto`) (try to fetch correct encoding from or tags)
105 | # Set required encoding manually, example: `encoding: "GB2312"` (Set required encoding manually)
106 | # Default this option is unset.
107 | # encoding: nil,
108 |
109 | # Restart browser if one of the options is true:
110 | restart_if: {
111 | # Restart browser if provided memory limit (in kilobytes) is exceeded (works for all engines)
112 | # memory_limit: 350_000,
113 |
114 | # Restart browser if provided requests limit is exceeded (works for all engines)
115 | # requests_limit: 100
116 | },
117 |
118 | # Perform several actions before each request:
119 | before_request: {
120 | # Change proxy before each request. The `proxy:` option above should be presented
121 | # and has lambda format. Works only for poltergeist and mechanize engines
122 | # (Selenium doesn't support proxy rotation).
123 | # change_proxy: true,
124 |
125 | # Change user agent before each request. The `user_agent:` option above should be presented
126 | # and has lambda format. Works only for poltergeist and mechanize engines
127 | # (selenium doesn't support to get/set headers).
128 | # change_user_agent: true,
129 |
130 | # Clear all cookies before each request, works for all engines
131 | # clear_cookies: true,
132 |
133 | # If you want to clear all cookies + set custom cookies (`cookies:` option above should be presented)
134 | # use this option instead (works for all engines)
135 | # clear_and_set_cookies: true,
136 |
137 | # Global option to set delay between requests.
138 | # Delay can be `Integer`, `Float` or `Range` (`2..5`). In case of a range,
139 | # delay number will be chosen randomly for each request: `rand (2..5) # => 3`
140 | # delay: 1..3
141 | }
142 | }
143 | end
144 |
--------------------------------------------------------------------------------
/lib/kimurai/template/tmp/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vifreefly/kimuraframework/a5d47c26fffe2a3c10cc346b7dcf9ac06b4ccd2f/lib/kimurai/template/tmp/.keep
--------------------------------------------------------------------------------
/lib/kimurai/version.rb:
--------------------------------------------------------------------------------
1 | module Kimurai
2 | VERSION = "1.4.0"
3 | end
4 |
--------------------------------------------------------------------------------
/test/kimurai_test.rb:
--------------------------------------------------------------------------------
1 | require "test_helper"
2 |
3 | class KimuraiTest < Minitest::Test
4 | def test_that_it_has_a_version_number
5 | refute_nil ::Kimurai::VERSION
6 | end
7 |
8 | def test_it_does_something_useful
9 | assert false
10 | end
11 | end
12 |
--------------------------------------------------------------------------------
/test/test_helper.rb:
--------------------------------------------------------------------------------
1 | $LOAD_PATH.unshift File.expand_path("../../lib", __FILE__)
2 | require "kimurai"
3 |
4 | require "minitest/autorun"
5 |
--------------------------------------------------------------------------------