├── .gitignore ├── CODE_OF_CONDUCT.md ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── bin ├── console └── setup ├── lib ├── proxycrawl.rb └── proxycrawl │ ├── api.rb │ ├── leads_api.rb │ ├── scraper_api.rb │ ├── screenshots_api.rb │ ├── storage_api.rb │ └── version.rb ├── proxycrawl.gemspec └── spec ├── api_spec.rb ├── screenshots_api_spec.rb ├── spec_helper.rb └── storage_api_spec.rb /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /Gemfile.lock 4 | /_yardoc/ 5 | /coverage/ 6 | /doc/ 7 | /pkg/ 8 | /spec/reports/ 9 | /tmp/ 10 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at info@proxycrawl.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gemspec 4 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2023 ProxyCrawl 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DEPRECATION NOTICE 2 | 3 | > :warning: **IMPORTANT:** This gem is no longer maintained or supported. For the latest updates, please use our new gem at [crawlbase-ruby](https://github.com/crawlbase-source/crawlbase-ruby). 4 | 5 | --- 6 | 7 | # ProxyCrawl 8 | 9 | Dependency free gem for scraping and crawling websites using the ProxyCrawl API. 10 | 11 | ## Installation 12 | 13 | Add this line to your application's Gemfile: 14 | 15 | ```ruby 16 | gem 'proxycrawl' 17 | ``` 18 | 19 | And then execute: 20 | 21 | $ bundle 22 | 23 | Or install it yourself as: 24 | 25 | $ gem install proxycrawl 26 | 27 | ## Crawling API Usage 28 | 29 | Require the gem in your project 30 | 31 | ```ruby 32 | require 'proxycrawl' 33 | ``` 34 | 35 | Initialize the API with one of your account tokens, either normal or javascript token. Then make get or post requests accordingly. 36 | 37 | You can get a token for free by [creating a ProxyCrawl account](https://proxycrawl.com/signup) and 1000 free testing requests. You can use them for tcp calls or javascript calls or both. 38 | 39 | ```ruby 40 | api = ProxyCrawl::API.new(token: 'YOUR_TOKEN') 41 | ``` 42 | 43 | ### GET requests 44 | 45 | Pass the url that you want to scrape plus any options from the ones available in the [API documentation](https://proxycrawl.com/dashboard/docs). 46 | 47 | ```ruby 48 | api.get(url, options) 49 | ``` 50 | 51 | Example: 52 | 53 | ```ruby 54 | 55 | begin 56 | response = api.get('https://www.facebook.com/britneyspears') 57 | puts response.status_code 58 | puts response.original_status 59 | puts response.pc_status 60 | puts response.body 61 | rescue => exception 62 | puts exception.backtrace 63 | end 64 | 65 | ``` 66 | 67 | You can pass any options of what the ProxyCrawl API supports in exact param format. 68 | 69 | Example: 70 | 71 | ```ruby 72 | options = { 73 | user_agent: 'Mozilla/5.0 (Windows NT 6.2; rv:20.0) Gecko/20121202 Firefox/30.0', 74 | format: 'json' 75 | } 76 | 77 | response = api.get('https://www.reddit.com/r/pics/comments/5bx4bx/thanks_obama/', options) 78 | 79 | puts response.status_code 80 | puts response.body # read the API json response 81 | ``` 82 | 83 | ### POST requests 84 | 85 | Pass the url that you want to scrape, the data that you want to send which can be either a json or a string, plus any options from the ones available in the [API documentation](https://proxycrawl.com/dashboard/docs). 86 | 87 | ```ruby 88 | api.post(url, data, options); 89 | ``` 90 | 91 | Example: 92 | 93 | ```ruby 94 | api.post('https://producthunt.com/search', { text: 'example search' }) 95 | ``` 96 | 97 | You can send the data as application/json instead of x-www-form-urlencoded by setting options `post_content_type` as json. 98 | 99 | ```ruby 100 | response = api.post('https://httpbin.org/post', { some_json: 'with some value' }, { post_content_type: 'json' }) 101 | 102 | puts response.status_code 103 | puts response.body 104 | 105 | ``` 106 | 107 | ### Javascript requests 108 | 109 | If you need to scrape any website built with Javascript like React, Angular, Vue, etc. You just need to pass your javascript token and use the same calls. Note that only `.get` is available for javascript and not `.post`. 110 | 111 | ```ruby 112 | api = ProxyCrawl::API.new(token: 'YOUR_JAVASCRIPT_TOKEN' }) 113 | ``` 114 | 115 | ```ruby 116 | response = api.get('https://www.nfl.com') 117 | puts response.status_code 118 | puts response.body 119 | ``` 120 | 121 | Same way you can pass javascript additional options. 122 | 123 | ```ruby 124 | response = api.get('https://www.freelancer.com', options: { page_wait: 5000 }) 125 | puts response.status_code 126 | ``` 127 | 128 | ## Original status 129 | 130 | You can always get the original status and proxycrawl status from the response. Read the [ProxyCrawl documentation](https://proxycrawl.com/dashboard/docs) to learn more about those status. 131 | 132 | ```ruby 133 | response = api.get('https://sfbay.craigslist.org/') 134 | 135 | puts response.original_status 136 | puts response.pc_status 137 | ``` 138 | 139 | ## Scraper API usage 140 | 141 | Initialize the Scraper API using your normal token and call the `get` method. 142 | 143 | ```ruby 144 | scraper_api = ProxyCrawl::ScraperAPI.new(token: 'YOUR_TOKEN') 145 | ``` 146 | 147 | Pass the url that you want to scrape plus any options from the ones available in the [Scraper API documentation](https://proxycrawl.com/docs/scraper-api/parameters). 148 | 149 | ```ruby 150 | api.get(url, options) 151 | ``` 152 | 153 | Example: 154 | 155 | ```ruby 156 | begin 157 | response = scraper_api.get('https://www.amazon.com/Halo-SleepSack-Swaddle-Triangle-Neutral/dp/B01LAG1TOS') 158 | puts response.remaining_requests 159 | puts response.status_code 160 | puts response.body 161 | rescue => exception 162 | puts exception.backtrace 163 | end 164 | ``` 165 | 166 | ## Leads API usage 167 | 168 | Initialize with your Leads API token and call the `get` method. 169 | 170 | For more details on the implementation, please visit the [Leads API documentation](https://proxycrawl.com/docs/leads-api). 171 | 172 | ```ruby 173 | leads_api = ProxyCrawl::LeadsAPI.new(token: 'YOUR_TOKEN') 174 | 175 | begin 176 | response = leads_api.get('stripe.com') 177 | puts response.success 178 | puts response.remaining_requests 179 | puts response.status_code 180 | puts response.body 181 | rescue => exception 182 | puts exception.backtrace 183 | end 184 | ``` 185 | 186 | If you have questions or need help using the library, please open an issue or [contact us](https://proxycrawl.com/contact). 187 | 188 | 189 | ## Screenshots API usage 190 | 191 | Initialize with your Screenshots API token and call the `get` method. 192 | 193 | ```ruby 194 | screenshots_api = ProxyCrawl::ScreenshotsAPI.new(token: 'YOUR_TOKEN') 195 | 196 | begin 197 | response = screenshots_api.get('https://www.apple.com') 198 | puts response.success 199 | puts response.remaining_requests 200 | puts response.status_code 201 | puts response.screenshot_path # do something with screenshot_path here 202 | rescue => exception 203 | puts exception.backtrace 204 | end 205 | ``` 206 | 207 | or with using a block 208 | 209 | ```ruby 210 | screenshots_api = ProxyCrawl::ScreenshotsAPI.new(token: 'YOUR_TOKEN') 211 | 212 | begin 213 | response = screenshots_api.get('https://www.apple.com') do |file| 214 | # do something (reading/writing) with the image file here 215 | end 216 | puts response.success 217 | puts response.remaining_requests 218 | puts response.status_code 219 | rescue => exception 220 | puts exception.backtrace 221 | end 222 | ``` 223 | 224 | or specifying a file path 225 | 226 | ```ruby 227 | screenshots_api = ProxyCrawl::ScreenshotsAPI.new(token: 'YOUR_TOKEN') 228 | 229 | begin 230 | response = screenshots_api.get('https://www.apple.com', save_to_path: '~/screenshot.jpg') do |file| 231 | # do something (reading/writing) with the image file here 232 | end 233 | puts response.success 234 | puts response.remaining_requests 235 | puts response.status_code 236 | rescue => exception 237 | puts exception.backtrace 238 | end 239 | ``` 240 | 241 | Note that `screenshots_api.get(url, options)` method accepts an [options](https://proxycrawl.com/docs/screenshots-api/parameters) 242 | 243 | ## Storage API usage 244 | 245 | Initialize the Storage API using your private token. 246 | 247 | ```ruby 248 | storage_api = ProxyCrawl::StorageAPI.new(token: 'YOUR_TOKEN') 249 | ``` 250 | 251 | Pass the [url](https://proxycrawl.com/docs/storage-api/parameters/#url) that you want to get from [Proxycrawl Storage](https://proxycrawl.com/dashboard/storage). 252 | 253 | ```ruby 254 | begin 255 | response = storage_api.get('https://www.apple.com') 256 | puts response.original_status 257 | puts response.pc_status 258 | puts response.url 259 | puts response.status_code 260 | puts response.rid 261 | puts response.body 262 | puts response.stored_at 263 | rescue => exception 264 | puts exception.backtrace 265 | end 266 | ``` 267 | 268 | or you can use the [RID](https://proxycrawl.com/docs/storage-api/parameters/#rid) 269 | 270 | ```ruby 271 | begin 272 | response = storage_api.get(RID) 273 | puts response.original_status 274 | puts response.pc_status 275 | puts response.url 276 | puts response.status_code 277 | puts response.rid 278 | puts response.body 279 | puts response.stored_at 280 | rescue => exception 281 | puts exception.backtrace 282 | end 283 | ``` 284 | 285 | Note: One of the two RID or URL must be sent. So both are optional but it's mandatory to send one of the two. 286 | 287 | ### [Delete](https://proxycrawl.com/docs/storage-api/delete/) request 288 | 289 | To delete a storage item from your storage area, use the correct RID 290 | 291 | ```ruby 292 | if storage_api.delete(RID) 293 | puts 'delete success' 294 | else 295 | puts "Unable to delete: #{storage_api.body['error']}" 296 | end 297 | ``` 298 | 299 | ### [Bulk](https://proxycrawl.com/docs/storage-api/bulk/) request 300 | 301 | To do a bulk request with a list of RIDs, please send the list of rids as an array 302 | 303 | ```ruby 304 | begin 305 | response = storage_api.bulk([RID1, RID2, RID3, ...]) 306 | puts response.original_status 307 | puts response.pc_status 308 | puts response.url 309 | puts response.status_code 310 | puts response.rid 311 | puts response.body 312 | puts response.stored_at 313 | rescue => exception 314 | puts exception.backtrace 315 | end 316 | ``` 317 | 318 | ### [RIDs](https://proxycrawl.com/docs/storage-api/rids) request 319 | 320 | To request a bulk list of RIDs from your storage area 321 | 322 | ```ruby 323 | begin 324 | response = storage_api.rids 325 | puts response.status_code 326 | puts response.rid 327 | puts response.body 328 | rescue => exception 329 | puts exception.backtrace 330 | end 331 | ``` 332 | 333 | You can also specify a limit as a parameter 334 | 335 | ```ruby 336 | storage_api.rids(100) 337 | ``` 338 | 339 | ### [Total Count](https://proxycrawl.com/docs/storage-api/total_count) 340 | 341 | To get the total number of documents in your storage area 342 | 343 | ```ruby 344 | total_count = storage_api.total_count 345 | puts "total_count: #{total_count}" 346 | ``` 347 | 348 | If you have questions or need help using the library, please open an issue or [contact us](https://proxycrawl.com/contact). 349 | 350 | ## Development 351 | 352 | After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment. 353 | 354 | To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). 355 | 356 | ## Contributing 357 | 358 | Bug reports and pull requests are welcome on GitHub at https://github.com/proxycrawl/proxycrawl-ruby. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct. 359 | 360 | ## License 361 | 362 | The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT). 363 | 364 | ## Code of Conduct 365 | 366 | Everyone interacting in the Proxycrawl project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/proxycrawl/proxycrawl-ruby/blob/master/CODE_OF_CONDUCT.md). 367 | 368 | --- 369 | 370 | Copyright 2023 ProxyCrawl 371 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | task :default => :spec 3 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require "proxycrawl" 5 | 6 | # You can add fixtures and/or initialization code here to make experimenting 7 | # with your gem easier. You can also use a different console, if you like. 8 | 9 | # (If you use this, don't forget to add pry to your Gemfile!) 10 | # require "pry" 11 | # Pry.start 12 | 13 | require "irb" 14 | IRB.start(__FILE__) 15 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /lib/proxycrawl.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'proxycrawl/version' 4 | require 'proxycrawl/api' 5 | require 'proxycrawl/scraper_api' 6 | require 'proxycrawl/leads_api' 7 | require 'proxycrawl/screenshots_api' 8 | require 'proxycrawl/storage_api' 9 | 10 | module ProxyCrawl 11 | end 12 | -------------------------------------------------------------------------------- /lib/proxycrawl/api.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'net/http' 4 | require 'json' 5 | require 'uri' 6 | 7 | module ProxyCrawl 8 | class API 9 | attr_reader :token, :body, :timeout, :status_code, :original_status, :pc_status, :url, :storage_url 10 | 11 | INVALID_TOKEN = 'Token is required' 12 | INVALID_URL = 'URL is required' 13 | 14 | def initialize(options = {}) 15 | raise INVALID_TOKEN if options[:token].nil? 16 | 17 | @token = options[:token] 18 | @timeout = options[:timeout] || 120 19 | end 20 | 21 | def get(url, options = {}) 22 | raise INVALID_URL if url.empty? 23 | 24 | uri = prepare_uri(url, options) 25 | req = Net::HTTP::Get.new(uri) 26 | 27 | req_options = { 28 | read_timeout: timeout, 29 | use_ssl: uri.scheme == 'https', 30 | verify_mode: OpenSSL::SSL::VERIFY_NONE 31 | } 32 | 33 | response = Net::HTTP.start(uri.hostname, uri.port, req_options) { |http| http.request(req) } 34 | 35 | prepare_response(response, options[:format]) 36 | 37 | self 38 | end 39 | 40 | def post(url, data, options = {}) 41 | raise INVALID_URL if url.empty? 42 | 43 | uri = prepare_uri(url, options) 44 | 45 | http = Net::HTTP.new(uri.host, uri.port) 46 | 47 | http.use_ssl = true 48 | 49 | content_type = options[:post_content_type].to_s.include?('json') ? { 'Content-Type': 'text/json' } : nil 50 | 51 | request = Net::HTTP::Post.new(uri.request_uri, content_type) 52 | 53 | if options[:post_content_type].to_s.include?('json') 54 | request.body = data.to_json 55 | else 56 | request.set_form_data(data) 57 | end 58 | 59 | response = http.request(request) 60 | 61 | prepare_response(response, options[:format]) 62 | 63 | self 64 | end 65 | 66 | private 67 | 68 | def base_url 69 | 'https://api.proxycrawl.com' 70 | end 71 | 72 | def prepare_uri(url, options) 73 | uri = URI(base_url) 74 | uri.query = URI.encode_www_form({ token: @token, url: url }.merge(options)) 75 | 76 | uri 77 | end 78 | 79 | def prepare_response(response, format) 80 | res = format == 'json' || base_url.include?('/scraper') ? JSON.parse(response.body) : response 81 | 82 | @original_status = res['original_status'].to_i 83 | @pc_status = res['pc_status'].to_i 84 | @url = res['url'] 85 | @storage_url = res['storage_url'] 86 | @status_code = response.code.to_i 87 | @body = response.body 88 | end 89 | end 90 | end 91 | -------------------------------------------------------------------------------- /lib/proxycrawl/leads_api.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'net/http' 4 | require 'json' 5 | require 'uri' 6 | 7 | module ProxyCrawl 8 | class LeadsAPI 9 | attr_reader :token, :timeout, :body, :status_code, :success, :remaining_requests 10 | 11 | INVALID_TOKEN = 'Token is required' 12 | INVALID_DOMAIN = 'Domain is required' 13 | 14 | def initialize(options = {}) 15 | raise INVALID_TOKEN if options[:token].nil? || options[:token].empty? 16 | 17 | @token = options[:token] 18 | @timeout = options[:timeout] || 120 19 | end 20 | 21 | def get(domain) 22 | raise INVALID_DOMAIN if domain.empty? 23 | 24 | uri = URI('https://api.proxycrawl.com/leads') 25 | uri.query = URI.encode_www_form({ token: token, domain: domain }) 26 | 27 | req = Net::HTTP::Get.new(uri) 28 | 29 | req_options = { 30 | read_timeout: timeout, 31 | use_ssl: uri.scheme == 'https', 32 | verify_mode: OpenSSL::SSL::VERIFY_NONE 33 | } 34 | 35 | response = Net::HTTP.start(uri.hostname, uri.port, req_options) { |http| http.request(req) } 36 | @status_code = response.code.to_i 37 | @body = response.body 38 | 39 | json_body = JSON.parse(response.body) 40 | @success = json_body['success'] 41 | @remaining_requests = json_body['remaining_requests'].to_i 42 | 43 | self 44 | end 45 | 46 | def post 47 | raise 'Only GET is allowed for the LeadsAPI' 48 | end 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /lib/proxycrawl/scraper_api.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyCrawl 4 | class ScraperAPI < ProxyCrawl::API 5 | attr_reader :remaining_requests 6 | 7 | def post 8 | raise 'Only GET is allowed for the ScraperAPI' 9 | end 10 | 11 | private 12 | 13 | def prepare_response(response, format) 14 | super(response, format) 15 | json_body = JSON.parse(response.body) 16 | @remaining_requests = json_body['remaining_requests'].to_i 17 | end 18 | 19 | def base_url 20 | 'https://api.proxycrawl.com/scraper' 21 | end 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /lib/proxycrawl/screenshots_api.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'securerandom' 4 | require 'tmpdir' 5 | 6 | module ProxyCrawl 7 | class ScreenshotsAPI < ProxyCrawl::API 8 | attr_reader :screenshot_path, :success, :remaining_requests, :screenshot_url 9 | 10 | INVALID_SAVE_TO_PATH_FILENAME = 'Filename must end with .jpg or .jpeg' 11 | SAVE_TO_PATH_FILENAME_PATTERN = /.+\.(jpg|JPG|jpeg|JPEG)$/.freeze 12 | 13 | def post 14 | raise 'Only GET is allowed for the ScreenshotsAPI' 15 | end 16 | 17 | def get(url, options = {}) 18 | screenshot_path = options.delete(:save_to_path) || generate_file_path 19 | raise INVALID_SAVE_TO_PATH_FILENAME unless SAVE_TO_PATH_FILENAME_PATTERN =~ screenshot_path 20 | 21 | response = super(url, options) 22 | file = File.open(screenshot_path, 'w+') 23 | file.write(response.body&.force_encoding('UTF-8')) 24 | @screenshot_path = screenshot_path 25 | yield(file) if block_given? 26 | response 27 | ensure 28 | file&.close 29 | end 30 | 31 | private 32 | 33 | def prepare_response(response, format) 34 | super(response, format) 35 | @remaining_requests = response['remaining_requests'].to_i 36 | @success = response['success'] == 'true' 37 | @screenshot_url = response['screenshot_url'] 38 | end 39 | 40 | def base_url 41 | 'https://api.proxycrawl.com/screenshots' 42 | end 43 | 44 | def generate_file_name 45 | "#{SecureRandom.urlsafe_base64}.jpg" 46 | end 47 | 48 | def generate_file_path 49 | File.join(Dir.tmpdir, generate_file_name) 50 | end 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /lib/proxycrawl/storage_api.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'net/http' 4 | require 'json' 5 | require 'uri' 6 | 7 | module ProxyCrawl 8 | class StorageAPI 9 | attr_reader :token, :timeout, :original_status, :pc_status, :url, :status_code, :rid, :body, :stored_at 10 | 11 | INVALID_TOKEN = 'Token is required' 12 | INVALID_RID = 'RID is required' 13 | INVALID_RID_ARRAY = 'One or more RIDs are required' 14 | INVALID_URL_OR_RID = 'Either URL or RID is required' 15 | BASE_URL = 'https://api.proxycrawl.com/storage' 16 | 17 | def initialize(options = {}) 18 | raise INVALID_TOKEN if options[:token].nil? || options[:token].empty? 19 | 20 | @token = options[:token] 21 | @timeout = options[:timeout] || 120 22 | end 23 | 24 | def get(url_or_rid, format = 'html') 25 | raise INVALID_URL_OR_RID if url_or_rid.nil? || url_or_rid.empty? 26 | 27 | uri = URI(BASE_URL) 28 | uri.query = URI.encode_www_form({ token: token, format: format }.merge(decide_url_or_rid(url_or_rid))) 29 | 30 | req = Net::HTTP::Get.new(uri) 31 | 32 | req_options = { 33 | read_timeout: timeout, 34 | use_ssl: uri.scheme == 'https', 35 | verify_mode: OpenSSL::SSL::VERIFY_NONE 36 | } 37 | 38 | response = Net::HTTP.start(uri.hostname, uri.port, req_options) { |http| http.request(req) } 39 | 40 | res = format == 'json' ? JSON.parse(response.body) : response 41 | 42 | @original_status = res['original_status'].to_i 43 | @pc_status = res['pc_status'].to_i 44 | @url = res['url'] 45 | @rid = res['rid'] 46 | @stored_at = res['stored_at'] 47 | 48 | @status_code = response.code.to_i 49 | @body = response.body 50 | 51 | self 52 | end 53 | 54 | def delete(rid) 55 | raise INVALID_RID if rid.nil? || rid.empty? 56 | 57 | uri = URI(BASE_URL) 58 | uri.query = URI.encode_www_form(token: token, rid: rid) 59 | http = Net::HTTP.new(uri.host) 60 | request = Net::HTTP::Delete.new(uri.request_uri) 61 | response = http.request(request) 62 | 63 | @url, @original_status, @pc_status, @stored_at = nil 64 | @status_code = response.code.to_i 65 | @rid = rid 66 | @body = JSON.parse(response.body) 67 | 68 | @body.key?('success') 69 | end 70 | 71 | def bulk(rids_array = []) 72 | raise INVALID_RID_ARRAY if rids_array.empty? 73 | 74 | uri = URI("#{BASE_URL}/bulk") 75 | uri.query = URI.encode_www_form(token: token) 76 | http = Net::HTTP.new(uri.host) 77 | request = Net::HTTP::Post.new(uri.request_uri, { 'Content-Type': 'application/json' }) 78 | request.body = { rids: rids_array }.to_json 79 | response = http.request(request) 80 | 81 | @body = JSON.parse(response.body) 82 | @original_status = @body.map { |item| item['original_status'].to_i } 83 | @status_code = response.code.to_i 84 | @pc_status = @body.map { |item| item['pc_status'].to_i } 85 | @url = @body.map { |item| item['url'] } 86 | @rid = @body.map { |item| item['rid'] } 87 | @stored_at = @body.map { |item| item['stored_at'] } 88 | 89 | self 90 | end 91 | 92 | def rids(limit = -1) 93 | uri = URI("#{BASE_URL}/rids") 94 | query_hash = { token: token } 95 | query_hash.merge!({ limit: limit }) if limit >= 0 96 | uri.query = URI.encode_www_form(query_hash) 97 | 98 | response = Net::HTTP.get_response(uri) 99 | @url, @original_status, @pc_status, @stored_at = nil 100 | @status_code = response.code.to_i 101 | @body = JSON.parse(response.body) 102 | @rid = @body 103 | 104 | @body 105 | end 106 | 107 | def total_count 108 | uri = URI("#{BASE_URL}/total_count") 109 | uri.query = URI.encode_www_form(token: token) 110 | 111 | response = Net::HTTP.get_response(uri) 112 | @url, @original_status, @pc_status, @stored_at = nil 113 | @status_code = response.code.to_i 114 | @rid = rid 115 | @body = JSON.parse(response.body) 116 | 117 | body['totalCount'] 118 | end 119 | 120 | private 121 | 122 | def decide_url_or_rid(url_or_rid) 123 | %r{^https?://} =~ url_or_rid ? { url: url_or_rid } : { rid: url_or_rid } 124 | end 125 | end 126 | end 127 | -------------------------------------------------------------------------------- /lib/proxycrawl/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyCrawl 4 | VERSION = '1.0.2' 5 | end 6 | -------------------------------------------------------------------------------- /proxycrawl.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path("../lib", __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require "proxycrawl/version" 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "proxycrawl" 8 | spec.version = ProxyCrawl::VERSION 9 | spec.platform = Gem::Platform::RUBY 10 | spec.authors = ["proxycrawl"] 11 | spec.email = ["info@proxycrawl.com"] 12 | spec.summary = %q{ProxyCrawl API client for web scraping and crawling} 13 | spec.description = %q{Ruby based client for the ProxyCrawl API that helps developers crawl or scrape thousands of web pages anonymously} 14 | spec.homepage = "https://github.com/proxycrawl/proxycrawl-ruby" 15 | spec.license = "MIT" 16 | 17 | spec.files = `git ls-files -z`.split("\x0").reject do |f| 18 | f.match(%r{^(test|spec|features)/}) 19 | end 20 | 21 | spec.required_ruby_version = '>= 2.0' 22 | 23 | spec.bindir = "exe" 24 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } 25 | spec.require_paths = ["lib"] 26 | 27 | spec.add_development_dependency "rspec", "~> 3.2" 28 | spec.add_development_dependency "webmock", "~> 3.4" 29 | spec.add_development_dependency "bundler", "~> 2.0" 30 | spec.add_development_dependency "rake", "~> 12.3.3" 31 | 32 | # Deprecation warning 33 | spec.post_install_message = <<~MESSAGE 34 | ================================================================================ 35 | DEPRECATION WARNING - 'proxycrawl' gem 36 | ================================================================================ 37 | 38 | 'proxycrawl' is deprecated due to rebranding. Please switch to the 'crawlbase' gem. 39 | 40 | More details and migration guide: https://github.com/crawlbase-source/crawlbase-ruby 41 | ================================================================================ 42 | MESSAGE 43 | end 44 | -------------------------------------------------------------------------------- /spec/api_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper.rb' 2 | require 'proxycrawl' 3 | 4 | describe ProxyCrawl::API do 5 | it 'raises an error if token is missing' do 6 | expect { ProxyCrawl::API.new }.to raise_error(RuntimeError, 'Token is required') 7 | end 8 | 9 | it 'sets/reads token' do 10 | expect(ProxyCrawl::API.new(token: 'test').token).to eql('test') 11 | end 12 | 13 | describe '#get' do 14 | it 'sends an get request to ProxyCrawl API' do 15 | stub_request(:get, 'https://api.proxycrawl.com/?token=test&url=http%3A%2F%2Fhttpbin.org%2Fanything%3Fparam1%3Dx%26params2%3Dy'). 16 | to_return( 17 | body: 'body', 18 | status: 200, 19 | headers: { skip_normalize: true, 'original_status' => 200, 'pc_status' => 200, 'url' => 'http://httpbin.org/anything?param1=x¶ms2=y'}) 20 | 21 | api = ProxyCrawl::API.new(token: 'test') 22 | 23 | response = api.get('http://httpbin.org/anything?param1=x¶ms2=y') 24 | 25 | expect(response.status_code).to eql(200) 26 | expect(response.original_status).to eql(200) 27 | expect(response.pc_status).to eql(200) 28 | expect(response.url).to eql('http://httpbin.org/anything?param1=x¶ms2=y') 29 | expect(response.body).to eql('body') 30 | end 31 | end 32 | 33 | describe '#post' do 34 | it 'sends a post request to ProxyCrawl API with json data' do 35 | stub_request(:post, 'https://api.proxycrawl.com/?post_content_type=json&token=test&url=http://httpbin.org/post'). 36 | with(body: "{\"foo\":\"bar\"}"). 37 | to_return( 38 | body: 'body', 39 | status: 200, 40 | headers: { skip_normalize: true, 'original_status' => 200, 'pc_status' => 200, 'url' => 'http://httpbin.org/anything?param1=x¶ms2=y'}) 41 | 42 | api = ProxyCrawl::API.new(token: 'test') 43 | 44 | response = api.post("http://httpbin.org/post", { foo: 'bar' }, { post_content_type: 'json'} ) 45 | 46 | expect(response.status_code).to eql(200) 47 | expect(response.original_status).to eql(200) 48 | expect(response.pc_status).to eql(200) 49 | expect(response.url).to eql('http://httpbin.org/anything?param1=x¶ms2=y') 50 | expect(response.body).to eql('body') 51 | end 52 | 53 | it 'sends a post request to ProxyCrawl API with form data' do 54 | stub_request(:post, 'https://api.proxycrawl.com/?token=test&url=http://httpbin.org/post'). 55 | with(body: { "foo" => "bar" }). 56 | to_return( 57 | body: 'body', 58 | status: 200, 59 | headers: { skip_normalize: true, 'original_status' => 200, 'pc_status' => 200, 'url' => 'http://httpbin.org/anything?param1=x¶ms2=y'}) 60 | 61 | api = ProxyCrawl::API.new(token: 'test') 62 | 63 | response = api.post("http://httpbin.org/post", { foo: 'bar' } ) 64 | 65 | expect(response.status_code).to eql(200) 66 | expect(response.original_status).to eql(200) 67 | expect(response.pc_status).to eql(200) 68 | expect(response.url).to eql('http://httpbin.org/anything?param1=x¶ms2=y') 69 | expect(response.body).to eql('body') 70 | end 71 | end 72 | 73 | end 74 | -------------------------------------------------------------------------------- /spec/screenshots_api_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'proxycrawl' 3 | 4 | describe ProxyCrawl::ScreenshotsAPI do 5 | it 'raises an error if token is missing' do 6 | expect { ProxyCrawl::ScreenshotsAPI.new }.to raise_error(RuntimeError, 'Token is required') 7 | end 8 | 9 | it 'sets/reads token' do 10 | expect(ProxyCrawl::ScreenshotsAPI.new(token: 'test').token).to eql('test') 11 | end 12 | 13 | describe '#get' do 14 | before(:each) do 15 | stub_request(:get, 'https://api.proxycrawl.com/screenshots?token=test&url=http%3A%2F%2Fhttpbin.org%2Fanything%3Fparam1%3Dx%26params2%3Dy'). 16 | to_return( 17 | body: 'body', 18 | status: 200, 19 | headers: { skip_normalize: true, 'original_status' => 200, 'pc_status' => 200, 'url' => 'http://httpbin.org/anything?param1=x¶ms2=y'}) 20 | end 21 | 22 | it 'sends an get request to ProxyCrawl Screenshots API' do 23 | api = ProxyCrawl::ScreenshotsAPI.new(token: 'test') 24 | 25 | response = api.get("http://httpbin.org/anything?param1=x¶ms2=y") 26 | 27 | expect(response.status_code).to eql(200) 28 | expect(response.original_status).to eql(200) 29 | expect(response.pc_status).to eql(200) 30 | expect(response.url).to eql('http://httpbin.org/anything?param1=x¶ms2=y') 31 | expect(response.body).to eql('body') 32 | expect(response.screenshot_path).not_to be_empty 33 | end 34 | 35 | it 'accepts a valid save_to_path option' do 36 | api = ProxyCrawl::ScreenshotsAPI.new(token: 'test') 37 | 38 | response = api.get("http://httpbin.org/anything?param1=x¶ms2=y", save_to_path: save_to_path) 39 | 40 | expect(response.status_code).to eql(200) 41 | expect(response.original_status).to eql(200) 42 | expect(response.pc_status).to eql(200) 43 | expect(response.url).to eql('http://httpbin.org/anything?param1=x¶ms2=y') 44 | expect(response.body).to eql('body') 45 | expect(response.screenshot_path).to eql(File.join(Dir.tmpdir, 'test-image.jpg')) 46 | end 47 | 48 | it 'rejects an invalid save_to_path option' do 49 | api = ProxyCrawl::ScreenshotsAPI.new(token: 'test') 50 | 51 | expect { api.get("http://httpbin.org/anything?param1=x¶ms2=y", save_to_path: '~/images/image_filename.png') }.to raise_error(RuntimeError, 'Filename must end with .jpg or .jpeg') 52 | expect { api.get("http://httpbin.org/anything?param1=x¶ms2=y", save_to_path: 'image_filename.png') }.to raise_error(RuntimeError, 'Filename must end with .jpg or .jpeg') 53 | expect { api.get("http://httpbin.org/anything?param1=x¶ms2=y", save_to_path: '~/images/image_filename') }.to raise_error(RuntimeError, 'Filename must end with .jpg or .jpeg') 54 | expect { api.get("http://httpbin.org/anything?param1=x¶ms2=y", save_to_path: 'image_filename') }.to raise_error(RuntimeError, 'Filename must end with .jpg or .jpeg') 55 | end 56 | 57 | it 'accepts a block' do 58 | api = ProxyCrawl::ScreenshotsAPI.new(token: 'test') 59 | 60 | response = api.get("http://httpbin.org/anything?param1=x¶ms2=y", save_to_path: save_to_path) do |file| 61 | expect(file).to be_kind_of(File) 62 | expect(file.path).to eql(File.join(Dir.tmpdir, 'test-image.jpg')) 63 | end 64 | 65 | expect(response.status_code).to eql(200) 66 | expect(response.original_status).to eql(200) 67 | expect(response.pc_status).to eql(200) 68 | expect(response.url).to eql('http://httpbin.org/anything?param1=x¶ms2=y') 69 | expect(response.body).to eql('body') 70 | expect(response.screenshot_path).to eql(File.join(Dir.tmpdir, 'test-image.jpg')) 71 | end 72 | end 73 | 74 | private 75 | 76 | def save_to_path 77 | File.join(Dir.tmpdir, 'test-image.jpg') 78 | end 79 | end 80 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | require 'webmock/rspec' 3 | 4 | WebMock.disable_net_connect!(allow_localhost: true) 5 | 6 | # patch to optionaly skip normalizing webmock response headers 7 | if defined? WebMock::Response 8 | WebMock::Response.class_eval do 9 | def headers=(headers) 10 | @headers = headers 11 | if @headers && !@headers.is_a?(Proc) 12 | @headers = 13 | if @headers.key?(:skip_normalize) 14 | @headers.delete(:skip_normalize) 15 | @headers 16 | else 17 | WebMock::Util::Headers.normalize_headers(@headers) 18 | end 19 | end 20 | end 21 | end 22 | end -------------------------------------------------------------------------------- /spec/storage_api_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'proxycrawl' 3 | 4 | describe ProxyCrawl::StorageAPI do 5 | it 'raises an error if token is missing' do 6 | expect { ProxyCrawl::StorageAPI.new }.to raise_error(RuntimeError, 'Token is required') 7 | end 8 | 9 | context '#get' do 10 | before(:each) do 11 | stub_request(:get, 'https://api.proxycrawl.com/storage?format=html&rid=1&token=test') 12 | .to_return( 13 | status: 200, 14 | body: { 15 | stored_at: '2021-03-01T14:22:58+02:00', 16 | original_status: 200, 17 | pc_status: 200, 18 | rid: '1', 19 | url: 'https://www.apple.com', 20 | body: '