├── Gemfile ├── Rakefile ├── lib ├── proxycrawl │ ├── version.rb │ ├── scraper_api.rb │ ├── leads_api.rb │ ├── screenshots_api.rb │ ├── api.rb │ └── storage_api.rb └── proxycrawl.rb ├── .gitignore ├── bin ├── setup └── console ├── spec ├── spec_helper.rb ├── api_spec.rb ├── screenshots_api_spec.rb └── storage_api_spec.rb ├── LICENSE.txt ├── proxycrawl.gemspec ├── CODE_OF_CONDUCT.md └── README.md /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gemspec 4 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | task :default => :spec 3 | -------------------------------------------------------------------------------- /lib/proxycrawl/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyCrawl 4 | VERSION = '1.0.2' 5 | end 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /Gemfile.lock 4 | /_yardoc/ 5 | /coverage/ 6 | /doc/ 7 | /pkg/ 8 | /spec/reports/ 9 | /tmp/ 10 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /lib/proxycrawl.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'proxycrawl/version' 4 | require 'proxycrawl/api' 5 | require 'proxycrawl/scraper_api' 6 | require 'proxycrawl/leads_api' 7 | require 'proxycrawl/screenshots_api' 8 | require 'proxycrawl/storage_api' 9 | 10 | module ProxyCrawl 11 | end 12 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "bundler/setup" 4 | require "proxycrawl" 5 | 6 | # You can add fixtures and/or initialization code here to make experimenting 7 | # with your gem easier. You can also use a different console, if you like. 8 | 9 | # (If you use this, don't forget to add pry to your Gemfile!) 10 | # require "pry" 11 | # Pry.start 12 | 13 | require "irb" 14 | IRB.start(__FILE__) 15 | -------------------------------------------------------------------------------- /lib/proxycrawl/scraper_api.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyCrawl 4 | class ScraperAPI < ProxyCrawl::API 5 | attr_reader :remaining_requests 6 | 7 | def post 8 | raise 'Only GET is allowed for the ScraperAPI' 9 | end 10 | 11 | private 12 | 13 | def prepare_response(response, format) 14 | super(response, format) 15 | json_body = JSON.parse(response.body) 16 | @remaining_requests = json_body['remaining_requests'].to_i 17 | end 18 | 19 | def base_url 20 | 'https://api.proxycrawl.com/scraper' 21 | end 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | require 'webmock/rspec' 3 | 4 | WebMock.disable_net_connect!(allow_localhost: true) 5 | 6 | # patch to optionaly skip normalizing webmock response headers 7 | if defined? WebMock::Response 8 | WebMock::Response.class_eval do 9 | def headers=(headers) 10 | @headers = headers 11 | if @headers && !@headers.is_a?(Proc) 12 | @headers = 13 | if @headers.key?(:skip_normalize) 14 | @headers.delete(:skip_normalize) 15 | @headers 16 | else 17 | WebMock::Util::Headers.normalize_headers(@headers) 18 | end 19 | end 20 | end 21 | end 22 | end -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2023 ProxyCrawl 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/proxycrawl/leads_api.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'net/http' 4 | require 'json' 5 | require 'uri' 6 | 7 | module ProxyCrawl 8 | class LeadsAPI 9 | attr_reader :token, :timeout, :body, :status_code, :success, :remaining_requests 10 | 11 | INVALID_TOKEN = 'Token is required' 12 | INVALID_DOMAIN = 'Domain is required' 13 | 14 | def initialize(options = {}) 15 | raise INVALID_TOKEN if options[:token].nil? || options[:token].empty? 16 | 17 | @token = options[:token] 18 | @timeout = options[:timeout] || 120 19 | end 20 | 21 | def get(domain) 22 | raise INVALID_DOMAIN if domain.empty? 23 | 24 | uri = URI('https://api.proxycrawl.com/leads') 25 | uri.query = URI.encode_www_form({ token: token, domain: domain }) 26 | 27 | req = Net::HTTP::Get.new(uri) 28 | 29 | req_options = { 30 | read_timeout: timeout, 31 | use_ssl: uri.scheme == 'https', 32 | verify_mode: OpenSSL::SSL::VERIFY_NONE 33 | } 34 | 35 | response = Net::HTTP.start(uri.hostname, uri.port, req_options) { |http| http.request(req) } 36 | @status_code = response.code.to_i 37 | @body = response.body 38 | 39 | json_body = JSON.parse(response.body) 40 | @success = json_body['success'] 41 | @remaining_requests = json_body['remaining_requests'].to_i 42 | 43 | self 44 | end 45 | 46 | def post 47 | raise 'Only GET is allowed for the LeadsAPI' 48 | end 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /lib/proxycrawl/screenshots_api.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'securerandom' 4 | require 'tmpdir' 5 | 6 | module ProxyCrawl 7 | class ScreenshotsAPI < ProxyCrawl::API 8 | attr_reader :screenshot_path, :success, :remaining_requests, :screenshot_url 9 | 10 | INVALID_SAVE_TO_PATH_FILENAME = 'Filename must end with .jpg or .jpeg' 11 | SAVE_TO_PATH_FILENAME_PATTERN = /.+\.(jpg|JPG|jpeg|JPEG)$/.freeze 12 | 13 | def post 14 | raise 'Only GET is allowed for the ScreenshotsAPI' 15 | end 16 | 17 | def get(url, options = {}) 18 | screenshot_path = options.delete(:save_to_path) || generate_file_path 19 | raise INVALID_SAVE_TO_PATH_FILENAME unless SAVE_TO_PATH_FILENAME_PATTERN =~ screenshot_path 20 | 21 | response = super(url, options) 22 | file = File.open(screenshot_path, 'w+') 23 | file.write(response.body&.force_encoding('UTF-8')) 24 | @screenshot_path = screenshot_path 25 | yield(file) if block_given? 26 | response 27 | ensure 28 | file&.close 29 | end 30 | 31 | private 32 | 33 | def prepare_response(response, format) 34 | super(response, format) 35 | @remaining_requests = response['remaining_requests'].to_i 36 | @success = response['success'] == 'true' 37 | @screenshot_url = response['screenshot_url'] 38 | end 39 | 40 | def base_url 41 | 'https://api.proxycrawl.com/screenshots' 42 | end 43 | 44 | def generate_file_name 45 | "#{SecureRandom.urlsafe_base64}.jpg" 46 | end 47 | 48 | def generate_file_path 49 | File.join(Dir.tmpdir, generate_file_name) 50 | end 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /proxycrawl.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path("../lib", __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require "proxycrawl/version" 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "proxycrawl" 8 | spec.version = ProxyCrawl::VERSION 9 | spec.platform = Gem::Platform::RUBY 10 | spec.authors = ["proxycrawl"] 11 | spec.email = ["info@proxycrawl.com"] 12 | spec.summary = %q{ProxyCrawl API client for web scraping and crawling} 13 | spec.description = %q{Ruby based client for the ProxyCrawl API that helps developers crawl or scrape thousands of web pages anonymously} 14 | spec.homepage = "https://github.com/proxycrawl/proxycrawl-ruby" 15 | spec.license = "MIT" 16 | 17 | spec.files = `git ls-files -z`.split("\x0").reject do |f| 18 | f.match(%r{^(test|spec|features)/}) 19 | end 20 | 21 | spec.required_ruby_version = '>= 2.0' 22 | 23 | spec.bindir = "exe" 24 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } 25 | spec.require_paths = ["lib"] 26 | 27 | spec.add_development_dependency "rspec", "~> 3.2" 28 | spec.add_development_dependency "webmock", "~> 3.4" 29 | spec.add_development_dependency "bundler", "~> 2.0" 30 | spec.add_development_dependency "rake", "~> 12.3.3" 31 | 32 | # Deprecation warning 33 | spec.post_install_message = <<~MESSAGE 34 | ================================================================================ 35 | DEPRECATION WARNING - 'proxycrawl' gem 36 | ================================================================================ 37 | 38 | 'proxycrawl' is deprecated due to rebranding. Please switch to the 'crawlbase' gem. 39 | 40 | More details and migration guide: https://github.com/crawlbase-source/crawlbase-ruby 41 | ================================================================================ 42 | MESSAGE 43 | end 44 | -------------------------------------------------------------------------------- /lib/proxycrawl/api.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'net/http' 4 | require 'json' 5 | require 'uri' 6 | 7 | module ProxyCrawl 8 | class API 9 | attr_reader :token, :body, :timeout, :status_code, :original_status, :pc_status, :url, :storage_url 10 | 11 | INVALID_TOKEN = 'Token is required' 12 | INVALID_URL = 'URL is required' 13 | 14 | def initialize(options = {}) 15 | raise INVALID_TOKEN if options[:token].nil? 16 | 17 | @token = options[:token] 18 | @timeout = options[:timeout] || 120 19 | end 20 | 21 | def get(url, options = {}) 22 | raise INVALID_URL if url.empty? 23 | 24 | uri = prepare_uri(url, options) 25 | req = Net::HTTP::Get.new(uri) 26 | 27 | req_options = { 28 | read_timeout: timeout, 29 | use_ssl: uri.scheme == 'https', 30 | verify_mode: OpenSSL::SSL::VERIFY_NONE 31 | } 32 | 33 | response = Net::HTTP.start(uri.hostname, uri.port, req_options) { |http| http.request(req) } 34 | 35 | prepare_response(response, options[:format]) 36 | 37 | self 38 | end 39 | 40 | def post(url, data, options = {}) 41 | raise INVALID_URL if url.empty? 42 | 43 | uri = prepare_uri(url, options) 44 | 45 | http = Net::HTTP.new(uri.host, uri.port) 46 | 47 | http.use_ssl = true 48 | 49 | content_type = options[:post_content_type].to_s.include?('json') ? { 'Content-Type': 'text/json' } : nil 50 | 51 | request = Net::HTTP::Post.new(uri.request_uri, content_type) 52 | 53 | if options[:post_content_type].to_s.include?('json') 54 | request.body = data.to_json 55 | else 56 | request.set_form_data(data) 57 | end 58 | 59 | response = http.request(request) 60 | 61 | prepare_response(response, options[:format]) 62 | 63 | self 64 | end 65 | 66 | private 67 | 68 | def base_url 69 | 'https://api.proxycrawl.com' 70 | end 71 | 72 | def prepare_uri(url, options) 73 | uri = URI(base_url) 74 | uri.query = URI.encode_www_form({ token: @token, url: url }.merge(options)) 75 | 76 | uri 77 | end 78 | 79 | def prepare_response(response, format) 80 | res = format == 'json' || base_url.include?('/scraper') ? JSON.parse(response.body) : response 81 | 82 | @original_status = res['original_status'].to_i 83 | @pc_status = res['pc_status'].to_i 84 | @url = res['url'] 85 | @storage_url = res['storage_url'] 86 | @status_code = response.code.to_i 87 | @body = response.body 88 | end 89 | end 90 | end 91 | -------------------------------------------------------------------------------- /spec/api_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper.rb' 2 | require 'proxycrawl' 3 | 4 | describe ProxyCrawl::API do 5 | it 'raises an error if token is missing' do 6 | expect { ProxyCrawl::API.new }.to raise_error(RuntimeError, 'Token is required') 7 | end 8 | 9 | it 'sets/reads token' do 10 | expect(ProxyCrawl::API.new(token: 'test').token).to eql('test') 11 | end 12 | 13 | describe '#get' do 14 | it 'sends an get request to ProxyCrawl API' do 15 | stub_request(:get, 'https://api.proxycrawl.com/?token=test&url=http%3A%2F%2Fhttpbin.org%2Fanything%3Fparam1%3Dx%26params2%3Dy'). 16 | to_return( 17 | body: 'body', 18 | status: 200, 19 | headers: { skip_normalize: true, 'original_status' => 200, 'pc_status' => 200, 'url' => 'http://httpbin.org/anything?param1=x¶ms2=y'}) 20 | 21 | api = ProxyCrawl::API.new(token: 'test') 22 | 23 | response = api.get('http://httpbin.org/anything?param1=x¶ms2=y') 24 | 25 | expect(response.status_code).to eql(200) 26 | expect(response.original_status).to eql(200) 27 | expect(response.pc_status).to eql(200) 28 | expect(response.url).to eql('http://httpbin.org/anything?param1=x¶ms2=y') 29 | expect(response.body).to eql('body') 30 | end 31 | end 32 | 33 | describe '#post' do 34 | it 'sends a post request to ProxyCrawl API with json data' do 35 | stub_request(:post, 'https://api.proxycrawl.com/?post_content_type=json&token=test&url=http://httpbin.org/post'). 36 | with(body: "{\"foo\":\"bar\"}"). 37 | to_return( 38 | body: 'body', 39 | status: 200, 40 | headers: { skip_normalize: true, 'original_status' => 200, 'pc_status' => 200, 'url' => 'http://httpbin.org/anything?param1=x¶ms2=y'}) 41 | 42 | api = ProxyCrawl::API.new(token: 'test') 43 | 44 | response = api.post("http://httpbin.org/post", { foo: 'bar' }, { post_content_type: 'json'} ) 45 | 46 | expect(response.status_code).to eql(200) 47 | expect(response.original_status).to eql(200) 48 | expect(response.pc_status).to eql(200) 49 | expect(response.url).to eql('http://httpbin.org/anything?param1=x¶ms2=y') 50 | expect(response.body).to eql('body') 51 | end 52 | 53 | it 'sends a post request to ProxyCrawl API with form data' do 54 | stub_request(:post, 'https://api.proxycrawl.com/?token=test&url=http://httpbin.org/post'). 55 | with(body: { "foo" => "bar" }). 56 | to_return( 57 | body: 'body', 58 | status: 200, 59 | headers: { skip_normalize: true, 'original_status' => 200, 'pc_status' => 200, 'url' => 'http://httpbin.org/anything?param1=x¶ms2=y'}) 60 | 61 | api = ProxyCrawl::API.new(token: 'test') 62 | 63 | response = api.post("http://httpbin.org/post", { foo: 'bar' } ) 64 | 65 | expect(response.status_code).to eql(200) 66 | expect(response.original_status).to eql(200) 67 | expect(response.pc_status).to eql(200) 68 | expect(response.url).to eql('http://httpbin.org/anything?param1=x¶ms2=y') 69 | expect(response.body).to eql('body') 70 | end 71 | end 72 | 73 | end 74 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at info@proxycrawl.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /spec/screenshots_api_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'proxycrawl' 3 | 4 | describe ProxyCrawl::ScreenshotsAPI do 5 | it 'raises an error if token is missing' do 6 | expect { ProxyCrawl::ScreenshotsAPI.new }.to raise_error(RuntimeError, 'Token is required') 7 | end 8 | 9 | it 'sets/reads token' do 10 | expect(ProxyCrawl::ScreenshotsAPI.new(token: 'test').token).to eql('test') 11 | end 12 | 13 | describe '#get' do 14 | before(:each) do 15 | stub_request(:get, 'https://api.proxycrawl.com/screenshots?token=test&url=http%3A%2F%2Fhttpbin.org%2Fanything%3Fparam1%3Dx%26params2%3Dy'). 16 | to_return( 17 | body: 'body', 18 | status: 200, 19 | headers: { skip_normalize: true, 'original_status' => 200, 'pc_status' => 200, 'url' => 'http://httpbin.org/anything?param1=x¶ms2=y'}) 20 | end 21 | 22 | it 'sends an get request to ProxyCrawl Screenshots API' do 23 | api = ProxyCrawl::ScreenshotsAPI.new(token: 'test') 24 | 25 | response = api.get("http://httpbin.org/anything?param1=x¶ms2=y") 26 | 27 | expect(response.status_code).to eql(200) 28 | expect(response.original_status).to eql(200) 29 | expect(response.pc_status).to eql(200) 30 | expect(response.url).to eql('http://httpbin.org/anything?param1=x¶ms2=y') 31 | expect(response.body).to eql('body') 32 | expect(response.screenshot_path).not_to be_empty 33 | end 34 | 35 | it 'accepts a valid save_to_path option' do 36 | api = ProxyCrawl::ScreenshotsAPI.new(token: 'test') 37 | 38 | response = api.get("http://httpbin.org/anything?param1=x¶ms2=y", save_to_path: save_to_path) 39 | 40 | expect(response.status_code).to eql(200) 41 | expect(response.original_status).to eql(200) 42 | expect(response.pc_status).to eql(200) 43 | expect(response.url).to eql('http://httpbin.org/anything?param1=x¶ms2=y') 44 | expect(response.body).to eql('body') 45 | expect(response.screenshot_path).to eql(File.join(Dir.tmpdir, 'test-image.jpg')) 46 | end 47 | 48 | it 'rejects an invalid save_to_path option' do 49 | api = ProxyCrawl::ScreenshotsAPI.new(token: 'test') 50 | 51 | expect { api.get("http://httpbin.org/anything?param1=x¶ms2=y", save_to_path: '~/images/image_filename.png') }.to raise_error(RuntimeError, 'Filename must end with .jpg or .jpeg') 52 | expect { api.get("http://httpbin.org/anything?param1=x¶ms2=y", save_to_path: 'image_filename.png') }.to raise_error(RuntimeError, 'Filename must end with .jpg or .jpeg') 53 | expect { api.get("http://httpbin.org/anything?param1=x¶ms2=y", save_to_path: '~/images/image_filename') }.to raise_error(RuntimeError, 'Filename must end with .jpg or .jpeg') 54 | expect { api.get("http://httpbin.org/anything?param1=x¶ms2=y", save_to_path: 'image_filename') }.to raise_error(RuntimeError, 'Filename must end with .jpg or .jpeg') 55 | end 56 | 57 | it 'accepts a block' do 58 | api = ProxyCrawl::ScreenshotsAPI.new(token: 'test') 59 | 60 | response = api.get("http://httpbin.org/anything?param1=x¶ms2=y", save_to_path: save_to_path) do |file| 61 | expect(file).to be_kind_of(File) 62 | expect(file.path).to eql(File.join(Dir.tmpdir, 'test-image.jpg')) 63 | end 64 | 65 | expect(response.status_code).to eql(200) 66 | expect(response.original_status).to eql(200) 67 | expect(response.pc_status).to eql(200) 68 | expect(response.url).to eql('http://httpbin.org/anything?param1=x¶ms2=y') 69 | expect(response.body).to eql('body') 70 | expect(response.screenshot_path).to eql(File.join(Dir.tmpdir, 'test-image.jpg')) 71 | end 72 | end 73 | 74 | private 75 | 76 | def save_to_path 77 | File.join(Dir.tmpdir, 'test-image.jpg') 78 | end 79 | end 80 | -------------------------------------------------------------------------------- /lib/proxycrawl/storage_api.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'net/http' 4 | require 'json' 5 | require 'uri' 6 | 7 | module ProxyCrawl 8 | class StorageAPI 9 | attr_reader :token, :timeout, :original_status, :pc_status, :url, :status_code, :rid, :body, :stored_at 10 | 11 | INVALID_TOKEN = 'Token is required' 12 | INVALID_RID = 'RID is required' 13 | INVALID_RID_ARRAY = 'One or more RIDs are required' 14 | INVALID_URL_OR_RID = 'Either URL or RID is required' 15 | BASE_URL = 'https://api.proxycrawl.com/storage' 16 | 17 | def initialize(options = {}) 18 | raise INVALID_TOKEN if options[:token].nil? || options[:token].empty? 19 | 20 | @token = options[:token] 21 | @timeout = options[:timeout] || 120 22 | end 23 | 24 | def get(url_or_rid, format = 'html') 25 | raise INVALID_URL_OR_RID if url_or_rid.nil? || url_or_rid.empty? 26 | 27 | uri = URI(BASE_URL) 28 | uri.query = URI.encode_www_form({ token: token, format: format }.merge(decide_url_or_rid(url_or_rid))) 29 | 30 | req = Net::HTTP::Get.new(uri) 31 | 32 | req_options = { 33 | read_timeout: timeout, 34 | use_ssl: uri.scheme == 'https', 35 | verify_mode: OpenSSL::SSL::VERIFY_NONE 36 | } 37 | 38 | response = Net::HTTP.start(uri.hostname, uri.port, req_options) { |http| http.request(req) } 39 | 40 | res = format == 'json' ? JSON.parse(response.body) : response 41 | 42 | @original_status = res['original_status'].to_i 43 | @pc_status = res['pc_status'].to_i 44 | @url = res['url'] 45 | @rid = res['rid'] 46 | @stored_at = res['stored_at'] 47 | 48 | @status_code = response.code.to_i 49 | @body = response.body 50 | 51 | self 52 | end 53 | 54 | def delete(rid) 55 | raise INVALID_RID if rid.nil? || rid.empty? 56 | 57 | uri = URI(BASE_URL) 58 | uri.query = URI.encode_www_form(token: token, rid: rid) 59 | http = Net::HTTP.new(uri.host) 60 | request = Net::HTTP::Delete.new(uri.request_uri) 61 | response = http.request(request) 62 | 63 | @url, @original_status, @pc_status, @stored_at = nil 64 | @status_code = response.code.to_i 65 | @rid = rid 66 | @body = JSON.parse(response.body) 67 | 68 | @body.key?('success') 69 | end 70 | 71 | def bulk(rids_array = []) 72 | raise INVALID_RID_ARRAY if rids_array.empty? 73 | 74 | uri = URI("#{BASE_URL}/bulk") 75 | uri.query = URI.encode_www_form(token: token) 76 | http = Net::HTTP.new(uri.host) 77 | request = Net::HTTP::Post.new(uri.request_uri, { 'Content-Type': 'application/json' }) 78 | request.body = { rids: rids_array }.to_json 79 | response = http.request(request) 80 | 81 | @body = JSON.parse(response.body) 82 | @original_status = @body.map { |item| item['original_status'].to_i } 83 | @status_code = response.code.to_i 84 | @pc_status = @body.map { |item| item['pc_status'].to_i } 85 | @url = @body.map { |item| item['url'] } 86 | @rid = @body.map { |item| item['rid'] } 87 | @stored_at = @body.map { |item| item['stored_at'] } 88 | 89 | self 90 | end 91 | 92 | def rids(limit = -1) 93 | uri = URI("#{BASE_URL}/rids") 94 | query_hash = { token: token } 95 | query_hash.merge!({ limit: limit }) if limit >= 0 96 | uri.query = URI.encode_www_form(query_hash) 97 | 98 | response = Net::HTTP.get_response(uri) 99 | @url, @original_status, @pc_status, @stored_at = nil 100 | @status_code = response.code.to_i 101 | @body = JSON.parse(response.body) 102 | @rid = @body 103 | 104 | @body 105 | end 106 | 107 | def total_count 108 | uri = URI("#{BASE_URL}/total_count") 109 | uri.query = URI.encode_www_form(token: token) 110 | 111 | response = Net::HTTP.get_response(uri) 112 | @url, @original_status, @pc_status, @stored_at = nil 113 | @status_code = response.code.to_i 114 | @rid = rid 115 | @body = JSON.parse(response.body) 116 | 117 | body['totalCount'] 118 | end 119 | 120 | private 121 | 122 | def decide_url_or_rid(url_or_rid) 123 | %r{^https?://} =~ url_or_rid ? { url: url_or_rid } : { rid: url_or_rid } 124 | end 125 | end 126 | end 127 | -------------------------------------------------------------------------------- /spec/storage_api_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'proxycrawl' 3 | 4 | describe ProxyCrawl::StorageAPI do 5 | it 'raises an error if token is missing' do 6 | expect { ProxyCrawl::StorageAPI.new }.to raise_error(RuntimeError, 'Token is required') 7 | end 8 | 9 | context '#get' do 10 | before(:each) do 11 | stub_request(:get, 'https://api.proxycrawl.com/storage?format=html&rid=1&token=test') 12 | .to_return( 13 | status: 200, 14 | body: { 15 | stored_at: '2021-03-01T14:22:58+02:00', 16 | original_status: 200, 17 | pc_status: 200, 18 | rid: '1', 19 | url: 'https://www.apple.com', 20 | body: '