├── .github ├── dependabot.yml └── workflows │ └── ci.yml ├── .gitignore ├── .rubocop.yml ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── Gemfile ├── LICENSE ├── README.md ├── Rakefile ├── bin ├── console └── proxy_fetcher ├── gemfiles ├── nokogiri.gemfile └── oga.gemfile ├── lib ├── proxy_fetcher.rb └── proxy_fetcher │ ├── client │ ├── client.rb │ ├── proxies_registry.rb │ └── request.rb │ ├── configuration.rb │ ├── configuration │ └── providers_registry.rb │ ├── document.rb │ ├── document │ ├── adapters.rb │ ├── adapters │ │ ├── abstract_adapter.rb │ │ ├── nokogiri_adapter.rb │ │ └── oga_adapter.rb │ └── node.rb │ ├── exceptions.rb │ ├── manager.rb │ ├── null_logger.rb │ ├── providers │ ├── base.rb │ ├── free_proxy_list.rb │ ├── free_proxy_list_socks.rb │ ├── free_proxy_list_ssl.rb │ ├── free_proxy_list_us.rb │ ├── http_tunnel.rb │ ├── mtpro.rb │ ├── proxy_list.rb │ ├── proxypedia.rb │ └── xroxy.rb │ ├── proxy.rb │ ├── utils │ ├── http_client.rb │ ├── proxy_list_validator.rb │ └── proxy_validator.rb │ └── version.rb ├── proxy_fetcher.gemspec └── spec ├── fixtures └── proxies.txt ├── proxy_fetcher ├── client │ └── client_spec.rb ├── configuration_spec.rb ├── document │ ├── adapters_spec.rb │ └── node_spec.rb ├── manager_spec.rb ├── providers │ ├── base_spec.rb │ ├── multiple_providers_spec.rb │ └── proxy_classes_spec.rb ├── proxy_spec.rb └── version_spec.rb ├── spec_helper.rb └── support └── manager_examples.rb /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: bundler 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "03:00" 8 | open-pull-requests-limit: 10 9 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | name: >- 8 | Ruby ${{ matrix.ruby }} (${{ matrix.gemfile }}) 9 | env: 10 | CI: true 11 | runs-on: ${{ matrix.os }} 12 | continue-on-error: ${{ endsWith(matrix.ruby, 'head') || matrix.ruby == 'debug' || matrix.experimental }} 13 | if: | 14 | !( contains(github.event.pull_request.title, '[ci skip]') 15 | || contains(github.event.pull_request.title, '[skip ci]')) 16 | strategy: 17 | fail-fast: true 18 | matrix: 19 | experimental: [false] 20 | os: [ ubuntu-latest ] 21 | ruby: 22 | - 2.6 23 | - 2.7 24 | - '3.0' 25 | - '3.1' 26 | gemfile: 27 | - gemfiles/oga.gemfile 28 | - gemfiles/nokogiri.gemfile 29 | include: 30 | - ruby: head 31 | os: ubuntu-latest 32 | gemfile: gemfiles/nokogiri.gemfile 33 | experimental: true 34 | - ruby: head 35 | os: ubuntu-latest 36 | gemfile: gemfiles/oga.gemfile 37 | experimental: true 38 | - ruby: jruby 39 | os: ubuntu-latest 40 | gemfile: gemfiles/nokogiri.gemfile 41 | experimental: true 42 | - ruby: truffleruby 43 | os: ubuntu-latest 44 | gemfile: gemfiles/nokogiri.gemfile 45 | experimental: true 46 | steps: 47 | - name: Repo checkout 48 | uses: actions/checkout@v2 49 | 50 | - name: Setup Ruby 51 | uses: ruby/setup-ruby@v1 52 | with: 53 | ruby-version: ${{ matrix.ruby }} 54 | bundler-cache: true 55 | 56 | - name: Run tests 57 | timeout-minutes: 10 58 | run: bundle exec rake spec 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.rbc 2 | capybara-*.html 3 | .rspec 4 | /log 5 | /tmp 6 | /db/*.sqlite3 7 | /db/*.sqlite3-journal 8 | /public/system 9 | /coverage/ 10 | /spec/tmp 11 | *.orig 12 | rerun.txt 13 | pickle-email-*.html 14 | .idea 15 | Gemfile.lock 16 | *.gem 17 | certs 18 | gemfiles/*.gemfile.lock 19 | 20 | ## Environment normalization: 21 | /.bundle 22 | /vendor/bundle 23 | 24 | # these should all be checked in to normalize the environment: 25 | # Gemfile.lock, .ruby-version, .ruby-gemset 26 | 27 | # unless supporting rvm < 1.11.0 or doing something fancy, ignore this: 28 | .rvmrc 29 | 30 | # if using bower-rails ignore default bower_components path bower.json files 31 | /vendor/assets/bower_components 32 | *.bowerrc 33 | bower.json 34 | 35 | # Ignore pow environment settings 36 | .powenv 37 | 38 | # Ignore Byebug command history file. 39 | .byebug_history 40 | .yardoc/ 41 | doc/ 42 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | AllCops: 2 | TargetRubyVersion: 2.4 3 | Exclude: 4 | - 'bin/*' 5 | DisplayCopNames: true 6 | 7 | Style/ClassAndModuleChildren: 8 | Exclude: 9 | - spec/**/* 10 | Style/FrozenStringLiteralComment: 11 | Enabled: true 12 | Style/StringLiterals: 13 | EnforcedStyle: double_quotes 14 | Style/StringLiteralsInInterpolation: 15 | EnforcedStyle: double_quotes 16 | 17 | Layout/MultilineMethodCallIndentation: 18 | EnforcedStyle: indented 19 | Layout/TrailingEmptyLines: 20 | Enabled: true 21 | Layout/DotPosition: 22 | EnforcedStyle: leading 23 | 24 | Lint/ConstantDefinitionInBlock: 25 | Exclude: 26 | - spec/**/* 27 | 28 | Metrics/LineLength: 29 | Exclude: 30 | - spec/**/* 31 | Max: 100 32 | Metrics/BlockLength: 33 | Exclude: 34 | - spec/**/* 35 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Proxy Fetcher Changelog 2 | 3 | Reverse Chronological Order: 4 | 5 | ## `master` 6 | 7 | ... 8 | 9 | ## `0.17.0` (2023-06-02) 10 | 11 | * Remove dead providers 12 | 13 | ## `0.16.0` (2022-02-04) 14 | 15 | * Fix providers 16 | * Improve HTTP.rb dependency 17 | 18 | ## `0.15.1` (2021-02-17) 19 | 20 | * Support for Ruby 3.0 21 | 22 | ## `0.15.0` (2021-01-26) 23 | 24 | * Removed failing providers 25 | * Added new 26 | * Specs refactoring 27 | 28 | ## `0.14.0` (2020-05-11) 29 | 30 | * Add MTPro provider 31 | * Add Proxypedia provider 32 | 33 | ## `0.13.0` (2020-03-09) 34 | 35 | * Fix GatherProxy provider 36 | * Fix XRoxy provider 37 | * Allow ability to load proxies from files 38 | * Fix Proxy object comparators 39 | 40 | ## `0.12.0` (2020-01-28) 41 | 42 | * Fix XRoxy provider 43 | * Fix multi-threading issues with config and adapter 44 | 45 | ## `0.11.0` (2019-10-24) 46 | 47 | * Big gem refactoring 48 | 49 | ## `0.10.2` (2019-03-15) 50 | 51 | * Remove ProxyDocker provider (no longer workable) 52 | 53 | ## `0.10.1` (2019-03-07) 54 | 55 | * Fix broken ProxyDocker provider. 56 | * Refactor gem internals. 57 | 58 | ## `0.9.0` (2019-01-22) 59 | 60 | * Fix a problem with stuck of proxies list loading. 61 | 62 | * Add a possibility to configure different timeouts for different cases: 63 | - `client_timeout` - timeout for `ProxyFetcher::Client`. 64 | - `provider_proxies_load_timeout` - timeout for loading of proxies list by provider. 65 | - `proxy_validation_timeout` - timeout for proxy validation with `ProxyFetcher::ProxyValidator`. 66 | 67 | (old option `timeout` sets and returns value of `client_timeout`) 68 | 69 | ## `0.8.0` (2018-11-12) 70 | 71 | * Improve speed of proxy list loading. 72 | * Improve speed of proxies cleanup. 73 | * Fix ProxyDocker provider 74 | 75 | ## `0.7.2` (2018-08-13) 76 | 77 | * Fix XRoxy provider 78 | 79 | ## `0.7.1` (2018-07-13) 80 | 81 | * Fix XRoxy provider 82 | 83 | ## `0.7.0` (2018-06-04) 84 | 85 | * Migrate to `HTTP.rb` instead of `Net::HTTP` 86 | * Fixes 87 | 88 | ## `0.6.5` (2018-04-20) 89 | 90 | * Fix providers 91 | 92 | ## `0.6.4` (2018-03-26) 93 | 94 | * Fix providers 95 | 96 | ## `0.6.3` (2018-01-26) 97 | 98 | * Add ability to use own proxy for `ProxyFetcher::Client` 99 | * Improve specs 100 | 101 | ## `0.6.2` (2017-12-27) 102 | 103 | * Fix ProxyDocker provider. 104 | 105 | ## `0.6.1` (2017-12-11) 106 | 107 | * Fix gem executable to check dependencies for adapters 108 | * Code cleanup 109 | * Some new specs 110 | 111 | ## `0.6.0` (2017-12-08) 112 | 113 | * Make HTML parser configurable (Nokogiri, Oga, custom one) 114 | * Documentation 115 | 116 | ## `0.5.1` (2017-11-13) 117 | 118 | * Fix ProxyFetcher CLI 119 | 120 | ## `0.5.0` (2017-09-06) 121 | 122 | * Remove HideMyName provider (not works anymore) 123 | * Fix ProxyDocker provider 124 | * Add `ProxyFetcher::Client` to make interacting with proxies easier 125 | * Add new providers (Gather Proxy & HTTP Tunnel Genius) 126 | * Simplify `connection_timeout` config option to `timeout` 127 | * Make User-Agent configurable 128 | * Move all the gem exceptions under `ProxyFetcher::Error` base class 129 | * Small improvements 130 | 131 | ## `0.4.1` (2017-09-04) 132 | 133 | * Use all registered providers by default 134 | * Disable HideMyName provider (now ше uses CloudFlare) 135 | 136 | ## `0.4.0` (2017-08-26) 137 | 138 | * Support operations with multiple providers 139 | * Refactor filtering 140 | * Small bugfixes 141 | * Documentation 142 | 143 | ## `0.3.1` (2017-08-24) 144 | 145 | * Remove speed from proxy (no need to) 146 | * Extract proxy validation from the HTTPClient to separate class 147 | * Make proxy validator configurable 148 | * Refactor proxy validation behavior 149 | * Refactor Proxy object (OpenStruct => PORO, url / uri methods, etc) 150 | * Optimize proxy list check with threads 151 | * Improve proxy_fetcher bin 152 | 153 | ## `0.3.0` (2017-08-21) 154 | 155 | * Proxy providers refactoring 156 | * Proxy object refactoring 157 | * Specs refactoring 158 | * New providers 159 | * Custom HTTP client 160 | * Configuration improvements 161 | * Proxy filters 162 | 163 | ## `0.2.5` (2017-08-17) 164 | 165 | * Configurable HTTPClient 166 | * Fix errors handling 167 | 168 | ## `0.2.3` (2017-08-10) 169 | 170 | * Fix broken providers 171 | * Add new providers 172 | * Docs 173 | 174 | ## `0.2.2` (2017-07-20) 175 | 176 | * Code & specs refactoring 177 | 178 | ## `0.2.1` (2017-07-19) 179 | 180 | * New proxy providers 181 | * Bugfixes 182 | 183 | ## `0.2.0` (2017-07-17) 184 | 185 | * New proxy providers 186 | * Custom providers 187 | * Network errors handling 188 | * Specs refactorirng 189 | 190 | ## `0.1.4` (2017-05-31) 191 | 192 | * Code & specs refactoring 193 | * Add `speed` to `Proxy` object 194 | * Docs 195 | 196 | ## `0.1.3` (2017-05-25) 197 | 198 | * Proxy list management with `ProxyFetcher::Manager` 199 | 200 | ## `0.1.2` (2017-05-23) 201 | 202 | * HTTPS proccesing 203 | * `Proxy` object sugar 204 | * Specs improvements 205 | * Docs improvements 206 | 207 | ## `0.1.1` (2017-05-22) 208 | 209 | * Configuration (timeouts) 210 | * Documentation 211 | 212 | ## `0.1.0` (2017-05-19) 213 | 214 | * Initial release -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at bulajnikita@gmail.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source "https://rubygems.org" 4 | 5 | gemspec 6 | 7 | gem "nokogiri", "~> 1.8" 8 | gem "oga", "~> 3.2" 9 | gem "rubocop", "~> 1.0" 10 | 11 | group :test do 12 | gem "coveralls_reborn", require: false 13 | # Until I find a way to introduce other MITM proxy 14 | gem "webrick", "1.4.2" 15 | gem "evil-proxy", "~> 0.2" 16 | end 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017—2018 Nikita Bulai 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ruby / JRuby lib for managing proxies 2 | [![Gem Version](https://badge.fury.io/rb/proxy_fetcher.svg)](http://badge.fury.io/rb/proxy_fetcher) 3 | [![CI](https://github.com/nbulaj/proxy_fetcher/actions/workflows/ci.yml/badge.svg)](https://github.com/nbulaj/proxy_fetcher/actions/workflows/ci.yml) 4 | [![Coverage Status](https://coveralls.io/repos/github/nbulaj/proxy_fetcher/badge.svg)](https://coveralls.io/github/nbulaj/proxy_fetcher) 5 | [![Code Climate](https://codeclimate.com/github/nbulaj/proxy_fetcher/badges/gpa.svg)](https://codeclimate.com/github/nbulaj/proxy_fetcher) 6 | [![Inline docs](http://inch-ci.org/github/nbulaj/proxy_fetcher.png?branch=master)](http://inch-ci.org/github/nbulaj/proxy_fetcher) 7 | [![License](http://img.shields.io/badge/license-MIT-brightgreen.svg)](#license) 8 | 9 | This gem can help your Ruby / JRuby application to make HTTP(S) requests using 10 | proxy by fetching and validating actual proxy lists from multiple providers. 11 | 12 | It gives you a special `Manager` class that can load proxy lists, validate them and return random or specific proxies. 13 | It also has a `Client` class that encapsulates all the logic for sending HTTP requests using proxies, automatically 14 | fetched and validated by the gem. Take a look at the documentation below to find all the gem features. 15 | 16 | Also this gem can be used with any other programming language (Go / Python / etc) as standalone solution for downloading and 17 | validating proxy lists from the different providers. [Checkout examples](#standalone) of usage below. 18 | 19 | ## Documentation valid for `master` branch 20 | 21 | Please check the documentation for the version of doorkeeper you are using in: 22 | https://github.com/nbulaj/proxy_fetcher/releases 23 | 24 | ## Table of Contents 25 | 26 | - [Dependencies](#dependencies) 27 | - [Installation](#installation) 28 | - [Example of usage](#example-of-usage) 29 | - [In Ruby application](#in-ruby-application) 30 | - [Standalone](#standalone) 31 | - [Client](#client) 32 | - [Configuration](#configuration) 33 | - [Proxy validation speed](#proxy-validation-speed) 34 | - [Proxy object](#proxy-object) 35 | - [Providers](#providers) 36 | - [Contributing](#contributing) 37 | - [License](#license) 38 | 39 | ## Dependencies 40 | 41 | ProxyFetcher gem itself requires Ruby `>= 2.0.0` (or [JRuby](http://jruby.org/) `> 9.0`, but maybe earlier too, 42 | [see GitHub Actions matrix](.github/workflows/ci.yml)) and great [HTTP.rb gem](https://github.com/httprb/http). 43 | 44 | However, it requires an adapter to parse HTML. If you do not specify any specific adapter, then it will use 45 | default one - [Nokogiri](https://github.com/sparklemotion/nokogiri). It's OK for any Ruby on Rails project 46 | (because they use it by default). 47 | 48 | But if you want to use some specific adapter (for example your application uses [Oga](https://gitlab.com/yorickpeterse/oga), 49 | then you need to manually add your dependencies to your project and configure ProxyFetcher to use another adapter. Moreover, 50 | you can implement your own adapter if it your use-case. Take a look at the [Configuration](#configuration) section for more details. 51 | 52 | ## Installation 53 | 54 | If using bundler, first add 'proxy_fetcher' to your Gemfile: 55 | 56 | ```ruby 57 | gem 'proxy_fetcher', '~> 0.14' 58 | ``` 59 | 60 | or if you want to use the latest version (from `master` branch), then: 61 | 62 | ```ruby 63 | gem 'proxy_fetcher', git: 'https://github.com/nbulaj/proxy_fetcher.git' 64 | ``` 65 | 66 | And run: 67 | 68 | ```sh 69 | bundle install 70 | ``` 71 | 72 | Otherwise simply install the gem: 73 | 74 | ```sh 75 | gem install proxy_fetcher -v '0.14' 76 | ``` 77 | 78 | ## Example of usage 79 | 80 | ### In Ruby application 81 | 82 | By default ProxyFetcher uses all the available proxy providers. To get current proxy list without validation you 83 | need to initialize an instance of `ProxyFetcher::Manager` class. By default ProxyFetcher will automatically load 84 | and parse all the proxies from all available sources: 85 | 86 | ```ruby 87 | manager = ProxyFetcher::Manager.new # will immediately load proxy list from the servers 88 | manager.proxies 89 | 90 | #=> [#, ... ] 92 | ``` 93 | 94 | You can initialize proxy manager without immediate load of the proxy list from the remote server by passing 95 | `refresh: false` on initialization: 96 | 97 | ```ruby 98 | manager = ProxyFetcher::Manager.new(refresh: false) # just initialize class instance 99 | manager.proxies 100 | 101 | #=> [] 102 | ``` 103 | 104 | Also you could use ProxyFetcher to load proxy lists from local files if you have such: 105 | 106 | ```ruby 107 | manager = ProxyFetcher::Manager.new(file: "/home/dev/proxies.txt", refresh: false) 108 | 109 | # or 110 | 111 | manager = ProxyFetcher::Manager.from_file(file: "/home/dev/proxies.txt", refresh: false) 112 | 113 | # or 114 | 115 | manager = ProxyFetcher::Manager.new( 116 | files: Dir.glob("/home/dev/proxies/**/*.txt"), 117 | refresh: false 118 | ) 119 | manager.proxies 120 | 121 | #=> [#, ... ] 123 | ``` 124 | 125 | `ProxyFetcher::Manager` class is very helpful when you need to manipulate and manager proxies. To get the proxy 126 | from the list you can call `.get` or `.pop` method that will return first proxy and move it to the end of the list. 127 | This methods has some equivalents like `get!` or aliased `pop!` that will return first **connectable** proxy and 128 | move it to the end of the list. They both marked as danger methods because all dead proxies will be removed from the list. 129 | 130 | If you need just some random proxy then call `manager.random_proxy` or it's alias `manager.random`. 131 | 132 | To clean current proxy list from the dead entries that does not respond to the requests you need to use `cleanup!` 133 | or `validate!` method: 134 | 135 | ```ruby 136 | manager.cleanup! # or manager.validate! 137 | ``` 138 | 139 | This action will enumerate proxy list and remove all the entries that doesn't respond by timeout or returns errors. 140 | 141 | In order to increase the performance proxy list validation is performed using Ruby threads. By default gem creates a 142 | pool with 10 threads, but you can increase this number by changing `pool_size` configuration option: `ProxyFetcher.config.pool_size = 50`. 143 | Read more in [Proxy validation speed](#proxy-validation-speed) section. 144 | 145 | If you need raw proxy URLs (like `host:port`) then you can use `raw_proxies` methods that will return array of strings: 146 | 147 | ```ruby 148 | manager = ProxyFetcher::Manager.new 149 | manager.raw_proxies 150 | 151 | # => ["97.77.104.22:3128", "94.23.205.32:3128", "209.79.65.140:8080", 152 | # "91.217.42.2:8080", "97.77.104.22:80", "165.234.102.177:8080", ...] 153 | ``` 154 | 155 | You don't need to initialize a new manager every time you want to load actual proxy list from the providers. All you 156 | need is to refresh the proxy list by calling `#refresh_list!` (or `#fetch!`) method for your `ProxyFetcher::Manager` instance: 157 | 158 | ```ruby 159 | manager.refresh_list! # or manager.fetch! 160 | 161 | #=> [#, ... ] 163 | ``` 164 | 165 | If you need to filter proxy list, for example, by country or response time and **selected provider supports filtering** 166 | with GET params, then you can just pass your filters like a simple Ruby hash to the Manager instance: 167 | 168 | ```ruby 169 | ProxyFetcher.config.providers = :xroxy 170 | 171 | manager = ProxyFetcher::Manager.new(filters: { country: 'PL', maxtime: '500' }) 172 | manager.proxies 173 | 174 | # => [...] 175 | ``` 176 | 177 | **[IMPORTANT]**: All the providers have their own filtering params! So you can't just use something like `country` to 178 | filter all the proxies by country. If you are using multiple providers, then you can split your filters by proxy 179 | provider names: 180 | 181 | ```ruby 182 | ProxyFetcher.config.providers = [:proxy_docker, :xroxy] 183 | 184 | manager = ProxyFetcher::Manager.new(filters: { 185 | hide_my_name: { 186 | country: 'PL', 187 | maxtime: '500' 188 | }, 189 | xroxy: { 190 | type: 'All_http' 191 | } 192 | }) 193 | 194 | manager.proxies 195 | 196 | # => [...] 197 | ``` 198 | 199 | You can apply different filters every time you calling `#refresh_list!` (or `#fetch!`) method: 200 | 201 | ```ruby 202 | manager.refresh_list!(country: 'PL', maxtime: '500') 203 | 204 | # => [...] 205 | ``` 206 | 207 | *NOTE*: not all the providers support filtering. Take a look at the provider classes to see if it supports custom filters. 208 | 209 | ### Standalone 210 | 211 | All you need to use this gem is Ruby >= 2.0 (2.4 is recommended). You can install it in a different ways. If you are using Ubuntu Xenial (16.04LTS) 212 | then you already have Ruby 2.3 installed. In other cases you can install it with [RVM](https://rvm.io/) or [rbenv](https://github.com/rbenv/rbenv). 213 | 214 | After installing Ruby just bundle the gem by running `gem install proxy_fetcher` in your terminal and now you can run it: 215 | 216 | ```bash 217 | proxy_fetcher >> proxies.txt # Will download proxies from the default provider, validate them and write to file 218 | ``` 219 | 220 | If you need a list of proxies from some specific provider, then you need to pass it's name with `-p` option: 221 | 222 | ```bash 223 | proxy_fetcher -p xroxy >> proxies.txt # Will download proxies from the default provider, validate them and write to file 224 | ``` 225 | 226 | If you need a list of proxies in JSON format just pass a `--json` option to the command: 227 | 228 | ```bash 229 | proxy_fetcher --json 230 | 231 | # Will print: 232 | # {"proxies":["120.26.206.178:80","119.61.13.242:1080","117.40.213.26:80","92.62.72.242:1080","77.53.105.155:3124" 233 | # "58.20.41.172:35923","204.116.192.151:35923","190.5.96.58:1080","170.250.109.97:35923","121.41.82.99:1080"]} 234 | ``` 235 | 236 | To get all the possible options run: 237 | 238 | ```bash 239 | proxy_fetcher --help 240 | ``` 241 | 242 | ## Client 243 | 244 | ProxyFetcher gem provides you a ready-to-use HTTP client that made requesting with proxies easy. It does all the work 245 | with the proxy lists for you (load, validate, refresh, find proxy by type, follow redirects, etc). All you need it to 246 | make HTTP(S) requests: 247 | 248 | ```ruby 249 | require 'proxy_fetcher' 250 | 251 | ProxyFetcher::Client.get 'https://example.com/resource' 252 | 253 | ProxyFetcher::Client.post 'https://example.com/resource', { param: 'value' } 254 | 255 | ProxyFetcher::Client.post 'https://example.com/resource', 'Any data' 256 | 257 | ProxyFetcher::Client.post 'https://example.com/resource', { param: 'value'}.to_json , headers: { 'Content-Type': 'application/json' } 258 | 259 | ProxyFetcher::Client.put 'https://example.com/resource', { param: 'value' } 260 | 261 | ProxyFetcher::Client.patch 'https://example.com/resource', { param: 'value' } 262 | 263 | ProxyFetcher::Client.delete 'https://example.com/resource' 264 | ``` 265 | 266 | By default, `ProxyFetcher::Client` makes 1000 attempts to send a HTTP request in case if proxy is out of order or the 267 | remote server returns an error. You can increase or decrease this number for your case or set it to `nil` if you want to 268 | make infinite number of requests (or before your Ruby process will die :skull:): 269 | 270 | ```ruby 271 | require 'proxy_fetcher' 272 | 273 | ProxyFetcher::Client.get 'https://example.com/resource', options: { max_retries: 10_000 } 274 | ``` 275 | 276 | You can also use your own proxy object when using ProxyFetcher client: 277 | 278 | ```ruby 279 | require 'proxy_fetcher' 280 | 281 | manager = ProxyFetcher::Manager.new # will immediately load proxy list from the server 282 | 283 | #random will return random proxy object from the list 284 | ProxyFetcher::Client.get 'https://example.com/resource', options: { proxy: manager.random } 285 | ``` 286 | 287 | Btw, if you need support of JavaScript or some other features, you need to implement your own client using, for example, 288 | `selenium-webdriver`. 289 | 290 | ## Configuration 291 | 292 | ProxyFetcher is very flexible gem. You can configure the most important parts of the library and use your own solutions. 293 | 294 | Default configuration looks as follows: 295 | 296 | ```ruby 297 | ProxyFetcher.configure do |config| 298 | config.logger = Logger.new($stdout) 299 | config.user_agent = ProxyFetcher::Configuration::DEFAULT_USER_AGENT 300 | config.pool_size = 10 301 | config.client_timeout = 3 302 | config.provider_proxies_load_timeout = 30 303 | config.proxy_validation_timeout = 3 304 | config.http_client = ProxyFetcher::HTTPClient 305 | config.proxy_validator = ProxyFetcher::ProxyValidator 306 | config.providers = ProxyFetcher::Configuration.registered_providers 307 | config.adapter = ProxyFetcher::Configuration::DEFAULT_ADAPTER # :nokogiri by default 308 | end 309 | ``` 310 | 311 | You can change any of the options above. 312 | 313 | For example, you can set your custom User-Agent string: 314 | 315 | ```ruby 316 | ProxyFetcher.configure do |config| 317 | config.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36' 318 | end 319 | ``` 320 | 321 | ProxyFetcher uses HTTP.rb gem for dealing with HTTP(S) requests. It is fast enough and has a great chainable API. 322 | If you wanna add, for example, your custom provider that was developed as a Single Page Application (SPA) with some JavaScript, 323 | then you will need something like [selenium-webdriver](https://github.com/SeleniumHQ/selenium/tree/master/rb) to properly 324 | load the content of the website. For those and other cases you can write your own class for fetching HTML content by 325 | the URL and setup it in the ProxyFetcher config: 326 | 327 | ```ruby 328 | class MyHTTPClient 329 | # [IMPORTANT]: below methods are required! 330 | def self.fetch(url) 331 | # ... some magic to return proper HTML ... 332 | end 333 | end 334 | 335 | ProxyFetcher.config.http_client = MyHTTPClient 336 | 337 | manager = ProxyFetcher::Manager.new 338 | manager.proxies 339 | 340 | #=> [#, ... ] 342 | ``` 343 | 344 | You can take a look at the [lib/proxy_fetcher/utils/http_client.rb](lib/proxy_fetcher/utils/http_client.rb) for an example. 345 | 346 | Moreover, you can write your own proxy validator to check if proxy is valid or not: 347 | 348 | ```ruby 349 | class MyProxyValidator 350 | # [IMPORTANT]: below methods are required! 351 | def self.connectable?(proxy_addr, proxy_port) 352 | # ... some magic to check if proxy is valid ... 353 | end 354 | end 355 | 356 | ProxyFetcher.config.proxy_validator = MyProxyValidator 357 | 358 | manager = ProxyFetcher::Manager.new 359 | manager.proxies 360 | 361 | #=> [#, ... ] 363 | 364 | manager.validate! 365 | 366 | #=> [ ... ] 367 | ``` 368 | 369 | Be default, ProxyFetcher gem uses [Nokogiri](https://github.com/sparklemotion/nokogiri) for parsing HTML. If you want 370 | to use [Oga](https://gitlab.com/yorickpeterse/oga) instead, then you need to add `gem 'oga'` to your Gemfile and configure 371 | ProxyFetcher as follows: 372 | 373 | ```ruby 374 | ProxyFetcher.config.adapter = :oga 375 | ``` 376 | 377 | Also you can write your own HTML parser implementation and use it, take a look at the [abstract class and implementations](lib/proxy_fetcher/document). 378 | Configure it as: 379 | 380 | ```ruby 381 | ProxyFetcher.config.adapter = MyHTMLParserClass 382 | ``` 383 | 384 | ### Proxy validation speed 385 | 386 | There are some tricks to increase proxy list validation performance. 387 | 388 | In a few words, ProxyFetcher gem uses threads to validate proxies for availability. Every proxy is checked in a 389 | separate thread. By default, ProxyFetcher uses a pool with a maximum of 10 threads. You can increase this number by 390 | setting max number of threads in the config: 391 | 392 | ```ruby 393 | ProxyFetcher.config.pool_size = 50 394 | ``` 395 | 396 | You can experiment with the threads pool size to find an optimal number of maximum threads count for you PC and OS. 397 | This will definitely give you some performance improvements. 398 | 399 | Moreover, the common proxy validation speed depends on `ProxyFetcher.config.proxy_validation_timeout` option that is equal 400 | to `3` by default. It means that gem will wait 3 seconds for the server answer to check if particular proxy is connectable. 401 | You can decrease this option to `1`, for example, and it will heavily increase proxy validation speed (**but remember** 402 | that some proxies could be connectable, but slow, so with this option you will clear proxy list from the proxies that 403 | works, but very slow). 404 | 405 | ## Proxy object 406 | 407 | Every proxy is a `ProxyFetcher::Proxy` object that has next readers (instance variables): 408 | 409 | * `addr` (IP address) 410 | * `port` 411 | * `type` (proxy type, can be HTTP, HTTPS, SOCKS4 or/and SOCKS5) 412 | * `country` (USA or Brazil for example) 413 | * `response_time` (5217 for example) 414 | * `anonymity` (`Low`, `Elite proxy` or `High +KA` for example) 415 | 416 | Also you can call next instance methods for every Proxy object: 417 | 418 | * `connectable?` (whether proxy server is available) 419 | * `http?` (whether proxy server has a HTTP protocol) 420 | * `https?` (whether proxy server has a HTTPS protocol) 421 | * `socks4?` 422 | * `socks5?` 423 | * `uri` (returns `URI::Generic` object) 424 | * `url` (returns a formatted URL like "_IP:PORT_" or "_http://IP:PORT_" if `scheme: true` provided) 425 | 426 | ## Providers 427 | 428 | Currently ProxyFetcher can deal with next proxy providers (services): 429 | 430 | * Free Proxy List 431 | * Free SSL Proxies 432 | * Free Socks Proxies 433 | * Free US Proxies 434 | * HTTP Tunnel Genius 435 | * Proxy List 436 | * XRoxy 437 | * Proxypedia 438 | * Proxyscrape 439 | * MTPro.xyz 440 | 441 | If you wanna use one of them just setup it in the config: 442 | 443 | ```ruby 444 | ProxyFetcher.config.provider = :free_proxy_list 445 | 446 | manager = ProxyFetcher::Manager.new 447 | manager.proxies 448 | #=> ... 449 | ``` 450 | 451 | You can use multiple providers at the same time: 452 | 453 | ```ruby 454 | ProxyFetcher.config.providers = :free_proxy_list, :xroxy, :proxy_docker 455 | 456 | manager = ProxyFetcher::Manager.new 457 | manager.proxies 458 | #=> ... 459 | ``` 460 | 461 | If you want to use all the possible proxy providers then you can configure ProxyFetcher as follows: 462 | 463 | ```ruby 464 | ProxyFetcher.config.providers = ProxyFetcher::Configuration.registered_providers 465 | 466 | manager = ProxyFetcher::Manager.new 467 | manager.proxies 468 | 469 | #=> [#, ... ] 471 | ``` 472 | 473 | Moreover, you can write your own provider! All you need is to create a class, that would be inherited from the 474 | `ProxyFetcher::Providers::Base` class, and register your provider like this: 475 | 476 | ```ruby 477 | ProxyFetcher::Configuration.register_provider(:your_provider, YourProviderClass) 478 | ``` 479 | 480 | Provider class must implement `self.load_proxy_list` and `#to_proxy(html_element)` methods that will load and parse 481 | provider HTML page with proxy list. Take a look at the existing providers in the [lib/proxy_fetcher/providers](lib/proxy_fetcher/providers) directory. 482 | 483 | ## Contributing 484 | 485 | You are very welcome to help improve ProxyFetcher if you have suggestions for features that other people can use. 486 | 487 | To contribute: 488 | 489 | 1. Fork the project. 490 | 2. Create your feature branch (`git checkout -b my-new-feature`). 491 | 3. Implement your feature or bug fix. 492 | 4. Add documentation for your feature or bug fix. 493 | 5. Run rake doc:yard. If your changes are not 100% documented, go back to step 4. 494 | 6. Add tests for your feature or bug fix. 495 | 7. Run `rake spec` to make sure all tests pass. 496 | 8. Commit your changes (`git commit -am 'Add new feature'`). 497 | 9. Push to the branch (`git push origin my-new-feature`). 498 | 10. Create new pull request. 499 | 500 | Thanks. 501 | 502 | ## License 503 | 504 | `proxy_fetcher` gem is released under the [MIT License](http://www.opensource.org/licenses/MIT). 505 | 506 | Copyright (c) 2017 Nikita Bulai (bulajnikita@gmail.com). 507 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "bundler/gem_tasks" 4 | 5 | require "rspec/core/rake_task" 6 | RSpec::Core::RakeTask.new(:spec) do |t| 7 | t.rspec_opts = '--tag "~unreliable"' 8 | end 9 | 10 | task default: :spec 11 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'bundler/setup' 4 | require 'proxy_fetcher' 5 | 6 | # You can add fixtures and/or initialization code here to make experimenting 7 | # with your gem easier. You can also use a different console, if you like. 8 | 9 | # (If you use this, don't forget to add pry to your Gemfile!) 10 | # require "pry" 11 | # Pry.start 12 | 13 | require 'irb' 14 | IRB.start(__FILE__) 15 | -------------------------------------------------------------------------------- /bin/proxy_fetcher: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | begin 4 | require 'nokogiri' 5 | rescue LoadError 6 | puts "Seems like you haven't installed 'nokogiri' gem that is used by ProxyFetcher for HTML parsing.\n" \ 7 | "Install it with the command: `gem install nokogiri` or check out it's documentation:\n" \ 8 | 'http://www.nokogiri.org/tutorials/installing_nokogiri.html' 9 | exit(1) 10 | end 11 | 12 | require 'optparse' 13 | require 'proxy_fetcher' 14 | 15 | options = { 16 | filters: {}, 17 | validate: true, 18 | json: false 19 | } 20 | 21 | OptionParser.new do |opts| 22 | opts.banner = 'Usage: proxy_fetcher [OPTIONS]' 23 | 24 | opts.on('-h', '--help', '# Show this help message and quit') do 25 | puts opts 26 | exit(0) 27 | end 28 | 29 | opts.on('-p', '--providers=NAME1,NAME2', Array, '# Use specific proxy providers') do |values| 30 | options[:providers] = values 31 | end 32 | 33 | opts.on('-n', '--no-validate', '# Dump all the proxies without validation') do 34 | options[:validate] = false 35 | end 36 | 37 | opts.on('-f', '--filters={}', String, '# Filters for proxy provider in JSON format') do |filters| 38 | require 'json' 39 | 40 | options[:filters] = JSON.parse(filters) 41 | end 42 | 43 | opts.on('-t', '--timeout=SECONDS', Integer, '# Connection timeout in seconds') do |value| 44 | options[:timeout] = value 45 | end 46 | 47 | opts.on('-j', '--json', '# Dump proxies to the JSON format') do 48 | options[:json] = true 49 | end 50 | 51 | opts.on('-v', '--version', '# Shows gem version') do 52 | puts ProxyFetcher.gem_version.to_s 53 | exit(0) 54 | end 55 | end.parse! 56 | 57 | ProxyFetcher.config.providers = options[:providers] if options[:providers] 58 | ProxyFetcher.config.timeout = options[:timeout] if options[:timeout] 59 | 60 | manager = ProxyFetcher::Manager.new(filters: options[:filters]) 61 | manager.validate! if options[:validate] 62 | 63 | if options[:json] 64 | require 'json' 65 | 66 | puts JSON.generate(proxies: manager.raw_proxies) 67 | else 68 | puts manager.raw_proxies 69 | end 70 | -------------------------------------------------------------------------------- /gemfiles/nokogiri.gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source "https://rubygems.org" 4 | 5 | gemspec path: "../" 6 | 7 | gem "nokogiri", "~> 1.8" 8 | 9 | group :test do 10 | gem "coveralls_reborn", require: false 11 | gem "webrick" 12 | gem "evil-proxy", "~> 0.2" 13 | end 14 | -------------------------------------------------------------------------------- /gemfiles/oga.gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source "https://rubygems.org" 4 | 5 | gemspec path: "../" 6 | 7 | gem "oga", "~> 3.0" 8 | 9 | group :test do 10 | gem "coveralls_reborn", require: false 11 | gem "webrick" 12 | gem "evil-proxy", "~> 0.2" 13 | end 14 | -------------------------------------------------------------------------------- /lib/proxy_fetcher.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "uri" 4 | require "http" 5 | require "logger" 6 | 7 | require "#{File.dirname(__FILE__)}/proxy_fetcher/version" 8 | 9 | require "#{File.dirname(__FILE__)}/proxy_fetcher/exceptions" 10 | require "#{File.dirname(__FILE__)}/proxy_fetcher/configuration" 11 | require "#{File.dirname(__FILE__)}/proxy_fetcher/configuration/providers_registry" 12 | require "#{File.dirname(__FILE__)}/proxy_fetcher/proxy" 13 | require "#{File.dirname(__FILE__)}/proxy_fetcher/manager" 14 | require "#{File.dirname(__FILE__)}/proxy_fetcher/null_logger" 15 | 16 | require "#{File.dirname(__FILE__)}/proxy_fetcher/utils/http_client" 17 | require "#{File.dirname(__FILE__)}/proxy_fetcher/utils/proxy_validator" 18 | require "#{File.dirname(__FILE__)}/proxy_fetcher/utils/proxy_list_validator" 19 | require "#{File.dirname(__FILE__)}/proxy_fetcher/client/client" 20 | require "#{File.dirname(__FILE__)}/proxy_fetcher/client/request" 21 | require "#{File.dirname(__FILE__)}/proxy_fetcher/client/proxies_registry" 22 | 23 | require "#{File.dirname(__FILE__)}/proxy_fetcher/document" 24 | require "#{File.dirname(__FILE__)}/proxy_fetcher/document/adapters" 25 | require "#{File.dirname(__FILE__)}/proxy_fetcher/document/node" 26 | require "#{File.dirname(__FILE__)}/proxy_fetcher/document/adapters/abstract_adapter" 27 | require "#{File.dirname(__FILE__)}/proxy_fetcher/document/adapters/nokogiri_adapter" 28 | require "#{File.dirname(__FILE__)}/proxy_fetcher/document/adapters/oga_adapter" 29 | 30 | ## 31 | # Ruby / JRuby lib for managing proxies 32 | module ProxyFetcher 33 | # ProxyFetcher providers namespace 34 | module Providers 35 | require "#{File.dirname(__FILE__)}/proxy_fetcher/providers/base" 36 | require "#{File.dirname(__FILE__)}/proxy_fetcher/providers/free_proxy_list" 37 | require "#{File.dirname(__FILE__)}/proxy_fetcher/providers/free_proxy_list_socks" 38 | require "#{File.dirname(__FILE__)}/proxy_fetcher/providers/free_proxy_list_ssl" 39 | require "#{File.dirname(__FILE__)}/proxy_fetcher/providers/free_proxy_list_us" 40 | require "#{File.dirname(__FILE__)}/proxy_fetcher/providers/http_tunnel" 41 | require "#{File.dirname(__FILE__)}/proxy_fetcher/providers/mtpro" 42 | require "#{File.dirname(__FILE__)}/proxy_fetcher/providers/proxy_list" 43 | require "#{File.dirname(__FILE__)}/proxy_fetcher/providers/proxypedia" 44 | require "#{File.dirname(__FILE__)}/proxy_fetcher/providers/xroxy" 45 | end 46 | 47 | @__config_access_lock__ = Mutex.new 48 | @__config_definition_lock__ = Mutex.new 49 | 50 | # Main ProxyFetcher module. 51 | class << self 52 | ## 53 | # Returns ProxyFetcher configuration. 54 | # 55 | # @return [ProxyFetcher::Configuration] 56 | # Configuration object. 57 | # 58 | # @example 59 | # ProxyFetcher.config 60 | # 61 | # #=> # 65 | # 66 | def config 67 | @__config_definition_lock__.synchronize do 68 | @config ||= ProxyFetcher::Configuration.new 69 | end 70 | end 71 | 72 | ## 73 | # Configures ProxyFetcher and yields config object for additional manipulations. 74 | 75 | # @yieldreturn [optional, types, ...] description 76 | # 77 | # @return [ProxyFetcher::Configuration] 78 | # Configuration object. 79 | # 80 | def configure 81 | @__config_access_lock__.synchronize { yield config } 82 | end 83 | 84 | # Returns ProxyFetcher logger instance. 85 | # 86 | # @return [Logger, ProxyFetcher::NullLogger] logger object 87 | # 88 | def logger 89 | return @logger if defined?(@logger) 90 | 91 | @logger = config.logger || NullLogger.new 92 | end 93 | 94 | private 95 | 96 | # Configures default adapter if it isn't defined by the user. 97 | # @api private 98 | # 99 | def configure_adapter! 100 | config.adapter = Configuration::DEFAULT_ADAPTER if config.adapter.nil? 101 | end 102 | end 103 | 104 | configure_adapter! 105 | end 106 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/client/client.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | # ProxyFetcher HTTP client that encapsulates all the logic for sending 5 | # HTTP(S) requests using proxies, automatically fetched and validated by the gem. 6 | module Client 7 | class << self 8 | # Sends HTTP GET request. 9 | # 10 | # @param url [String] 11 | # Requested URL 12 | # 13 | # @param headers [Hash] 14 | # HTTP headers that will be used in the request 15 | # 16 | # @param options [Hash] 17 | # Additional options used by ProxyFetcher::Client 18 | # 19 | # @return [String] 20 | # HTML body from the URL. 21 | # 22 | def get(url, headers: {}, options: {}) 23 | request_without_payload(:get, url, headers, options) 24 | end 25 | 26 | # Sends HTTP HEAD request. 27 | # 28 | # @param url [String] 29 | # Requested URL 30 | # 31 | # @param headers [Hash] 32 | # HTTP headers that will be used in the request 33 | # 34 | # @param options [Hash] 35 | # Additional options used by ProxyFetcher::Client 36 | # 37 | # @return [String] 38 | # HTML body from the URL. 39 | # 40 | def head(url, headers: {}, options: {}) 41 | request_without_payload(:head, url, headers, options) 42 | end 43 | 44 | # Sends HTTP POST request. 45 | # 46 | # @param url [String] 47 | # Requested URL 48 | # 49 | # @param payload [String, Hash] 50 | # HTTP payload 51 | # 52 | # @param headers [Hash] 53 | # HTTP headers that will be used in the request 54 | # 55 | # @param options [Hash] 56 | # Additional options used by ProxyFetcher::Client 57 | # 58 | # @return [String] 59 | # HTML body from the URL. 60 | # 61 | def post(url, payload, headers: {}, options: {}) 62 | request_with_payload(:post, url, payload, headers, options) 63 | end 64 | 65 | # Sends HTTP DELETE request. 66 | # 67 | # @param url [String] 68 | # Requested URL 69 | # 70 | # @param headers [Hash] 71 | # HTTP headers that will be used in the request 72 | # 73 | # @param options [Hash] 74 | # Additional options used by ProxyFetcher::Client 75 | # 76 | # @return [String] 77 | # HTML body from the URL. 78 | # 79 | def delete(url, headers: {}, options: {}) 80 | request_without_payload(:delete, url, headers, options) 81 | end 82 | 83 | # Sends HTTP PUT request. 84 | # 85 | # @param url [String] 86 | # Requested URL 87 | # 88 | # @param payload [String, Hash] 89 | # HTTP payload 90 | # 91 | # @param headers [Hash] 92 | # HTTP headers that will be used in the request 93 | # 94 | # @param options [Hash] 95 | # Additional options used by ProxyFetcher::Client 96 | # 97 | # @return [String] 98 | # HTML body from the URL. 99 | # 100 | def put(url, payload, headers: {}, options: {}) 101 | request_with_payload(:put, url, payload, headers, options) 102 | end 103 | 104 | # Sends HTTP PATCH request. 105 | # 106 | # @param url [String] 107 | # Requested URL 108 | # 109 | # @param payload [String, Hash] 110 | # HTTP payload 111 | # 112 | # @param headers [Hash] 113 | # HTTP headers that will be used in the request 114 | # 115 | # @param options [Hash] 116 | # Additional options used by ProxyFetcher::Client 117 | # 118 | # @return [String] 119 | # HTML body from the URL. 120 | # 121 | def patch(url, payload, headers: {}, options: {}) 122 | request_with_payload(:patch, url, payload, headers, options) 123 | end 124 | 125 | private 126 | 127 | # Executes HTTP request with user payload. 128 | # 129 | def request_with_payload(method, url, payload, headers, options) 130 | with_proxy_for(url, options.fetch(:max_retries, 1000)) do |proxy| 131 | opts = options.merge( 132 | payload: payload, 133 | proxy: options.fetch(:proxy, proxy), 134 | headers: default_headers.merge(headers) 135 | ) 136 | 137 | Request.execute(url: url, method: method, **opts) 138 | end 139 | end 140 | 141 | # Executes HTTP request without user payload. 142 | # 143 | def request_without_payload(method, url, headers, options) 144 | with_proxy_for(url, options.fetch(:max_retries, 1000)) do |proxy| 145 | opts = options.merge( 146 | proxy: options.fetch(:proxy, proxy), 147 | headers: default_headers.merge(headers) 148 | ) 149 | 150 | Request.execute(url: url, method: method, **opts) 151 | end 152 | end 153 | 154 | # Default ProxyFetcher::Client http headers. Uses some options 155 | # from the configuration object, such as User-Agent string. 156 | # 157 | # @return [Hash] 158 | # headers 159 | # 160 | def default_headers 161 | { 162 | "User-Agent" => ProxyFetcher.config.user_agent 163 | } 164 | end 165 | 166 | # Searches for valid proxy (suitable for URL type) using ProxyFetcher::Manager 167 | # instance and executes the block with found proxy with retries (N times, default is 1000) if 168 | # something goes wrong. 169 | # 170 | # @param url [String] request URL 171 | # @param max_retries [Integer] maximum number of retries 172 | # 173 | # @raise [ProxyFetcher::Error] internal error happened during block execution 174 | # 175 | # Requires refactoring :( 176 | # 177 | def with_proxy_for(url, max_retries = 1000) 178 | tries = 0 179 | 180 | begin 181 | proxy = ProxiesRegistry.find_proxy_for(url) 182 | yield(proxy) 183 | rescue ProxyFetcher::Error 184 | raise 185 | rescue StandardError 186 | if max_retries && tries >= max_retries 187 | ProxyFetcher.logger.warn("reached maximum amount of retries (#{max_retries})") 188 | raise ProxyFetcher::Exceptions::MaximumRetriesReached 189 | end 190 | 191 | ProxiesRegistry.invalidate_proxy!(proxy) 192 | tries += 1 193 | 194 | retry 195 | end 196 | end 197 | end 198 | end 199 | end 200 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/client/proxies_registry.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | module Client 5 | # ProxyFetcher proxies registry for managing proxy lists used by the Client. 6 | # It is used to fetch proxy lists and instantiate Manager object that will 7 | # handle proxies. 8 | class ProxiesRegistry 9 | class << self 10 | # Removes proxy from the list of the current proxy manager 11 | # instance. If no more proxy available, refreshes the list. 12 | # 13 | # @param proxy [ProxyFetcher::Proxy] 14 | # proxy object to remove 15 | # 16 | def invalidate_proxy!(proxy) 17 | manager.proxies.delete(proxy) 18 | manager.refresh_list! if manager.proxies.empty? 19 | end 20 | 21 | # Searches for valid proxy or required type (HTTP or secure) 22 | # for requested URL. If no proxy found, than it refreshes proxy list 23 | # and tries again. 24 | # 25 | # @param url [String] 26 | # URL to process with proxy 27 | # 28 | # @return [ProxyFetcher::Proxy] 29 | # gems proxy object 30 | # 31 | def find_proxy_for(url) 32 | proxy = if URI.parse(url).is_a?(URI::HTTPS) 33 | manager.proxies.detect(&:ssl?) 34 | else 35 | manager.get 36 | end 37 | 38 | return proxy unless proxy.nil? 39 | 40 | manager.refresh_list! 41 | find_proxy_for(url) 42 | end 43 | 44 | # Instantiates or returns ProxyFetcher::Manager instance 45 | # for current Thread. 46 | # 47 | # @return [ProxyFetcher::Manager] 48 | # ProxyFetcher manager class 49 | # 50 | def manager 51 | manager = Thread.current[:proxy_fetcher_manager] 52 | return manager unless manager.nil? 53 | 54 | Thread.current[:proxy_fetcher_manager] = ProxyFetcher::Manager.new 55 | end 56 | end 57 | end 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/client/request.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | module Client 5 | # ProxyFetcher::Client HTTP request abstraction. 6 | class Request 7 | # @!attribute [r] method 8 | # @return [String, Symbol] HTTP request method 9 | attr_reader :method 10 | 11 | # @!attribute [r] url 12 | # @return [String] Request URL 13 | attr_reader :url 14 | 15 | # @!attribute [r] headers 16 | # @return [Hash] HTTP headers 17 | attr_reader :headers 18 | 19 | # @!attribute [r] timeout 20 | # @return [Integer] Request timeout 21 | attr_reader :timeout 22 | 23 | # @!attribute [r] payload 24 | # @return [String, Hash] Request payload 25 | attr_reader :payload 26 | 27 | # @!attribute [r] proxy 28 | # @return [Proxy] Proxy to process the request 29 | attr_reader :proxy 30 | 31 | # @!attribute [r] max_redirects 32 | # @return [Integer] Maximum count of requests (if fails) 33 | attr_reader :max_redirects 34 | 35 | # @!attribute [r] ssl_options 36 | # @return [Hash] SSL options 37 | attr_reader :ssl_options 38 | 39 | # Initializes a new HTTP request and processes it 40 | # 41 | # @return [String] 42 | # response body (requested resource content) 43 | # 44 | def self.execute(**args) 45 | new(**args).execute 46 | end 47 | 48 | # Initialize new HTTP request 49 | # 50 | # @return [Request] 51 | # 52 | def initialize(**args) 53 | raise ArgumentError, "args must be a Hash!" unless args.is_a?(Hash) 54 | 55 | @url = args.fetch(:url) 56 | @method = args.fetch(:method).to_s.downcase 57 | @headers = (args[:headers] || {}).dup 58 | @payload = args[:payload] 59 | @timeout = args.fetch(:timeout, ProxyFetcher.config.client_timeout) 60 | @ssl_options = args.fetch(:ssl_options, default_ssl_options) 61 | 62 | @proxy = args.fetch(:proxy) 63 | @max_redirects = args.fetch(:max_redirects, 10) 64 | 65 | @http = build_http_client 66 | end 67 | 68 | # Executes HTTP request with defined options. 69 | # 70 | # @return [String] 71 | # response body (requested resource content) 72 | # 73 | def execute 74 | response = send_request 75 | response.body.to_s 76 | rescue HTTP::Redirector::TooManyRedirectsError 77 | raise ProxyFetcher::Exceptions::MaximumRedirectsReached 78 | end 79 | 80 | private 81 | 82 | # Builds HTTP client. 83 | # 84 | # @return [HTTP::Client] 85 | # HTTP client 86 | # 87 | def build_http_client 88 | HTTP.via(proxy.addr, proxy.port.to_i) 89 | .headers(headers) 90 | .timeout(connect: timeout, read: timeout) 91 | .follow(max_hops: max_redirects) 92 | end 93 | 94 | # Default SSL options that will be used for connecting to resources 95 | # the uses secure connection. By default ProxyFetcher wouldn't verify 96 | # SSL certs. 97 | # 98 | # @return [OpenSSL::SSL::SSLContext] SSL context 99 | # 100 | def default_ssl_options 101 | ssl_ctx = OpenSSL::SSL::SSLContext.new 102 | ssl_ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE 103 | ssl_ctx 104 | end 105 | 106 | # Sends HTTP request to the URL. Check for the payload and it's type 107 | # in order to build valid request. 108 | # 109 | # @return [HTTP::Response] request response 110 | # 111 | def send_request 112 | if payload 113 | payload_type = payload.is_a?(String) ? :body : :form 114 | 115 | @http.public_send(method, url, payload_type => payload, ssl_context: ssl_options) 116 | else 117 | @http.public_send(method, url, ssl_context: ssl_options) 118 | end 119 | end 120 | end 121 | end 122 | end 123 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/configuration.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | # ProxyFetcher configuration. Stores all the options for dealing 5 | # with HTTP requests, adapters, custom classes. 6 | # 7 | class Configuration 8 | # @!attribute client_timeout 9 | # @return [Integer] 10 | # HTTP request timeout (connect / open) for [ProxyFetcher::Client] 11 | attr_accessor :client_timeout 12 | 13 | # @!attribute provider_proxies_load_timeout 14 | # @return [Integer] 15 | # HTTP request timeout (connect / open) for loading 16 | # of proxies list by provider 17 | attr_accessor :provider_proxies_load_timeout 18 | 19 | # @!attribute proxy_validation_timeout 20 | # @return [Integer] 21 | # HTTP request timeout (connect / open) for proxy 22 | # validation with [ProxyFetcher::ProxyValidator] 23 | attr_accessor :proxy_validation_timeout 24 | 25 | # to save compatibility 26 | alias timeout client_timeout 27 | alias timeout= client_timeout= 28 | 29 | # @!attribute pool_size 30 | # @return [Integer] proxy validator pool size (max number of threads) 31 | attr_accessor :pool_size 32 | 33 | # @!attribute user_agent 34 | # @return [String] User-Agent string 35 | attr_accessor :user_agent 36 | 37 | # @!attribute [r] logger 38 | # @return [Logger] Logger object 39 | attr_accessor :logger 40 | 41 | # @!attribute [r] adapter 42 | # @return [#to_s] HTML parser adapter 43 | attr_reader :adapter 44 | 45 | # @!attribute [r] http_client 46 | # @return [Object] HTTP client class 47 | attr_reader :http_client 48 | 49 | # @!attribute [r] proxy_validator 50 | # @return [Object] proxy validator class 51 | attr_reader :proxy_validator 52 | 53 | # @!attribute [r] providers 54 | # @return [Array] proxy providers list to be used 55 | attr_reader :providers 56 | 57 | # User-Agent string that will be used by the ProxyFetcher HTTP client (to 58 | # send requests via proxy) and to fetch proxy lists from the sources. 59 | # 60 | # Default is Google Chrome 60, but can be changed in ProxyFetcher.config. 61 | # 62 | DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 " \ 63 | "(KHTML, like Gecko) Chrome/60.0.3112 Safari/537.36" 64 | 65 | # HTML parser adapter name. 66 | # 67 | # Default is Nokogiri, but can be changed in ProxyFetcher.config. 68 | # 69 | DEFAULT_ADAPTER = :nokogiri 70 | 71 | @__adapter_lock__ = Mutex.new 72 | 73 | class << self 74 | # Registry for handling proxy providers. 75 | # 76 | # @return [ProxyFetcher::ProvidersRegistry] 77 | # providers registry 78 | # 79 | def providers_registry 80 | @providers_registry ||= ProvidersRegistry.new 81 | end 82 | 83 | # Register new proxy provider. Requires provider name and class 84 | # that will process proxy list. 85 | # 86 | # @param name [String, Symbol] 87 | # name of the provider 88 | # 89 | # @param klass [Class] 90 | # Class that will fetch and process proxy list 91 | # 92 | def register_provider(name, klass) 93 | providers_registry.register(name, klass) 94 | end 95 | 96 | # Returns registered providers names. 97 | # 98 | # @return [Array] 99 | # registered providers names 100 | # 101 | def registered_providers 102 | providers_registry.providers.keys 103 | end 104 | end 105 | 106 | # Initialize ProxyFetcher configuration with default options. 107 | # 108 | # @return [ProxyFetcher::Configuration] 109 | # ProxyFetcher gem configuration object 110 | # 111 | def initialize 112 | reset! 113 | end 114 | 115 | # Sets default configuration options 116 | def reset! 117 | @logger = Logger.new($stdout) 118 | @user_agent = DEFAULT_USER_AGENT 119 | @pool_size = 10 120 | @client_timeout = 3 121 | @provider_proxies_load_timeout = 30 122 | @proxy_validation_timeout = 3 123 | 124 | @http_client = HTTPClient 125 | @proxy_validator = ProxyValidator 126 | 127 | self.providers = self.class.registered_providers 128 | end 129 | 130 | def adapter=(value) 131 | remove_instance_variable(:@adapter_class) if defined?(@adapter_class) 132 | @adapter = value 133 | end 134 | 135 | def adapter_class 136 | self.class.instance_variable_get(:@__adapter_lock__).synchronize do 137 | return @adapter_class if defined?(@adapter_class) 138 | 139 | @adapter_class = ProxyFetcher::Document::Adapters.lookup(adapter) 140 | @adapter_class.setup! 141 | @adapter_class 142 | end 143 | end 144 | 145 | # Setups collection of providers that will be used to fetch proxies. 146 | # 147 | # @param value [String, Symbol, Array] 148 | # provider names 149 | # 150 | def providers=(value) 151 | @providers = Array(value) 152 | end 153 | 154 | alias provider providers 155 | alias provider= providers= 156 | 157 | # Setups HTTP client class that will be used to fetch proxy lists. 158 | # Validates class for the required methods to be defined. 159 | # 160 | # @param klass [Class] 161 | # HTTP client class 162 | # 163 | def http_client=(klass) 164 | @http_client = setup_custom_class(klass, required_methods: :fetch) 165 | end 166 | 167 | # Setups class that will be used to validate proxy lists. 168 | # Validates class for the required methods to be defined. 169 | # 170 | # @param klass [Class] 171 | # Proxy validator class 172 | # 173 | def proxy_validator=(klass) 174 | @proxy_validator = setup_custom_class(klass, required_methods: :connectable?) 175 | end 176 | 177 | private 178 | 179 | # Checks if custom class has some required class methods 180 | def setup_custom_class(klass, required_methods: []) 181 | unless klass.respond_to?(*required_methods) 182 | raise ProxyFetcher::Exceptions::WrongCustomClass.new(klass, required_methods) 183 | end 184 | 185 | klass 186 | end 187 | end 188 | end 189 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/configuration/providers_registry.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | # ProxyFetcher providers registry that stores all registered proxy providers. 5 | class ProvidersRegistry 6 | # Returns providers hash where key is the name of the provider 7 | # and value is an associated class. 8 | # 9 | # @return [Hash] 10 | # registered providers 11 | # 12 | def providers 13 | @providers ||= {} 14 | end 15 | 16 | # Add custom provider to common registry. 17 | # Requires proxy provider name ('xroxy' for example) and a class 18 | # that implements the parsing logic. 19 | # 20 | # @param name [String, Symbol] 21 | # provider name 22 | # 23 | # @param klass [Class] 24 | # provider class 25 | # 26 | # @raise [ProxyFetcher::Exceptions::RegisteredProvider] 27 | # provider already registered 28 | # 29 | def register(name, klass) 30 | raise ProxyFetcher::Exceptions::RegisteredProvider, name if providers.key?(name.to_sym) 31 | 32 | providers[name.to_sym] = klass 33 | end 34 | 35 | # Returns a class for specific provider if it is registered 36 | # in the registry. Otherwise throws an exception. 37 | # 38 | # @param provider_name [String, Symbol] 39 | # provider name 40 | # 41 | # @return [Class] 42 | # provider class 43 | # 44 | # @raise [ProxyFetcher::Exceptions::UnknownProvider] 45 | # provider is unknown 46 | # 47 | def class_for(provider_name) 48 | provider_name = provider_name.to_sym 49 | 50 | providers.fetch(provider_name) 51 | rescue KeyError 52 | raise ProxyFetcher::Exceptions::UnknownProvider, provider_name 53 | end 54 | end 55 | end 56 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/document.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | # HTML document abstraction class. Used to work with different HTML parser adapters 5 | # such as Nokogiri, Oga or a custom one. Stores backend] 34 | # collection of nodes 35 | # 36 | def xpath(*args) 37 | backend.xpath(*args).map { |node| backend.proxy_node.new(node) } 38 | end 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/document/adapters.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | class Document 5 | # ProxyFetcher HTML parser adapters. 6 | # 7 | # ProxyFetcher default supported adapters are: 8 | # 9 | # * Nokogiri 10 | # * Oga 11 | # 12 | # Any custom adapter can be used and must be inherited from 13 | # ProxyFetcher::Document::AbstractAdapter. 14 | class Adapters 15 | # Adapters class name suffix 16 | ADAPTER = "Adapter" 17 | private_constant :ADAPTER 18 | 19 | class << self 20 | # Returns HTML parser adapter by it's name or class. 21 | # If name is provided, then it looks for predefined classes 22 | # in ProxyFetcher::Document namespace. Otherwise 23 | # it just returns the passed class. 24 | # 25 | # @param name_or_class [String, Class] 26 | # Adapter name or class 27 | # 28 | def lookup(name_or_class) 29 | raise Exceptions::BlankAdapter if name_or_class.nil? || name_or_class.to_s.empty? 30 | 31 | case name_or_class 32 | when Symbol, String 33 | adapter_name = "#{name_or_class.to_s.capitalize}#{ADAPTER}" 34 | ProxyFetcher::Document.const_get(adapter_name) 35 | else 36 | name_or_class 37 | end 38 | rescue NameError 39 | raise Exceptions::UnknownAdapter, name_or_class 40 | end 41 | end 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/document/adapters/abstract_adapter.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | class Document 5 | # Abstract HTML parser adapter class. 6 | # Handles document manipulations. 7 | class AbstractAdapter 8 | # @!attribute [r] events 9 | # @return [Hash] A hash with events registered within a bus 10 | attr_reader :document 11 | 12 | # Initialize adapter 13 | # 14 | # @return [AbstractAdapter] 15 | # 16 | def initialize(document) 17 | @document = document 18 | end 19 | 20 | # You can override this method in your own adapter class 21 | # 22 | # @param selector [String] 23 | # XPath selector 24 | # 25 | def xpath(selector) 26 | document.xpath(selector) 27 | end 28 | 29 | # Returns Node class that will handle HTML 30 | # nodes for particular adapter. 31 | # 32 | # @return [ProxyFetcher::Document::Node] 33 | # node 34 | # 35 | def proxy_node 36 | self.class.const_get("Node") 37 | end 38 | 39 | # Installs adapter requirements. 40 | # 41 | # @raise [Exceptions::AdapterSetupError] 42 | # adapter can't be install due to some error 43 | # 44 | def self.setup!(*args) 45 | install_requirements!(*args) 46 | self 47 | rescue LoadError, StandardError => e 48 | raise Exceptions::AdapterSetupError.new(name, e.message) 49 | end 50 | end 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/document/adapters/nokogiri_adapter.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | class Document 5 | # HTML parser adapter that uses Nokogiri as a backend. 6 | class NokogiriAdapter < AbstractAdapter 7 | # Requires Nokogiri gem to the application. 8 | def self.install_requirements! 9 | require "nokogiri" 10 | end 11 | 12 | # Parses raw HTML content with specific gem. 13 | # 14 | # @param data [String] 15 | # HTML content 16 | # 17 | # @return [ProxyFetcher::Document::NokogiriAdapter] 18 | # Object with parsed document 19 | # 20 | def self.parse(data) 21 | new(::Nokogiri::HTML(data)) 22 | end 23 | 24 | # Nokogiri DOM node 25 | class Node < ProxyFetcher::Document::Node 26 | # Returns HTML node attribute value. 27 | # 28 | # @return [String] attribute value 29 | # 30 | def attr(*args) 31 | clear(node.attr(*args)) 32 | end 33 | 34 | # Returns HTML node inner text value clean from 35 | # whitespaces, tabs, etc. 36 | # 37 | # @return [String] node inner text 38 | # 39 | def content 40 | clear(node.content) 41 | end 42 | 43 | # Returns node inner HTML. 44 | # 45 | # @return [String] inner HTML 46 | # 47 | def html 48 | node.inner_html 49 | end 50 | end 51 | end 52 | end 53 | end 54 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/document/adapters/oga_adapter.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | class Document 5 | # HTML parser adapter that uses Oga as a backend. 6 | class OgaAdapter < AbstractAdapter 7 | # Requires Oga gem to the application. 8 | def self.install_requirements! 9 | require "oga" 10 | end 11 | 12 | # Parses raw HTML content with specific gem. 13 | # 14 | # @param data [String] 15 | # HTML content 16 | # 17 | # @return [ProxyFetcher::Document::OgaAdapter] 18 | # Object with parsed document 19 | # 20 | def self.parse(data) 21 | new(::Oga.parse_html(data)) 22 | end 23 | 24 | # Oga DOM node 25 | class Node < ProxyFetcher::Document::Node 26 | # Returns HTML node attribute value. 27 | # 28 | # @return [String] attribute value 29 | # 30 | def attr(*args) 31 | clear(node.attribute(*args).value) 32 | end 33 | 34 | # Returns HTML node inner text value clean from 35 | # whitespaces, tabs, etc. 36 | # 37 | # @return [String] node inner text 38 | # 39 | def content 40 | clear(node.text) 41 | end 42 | 43 | # Returns node inner HTML. 44 | # 45 | # @return [String] inner HTML 46 | # 47 | def html 48 | node.to_xml 49 | end 50 | end 51 | end 52 | end 53 | end 54 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/document/node.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | class Document 5 | # Abstract class for storing HTML elements that was parsed by 6 | # one of the ProxyFetcher::Document adapters class. 7 | class Node 8 | # @!attribute [r] node 9 | # @return [Object] original DOM node, parsed by adapter backend 10 | attr_reader :node 11 | 12 | # Initialize new HTML node 13 | # 14 | # @return [Node] 15 | # 16 | def initialize(node) 17 | @node = node 18 | end 19 | 20 | # Searches for node in children using some selector (CSS or XPath). 21 | # 22 | # @param selector [String] selector (CSS or XPath) 23 | # 24 | # @return [Node] child node 25 | # 26 | def find(selector, method = :at_xpath) 27 | self.class.new(node.public_send(method, selector)) 28 | end 29 | 30 | # Searches exact HTML element by XPath. Returns only one element. 31 | # 32 | # @return [ProxyFetcher::Document::Node] 33 | # node 34 | # 35 | def at_xpath(*args) 36 | self.class.new(node.at_xpath(*args)) 37 | end 38 | 39 | # Searches exact HTML element by CSS. Returns only one element. 40 | # 41 | # @return [ProxyFetcher::Document::Node] 42 | # node 43 | # 44 | def at_css(*args) 45 | self.class.new(node.at_css(*args)) 46 | end 47 | 48 | # Returns clean content (text) for the specific element. 49 | # 50 | # @return [String] 51 | # HTML node content 52 | # 53 | def content_at(*args) 54 | clear(find(*args).content) 55 | end 56 | 57 | # Returns HTML node content. 58 | # 59 | # Abstract method, must be implemented for specific adapter class. 60 | # 61 | def content 62 | raise "`#{__method__}` must be implemented for specific adapter class!" 63 | end 64 | 65 | # Returns HTML node inner HTML. 66 | # 67 | # Abstract method, must be implemented for specific adapter class. 68 | # 69 | def html 70 | raise "`#{__method__}` must be implemented for specific adapter class!" 71 | end 72 | 73 | protected 74 | 75 | # Removes whitespaces, tabulation and other "garbage" for the text. 76 | # 77 | # @param text [String] 78 | # text to clear 79 | # 80 | # @return [String] 81 | # clean text 82 | # 83 | def clear(text) 84 | return "" if text.nil? || text.empty? 85 | 86 | text.strip.gsub(/\t/i, "") 87 | end 88 | end 89 | end 90 | end 91 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/exceptions.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | # Base exception class for all the ProxyFetcher exceptions. 5 | Error = Class.new(StandardError) 6 | 7 | # ProxyFetcher exceptions namespace 8 | module Exceptions 9 | # Exception for wrong custom classes (such as ProxyValidator or HTTP Client). 10 | class WrongCustomClass < Error 11 | # Initialize new exception 12 | # 13 | # @return [WrongCustomClass] 14 | # 15 | def initialize(klass, methods) 16 | required_methods = Array(methods).join(", ") 17 | super("#{klass} must respond to [#{required_methods}] class methods!") 18 | end 19 | end 20 | 21 | # Exception for wrong provider name, that raises when configured provider 22 | # that is not registered via register_provider interface. 23 | class UnknownProvider < Error 24 | # Initialize new exception 25 | # 26 | # @param provider_name [String] provider name 27 | # 28 | # @return [UnknownProvider] 29 | # 30 | def initialize(provider_name) 31 | super("unregistered proxy provider `#{provider_name}`") 32 | end 33 | end 34 | 35 | # Exception for cases when user tries to register already existing provider. 36 | class RegisteredProvider < Error 37 | # Initialize new exception 38 | # 39 | # @param name [String, Symbol] provider name 40 | # 41 | # @return [RegisteredProvider] 42 | # 43 | def initialize(name) 44 | super("`#{name}` provider already registered!") 45 | end 46 | end 47 | 48 | # Exception for cases when HTTP client reached maximum count of redirects 49 | # trying to process HTTP request. 50 | class MaximumRedirectsReached < Error 51 | # Initialize new exception 52 | # 53 | # @return [MaximumRedirectsReached] 54 | # 55 | def initialize(*) 56 | super("maximum redirects reached") 57 | end 58 | end 59 | 60 | # Exception for cases when HTTP client reached maximum count of retries 61 | # trying to process HTTP request. Can occur when request failed by timeout 62 | # multiple times. 63 | class MaximumRetriesReached < Error 64 | # Initialize new exception 65 | # 66 | # @return [MaximumRetriesReached] 67 | # 68 | def initialize(*) 69 | super("reached the maximum number of retries") 70 | end 71 | end 72 | 73 | # Exception for cases when user tries to set wrong HTML parser adapter 74 | # in the configuration. 75 | class UnknownAdapter < Error 76 | # Initialize new exception 77 | # 78 | # @param name [String] configured adapter name 79 | # 80 | # @return [UnknownAdapter] 81 | # 82 | def initialize(name) 83 | super("unknown adapter '#{name}'") 84 | end 85 | end 86 | 87 | # Exception for cases when user tries to set nil HTML parser adapter 88 | # in the configuration (or just forget to change it). 89 | class BlankAdapter < Error 90 | # Initialize new exception 91 | # 92 | # @return [BlankAdapter] 93 | # 94 | def initialize(*) 95 | super(<<-MSG.strip.squeeze 96 | you need to specify adapter for HTML parsing: ProxyFetcher.config.adapter = :nokogiri. 97 | You can use one of the predefined adapters (:nokogiri or :oga) or your own implementation. 98 | MSG 99 | ) 100 | end 101 | end 102 | 103 | # Exception for cases when HTML parser adapter can't be installed. 104 | # It will print the reason (backtrace) of the exception that caused an error. 105 | class AdapterSetupError < Error 106 | # Initialize new exception 107 | # 108 | # @param adapter_name [String] configured adapter name 109 | # @param error [String] full setup error (backtrace) 110 | # 111 | # @return [AdapterSetupError] 112 | # 113 | def initialize(adapter_name, error) 114 | adapter = demodulize(adapter_name.gsub("Adapter", "")) 115 | 116 | super("can't setup '#{adapter}' adapter during the following error:\n\t#{error}'") 117 | end 118 | 119 | private 120 | 121 | # Returns just class name removing it's namespace. 122 | # 123 | # @param path [String] 124 | # full class name 125 | # 126 | # @return [String] demodulized class name 127 | # 128 | def demodulize(path) 129 | path = path.to_s 130 | index = path.rindex("::") 131 | 132 | index ? path[(index + 2)..-1] : path 133 | end 134 | end 135 | end 136 | end 137 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/manager.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | # ProxyFetcher Manager class for interacting with proxy lists from various providers. 5 | class Manager 6 | REFRESHER_LOCK = Mutex.new 7 | 8 | class << self 9 | def from_files(files, **options) 10 | new(**options.merge(files: Array(files))) 11 | end 12 | 13 | alias from_file from_files 14 | end 15 | 16 | # @!attribute [r] proxies 17 | # @return [Array] An array of proxies 18 | attr_reader :proxies 19 | 20 | # Initialize ProxyFetcher Manager instance for managing proxies 21 | # 22 | # refresh: true - load proxy list from the remote server on initialization 23 | # refresh: false - just initialize the class, proxy list will be empty ([]) 24 | # 25 | # @return [Manager] 26 | # 27 | def initialize(**options) 28 | if options.fetch(:refresh, true) 29 | refresh_list!(options.fetch(:filters, {})) 30 | else 31 | @proxies = [] 32 | end 33 | 34 | files = Array(options.fetch(:file, options.fetch(:files, []))) 35 | load_proxies_from_files!(files) if files&.any? 36 | 37 | cleanup! if options.fetch(:validate, false) 38 | end 39 | 40 | # Update current proxy list using configured providers. 41 | # 42 | # @param filters [Hash] providers filters 43 | # 44 | def refresh_list!(filters = nil) 45 | @proxies = [] 46 | threads = [] 47 | 48 | ProxyFetcher.config.providers.each do |provider_name| 49 | threads << Thread.new do 50 | Thread.current.report_on_exception = false 51 | 52 | provider = ProxyFetcher::Configuration.providers_registry.class_for(provider_name) 53 | provider_filters = filters && filters.fetch(provider_name.to_sym, filters) 54 | provider_proxies = provider.fetch_proxies!(provider_filters) 55 | 56 | REFRESHER_LOCK.synchronize do 57 | @proxies.concat(provider_proxies) 58 | end 59 | end 60 | end 61 | 62 | threads.each(&:join) 63 | 64 | @proxies 65 | end 66 | 67 | alias fetch! refresh_list! 68 | 69 | # Pop just first proxy (and back it to the end of the proxy list). 70 | # 71 | # @return [ProxyFetcher::Proxy, NilClass] 72 | # proxy object from the list 73 | # 74 | def get 75 | return if @proxies.empty? 76 | 77 | first_proxy = @proxies.shift 78 | @proxies << first_proxy 79 | 80 | first_proxy 81 | end 82 | 83 | alias pop get 84 | 85 | # Pop first valid proxy (and back it to the end of the proxy list) 86 | # Invalid proxies will be removed from the list 87 | # 88 | # @return [ProxyFetcher::Proxy, NilClass] 89 | # proxy object from the list 90 | # 91 | def get! 92 | index = proxies.find_index(&:connectable?) 93 | return if index.nil? 94 | 95 | proxy = proxies.delete_at(index) 96 | tail = proxies[index..-1] 97 | 98 | @proxies = tail << proxy 99 | 100 | proxy 101 | end 102 | 103 | alias pop! get! 104 | 105 | # Loads proxies from files. 106 | # 107 | # @param proxy_files [String, Array] 108 | # file path of list of files to load 109 | # 110 | def load_proxies_from_files!(proxy_files) 111 | proxy_files = Array(proxy_files) 112 | return if proxy_files.empty? 113 | 114 | proxy_files.each do |proxy_file| 115 | File.foreach(proxy_file, chomp: true) do |proxy_string| 116 | addr, port = proxy_string.split(":", 2) 117 | port = Integer(port) if port 118 | @proxies << Proxy.new(addr: addr, port: port) 119 | end 120 | end 121 | 122 | @proxies.uniq! 123 | end 124 | 125 | # Clean current proxy list from dead proxies (that doesn't respond by timeout) 126 | # 127 | # @return [Array] 128 | # list of valid proxies 129 | def cleanup! 130 | valid_proxies = ProxyListValidator.new(@proxies).validate 131 | @proxies &= valid_proxies 132 | end 133 | 134 | alias validate! cleanup! 135 | 136 | # Returns random proxy 137 | # 138 | # @return [Proxy] 139 | # random proxy from the loaded list 140 | # 141 | def random_proxy 142 | proxies.sample 143 | end 144 | 145 | alias random random_proxy 146 | 147 | # Returns array of proxy URLs (just schema + host + port) 148 | # 149 | # @return [Array] 150 | # collection of proxies 151 | # 152 | def raw_proxies 153 | proxies.map(&:url) 154 | end 155 | 156 | # @private No need to put all the attr_readers to the output 157 | def inspect 158 | to_s 159 | end 160 | end 161 | end 162 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/null_logger.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | class NullLogger 5 | # @return [nil] 6 | def unknown(*) 7 | nil 8 | end 9 | 10 | # @return [nil] 11 | def fatal(*) 12 | nil 13 | end 14 | 15 | # @return [nil] 16 | def error(*) 17 | nil 18 | end 19 | 20 | # @return [nil] 21 | def warn(*) 22 | nil 23 | end 24 | 25 | # @return [nil] 26 | def info(*) 27 | nil 28 | end 29 | 30 | # @return [nil] 31 | def debug(*) 32 | nil 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/providers/base.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | module Providers 5 | # Base class for all the ProxyFetcher providers. 6 | class Base 7 | # Loads proxy provider page content, extract proxy list from it 8 | # and convert every entry to proxy object. 9 | def fetch_proxies(filters = {}) 10 | raw_proxies = load_proxy_list(filters) 11 | proxies = raw_proxies.map { |html_node| build_proxy(html_node) }.compact 12 | proxies.reject { |proxy| proxy.addr.nil? } 13 | end 14 | 15 | # For retro-compatibility 16 | alias fetch_proxies! fetch_proxies 17 | 18 | def provider_url 19 | raise NotImplementedError, "#{__method__} must be implemented in a descendant class!" 20 | end 21 | 22 | def provider_method 23 | :get 24 | end 25 | 26 | def provider_params 27 | {} 28 | end 29 | 30 | # @return [Hash] 31 | # Provider headers required to fetch the proxy list 32 | # 33 | def provider_headers 34 | {} 35 | end 36 | 37 | def xpath 38 | raise NotImplementedError, "#{__method__} must be implemented in a descendant class!" 39 | end 40 | 41 | # Just synthetic sugar to make it easier to call #fetch_proxies! method. 42 | def self.fetch_proxies!(*args) 43 | new.fetch_proxies!(*args) 44 | end 45 | 46 | protected 47 | 48 | # Loads raw provider HTML with proxies. 49 | # 50 | # @param url [String] 51 | # Provider URL 52 | # 53 | # @param filters [#to_h] 54 | # Provider filters (Hash-like object) 55 | # 56 | # @return [String] 57 | # HTML body from the response 58 | # 59 | def load_html(url, filters = {}) 60 | unless filters.respond_to?(:to_h) 61 | raise ArgumentError, "filters must be a Hash or respond to #to_h" 62 | end 63 | 64 | if filters&.any? 65 | # TODO: query for post request? 66 | uri = URI.parse(url) 67 | uri.query = URI.encode_www_form(provider_params.merge(filters.to_h)) 68 | url = uri.to_s 69 | end 70 | 71 | ProxyFetcher.config.http_client.fetch( 72 | url, 73 | method: provider_method, 74 | headers: provider_headers, 75 | params: provider_params 76 | ) 77 | end 78 | 79 | # Loads provider HTML and parses it with internal document object. 80 | # 81 | # @param url [String] 82 | # URL to fetch 83 | # 84 | # @param filters [Hash] 85 | # filters for proxy provider 86 | # 87 | # @return [ProxyFetcher::Document] 88 | # ProxyFetcher document object 89 | # 90 | def load_document(url, filters = {}) 91 | html = load_html(url, filters) 92 | ProxyFetcher::Document.parse(html) 93 | end 94 | 95 | # Fetches HTML content by sending HTTP request to the provider URL and 96 | # parses the document (built as abstract ProxyFetcher::Document) 97 | # to return all the proxy entries (HTML nodes). 98 | # 99 | # @return [Array] 100 | # Collection of extracted HTML nodes with full proxy info 101 | # 102 | def load_proxy_list(filters = {}) 103 | doc = load_document(provider_url, filters) 104 | doc.xpath(xpath) 105 | end 106 | 107 | def build_proxy(*args) 108 | to_proxy(*args) 109 | rescue StandardError => e 110 | ProxyFetcher.logger.warn( 111 | "Failed to build Proxy for #{self.class.name.split("::").last} " \ 112 | "due to error: #{e.message}" 113 | ) 114 | 115 | nil 116 | end 117 | 118 | # Convert HTML element with proxy info to ProxyFetcher::Proxy instance. 119 | # 120 | # Abstract method. Must be implemented in a descendant class 121 | # 122 | # @return [Proxy] 123 | # new proxy object from the HTML node 124 | # 125 | def to_proxy(*) 126 | raise NotImplementedError, "#{__method__} must be implemented in a descendant class!" 127 | end 128 | end 129 | end 130 | end 131 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/providers/free_proxy_list.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | module Providers 5 | # FreeProxyList provider class. 6 | class FreeProxyList < Base 7 | # Provider URL to fetch proxy list 8 | def provider_url 9 | "https://free-proxy-list.net/" 10 | end 11 | 12 | # [NOTE] Doesn't support filtering 13 | def xpath 14 | "//table[./thead/tr/th[contains(text(), 'IP')]]/tbody/tr" 15 | end 16 | 17 | # Converts HTML node (entry of N tags) to ProxyFetcher::Proxy 18 | # object. 19 | # 20 | # @param html_node [Object] 21 | # HTML node from the ProxyFetcher::Document DOM model. 22 | # 23 | # @return [ProxyFetcher::Proxy] 24 | # Proxy object 25 | # 26 | def to_proxy(html_node) 27 | ProxyFetcher::Proxy.new.tap do |proxy| 28 | proxy.addr = html_node.content_at("td[1]") 29 | proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, "")) 30 | proxy.country = html_node.content_at("td[4]") 31 | proxy.anonymity = html_node.content_at("td[5]") 32 | proxy.type = parse_type(html_node) 33 | end 34 | end 35 | 36 | private 37 | 38 | # Parses HTML node to extract proxy type. 39 | # 40 | # @param html_node [Object] 41 | # HTML node from the ProxyFetcher::Document DOM model. 42 | # 43 | # @return [String] 44 | # Proxy type 45 | # 46 | def parse_type(html_node) 47 | https = html_node.content_at("td[6]") 48 | https&.casecmp("yes")&.zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP 49 | end 50 | end 51 | 52 | ProxyFetcher::Configuration.register_provider(:free_proxy_list, FreeProxyList) 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/providers/free_proxy_list_socks.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | module Providers 5 | # FreeProxyListSocks provider class. 6 | class FreeProxyListSocks < Base 7 | # Provider URL to fetch proxy list 8 | def provider_url 9 | "https://www.socks-proxy.net/" 10 | end 11 | 12 | # [NOTE] Doesn't support filtering 13 | def xpath 14 | "//table[./thead/tr/th[contains(text(), 'IP')]]/tbody/tr" 15 | end 16 | 17 | # Converts HTML node (entry of N tags) to ProxyFetcher::Proxy 18 | # object. 19 | # 20 | # @param html_node [Object] 21 | # HTML node from the ProxyFetcher::Document DOM model. 22 | # 23 | # @return [ProxyFetcher::Proxy] 24 | # Proxy object 25 | # 26 | def to_proxy(html_node) 27 | ProxyFetcher::Proxy.new.tap do |proxy| 28 | proxy.addr = html_node.content_at("td[1]") 29 | proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, "")) 30 | proxy.country = html_node.content_at("td[4]") 31 | proxy.type = parse_type(html_node) 32 | proxy.anonymity = html_node.content_at("td[6]") 33 | end 34 | end 35 | 36 | private 37 | 38 | # Parses HTML node to extract proxy type. 39 | # 40 | # @param html_node [Object] 41 | # HTML node from the ProxyFetcher::Document DOM model. 42 | # 43 | # @return [String] 44 | # Proxy type 45 | # 46 | def parse_type(html_node) 47 | https = html_node.content_at("td[5]") 48 | 49 | return ProxyFetcher::Proxy::SOCKS4 if https&.casecmp("socks4")&.zero? 50 | return ProxyFetcher::Proxy::SOCKS5 if https&.casecmp("socks5")&.zero? 51 | 52 | "Unknown" 53 | end 54 | end 55 | 56 | ProxyFetcher::Configuration.register_provider(:free_proxy_list_socks, FreeProxyListSocks) 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/providers/free_proxy_list_ssl.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | module Providers 5 | # FreeProxyListSSL provider class. 6 | class FreeProxyListSSL < Base 7 | # Provider URL to fetch proxy list 8 | def provider_url 9 | "https://www.sslproxies.org/" 10 | end 11 | 12 | # [NOTE] Doesn't support filtering 13 | def xpath 14 | "//table[./thead/tr/th[contains(text(), 'IP')]]/tbody/tr" 15 | end 16 | 17 | # Converts HTML node (entry of N tags) to ProxyFetcher::Proxy 18 | # object. 19 | # 20 | # @param html_node [Object] 21 | # HTML node from the ProxyFetcher::Document DOM model. 22 | # 23 | # @return [ProxyFetcher::Proxy] 24 | # Proxy object 25 | # 26 | def to_proxy(html_node) 27 | ProxyFetcher::Proxy.new.tap do |proxy| 28 | proxy.addr = html_node.content_at("td[1]") 29 | proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, "")) 30 | proxy.country = html_node.content_at("td[4]") 31 | proxy.anonymity = html_node.content_at("td[5]") 32 | proxy.type = ProxyFetcher::Proxy::HTTPS 33 | end 34 | end 35 | end 36 | 37 | ProxyFetcher::Configuration.register_provider(:free_proxy_list_ssl, FreeProxyListSSL) 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/providers/free_proxy_list_us.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | module Providers 5 | # FreeProxyListUS provider class. 6 | class FreeProxyListUS < Base 7 | # Provider URL to fetch proxy list 8 | def provider_url 9 | "https://www.us-proxy.org/" 10 | end 11 | 12 | # [NOTE] Doesn't support filtering 13 | def xpath 14 | "//table[./thead/tr/th[contains(text(), 'IP')]]/tbody/tr" 15 | end 16 | 17 | # Converts HTML node (entry of N tags) to ProxyFetcher::Proxy 18 | # object. 19 | # 20 | # @param html_node [Object] 21 | # HTML node from the ProxyFetcher::Document DOM model. 22 | # 23 | # @return [ProxyFetcher::Proxy] 24 | # Proxy object 25 | # 26 | def to_proxy(html_node) 27 | ProxyFetcher::Proxy.new.tap do |proxy| 28 | proxy.addr = html_node.content_at("td[1]") 29 | proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, "")) 30 | proxy.country = html_node.content_at("td[4]") 31 | proxy.anonymity = html_node.content_at("td[5]") 32 | proxy.type = parse_type(html_node) 33 | end 34 | end 35 | 36 | private 37 | 38 | # Parses HTML node to extract proxy type. 39 | # 40 | # @param html_node [Object] 41 | # HTML node from the ProxyFetcher::Document DOM model. 42 | # 43 | # @return [String] 44 | # Proxy type 45 | # 46 | def parse_type(html_node) 47 | https = html_node.content_at("td[7]") 48 | https&.casecmp("yes")&.zero? ? ProxyFetcher::Proxy::HTTPS : ProxyFetcher::Proxy::HTTP 49 | end 50 | end 51 | 52 | ProxyFetcher::Configuration.register_provider(:free_proxy_list_us, FreeProxyListUS) 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/providers/http_tunnel.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | module Providers 5 | # HTTPTunnel provider class. 6 | class HTTPTunnel < Base 7 | # Provider URL to fetch proxy list 8 | def provider_url 9 | "http://www.httptunnel.ge/ProxyListForFree.aspx" 10 | end 11 | 12 | def xpath 13 | '//table[contains(@id, "GridView")]/tr[(count(td)>2)]' 14 | end 15 | 16 | # Converts HTML node (entry of N tags) to ProxyFetcher::Proxy 17 | # object. 18 | # 19 | # @param html_node [Object] 20 | # HTML node from the ProxyFetcher::Document DOM model. 21 | # 22 | # @return [ProxyFetcher::Proxy] 23 | # Proxy object 24 | # 25 | def to_proxy(html_node) 26 | ProxyFetcher::Proxy.new.tap do |proxy| 27 | uri = parse_proxy_uri(html_node) 28 | proxy.addr = uri.host 29 | proxy.port = uri.port 30 | 31 | proxy.country = parse_country(html_node) 32 | proxy.anonymity = parse_anonymity(html_node) 33 | proxy.type = ProxyFetcher::Proxy::HTTP 34 | end 35 | end 36 | 37 | private 38 | 39 | # Parses HTML node to extract URI object with proxy host and port. 40 | # 41 | # @param html_node [Object] 42 | # HTML node from the ProxyFetcher::Document DOM model. 43 | # 44 | # @return [URI] 45 | # URI object 46 | # 47 | def parse_proxy_uri(html_node) 48 | full_addr = html_node.content_at("td[1]") 49 | URI.parse("http://#{full_addr}") 50 | end 51 | 52 | # Parses HTML node to extract proxy country. 53 | # 54 | # @param html_node [Object] 55 | # HTML node from the ProxyFetcher::Document DOM model. 56 | # 57 | # @return [String] 58 | # Country code 59 | # 60 | def parse_country(html_node) 61 | html_node.find(".//img").attr("title") 62 | end 63 | 64 | # Parses HTML node to extract proxy anonymity level. 65 | # 66 | # @param html_node [Object] 67 | # HTML node from the ProxyFetcher::Document DOM model. 68 | # 69 | # @return [String] 70 | # Anonymity level 71 | # 72 | def parse_anonymity(html_node) 73 | transparency = html_node.content_at("td[5]").to_sym 74 | 75 | { 76 | A: "Anonymous", 77 | E: "Elite", 78 | T: "Transparent", 79 | U: "Unknown" 80 | }.fetch(transparency, "Unknown") 81 | end 82 | end 83 | 84 | ProxyFetcher::Configuration.register_provider(:http_tunnel, HTTPTunnel) 85 | end 86 | end 87 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/providers/mtpro.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "json" 4 | 5 | module ProxyFetcher 6 | module Providers 7 | # MTPro provider class. 8 | class MTPro < Base 9 | # Provider URL to fetch proxy list 10 | def provider_url 11 | "https://mtpro.xyz/api/?type=socks" 12 | end 13 | 14 | def load_proxy_list(filters = {}) 15 | html = load_html(provider_url, filters) 16 | JSON.parse(html) 17 | rescue JSON::ParserError 18 | [] 19 | end 20 | 21 | # Converts HTML node (entry of N tags) to ProxyFetcher::Proxy 22 | # object. 23 | # 24 | # @param node [Object] 25 | # HTML node from the ProxyFetcher::Document DOM model. 26 | # 27 | # @return [ProxyFetcher::Proxy] 28 | # Proxy object 29 | # 30 | def to_proxy(node) 31 | ProxyFetcher::Proxy.new.tap do |proxy| 32 | proxy.addr = node["ip"] 33 | proxy.port = Integer(node["port"]) 34 | proxy.country = node["country"] 35 | proxy.anonymity = "Unknown" 36 | proxy.type = ProxyFetcher::Proxy::SOCKS5 37 | end 38 | end 39 | end 40 | 41 | ProxyFetcher::Configuration.register_provider(:mtpro, MTPro) 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/providers/proxy_list.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "base64" 4 | 5 | module ProxyFetcher 6 | module Providers 7 | # ProxyList provider class. 8 | class ProxyList < Base 9 | # Provider URL to fetch proxy list 10 | def provider_url 11 | "https://proxy-list.org/english/index.php" 12 | end 13 | 14 | def xpath 15 | '//div[@class="table-wrap"]/div[@class="table"]/ul' 16 | end 17 | 18 | # Converts HTML node (entry of N tags) to ProxyFetcher::Proxy 19 | # object. 20 | # 21 | # @param html_node [Object] 22 | # HTML node from the ProxyFetcher::Document DOM model. 23 | # 24 | # @return [ProxyFetcher::Proxy] 25 | # Proxy object 26 | # 27 | def to_proxy(html_node) 28 | ProxyFetcher::Proxy.new.tap do |proxy| 29 | uri = parse_proxy_uri(html_node) 30 | proxy.addr = uri.host 31 | proxy.port = uri.port 32 | 33 | proxy.type = html_node.content_at("li[2]") 34 | proxy.anonymity = html_node.content_at("li[4]") 35 | proxy.country = html_node.find("li[5]//span[@class='country']").attr("title") 36 | end 37 | end 38 | 39 | private 40 | 41 | # Parses HTML node to extract URI object with proxy host and port. 42 | # 43 | # @param html_node [Object] 44 | # HTML node from the ProxyFetcher::Document DOM model. 45 | # 46 | # @return [URI] 47 | # URI object 48 | # 49 | def parse_proxy_uri(html_node) 50 | full_addr = ::Base64.decode64(html_node.at_css("li script").html.match(/'(.+)'/)[1]) 51 | URI.parse("http://#{full_addr}") 52 | end 53 | end 54 | 55 | ProxyFetcher::Configuration.register_provider(:proxy_list, ProxyList) 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/providers/proxypedia.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | module Providers 5 | # FreeProxyList provider class. 6 | class Proxypedia < Base 7 | # Provider URL to fetch proxy list 8 | def provider_url 9 | "https://proxypedia.org" 10 | end 11 | 12 | # [NOTE] Doesn't support filtering 13 | def xpath 14 | "//main/ul/li[position()>1]" 15 | end 16 | 17 | # Converts HTML node (entry of N tags) to ProxyFetcher::Proxy 18 | # object.] 19 | # 20 | # @param html_node [Object] 21 | # HTML node from the ProxyFetcher::Document DOM model. 22 | # 23 | # @return [ProxyFetcher::Proxy] 24 | # Proxy object 25 | # 26 | def to_proxy(html_node) 27 | addr, port = html_node.content_at("a").to_s.split(":") 28 | 29 | ProxyFetcher::Proxy.new.tap do |proxy| 30 | proxy.addr = addr 31 | proxy.port = Integer(port) 32 | proxy.country = parse_country(html_node) 33 | proxy.anonymity = "Unknown" 34 | proxy.type = ProxyFetcher::Proxy::HTTP 35 | end 36 | end 37 | 38 | private 39 | 40 | def parse_country(html_node) 41 | text = html_node.content.to_s 42 | text[/\((.+?)\)/, 1] || "Unknown" 43 | end 44 | end 45 | 46 | ProxyFetcher::Configuration.register_provider(:proxypedia, Proxypedia) 47 | end 48 | end 49 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/providers/xroxy.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | module Providers 5 | # XRoxy provider class. 6 | class XRoxy < Base 7 | # Provider URL to fetch proxy list 8 | def provider_url 9 | "https://www.xroxy.com/proxylist.htm" 10 | end 11 | 12 | def xpath 13 | "//tr[@class='row1' or @class='row0']" 14 | end 15 | 16 | # Converts HTML node (entry of N tags) to ProxyFetcher::Proxy 17 | # object. 18 | # 19 | # @param html_node [Object] 20 | # HTML node from the ProxyFetcher::Document DOM model. 21 | # 22 | # @return [ProxyFetcher::Proxy] 23 | # Proxy object 24 | # 25 | def to_proxy(html_node) 26 | ProxyFetcher::Proxy.new.tap do |proxy| 27 | proxy.addr = html_node.content_at("td[1]") 28 | proxy.port = Integer(html_node.content_at("td[2]").gsub(/^0+/, "")) 29 | proxy.anonymity = html_node.content_at("td[3]") 30 | proxy.country = html_node.content_at("td[5]") 31 | proxy.response_time = Integer(html_node.content_at("td[6]")) 32 | proxy.type = html_node.content_at("td[3]") 33 | end 34 | end 35 | end 36 | 37 | ProxyFetcher::Configuration.register_provider(:xroxy, XRoxy) 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/proxy.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | # Proxy object 5 | class Proxy 6 | # @!attribute [rw] addr 7 | # @return [String] address (IP or domain) 8 | attr_accessor :addr 9 | 10 | # @!attribute [rw] port 11 | # @return [Integer] port 12 | attr_accessor :port 13 | 14 | # @!attribute [rw] type 15 | # @return [String] type (SOCKS, HTTP(S)) 16 | attr_accessor :type 17 | 18 | # @!attribute [rw] country 19 | # @return [String] country or country code 20 | attr_accessor :country 21 | 22 | # @!attribute [rw] response_time 23 | # @return [Integer] response time (value and measurements depends on the provider) 24 | attr_accessor :response_time 25 | 26 | # @!attribute [rw] anonymity 27 | # @return [String] anonymity level (high, elite, transparent, etc) 28 | attr_accessor :anonymity 29 | 30 | # Proxy types 31 | TYPES = [ 32 | HTTP = "HTTP", 33 | HTTPS = "HTTPS", 34 | SOCKS4 = "SOCKS4", 35 | SOCKS5 = "SOCKS5" 36 | ].freeze 37 | 38 | # Proxy type predicates (#socks4?, #https?) 39 | # 40 | # @return [Boolean] 41 | # true if proxy of requested type, otherwise false. 42 | # 43 | TYPES.each do |proxy_type| 44 | define_method "#{proxy_type.downcase}?" do 45 | !type.nil? && type.upcase.include?(proxy_type) 46 | end 47 | end 48 | 49 | # Returns true if proxy is secure (works through https, socks4 or socks5). 50 | # 51 | # @return [Boolean] 52 | # true if proxy is secure, otherwise false. 53 | # 54 | def ssl? 55 | https? || socks4? || socks5? 56 | end 57 | 58 | # Initialize new Proxy 59 | # 60 | # @param attributes [Hash] 61 | # proxy attributes 62 | # 63 | # @return [Proxy] 64 | # 65 | def initialize(attributes = {}) 66 | attributes.each do |attr, value| 67 | public_send("#{attr}=", value) 68 | end 69 | end 70 | 71 | # Checks if proxy object is connectable (can be used as a proxy for 72 | # HTTP requests). 73 | # 74 | # @return [Boolean] 75 | # true if proxy connectable, otherwise false. 76 | # 77 | def connectable? 78 | ProxyFetcher.config.proxy_validator.connectable?(addr, port) 79 | end 80 | 81 | alias valid? connectable? 82 | 83 | # Returns URI::Generic object with host and port values of the proxy. 84 | # 85 | # @return [URI::Generic] 86 | # URI object. 87 | # 88 | def uri 89 | URI::Generic.build(host: addr, port: port) 90 | end 91 | 92 | # Returns String object with addr:port values of the proxy. 93 | # 94 | # @param scheme [Boolean] 95 | # Indicates if URL must include proxy type 96 | # 97 | # @return [String] 98 | # true if proxy connectable, otherwise false. 99 | # 100 | def url(scheme: false) 101 | if scheme 102 | URI::Generic.build(scheme: type, host: addr, port: port).to_s 103 | else 104 | URI::Generic.build(host: addr, port: port).to_s 105 | end 106 | end 107 | 108 | def ==(other) 109 | other.is_a?(Proxy) && addr == other.addr && port == other.port 110 | end 111 | 112 | def eql?(other) 113 | hash.eql?(other.hash) 114 | end 115 | 116 | def hash 117 | [addr.hash, port.hash].hash 118 | end 119 | end 120 | end 121 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/utils/http_client.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | # Default ProxyFetcher HTTP client used to fetch proxy lists from 5 | # the different providers. Uses ProxyFetcher configuration options 6 | # for sending HTTP requests to providers URLs. 7 | class HTTPClient 8 | # @!attribute [r] url 9 | # @return [String] URL 10 | attr_reader :url 11 | 12 | # @!attribute [r] HTTP method 13 | # @return [String] HTTP method verb 14 | attr_reader :method 15 | 16 | # @!attribute [r] HTTP params 17 | # @return [Hash] params 18 | attr_reader :params 19 | 20 | # @!attribute [r] HTTP headers 21 | # @return [Hash] headers 22 | attr_reader :headers 23 | 24 | # @!attribute [r] http 25 | # @return [Net::HTTP] HTTP client 26 | attr_reader :http 27 | 28 | # @!attribute [r] ssl_ctx 29 | # @return [OpenSSL::SSL::SSLContext] SSL context 30 | attr_reader :ssl_ctx 31 | 32 | # @!attribute [r] timeout 33 | # @return [Integer] Request timeout 34 | attr_reader :timeout 35 | 36 | # Fetches resource content by sending HTTP request to it. 37 | # Synthetic sugar to simplify URIes fetching. 38 | # 39 | # @param url [String] URL 40 | # 41 | # @return [String] 42 | # resource content 43 | # 44 | def self.fetch(*args, **kwargs, &block) 45 | new(*args, **kwargs, &block).fetch 46 | end 47 | 48 | # Initialize HTTP client instance 49 | # 50 | # @return [HTTPClient] 51 | # 52 | def initialize(url, method: :get, params: {}, headers: {}) 53 | @url = url.to_s 54 | @method = method.to_sym 55 | @params = params 56 | @headers = headers 57 | 58 | unless HTTP::Request::METHODS.include?(@method) 59 | raise ArgumentError, "'#{@method}' is a wrong HTTP method name" 60 | end 61 | 62 | @timeout = ProxyFetcher.config.provider_proxies_load_timeout 63 | @http = build_http_engine 64 | @ssl_ctx = build_ssl_context 65 | end 66 | 67 | # Fetches resource content by sending HTTP request to it. 68 | # 69 | # @return [String] 70 | # response body 71 | # 72 | def fetch(**options) 73 | response = perform_http_request 74 | return response if options.fetch(:raw, false) 75 | 76 | response.body.to_s 77 | rescue StandardError => e 78 | ProxyFetcher.config.logger.warn("Failed to process request to #{url} (#{e.message})") 79 | "" 80 | end 81 | 82 | protected 83 | 84 | def build_ssl_context 85 | OpenSSL::SSL::SSLContext.new.tap do |context| 86 | context.verify_mode = OpenSSL::SSL::VERIFY_NONE 87 | end 88 | end 89 | 90 | def build_http_engine 91 | HTTP.headers(default_headers.merge(headers)).timeout(connect: timeout, read: timeout) 92 | end 93 | 94 | def perform_http_request(http_method: method, http_params: params) 95 | http.public_send( 96 | http_method, 97 | url, 98 | form: http_params, 99 | ssl_context: ssl_ctx 100 | ) 101 | end 102 | 103 | # Default HTTP client headers 104 | # 105 | # @return [Hash] 106 | # hash of HTTP headers 107 | # 108 | def default_headers 109 | { 110 | "User-Agent" => ProxyFetcher.config.user_agent 111 | } 112 | end 113 | end 114 | end 115 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/utils/proxy_list_validator.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | # This class validates list of proxies. 5 | # Each proxy is validated using ProxyFetcher::ProxyValidator. 6 | class ProxyListValidator 7 | # @!attribute [r] proxies 8 | # @return [Array] Source array of proxies 9 | attr_reader :proxies 10 | # @!attribute [r] valid_proxies 11 | # @return [Array] Array of valid proxies after validation 12 | attr_reader :valid_proxies 13 | 14 | # @param [Array] *proxies 15 | # Any number of ProxyFetcher::Proxy to validate 16 | def initialize(*proxies) 17 | @proxies = proxies.flatten 18 | end 19 | 20 | # Performs validation 21 | # 22 | # @return [Array] 23 | # list of valid proxies 24 | def validate 25 | target_proxies = @proxies.dup 26 | target_proxies_lock = Mutex.new 27 | connectable_proxies = [] 28 | connectable_proxies_lock = Mutex.new 29 | threads = [] 30 | 31 | ProxyFetcher.config.pool_size.times do 32 | threads << Thread.new do 33 | loop do 34 | proxy = target_proxies_lock.synchronize { target_proxies.shift } 35 | break unless proxy 36 | 37 | if proxy.connectable? 38 | connectable_proxies_lock.synchronize { connectable_proxies << proxy } 39 | end 40 | end 41 | end 42 | end 43 | 44 | threads.each(&:join) 45 | 46 | @valid_proxies = connectable_proxies 47 | end 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/utils/proxy_validator.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | # Default ProxyFetcher proxy validator that checks either proxy 5 | # connectable or not. It tries to send HEAD request to default 6 | # URL to check if proxy can be used (aka connectable?). 7 | class ProxyValidator 8 | # Default URL that will be used to check if proxy can be used. 9 | URL_TO_CHECK = "https://google.com" 10 | 11 | # Short variant to validate proxy. 12 | # 13 | # @param address [String] proxy address or IP 14 | # @param port [String, Integer] proxy port 15 | # 16 | # @return [Boolean] 17 | # true if connection to the server using proxy established, otherwise false 18 | # 19 | def self.connectable?(address, port) 20 | new(address, port).connectable? 21 | end 22 | 23 | # Initialize new ProxyValidator instance 24 | # 25 | # @param address [String] Proxy address or IP 26 | # @param port [String, Integer] Proxy port 27 | # @param options [Hash] proxy options 28 | # @option username [String] Proxy authentication username 29 | # @option password [String] Proxy authentication password 30 | # @option headers [Hash] Proxy headers 31 | # 32 | # @return [ProxyValidator] 33 | # 34 | def initialize(address, port, options: {}) 35 | timeout = ProxyFetcher.config.proxy_validation_timeout 36 | proxy = [address, port.to_i] 37 | 38 | if options[:username] && options[:password] 39 | proxy << options[:username] 40 | proxy << options[:password] 41 | end 42 | 43 | proxy << options[:headers].to_h if options[:headers] 44 | 45 | @http = HTTP.follow.via(*proxy).timeout(connect: timeout, read: timeout) 46 | end 47 | 48 | # Checks if proxy is connectable (can be used to connect 49 | # resources via proxy server). 50 | # 51 | # @return [Boolean] 52 | # true if connection to the server using proxy established, otherwise false 53 | # 54 | def connectable? 55 | ssl_context = OpenSSL::SSL::SSLContext.new 56 | ssl_context.verify_mode = OpenSSL::SSL::VERIFY_NONE 57 | 58 | @http.head(URL_TO_CHECK, ssl_context: ssl_context).status.success? 59 | rescue StandardError 60 | false 61 | end 62 | end 63 | end 64 | -------------------------------------------------------------------------------- /lib/proxy_fetcher/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module ProxyFetcher 4 | ## 5 | # ProxyFetcher gem version. 6 | def self.gem_version 7 | Gem::Version.new VERSION::STRING 8 | end 9 | 10 | ## 11 | # ProxyFetcher gem semantic versioning. 12 | module VERSION 13 | # Major version number 14 | MAJOR = 0 15 | # Minor version number 16 | MINOR = 17 17 | # Smallest version number 18 | TINY = 0 19 | 20 | # Full version number 21 | STRING = [MAJOR, MINOR, TINY].compact.join(".") 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /proxy_fetcher.gemspec: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), "lib")) 4 | 5 | require "proxy_fetcher/version" 6 | 7 | Gem::Specification.new do |gem| 8 | gem.name = "proxy_fetcher" 9 | gem.version = ProxyFetcher.gem_version 10 | gem.summary = "Ruby gem for dealing with proxy lists from different providers" 11 | gem.description = <<-TEXT.strip.gsub(/[\s\n]+/, " ") 12 | This gem can help your Ruby application to make HTTP(S) requests 13 | using proxies by fetching and validating proxy lists from 14 | the different providers. 15 | TEXT 16 | gem.authors = ["Nikita Bulai"] 17 | gem.email = "bulajnikita@gmail.com" 18 | gem.require_paths = ["lib"] 19 | gem.bindir = "bin" 20 | gem.files = `git ls-files`.split($RS) - %w[README.md .travis.yml .rubocop.yml] 21 | gem.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) } 22 | gem.homepage = "http://github.com/nbulaj/proxy_fetcher" 23 | gem.license = "MIT" 24 | gem.required_ruby_version = ">= 2.3.0" 25 | 26 | gem.add_runtime_dependency "http", ">= 3" 27 | 28 | gem.add_development_dependency "rake", ">= 12.0" 29 | gem.add_development_dependency "rspec", "~> 3.9" 30 | end 31 | -------------------------------------------------------------------------------- /spec/fixtures/proxies.txt: -------------------------------------------------------------------------------- 1 | 139.162.59.9:3128 2 | 176.62.187.158:56351 3 | 144.217.22.142:8080 4 | 176.55.108.21:3128 5 | 157.225.214.251:3128 6 | 202.51.49.52:48298 7 | 104.244.75.26:8080 8 | 163.172.28.20:80 9 | 187.56.191.12:3128 10 | 129.194.12.26:80 11 | 178.128.39.39:8080 12 | 181.30.28.15:8080 13 | 51.181.96.185:8080 14 | 148.134.10.13 15 | -------------------------------------------------------------------------------- /spec/proxy_fetcher/client/client_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | require "json" 5 | 6 | begin 7 | require "webrick" 8 | rescue LoadError 9 | # nop 10 | end 11 | require "evil-proxy" 12 | require "evil-proxy/async" 13 | 14 | describe ProxyFetcher::Client do 15 | before :all do 16 | ProxyFetcher.configure do |config| 17 | config.provider = :xroxy 18 | config.client_timeout = 5 19 | config.logger = ProxyFetcher::NullLogger.new 20 | end 21 | 22 | quiet = ENV.key?("LOG_MITM") ? ENV["LOG_MITM"] == "false" : true 23 | 24 | @server = EvilProxy::MITMProxyServer.new Port: 3128, Quiet: quiet 25 | @server.start 26 | end 27 | 28 | after :all do 29 | @server.shutdown 30 | end 31 | 32 | let(:local_proxy) { ProxyFetcher::Proxy.new(addr: "127.0.0.1", port: 3128, type: "HTTP, HTTPS") } 33 | 34 | # Use local proxy server in order to avoid side effects, non-working proxies, etc 35 | before :each do 36 | ProxyFetcher::Client::ProxiesRegistry.manager.instance_variable_set(:'@proxies', [local_proxy]) 37 | allow_any_instance_of(ProxyFetcher::Providers::Base).to receive(:fetch_proxies).and_return([local_proxy]) 38 | end 39 | 40 | context "GET request with the valid proxy" do 41 | it "successfully returns page content for HTTP" do 42 | content = ProxyFetcher::Client.get("http://httpbin.org/get") 43 | 44 | expect(content).not_to be_empty 45 | end 46 | 47 | # TODO: oh this SSL / MITM proxies .... 48 | xit "successfully returns page content for HTTPS" do 49 | content = ProxyFetcher::Client.get("https://httpbin.org/get") 50 | 51 | expect(content).not_to be_empty 52 | end 53 | 54 | it "successfully returns page content using custom proxy" do 55 | content = ProxyFetcher::Client.get("http://httpbin.org/get", options: { proxy: local_proxy }) 56 | 57 | expect(content).not_to be_empty 58 | end 59 | end 60 | 61 | context "POST request with the valid proxy" do 62 | it "successfully returns page content for HTTP" do 63 | headers = { 64 | "X-Proxy-Fetcher-Version" => ProxyFetcher::VERSION::STRING 65 | } 66 | 67 | content = ProxyFetcher::Client.post( 68 | "http://httpbin.org/post", 69 | { param: "value" }, 70 | headers: headers 71 | ) 72 | 73 | expect(content).not_to be_empty 74 | 75 | json = JSON.parse(content) 76 | 77 | expect(json["headers"]["X-Proxy-Fetcher-Version"]).to eq(ProxyFetcher::VERSION::STRING) 78 | expect(json["headers"]["User-Agent"]).to eq(ProxyFetcher.config.user_agent) 79 | end 80 | end 81 | 82 | # TODO: EvilProxy incompatible with latest Ruby/Webrick 83 | # @see https://github.com/bbtfr/evil-proxy/issues/10 84 | if Gem::Version.new(RUBY_VERSION) < Gem::Version.new("2.6") 85 | context "PUT request with the valid proxy" do 86 | it "successfully returns page content for HTTP" do 87 | content = ProxyFetcher::Client.put("http://httpbin.org/put", "param=PutValue") 88 | 89 | expect(content).not_to be_empty 90 | 91 | json = JSON.parse(content) 92 | 93 | expect(json["form"]["param"]).to eq("PutValue") 94 | end 95 | end 96 | 97 | context "PATCH request with the valid proxy" do 98 | it "successfully returns page content for HTTP" do 99 | content = ProxyFetcher::Client.patch("http://httpbin.org/patch", param: "value") 100 | 101 | expect(content).not_to be_empty 102 | 103 | json = JSON.parse(content) 104 | 105 | expect(json["form"]["param"]).to eq("value") 106 | end 107 | end 108 | end 109 | 110 | context "DELETE request with the valid proxy" do 111 | it "successfully returns page content for HTTP" do 112 | content = ProxyFetcher::Client.delete("http://httpbin.org/delete") 113 | 114 | expect(content).not_to be_empty 115 | end 116 | end 117 | 118 | context "HEAD request with the valid proxy" do 119 | it "successfully works" do 120 | content = ProxyFetcher::Client.head("http://httpbin.org") 121 | 122 | expect(content).to be_empty 123 | end 124 | end 125 | 126 | context "retries" do 127 | it "raises an error when reaches max retries limit" do 128 | allow(ProxyFetcher::Client::Request).to receive(:execute).and_raise(StandardError) 129 | 130 | expect { ProxyFetcher::Client.get("http://httpbin.org", options: { max_retries: 10 }) } 131 | .to raise_error(ProxyFetcher::Exceptions::MaximumRetriesReached) 132 | end 133 | 134 | xit "raises an error when http request returns an error" do 135 | allow_any_instance_of(HTTP::Client).to receive(:get).and_return(StandardError.new) 136 | 137 | expect { ProxyFetcher::Client.get("http://httpbin.org") } 138 | .to raise_error(ProxyFetcher::Exceptions::MaximumRetriesReached) 139 | end 140 | 141 | it "refreshes proxy lists if no proxy found" do 142 | allow(ProxyFetcher::Manager.new).to receive(:proxies).and_return([]) 143 | 144 | expect { ProxyFetcher::Client.get("http://httpbin.org") } 145 | .not_to raise_error 146 | end 147 | end 148 | 149 | xcontext "redirects" do 150 | it "follows redirect when present" do 151 | content = ProxyFetcher::Client.get("http://httpbin.org/absolute-redirect/2") 152 | 153 | expect(content).not_to be_empty 154 | end 155 | 156 | it "raises an error when reaches max redirects limit" do 157 | expect { ProxyFetcher::Client.get("http://httpbin.org/absolute-redirect/11") } 158 | .to raise_error(ProxyFetcher::Exceptions::MaximumRedirectsReached) 159 | end 160 | end 161 | end 162 | -------------------------------------------------------------------------------- /spec/proxy_fetcher/configuration_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe ProxyFetcher::Configuration do 6 | before { ProxyFetcher.config.reset! } 7 | after { ProxyFetcher.config.reset! } 8 | 9 | context "custom HTTP client" do 10 | it "successfully setups if class has all the required methods" do 11 | class MyHTTPClient 12 | def self.fetch(url) 13 | url 14 | end 15 | end 16 | 17 | expect { ProxyFetcher.config.http_client = MyHTTPClient }.not_to raise_error 18 | end 19 | 20 | it "failed on setup if required methods are missing" do 21 | MyWrongHTTPClient = Class.new 22 | 23 | expect { ProxyFetcher.config.http_client = MyWrongHTTPClient } 24 | .to raise_error(ProxyFetcher::Exceptions::WrongCustomClass) 25 | end 26 | end 27 | 28 | context "custom proxy validator" do 29 | it "successfully setups if class has all the required methods" do 30 | class MyProxyValidator 31 | def self.connectable?(*) 32 | true 33 | end 34 | end 35 | 36 | expect { ProxyFetcher.config.proxy_validator = MyProxyValidator }.not_to raise_error 37 | end 38 | 39 | it "failed on setup if required methods are missing" do 40 | MyWrongProxyValidator = Class.new 41 | 42 | expect { ProxyFetcher.config.proxy_validator = MyWrongProxyValidator } 43 | .to raise_error(ProxyFetcher::Exceptions::WrongCustomClass) 44 | end 45 | end 46 | 47 | context "custom provider" do 48 | it "fails on registration if provider class already registered" do 49 | expect { ProxyFetcher::Configuration.register_provider(:xroxy, Class.new) } 50 | .to raise_error(ProxyFetcher::Exceptions::RegisteredProvider) 51 | end 52 | 53 | it "fails on proxy list fetching if provider doesn't registered" do 54 | ProxyFetcher.config.provider = :not_existing_provider 55 | 56 | expect { ProxyFetcher::Manager.new } 57 | .to raise_error(ProxyFetcher::Exceptions::UnknownProvider) 58 | end 59 | end 60 | 61 | context "custom HTML parsing adapter" do 62 | it "fails if adapter can't be installed" do 63 | old_adapter = ProxyFetcher.config.adapter.dup 64 | 65 | class CustomAdapter < ProxyFetcher::Document::AbstractAdapter 66 | def self.install_requirements! 67 | require "not_existing_gem" 68 | end 69 | end 70 | 71 | ProxyFetcher.config.adapter = CustomAdapter 72 | 73 | expect { ProxyFetcher.config.adapter_class } 74 | .to raise_error(ProxyFetcher::Exceptions::AdapterSetupError) 75 | 76 | ProxyFetcher.config.adapter = old_adapter 77 | end 78 | end 79 | end 80 | -------------------------------------------------------------------------------- /spec/proxy_fetcher/document/adapters_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe ProxyFetcher::Document::Adapters do 6 | describe "#lookup" do 7 | it "returns predefined adapters if symbol or string passed" do 8 | expect(described_class.lookup("nokogiri")).to eq(ProxyFetcher::Document::NokogiriAdapter) 9 | 10 | expect(described_class.lookup(:oga)).to eq(ProxyFetcher::Document::OgaAdapter) 11 | end 12 | 13 | it "returns self if class passed" do 14 | expect(described_class.lookup(Struct)).to eq(Struct) 15 | end 16 | 17 | it "raises an exception if passed value is blank" do 18 | expect { described_class.lookup(nil) }.to raise_error(ProxyFetcher::Exceptions::BlankAdapter) 19 | expect { described_class.lookup("") }.to raise_error(ProxyFetcher::Exceptions::BlankAdapter) 20 | end 21 | 22 | it "raises an exception if adapter doesn't exist" do 23 | expect { described_class.lookup("wrong") }.to raise_error(ProxyFetcher::Exceptions::UnknownAdapter) 24 | end 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /spec/proxy_fetcher/document/node_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe ProxyFetcher::Document::Node do 6 | context "overridable methods" do 7 | it "raises an error" do 8 | node = ProxyFetcher::Document::Node.new("") 9 | 10 | %w[content html].each do |method| 11 | expect { node.public_send(method) }.to raise_error do |error| 12 | expect(error.message).to include("`#{method}` must be implemented") 13 | end 14 | end 15 | end 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /spec/proxy_fetcher/manager_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe ProxyFetcher::Manager do 6 | it "can initialize with a proxies from file(s)" do 7 | manager = described_class.new(refresh: false, file: "spec/fixtures/proxies.txt") 8 | 9 | expect(manager.proxies.size).to be(14) 10 | 11 | manager = described_class.new( 12 | refresh: false, 13 | file: ["spec/fixtures/proxies.txt", "spec/fixtures/proxies.txt"] 14 | ) 15 | 16 | expect(manager.proxies.size).to be(14) 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /spec/proxy_fetcher/providers/base_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe ProxyFetcher::Providers::Base do 6 | before { ProxyFetcher.config.reset! } 7 | after { ProxyFetcher.config.reset! } 8 | 9 | it "does not allows to use not implemented methods" do 10 | NotImplementedCustomProvider = Class.new(ProxyFetcher::Providers::Base) 11 | 12 | ProxyFetcher::Configuration.register_provider(:provider_without_methods, NotImplementedCustomProvider) 13 | ProxyFetcher.config.provider = :provider_without_methods 14 | 15 | expect { ProxyFetcher::Manager.new }.to raise_error(NotImplementedError) do |error| 16 | expect(error.message).to include("provider_url") 17 | end 18 | 19 | # implement one of the methods 20 | NotImplementedCustomProvider.class_eval do 21 | def provider_url 22 | "http://provider.com" 23 | end 24 | end 25 | 26 | expect { ProxyFetcher::Manager.new }.to raise_error(NotImplementedError) do |error| 27 | expect(error.message).to include("xpath") 28 | end 29 | end 30 | 31 | it "logs failed to load proxy providers" do 32 | CustomProvider = Class.new(ProxyFetcher::Providers::Base) do 33 | def load_proxy_list(*) 34 | doc = load_document("https://google.com", {}) 35 | doc.xpath('//table[contains(@class, "table")]/tr[(not(@id="proxy-table-header")) and (count(td)>2)]') 36 | end 37 | end 38 | 39 | logger = Logger.new(StringIO.new) 40 | 41 | ProxyFetcher::Configuration.register_provider(:custom_provider, CustomProvider) 42 | ProxyFetcher.config.provider = :custom_provider 43 | ProxyFetcher.config.logger = logger 44 | 45 | allow_any_instance_of(HTTP::Client).to receive(:get).and_raise(StandardError) 46 | 47 | expect(logger).to receive(:warn).with(%r{Failed to process request to http[s:/]}) 48 | 49 | ProxyFetcher::Manager.new 50 | end 51 | end 52 | -------------------------------------------------------------------------------- /spec/proxy_fetcher/providers/multiple_providers_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe "Multiple proxy providers" do 6 | before { ProxyFetcher.config.reset! } 7 | after { ProxyFetcher.config.reset! } 8 | 9 | it "combine proxies from multiple providers" do 10 | proxy_stub = ProxyFetcher::Proxy.new(addr: "192.168.1.1", port: 8080) 11 | 12 | # Each proxy provider will return 2 proxies 13 | ProxyFetcher::Configuration.providers_registry.providers.each do |_name, klass| 14 | allow_any_instance_of(klass).to receive(:load_proxy_list).and_return([1, 2]) 15 | allow_any_instance_of(klass).to receive(:to_proxy).and_return(proxy_stub) 16 | end 17 | 18 | all_providers = ProxyFetcher::Configuration.registered_providers 19 | ProxyFetcher.config.providers = all_providers 20 | 21 | expect(ProxyFetcher::Manager.new.proxies.size).to eq(all_providers.size * 2) 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /spec/proxy_fetcher/providers/proxy_classes_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe "Proxy classes" do 6 | [ 7 | [:free_proxy_list, "FreeProxyList"], 8 | [:free_proxy_list_socks, "FreeProxyListSocks"], 9 | [:free_proxy_list_ssl, "FreeProxyListSSL"], 10 | [:free_proxy_list_us, "FreeProxyListUS"], 11 | [:http_tunnel, "HTTPTunnel"], 12 | [:mtpro, "MTPro"], 13 | [:proxy_list, "ProxyList"], 14 | [:proxypedia, "Proxypedia"], 15 | [:xroxy, "XRoxy"] 16 | ].each do |(provider_name, provider_klass)| 17 | describe Object.const_get("ProxyFetcher::Providers::#{provider_klass}") do 18 | before :all do 19 | ProxyFetcher.config.provider = provider_name 20 | end 21 | 22 | it_behaves_like "a manager" 23 | end 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /spec/proxy_fetcher/proxy_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | describe ProxyFetcher::Proxy do 6 | let(:proxy) { described_class.new(addr: "192.169.1.1", port: 8080, type: "HTTP") } 7 | 8 | it "can initialize a new proxy object" do 9 | proxy = described_class.new(addr: "192.169.1.1", port: 8080, type: "HTTP") 10 | 11 | expect(proxy).not_to be_nil 12 | expect(proxy.addr).to eq("192.169.1.1") 13 | expect(proxy.port).to eq(8080) 14 | expect(proxy.type).to eq("HTTP") 15 | end 16 | 17 | it "checks schema" do 18 | proxy.type = ProxyFetcher::Proxy::HTTP 19 | expect(proxy.http?).to be_truthy 20 | expect(proxy.https?).to be_falsey 21 | expect(proxy.ssl?).to be_falsey 22 | 23 | proxy.type = ProxyFetcher::Proxy::HTTPS 24 | expect(proxy.https?).to be_truthy 25 | expect(proxy.http?).to be_truthy 26 | expect(proxy.ssl?).to be_truthy 27 | 28 | proxy.type = ProxyFetcher::Proxy::SOCKS4 29 | expect(proxy.socks4?).to be_truthy 30 | expect(proxy.ssl?).to be_truthy 31 | 32 | proxy.type = ProxyFetcher::Proxy::SOCKS5 33 | expect(proxy.socks5?).to be_truthy 34 | expect(proxy.ssl?).to be_truthy 35 | end 36 | 37 | it "not connectable if IP addr is wrong" do 38 | proxy.addr = "192.168.1.0" 39 | expect(proxy.connectable?).to be_falsey 40 | end 41 | 42 | it "not connectable if there are some error during connection request" do 43 | allow_any_instance_of(HTTP::Client).to receive(:head).and_raise(HTTP::TimeoutError) 44 | expect(proxy.connectable?).to be_falsey 45 | end 46 | 47 | it "returns URI::Generic" do 48 | expect(proxy.uri).to be_a(URI::Generic) 49 | 50 | expect(proxy.uri.host).not_to be_empty 51 | expect(proxy.uri.port).not_to be_nil 52 | end 53 | 54 | it "returns URL" do 55 | expect(proxy.url).to be_a(String) 56 | end 57 | 58 | it "returns URL with scheme" do 59 | expect(proxy.url(scheme: true)).to include("://") 60 | end 61 | end 62 | -------------------------------------------------------------------------------- /spec/proxy_fetcher/version_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.describe ProxyFetcher::VERSION do 4 | it { expect(ProxyFetcher::VERSION::STRING).to match(/^\d+\.\d+\.\d+(\.\w+)?$/) } 5 | end 6 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "simplecov" 4 | SimpleCov.add_filter "spec" 5 | SimpleCov.add_filter "version" 6 | 7 | if ENV["CI"] || ENV["TRAVIS"] || ENV["COVERALLS"] || ENV["JENKINS_URL"] 8 | require "coveralls" 9 | Coveralls.wear! 10 | else 11 | SimpleCov.start 12 | end 13 | 14 | require "bundler/setup" 15 | Bundler.setup 16 | 17 | require "proxy_fetcher" 18 | 19 | Dir["./spec/support/**/*.rb"].sort.each { |f| require f } 20 | 21 | adapter = ENV["BUNDLE_GEMFILE"][%r{.+/(.+)\.gemfile}i, 1] || :nokogiri 22 | puts "Configured adapter: '#{adapter}'" 23 | 24 | ProxyFetcher.configure do |config| 25 | config.adapter = adapter 26 | end 27 | 28 | RSpec.configure do |config| 29 | config.order = "random" 30 | end 31 | -------------------------------------------------------------------------------- /spec/support/manager_examples.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.shared_examples "a manager" do 4 | before :all do 5 | @cached_manager = ProxyFetcher::Manager.new 6 | end 7 | 8 | it "loads proxy list on initialization by default" do 9 | expect(@cached_manager.proxies).not_to be_empty 10 | end 11 | 12 | it "doesn't load proxy list on initialization if `refresh` argument was set to false" do 13 | manager = ProxyFetcher::Manager.new(refresh: false) 14 | expect(manager.proxies).to be_empty 15 | end 16 | 17 | it "returns valid Proxy objects" do 18 | expect(@cached_manager.proxies).to all(be_a(ProxyFetcher::Proxy)) 19 | 20 | @cached_manager.proxies.each do |proxy| 21 | expect(proxy.addr).to match(/\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/i) 22 | expect(proxy.port).to be_a_kind_of(Numeric) 23 | expect(proxy.type).not_to be_empty 24 | expect(proxy.country).not_to be_empty 25 | expect(proxy.anonymity).not_to be_empty 26 | expect(proxy.response_time).to be_nil.or(be_a_kind_of(Numeric)) 27 | end 28 | end 29 | 30 | it "returns raw proxies (HOST:PORT)" do 31 | expect(@cached_manager.raw_proxies).to all(be_a(String)) 32 | end 33 | 34 | it "cleanup proxy list from dead servers" do 35 | allow_any_instance_of(ProxyFetcher::Proxy).to receive(:connectable?).and_return(false) 36 | 37 | manager = ProxyFetcher::Manager.new 38 | 39 | expect do 40 | manager.cleanup! 41 | end.to change { manager.proxies }.to([]) 42 | end 43 | 44 | it "doesn't pollute the output with array of proxies" do 45 | manager = ProxyFetcher::Manager.new(refresh: false) 46 | expect(manager.inspect).to eq(manager.to_s) 47 | end 48 | 49 | it "returns first proxy" do 50 | manager = ProxyFetcher::Manager.new 51 | 52 | first_proxy = manager.proxies.first 53 | 54 | expect(manager.get).to eq(first_proxy) 55 | expect(manager.proxies.first).not_to eq(first_proxy) 56 | end 57 | 58 | it "returns first valid proxy" do 59 | manager = ProxyFetcher::Manager.new(refresh: false) 60 | 61 | proxies = Array.new(5) { instance_double("ProxyFetcher::Proxy", connectable?: false) } 62 | manager.instance_variable_set(:@proxies, proxies) 63 | 64 | connectable_proxy = instance_double("ProxyFetcher::Proxy") 65 | allow(connectable_proxy).to receive(:connectable?).and_return(true) 66 | 67 | manager.proxies[0..2].each { |proxy| proxy.instance_variable_set(:@addr, "192.168.1.1") } 68 | manager.proxies[2] = connectable_proxy 69 | 70 | expect(manager.get!).to eq(connectable_proxy) 71 | expect(manager.proxies.size).to be(3) 72 | 73 | expect(manager.get!).to eq(connectable_proxy) 74 | expect(manager.proxies.size).to be(1) 75 | end 76 | 77 | it "returns nothing if proxy list is empty" do 78 | manager = ProxyFetcher::Manager.new(refresh: false) 79 | 80 | expect(manager.get).to be_nil 81 | expect(manager.get!).to be_nil 82 | end 83 | 84 | it "returns random proxy" do 85 | expect(@cached_manager.random).to be_an_instance_of(ProxyFetcher::Proxy) 86 | end 87 | end 88 | --------------------------------------------------------------------------------