├── .gitignore ├── CHANGELOG.md ├── Dockerfile ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── TODO ├── async_requests.py ├── build_docker.sh ├── check_from_stdin.py ├── checkers ├── __init__.py ├── base_checker.py ├── d3d_info_checker.py ├── google_com_checker.py └── ipinfo_io_checker.py ├── collectors ├── __init__.py ├── abstract_collector.py ├── pages_collector.py └── web │ ├── cn │ └── 89ip │ │ └── collector.py │ ├── com │ ├── freeproxylists │ │ └── collector.py │ ├── nordvpn │ │ └── collector.py │ └── premproxy │ │ └── collector.py │ ├── net │ ├── checkerproxy │ │ └── collector.py │ ├── free_proxy_list │ │ └── collector.py │ └── freeproxylists │ │ ├── collector.py │ │ └── fake_data │ └── org │ └── proxy_list │ └── collector.py ├── collectors_list.py ├── config_examples ├── docker_settings.py ├── proxy_py.nginx.conf ├── proxy_py.supervisor.conf └── settings.py ├── docker-compose-with-local.yml ├── docker-compose.yml ├── docs ├── Makefile ├── source │ ├── api_overview.md │ ├── conf.py │ ├── guides │ │ ├── guides.rst │ │ └── how_to_create_collector.rst │ ├── index.rst │ ├── modules │ │ ├── async_requests.rst │ │ ├── check_from_stdin.rst │ │ ├── checkers.base_checker.rst │ │ ├── checkers.d3d_info_checker.rst │ │ ├── checkers.google_com_checker.rst │ │ ├── checkers.ipinfo_io_checker.rst │ │ ├── checkers.rst │ │ ├── collectors.abstract_collector.rst │ │ ├── collectors.checkerproxy_net.collector_checkerproxy_net.rst │ │ ├── collectors.checkerproxy_net.collector_checkerproxy_net_today.rst │ │ ├── collectors.checkerproxy_net.rst │ │ ├── collectors.collector.rst │ │ ├── collectors.free_proxy_list_net.base_collector_free_proxy_list_net.rst │ │ ├── collectors.free_proxy_list_net.collector_free_proxy_list_net.rst │ │ ├── collectors.free_proxy_list_net.collector_free_proxy_list_net_anonymous_proxy.rst │ │ ├── collectors.free_proxy_list_net.collector_free_proxy_list_net_uk_proxy.rst │ │ ├── collectors.free_proxy_list_net.collector_socks_proxy_net.rst │ │ ├── collectors.free_proxy_list_net.collector_sslproxies_org.rst │ │ ├── collectors.free_proxy_list_net.collector_us_proxy_org.rst │ │ ├── collectors.free_proxy_list_net.rst │ │ ├── collectors.freeproxylists_net.freeproxylists_net.rst │ │ ├── collectors.freeproxylists_net.rst │ │ ├── collectors.gatherproxy_com.collector_gatherproxy_com.rst │ │ ├── collectors.gatherproxy_com.rst │ │ ├── collectors.nordvpn_com.nordvpn_com.rst │ │ ├── collectors.nordvpn_com.rst │ │ ├── collectors.pages_collector.rst │ │ ├── collectors.premproxy_com.base_collector_premproxy_com.rst │ │ ├── collectors.premproxy_com.collector_premproxy_com.rst │ │ ├── collectors.premproxy_com.collector_premproxy_com_socks_list.rst │ │ ├── collectors.premproxy_com.rst │ │ ├── collectors.proxy_list_org.collector_proxy_list_org.rst │ │ ├── collectors.proxy_list_org.rst │ │ ├── collectors.rst │ │ ├── collectors_list.rst │ │ ├── dump_db.rst │ │ ├── fill_db.rst │ │ ├── http_client.rst │ │ ├── main.rst │ │ ├── materialized_view_updater.rst │ │ ├── models.rst │ │ ├── modules.rst │ │ ├── parsers.regex_parser.rst │ │ ├── parsers.rst │ │ ├── processor.rst │ │ ├── proxy_py.rst │ │ ├── proxy_py.settings.rst │ │ ├── proxy_utils.rst │ │ ├── proxy_validator.rst │ │ ├── server.api_v1.api_request_handler.rst │ │ ├── server.api_v1.app.rst │ │ ├── server.api_v1.requests_to_models.request.rst │ │ ├── server.api_v1.requests_to_models.request_executor.rst │ │ ├── server.api_v1.requests_to_models.request_parser.rst │ │ ├── server.api_v1.requests_to_models.rst │ │ ├── server.api_v1.rst │ │ ├── server.base_app.rst │ │ ├── server.frontend.app.rst │ │ ├── server.frontend.rst │ │ ├── server.proxy_provider_server.rst │ │ ├── server.rst │ │ ├── setup.rst │ │ ├── statistics.rst │ │ ├── statistics.statistics.rst │ │ ├── tests.rst │ │ ├── tests.test_api.rst │ │ ├── tests.test_http_client.rst │ │ └── tests.test_proxy_validation_regex.rst │ └── readme_link.rst └── update_doc.sh ├── http_client.py ├── init_db.sql ├── logs └── .gitignore ├── main.py ├── materialized_view_updater.py ├── models.py ├── parsers ├── __init__.py └── regex_parser.py ├── processor.py ├── proxy_py ├── __init__.py └── _settings.py ├── proxy_utils.py ├── proxy_validator.py ├── requirements.txt ├── run.sh ├── server ├── __init__.py ├── api_v1 │ ├── __init__.py │ ├── api_request_handler.py │ ├── app.py │ └── requests_to_models │ │ ├── __init__.py │ │ ├── request.py │ │ ├── request_executor.py │ │ └── request_parser.py ├── api_v2 │ ├── api_request_handler.py │ └── app.py ├── base_app.py ├── frontend │ ├── __init__.py │ └── app.py ├── proxy_provider_server.py └── templates │ ├── base.html │ ├── collector_state.html │ ├── index.html │ ├── number_of_proxies_to_process.html │ ├── processor_proxies_queue_size.html │ ├── proxies.html │ └── proxy_count_items.html ├── setup.cfg ├── setup.py ├── statistics ├── __init__.py └── statistics.py ├── tests ├── __init__.py ├── test_api.py ├── test_http_client.py └── test_proxy_validation_regex.py ├── tools └── test_collector.py └── www └── robots.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .ropeproject 2 | __pycache__ 3 | .idea/ 4 | db.sqlite3* 5 | *.swp 6 | proxy_py/settings.py 7 | env/ 8 | dist/ 9 | proxy_py.egg-info 10 | proxypy.egg-info/ 11 | build/ 12 | local/ 13 | .pytest_cache 14 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | ## v2.0 4 | 5 | * Changed ORM to peewee_async(for performance purposes) 6 | * Documentation (http://proxy-py.readthedocs.io) 7 | * Improved logging 8 | * Changed server API URL. new is /api/v1/ 9 | * Logging client's IP 10 | * Now recently updated proxies won't update if they are gotten from collector 11 | * Fixed bug with printing exceptions to stdout 12 | 13 | ## v1.0.2 14 | 15 | ### Improved proxy checking 16 | 17 | Multiple sites for checking proxies. New information about proxies(real IP, country, region, city) 18 | 19 | 20 | ## v1.0.1 21 | 22 | ### Fix server API 23 | 24 | Now it's sending you status code 400/500 if error happened. 25 | 26 | 27 | ## v1.0 28 | 29 | first release 30 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-slim 2 | 3 | RUN apt-get update \ 4 | && apt-get install -y wget \ 5 | && rm -rf /var/lib/apt/lists/* \ 6 | && rm /bin/sh \ 7 | && ln -s /bin/bash /bin/sh \ 8 | && groupadd -r user \ 9 | && useradd --create-home --no-log-init -r -g user user \ 10 | && mkdir /proxy_py \ 11 | && chown user:user /proxy_py 12 | 13 | WORKDIR /proxy_py 14 | USER user 15 | 16 | ARG VERSION=1f186bc451781047175655656c0bcb655e174660 17 | 18 | RUN echo "Downloading proxy_py sources..." \ 19 | && wget https://github.com/DevAlone/proxy_py/archive/$VERSION.tar.gz -O sources.tar.gz 2> /dev/null \ 20 | && tar -xf sources.tar.gz && rm sources.tar.gz \ 21 | && mv proxy_py-*/.[!.]* ./ && mv proxy_py-*/* ./ \ 22 | && rmdir proxy_py-* \ 23 | && python3 -m venv env \ 24 | # they became too greedy to allow free downloading 25 | # && echo "Creating IP:Location database..." \ 26 | # && mkdir /tmp/proxy_py_9910549a_7d41_4102_9e9d_15d39418a5cb \ 27 | # && cd /tmp/proxy_py_9910549a_7d41_4102_9e9d_15d39418a5cb \ 28 | # && wget "https://geolite.maxmind.com/download/geoip/database/GeoLite2-City.tar.gz" 2> /dev/null \ 29 | # && tar xf GeoLite2-City.tar.gz \ 30 | # && mv GeoLite2-City_*/GeoLite2-City.mmdb ./ \ 31 | # && rm -r GeoLite2-City_* \ 32 | && cd /proxy_py \ 33 | && cp config_examples/settings.py proxy_py/settings.py \ 34 | && echo "Installing dependencies..." \ 35 | && source ./env/bin/activate \ 36 | && pip3 install -r requirements.txt --no-cache-dir 37 | 38 | EXPOSE 55555 39 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the README 2 | include *.md 3 | 4 | # Include the license file 5 | include LICENSE.txt 6 | 7 | # Include the data files 8 | # recursive-include data * 9 | 10 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | proxy_py README 2 | =============== 3 | 4 | proxy_py is a program which collects proxies, saves them in 5 | a database and makes periodically checks. 6 | It has a server for getting proxies with nice API(see below). 7 | 8 | Where is the documentation? 9 | *************************** 10 | 11 | It's here -> https://proxy-py.readthedocs.io 12 | 13 | How to support this project? 14 | **************************** 15 | 16 | You can donate here -> https://www.patreon.com/join/2313433 17 | 18 | Thank you :) 19 | 20 | How to install? 21 | *************** 22 | 23 | There is a prepared docker image. 24 | 25 | 1 Install docker and docker compose. If you're using ubuntu: 26 | 27 | .. code-block:: bash 28 | 29 | sudo apt install docker.io docker-compose 30 | 31 | 2 Download docker compose config: 32 | 33 | .. code-block:: bash 34 | 35 | wget "https://raw.githubusercontent.com/DevAlone/proxy_py/master/docker-compose.yml" 36 | 37 | 2 Create a container 38 | 39 | .. code-block:: bash 40 | 41 | docker-compose build 42 | 43 | 3 Run 44 | 45 | .. code-block:: bash 46 | 47 | docker-compose up 48 | 49 | It will give you a server on address localhost:55555 50 | 51 | To see running containers use 52 | 53 | .. code-block:: bash 54 | 55 | docker-compose ps 56 | 57 | To stop proxy_py use 58 | 59 | .. code-block:: bash 60 | 61 | docker-compose stop 62 | 63 | How to get proxies? 64 | ******************* 65 | 66 | proxy_py has a server, based on aiohttp, which is listening 127.0.0.1:55555 67 | (you can change it in the settings file) and provides proxies. 68 | To get proxies you should send the following json request 69 | on address `http://127.0.0.1:55555/api/v1/` 70 | (or other domain if behind reverse proxy): 71 | 72 | .. code-block:: json 73 | 74 | { 75 | "model": "proxy", 76 | "method": "get", 77 | "order_by": "response_time, uptime" 78 | } 79 | 80 | Note: order_by makes the result sorted 81 | by one or more fields(separated by comma). 82 | You can skip it. 83 | The required fields are `model` and `method`. 84 | 85 | It's gonna return you the json response like this: 86 | 87 | .. code-block:: json 88 | 89 | { 90 | "count": 1, 91 | "data": [{ 92 | "address": "http://127.0.0.1:8080", 93 | "auth_data": "", 94 | "bad_proxy": false, 95 | "domain": "127.0.0.1", 96 | "last_check_time": 1509466165, 97 | "number_of_bad_checks": 0, 98 | "port": 8080, 99 | "protocol": "http", 100 | "response_time": 461691, 101 | "uptime": 1509460949 102 | }], 103 | "has_more": false, 104 | "status": "ok", 105 | "status_code": 200 106 | } 107 | 108 | Note: All fields except *protocol*, *domain*, *port*, *auth_data*, 109 | *checking_period* and *address* CAN be null 110 | 111 | Or error if something went wrong: 112 | 113 | .. code-block:: json 114 | 115 | { 116 | "error_message": "You should specify \"model\"", 117 | "status": "error", 118 | "status_code": 400 119 | } 120 | 121 | Note: status_code is also duplicated in HTTP status code 122 | 123 | Example using curl: 124 | 125 | .. code-block:: bash 126 | 127 | curl -X POST http://127.0.0.1:55555/api/v1/ -H "Content-Type: application/json" --data '{"model": "proxy", "method": "get"}' 128 | 129 | Example using httpie: 130 | 131 | .. code-block:: bash 132 | 133 | http POST http://127.0.0.1:55555/api/v1/ model=proxy method=get 134 | 135 | Example using python's *requests* library: 136 | 137 | .. code-block:: python 138 | 139 | import requests 140 | import json 141 | 142 | 143 | def get_proxies(): 144 | result = [] 145 | json_data = { 146 | "model": "proxy", 147 | "method": "get", 148 | } 149 | url = "http://127.0.0.1:55555/api/v1/" 150 | 151 | response = requests.post(url, json=json_data) 152 | if response.status_code == 200: 153 | response = json.loads(response.text) 154 | for proxy in response["data"]: 155 | result.append(proxy["address"]) 156 | else: 157 | # check error here 158 | pass 159 | 160 | return result 161 | 162 | Example using aiohttp library: 163 | 164 | .. code-block:: python 165 | 166 | import aiohttp 167 | 168 | 169 | async def get_proxies(): 170 | result = [] 171 | json_data = { 172 | "model": "proxy", 173 | "method": "get", 174 | } 175 | 176 | url = "http://127.0.0.1:55555/api/v1/" 177 | 178 | async with aiohttp.ClientSession() as session: 179 | async with session.post(url, json=json_data) as response: 180 | if response.status == 200: 181 | response = json.loads(await response.text()) 182 | for proxy in response["data"]: 183 | result.append(proxy["address"]) 184 | else: 185 | # check error here 186 | pass 187 | 188 | return result 189 | 190 | How to interact with API? 191 | ************************* 192 | 193 | Read more about API here -> https://proxy-py.readthedocs.io/en/latest/api_v1_overview.html 194 | 195 | # TODO: add readme about API v2 196 | 197 | What about WEB interface? 198 | ************************* 199 | 200 | There is lib.ru inspired web interface which consists of these pages(with slash at the end): 201 | 202 | - http://localhost:55555/i/get/proxy/ 203 | - http://localhost:55555/i/get/proxy_count_item/ 204 | - http://localhost:55555/i/get/number_of_proxies_to_process/ 205 | - http://localhost:55555/i/get/collector_state/ 206 | 207 | How to contribute? 208 | ****************** 209 | 210 | Just fork, make your changes(implement new collector, fix a bug 211 | or whatever you want) and create pull request. 212 | 213 | Here are some useful guides: 214 | 215 | - `How to create a collector `_ 216 | 217 | How to test it? 218 | *************** 219 | 220 | If you've made changes to the code and want to check that you didn't break 221 | anything, just run 222 | 223 | .. code-block:: bash 224 | 225 | py.test 226 | 227 | inside virtual environment in proxy_py project directory. 228 | 229 | How to use custom checkers/collectors? 230 | ************************************** 231 | 232 | If you wan't to collect proxies from your source or you need proxies to work with particular site, 233 | you can write your own collectors or/and checkers. 234 | 235 | 1. Create your checkers/collectors in current directory following the next directory structure: 236 | 237 | // TOOD: add more detailed readme about it 238 | 239 | .. code-block:: 240 | 241 | local/ 242 | ├── requirements.txt 243 | ├── checkers 244 | │   └── custom_checker.py 245 | └── collectors 246 |    └── custom_collector.py 247 | 248 | You can create only checker or collector if you want so 249 | 250 | 2. Create `proxy_py/settings.py` in current dir with the following content 251 | 252 | .. code-block:: python3 253 | 254 | from ._settings import * 255 | from local.checkers.custom_checker import CustomChecker 256 | 257 | PROXY_CHECKERS = [CustomChecker] 258 | 259 | COLLECTORS_DIRS = ['local/collectors'] 260 | 261 | you can append your checker to PROXY_CHECKERS or COLLECTORS_DIRS instead of overriding to use built in ones as well, it's just normal python file. 262 | See `proxy_py/_settings.py` for more detailed instructions on options. 263 | 264 | 3. Follow the steps in "How to install?" but download this docker-compose config instead 265 | 266 | .. code-block:: bash 267 | 268 | wget "https://raw.githubusercontent.com/DevAlone/proxy_py/master/docker-compose-with-local.yml" 269 | 270 | and run with command 271 | 272 | .. code-block:: bash 273 | 274 | docker-compose -f docker-compose-with-local.yml up 275 | 276 | 4. ...? 277 | 278 | 5. Profit! 279 | 280 | How to build from scratch? 281 | ************************** 282 | 283 | 1. Clone this repository 284 | 285 | .. code-block:: bash 286 | 287 | git clone https://github.com/DevAlone/proxy_py.git 288 | 289 | 2. Install requirements 290 | 291 | .. code-block:: bash 292 | 293 | cd proxy_py 294 | pip3 install -r requirements.txt 295 | 296 | 3. Create settings file 297 | 298 | .. code-block:: bash 299 | 300 | cp config_examples/settings.py proxy_py/settings.py 301 | 302 | 4. Install postgresql and change database configuration in settings.py file 303 | 304 | 5. (Optional) Configure alembic 305 | 306 | 6. Run your application 307 | 308 | .. code-block:: bash 309 | 310 | python3 main.py 311 | 312 | 7. Enjoy! 313 | 314 | 315 | Mirrors 316 | ******* 317 | 318 | * https://github.com/DevAlone/proxy_py 319 | * https://gitlab.com/DevAlone/proxy_py 320 | * https://bitbucket.org/d3dev/proxy_py 321 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | new web interface 2 | statistics for proxies and for collectors 3 | log slow sql queries 4 | set limit for total number of proxies 5 | do not update never working proxies 6 | more statistics 7 | api for getting proxy near location 8 | client library 9 | api for filtering proxies 10 | floating updating period for collectors 11 | p2p 12 | log memory consumption 13 | stop using ipinfo.io because of rate limits 14 | remove processor proxies queue model 15 | write number of asyncio tasks to DB to be able to draw more grahps(everybody loves statistics) 16 | fix issue with domains like this 08.8.8.8 17 | refresh materialized view periodically 18 | send request to proxy multiple times 19 | consider moving to PyPy 20 | -------------------------------------------------------------------------------- /async_requests.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import aiohttp 4 | from aiosocks.connector import ProxyClientRequest, ProxyConnector 5 | 6 | 7 | async def get(url, **kwargs): 8 | return await request("get", url, **kwargs) 9 | 10 | 11 | async def post(url, data, **kwargs): 12 | if data is dict or data is str: 13 | kwargs["json"] = data 14 | else: 15 | kwargs["data"] = data 16 | 17 | return await request("post", url, **kwargs) 18 | 19 | 20 | async def request(method, url, **kwargs): 21 | session_kwargs = {} 22 | if "proxy" in kwargs and kwargs["proxy"].startswith("socks"): 23 | session_kwargs["connector"] = ProxyConnector(remote_resolve=False) 24 | session_kwargs["request_class"] = ProxyClientRequest 25 | 26 | if "cookies" in kwargs: 27 | session_kwargs["cookies"] = kwargs["cookies"] 28 | del kwargs["cookies"] 29 | 30 | if "timeout" not in kwargs: 31 | kwargs["timeout"] = 10 32 | 33 | # headers={'User-Agent': get_random_user_agent()} 34 | if "headers" not in kwargs: 35 | kwargs["headers"] = {"User-Agent": get_random_user_agent()} 36 | elif "User-Agent" not in kwargs["headers"]: 37 | kwargs["headers"]["User-Agent"] = get_random_user_agent() 38 | 39 | if "override_session" in kwargs: 40 | session = kwargs["override_session"] 41 | del kwargs["override_session"] 42 | async with session.request(method, url, **kwargs) as response: 43 | return await Response.from_aiohttp_response(response) 44 | 45 | async with aiohttp.ClientSession(**session_kwargs) as session: 46 | async with session.request(method, url, **kwargs) as response: 47 | return await Response.from_aiohttp_response(response) 48 | 49 | 50 | class Response: 51 | def __init__(self, status, text, aiohttp_response=None): 52 | self.status = status 53 | self.text = text 54 | self.aiohttp_response = aiohttp_response 55 | 56 | @staticmethod 57 | async def from_aiohttp_response(aiohttp_response): 58 | return Response( 59 | status=aiohttp_response.status, 60 | text=await aiohttp_response.text(), 61 | aiohttp_response=aiohttp_response, 62 | ) 63 | 64 | def __str__(self): 65 | return json.dumps( 66 | { 67 | "status": self.status, 68 | "text": self.text, 69 | } 70 | ) 71 | 72 | __repr__ = __str__ 73 | 74 | 75 | def get_random_user_agent(): 76 | return "Mozilla/5.0 (Windows NT;) Gecko/20100101 Firefox/58.0" 77 | # return 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0' 78 | # TODO: do it 79 | # return UserAgent().random 80 | -------------------------------------------------------------------------------- /build_docker.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | if [ $# -eq 0 ] 3 | then 4 | tag='latest' 5 | git checkout master || exit 1 6 | else 7 | tag=$1 8 | git checkout $tag || exit 1 9 | fi 10 | 11 | docker build -t devalone/proxy_py:$tag . 12 | docker push devalone/proxy_py:$tag 13 | -------------------------------------------------------------------------------- /check_from_stdin.py: -------------------------------------------------------------------------------- 1 | """ 2 | just a helper script for testing proxies 3 | """ 4 | import asyncio 5 | import re 6 | import sys 7 | 8 | import proxy_utils 9 | from checkers.base_checker import BaseChecker 10 | from models import Proxy 11 | from proxy_py import settings 12 | 13 | proxy_find_regex = ( 14 | r"([0-9]{1,3})[^0-9]+([0-9]{1,3})[^0-9]+([0-9]{1,3})[^0-9]+([0-9]{1,3})" 15 | r"[^0-9]+([0-9]{1,5})" 16 | ) 17 | semaphore = asyncio.BoundedSemaphore(settings.NUMBER_OF_CONCURRENT_TASKS) 18 | tasks = [] 19 | 20 | 21 | async def check_task(ip, port): 22 | async with semaphore: 23 | for raw_protocol in range(len(Proxy.PROTOCOLS)): 24 | proxy_url = "{}://{}:{}".format(Proxy.PROTOCOLS[raw_protocol], ip, port) 25 | check_result, _ = await proxy_utils.check_proxy(proxy_url) 26 | if check_result: 27 | break 28 | # if check_result: 29 | # print('proxy {} works'.format(proxy_url)) 30 | print("+" if check_result else "-", end="", file=sys.stderr) 31 | sys.stderr.flush() 32 | 33 | 34 | async def main(): 35 | for line in sys.stdin: 36 | line = line.strip() 37 | try: 38 | groups = re.search(proxy_find_regex, line).groups() 39 | except: 40 | continue 41 | ip = ".".join(groups[:4]) 42 | port = groups[4] 43 | 44 | tasks.append(asyncio.ensure_future(check_task(ip, port))) 45 | 46 | await asyncio.gather(*tasks) 47 | print() 48 | BaseChecker.clean() 49 | 50 | 51 | if __name__ == "__main__": 52 | asyncio.get_event_loop().run_until_complete(main()) 53 | -------------------------------------------------------------------------------- /checkers/__init__.py: -------------------------------------------------------------------------------- 1 | from checkers.d3d_info_checker import D3DInfoChecker 2 | from checkers.google_com_checker import GoogleComChecker 3 | from checkers.ipinfo_io_checker import IPInfoIOChecker 4 | -------------------------------------------------------------------------------- /checkers/base_checker.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import ssl 3 | 4 | import aiohttp 5 | import aiosocks 6 | from aiosocks.connector import ProxyClientRequest, ProxyConnector 7 | 8 | import async_requests 9 | from proxy_py import settings 10 | 11 | 12 | class CheckerResult: 13 | # TODO: change to properties with validation 14 | ipv4 = None 15 | ipv6 = None 16 | city = None 17 | region = None 18 | country_code = None 19 | # tuple 20 | location_coordinates = None 21 | organization_name = None 22 | 23 | def update_from_other(self, other): 24 | def set_attr_if_is_not_none(attribute_name, first_obj, second_obj): 25 | if hasattr(second_obj, attribute_name): 26 | second_val = getattr(second_obj, attribute_name) 27 | setattr(first_obj, attribute_name, second_val) 28 | 29 | set_attr_if_is_not_none("ipv4", self, other) 30 | set_attr_if_is_not_none("ipv6", self, other) 31 | set_attr_if_is_not_none("city", self, other) 32 | set_attr_if_is_not_none("region", self, other) 33 | set_attr_if_is_not_none("country_code", self, other) 34 | set_attr_if_is_not_none("location_coordinates", self, other) 35 | set_attr_if_is_not_none("organization_name", self, other) 36 | 37 | 38 | class BaseChecker: 39 | # TODO: rewrite using HttpClient 40 | aiohttp_connector = None 41 | 42 | def __init__(self, url=None, request_type="GET", timeout=None): 43 | if BaseChecker.aiohttp_connector is None: 44 | BaseChecker.aiohttp_connector = ProxyConnector( 45 | remote_resolve=True, 46 | limit=settings.NUMBER_OF_SIMULTANEOUS_REQUESTS, 47 | limit_per_host=settings.NUMBER_OF_SIMULTANEOUS_REQUESTS_PER_HOST, 48 | ) 49 | self.request_type = request_type 50 | self.timeout = ( 51 | timeout if timeout is not None else settings.PROXY_CHECKING_TIMEOUT 52 | ) 53 | self.url = url 54 | 55 | @staticmethod 56 | async def init(): 57 | """ 58 | Override to do some initialization. called once per program life 59 | 60 | :return: 61 | """ 62 | 63 | @staticmethod 64 | def get_aiohttp_connector(): 65 | return BaseChecker.aiohttp_connector 66 | 67 | @staticmethod 68 | def clean(): 69 | """ 70 | Should be called at the end of the program 71 | 72 | :return: 73 | """ 74 | BaseChecker.aiohttp_connector.close() 75 | 76 | async def check(self, proxy_address: str, timeout: int = None) -> tuple: 77 | """ 78 | Checks proxy and returns additional information if such was provided by checker server 79 | 80 | :param proxy_address: string representing proxy ("http://user@qwerty@127.0.0.1:8080") 81 | :param timeout: overrides timeout if not None 82 | :return: tuple where first item is bool indication whether proxy is working or not 83 | and second one is additional information structure with information like white ip address, country and so on 84 | """ 85 | 86 | timeout = timeout if timeout is not None else self.timeout 87 | 88 | try: 89 | return await self._request(proxy_address, timeout) 90 | except ( 91 | aiohttp.client_exceptions.ServerDisconnectedError, 92 | aiohttp.client_exceptions.ClientHttpProxyError, 93 | aiohttp.client_exceptions.ClientProxyConnectionError, 94 | aiohttp.client_exceptions.ClientResponseError, 95 | aiohttp.client_exceptions.ClientPayloadError, 96 | aiosocks.errors.SocksError, 97 | aiosocks.SocksError, 98 | asyncio.TimeoutError, 99 | ssl.CertificateError, 100 | aiohttp.client_exceptions.ClientOSError, 101 | ) as ex: 102 | message = str(ex).lower() 103 | 104 | if "too many open file" in message: 105 | raise OSError("Too many open files") 106 | 107 | if settings.DEBUG: 108 | # TODO: move to logs! 109 | print( 110 | f"proxy {proxy_address} doesn't work because of exception {type(ex)}, message is {message}" 111 | ) 112 | 113 | return False, None 114 | 115 | async def _request(self, proxy_address, timeout) -> tuple: 116 | checker_result = CheckerResult() 117 | 118 | if self.url is None: 119 | raise Exception() 120 | 121 | headers = {"User-Agent": async_requests.get_random_user_agent()} 122 | conn = BaseChecker.get_aiohttp_connector() 123 | 124 | async with aiohttp.ClientSession( 125 | connector=conn, connector_owner=False, request_class=ProxyClientRequest 126 | ) as session: 127 | async with session.request( 128 | self.request_type, 129 | self.url, 130 | proxy=proxy_address, 131 | timeout=timeout, 132 | headers=headers, 133 | ) as response: 134 | is_working = await self.validate(response, checker_result) 135 | 136 | return is_working, checker_result 137 | 138 | async def validate( 139 | self, response: aiohttp.ClientResponse, checker_result: CheckerResult 140 | ) -> bool: 141 | """ 142 | Implement this method. It will get response from url with http method you provided in constructor 143 | 144 | :param response: aiohttp response 145 | :param checker_result: fill this structure with information like ip address 146 | :return: whether proxy is working or not 147 | """ 148 | raise NotImplemented() 149 | -------------------------------------------------------------------------------- /checkers/d3d_info_checker.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | 3 | from checkers.base_checker import BaseChecker, CheckerResult 4 | 5 | 6 | class D3DInfoChecker(BaseChecker): 7 | def __init__(self, timeout=None): 8 | super(D3DInfoChecker, self).__init__( 9 | "https://test.d3d.info/ok.html", timeout=timeout 10 | ) 11 | 12 | async def validate( 13 | self, response: aiohttp.ClientResponse, checker_result: CheckerResult 14 | ): 15 | return (await response.text()).strip().lower() == "ok" 16 | -------------------------------------------------------------------------------- /checkers/google_com_checker.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | 3 | from checkers.base_checker import BaseChecker, CheckerResult 4 | 5 | 6 | class GoogleComChecker(BaseChecker): 7 | def __init__(self, timeout=None): 8 | super(GoogleComChecker, self).__init__( 9 | "https://www.google.com/humans.txt", timeout=timeout 10 | ) 11 | 12 | async def validate( 13 | self, response: aiohttp.ClientResponse, checker_result: CheckerResult 14 | ): 15 | """ 16 | We have already done the request and it was successful, 17 | Google returned something(maybe good response, maybe captcha, we don't care) 18 | """ 19 | return True 20 | -------------------------------------------------------------------------------- /checkers/ipinfo_io_checker.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | 3 | from checkers.base_checker import BaseChecker, CheckerResult 4 | 5 | 6 | class IPInfoIOChecker(BaseChecker): 7 | def __init__(self, timeout=None): 8 | super(IPInfoIOChecker, self).__init__("https://ipinfo.io/json", timeout=timeout) 9 | 10 | async def validate( 11 | self, response: aiohttp.ClientResponse, checker_result: CheckerResult 12 | ) -> bool: 13 | if response.status != 200: 14 | return False 15 | 16 | json_result = await response.json() 17 | if "ip" in json_result: 18 | checker_result.ipv4 = json_result["ip"] 19 | if "city" in json_result: 20 | checker_result.city = json_result["city"] 21 | if "region" in json_result: 22 | checker_result.region = json_result["region"] 23 | if "country" in json_result: 24 | checker_result.country_code = json_result["country"] 25 | if "loc" in json_result: 26 | checker_result.location_coordinates = tuple( 27 | float(x) for x in json_result["loc"].split(",") 28 | ) 29 | if "org" in json_result: 30 | checker_result.organization_name = json_result["org"] 31 | 32 | return True 33 | -------------------------------------------------------------------------------- /collectors/__init__.py: -------------------------------------------------------------------------------- 1 | from collectors.abstract_collector import AbstractCollector 2 | from collectors.pages_collector import PagesCollector 3 | -------------------------------------------------------------------------------- /collectors/abstract_collector.py: -------------------------------------------------------------------------------- 1 | # TODO: add wrapper for doing requests and saving its cookies and UserAgent 2 | import asyncio 3 | import json 4 | 5 | import models 6 | from proxy_py import settings 7 | 8 | 9 | class AbstractCollector: 10 | """Base class for all types of collectors""" 11 | 12 | __collector__ = False 13 | """Set this variable to True in your collector's implementation""" 14 | 15 | def __init__(self): 16 | self.data = {} 17 | self.saved_variables = set() 18 | 19 | async def collect(self): 20 | """ 21 | This method should return proxies in any of the following formats: 22 | 23 | :: 24 | 25 | ip:port 26 | domain:port 27 | protocol://ip:port 28 | protocol://domain:port 29 | 30 | 31 | ip can be both ipv4 and ipv6 32 | 33 | return either list or async generator: 34 | 35 | :: 36 | 37 | >> async def collect(self): 38 | >> for proxy in something: 39 | >> yield proxy 40 | 41 | """ 42 | 43 | return [] 44 | 45 | async def _collect(self): 46 | """Do not call yourself! It is called on collector's processing automatically""" 47 | collect = self.collect() 48 | if asyncio.iscoroutine(collect): 49 | 50 | async def wrapper(f): 51 | for item in await f: 52 | yield item 53 | 54 | collect = wrapper(collect) 55 | 56 | i = 0 57 | async for proxy in collect: 58 | if i > settings.COLLECTOR_MAXIMUM_NUMBER_OF_PROXIES_PER_REQUEST: 59 | break 60 | 61 | yield proxy 62 | i += 1 63 | 64 | self.last_processing_proxies_count = i 65 | 66 | async def load_state(self, state: models.CollectorState): 67 | """ 68 | Function for loading collector's state from database model. 69 | It's called automatically, don't worry. All you can do is 70 | to override without forgetting to call parent's method like this: 71 | 72 | :: 73 | 74 | async def load_state(self, state): 75 | super(MyCollector, self).load_state(state) 76 | # do something here 77 | """ 78 | self.last_processing_time = state.last_processing_time 79 | self.processing_period = state.processing_period 80 | self.last_processing_proxies_count = state.last_processing_proxies_count 81 | self.data = ( 82 | json.loads(state.data) if state.data is not None and state.data else {} 83 | ) 84 | if "_variables" in self.data: 85 | for var_name in self.data["_variables"]: 86 | setattr(self, var_name, self.data["_variables"][var_name]) 87 | 88 | async def save_state(self, state: models.CollectorState): 89 | """ 90 | Function for saving collector's state to database model. 91 | It's called automatically, don't worry about it. 92 | """ 93 | state.last_processing_time = self.last_processing_time 94 | state.processing_period = self.processing_period 95 | state.last_processing_proxies_count = self.last_processing_proxies_count 96 | 97 | if self.saved_variables is not None: 98 | if "_variables" not in self.data: 99 | self.data["_variables"] = {} 100 | for var_name in self.saved_variables: 101 | self.data["_variables"][var_name] = getattr(self, var_name) 102 | 103 | state.data = json.dumps(self.data) 104 | 105 | last_processing_time = 0 106 | """time in unix timestamp(seconds from 01.01.1970)""" 107 | 108 | last_processing_proxies_count = 0 109 | """how many proxies we got on last request, do not change manually""" 110 | 111 | processing_period = 60 * 60 112 | """processing period in seconds""" 113 | 114 | # TODO: create this feature 115 | floating_processing_period = True 116 | """ 117 | this means processing period may be changed 118 | if collector returns too few proxies, it will be increased 119 | """ 120 | 121 | override_maximum_processing_period = None 122 | """ 123 | ignore settings' maximum processing period and set 124 | it to the value of this variable 125 | """ 126 | 127 | override_minimum_processing_period = None 128 | """ 129 | ignore settings' minimum processing period and set 130 | it to the value of this variable, for example 131 | when some collector has requests time limit 132 | """ 133 | 134 | data = None 135 | """ 136 | here you can store some information, 137 | it will be written into and read from database 138 | by magic, don't worry about it :) 139 | If you're curious, see process_collector_of_state() function 140 | from processor.py file 141 | 142 | Don't use names starting with the underscore 143 | like this one: _last_page 144 | """ 145 | 146 | saved_variables = None 147 | """ 148 | Set of variables which are saved to database automatically(inside data dict) 149 | """ 150 | -------------------------------------------------------------------------------- /collectors/pages_collector.py: -------------------------------------------------------------------------------- 1 | from collectors.abstract_collector import AbstractCollector 2 | from proxy_py import settings 3 | 4 | 5 | # TODO: save pages to collector state 6 | class PagesCollector(AbstractCollector): 7 | """ 8 | Collector for paginated APIs. Pages are started from 0. 9 | Here you should override ``process_page(page_index)`` method. 10 | This collector will care about pages, increment it on each processing 11 | and will reset it to 0 if there is no proxies on the page or if proxies 12 | are the same as those on the previous one. If you don't want such smart 13 | behavior, just set dynamic_pages_count to false 14 | and set pages_count manually. 15 | """ 16 | 17 | def __init__(self): 18 | super(PagesCollector, self).__init__() 19 | self.last_proxies_list = [] 20 | self.saved_variables.add("current_page") 21 | self.saved_variables.add("pages_count") 22 | self.saved_variables.add("last_proxies_list") 23 | 24 | async def collect(self): 25 | proxies = list(await self.process_page(self.current_page))[ 26 | : settings.COLLECTOR_MAXIMUM_NUMBER_OF_PROXIES_PER_REQUEST 27 | ] 28 | 29 | if self.dynamic_pages_count: 30 | if proxies: 31 | self.pages_count = self.current_page + 2 32 | """ 33 | for those APIs which returns 34 | the last page for nonexistent ones 35 | """ 36 | proxies_set = set(proxies) 37 | 38 | if set(self.last_proxies_list) == proxies_set: 39 | self.pages_count = self.current_page + 1 40 | 41 | self.last_proxies_list = list(proxies_set) 42 | else: 43 | self.pages_count = self.current_page + 1 44 | 45 | self.current_page += 1 46 | if self.current_page >= self.pages_count: 47 | self.current_page = 0 48 | 49 | return proxies 50 | 51 | async def process_page(self, page_index): 52 | """ 53 | you should override this in your class derived from PagesCollector. 54 | 55 | `page_index` changes from 0 to pages_count(excluded) 56 | """ 57 | return [] 58 | 59 | pages_count = 0 60 | """set this value or use dynamic pages count""" 61 | current_page = 0 62 | 63 | dynamic_pages_count = True 64 | """use dynamic pages count""" 65 | 66 | processing_period = 60 * 10 67 | 68 | last_proxies_list = None 69 | -------------------------------------------------------------------------------- /collectors/web/cn/89ip/collector.py: -------------------------------------------------------------------------------- 1 | import http_client 2 | from collectors import AbstractCollector 3 | from parsers import RegexParser 4 | 5 | 6 | class Collector(AbstractCollector): 7 | __collector__ = True 8 | 9 | def __init__(self): 10 | super(Collector, self).__init__() 11 | # 30 minutes 12 | self.processing_period = 30 * 60 13 | """ 14 | floating period means proxy_py will be changing 15 | period to not make extra requests and handle 16 | new proxies in time, you don't need to change 17 | it in most cases 18 | """ 19 | # self.floating_processing_period = False 20 | 21 | async def collect(self): 22 | url = "http://www.89ip.cn/tqdl.html?num=9999&address=&kill_address=&port=&kill_port=&isp=" 23 | # send a request to get html code of the page 24 | html = await http_client.get_text(url) 25 | # and just parse it using regex parser with a default rule to parse 26 | # proxies like this: 27 | # 8.8.8.8:8080 28 | return RegexParser().parse(html) 29 | -------------------------------------------------------------------------------- /collectors/web/com/freeproxylists/collector.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import re 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | import http_client 7 | from collectors import AbstractCollector 8 | 9 | SLEEP_BETWEEN_PAGES_SECONDS = 1 10 | 11 | 12 | class Collector(AbstractCollector): 13 | __collector__ = True 14 | 15 | def __init__(self): 16 | super(Collector, self).__init__() 17 | # it provides really a lot of proxies so we'll check it rarely 18 | # 24 hours 19 | self.processing_period = 24 * 3600 20 | self.url = "http://freeproxylists.com" 21 | 22 | async def collect(self): 23 | html = await http_client.get_text(self.url) 24 | soup = BeautifulSoup(html, features="lxml") 25 | 26 | for link in soup.select("a"): 27 | link = link["href"].strip() 28 | 29 | if re.match(r"^/[a-zA-Z0-9_-]+\.html$", link): 30 | async for proxy in self.collect_from_page(link): 31 | yield proxy 32 | 33 | async def collect_from_page(self, page_link): 34 | html = await http_client.get_text(self.url + page_link) 35 | 36 | soup = BeautifulSoup(html, features="lxml") 37 | 38 | for link in soup.select("a"): 39 | link = link["href"].strip() 40 | 41 | regex = r"^([a-zA-Z0-9_-]+)/([0-9]+)\.html$" 42 | match = re.match(regex, link) 43 | 44 | if match: 45 | type_of_proxies, proxies_id = match.groups() 46 | url = f"{self.url}/load_{type_of_proxies}_{proxies_id}.html" 47 | 48 | async for proxy in self.collect_from_table(url): 49 | yield proxy 50 | 51 | async def collect_from_table(self, table_url): 52 | html = await http_client.get_text(table_url) 53 | 54 | soup = BeautifulSoup(html, features="lxml") 55 | 56 | table_text = soup.find("quote").contents[0] 57 | soup = BeautifulSoup(table_text, features="lxml") 58 | 59 | for tr in soup.find_all("tr"): 60 | children = tr.find_all("td") 61 | if len(children) != 2: 62 | continue 63 | 64 | ip, port = [child.contents[0] for child in children] 65 | proxy = f"{ip}:{port}" 66 | yield proxy 67 | 68 | await asyncio.sleep(SLEEP_BETWEEN_PAGES_SECONDS) 69 | -------------------------------------------------------------------------------- /collectors/web/com/nordvpn/collector.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import async_requests 4 | from collectors.pages_collector import PagesCollector 5 | 6 | URL_PATTERN = ( 7 | "https://nordvpn.com/wp-admin/admin-ajax.php?searchParameters[0][name]=proxy-country" 8 | "&searchParameters[0][value]=&searchParameters[1][name]=proxy-ports&searchParameters[1][value]=" 9 | "&offset={}&limit={}&action=getProxies" 10 | ) 11 | 12 | 13 | class Collector(PagesCollector): 14 | # this collector gives a lot of bad proxies 15 | # TODO: do something 16 | __collector__ = False 17 | processing_period = 10 * 60 18 | 19 | def __init__(self): 20 | super(Collector, self).__init__() 21 | self.pages_count = 10 22 | self.limit = 100 23 | 24 | async def process_page(self, page_index): 25 | offset = page_index * self.limit 26 | resp = await async_requests.get(URL_PATTERN.format(offset, self.limit)) 27 | json_response = json.loads(resp.text) 28 | 29 | result = [] 30 | 31 | for item in json_response: 32 | result.append("{}:{}".format(item["ip"], item["port"])) 33 | 34 | if result: 35 | self.pages_count = page_index + 2 36 | else: 37 | self.pages_count = page_index 38 | 39 | return result 40 | -------------------------------------------------------------------------------- /collectors/web/com/premproxy/collector.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import async_requests 4 | import lxml.html 5 | from collectors.pages_collector import PagesCollector 6 | from lxml import etree 7 | from py_mini_racer import py_mini_racer 8 | 9 | 10 | class BaseCollectorPremProxyCom(PagesCollector): 11 | def __init__(self, url, pages_count): 12 | super(BaseCollectorPremProxyCom, self).__init__() 13 | self.url = url 14 | self.pages_count = pages_count 15 | 16 | async def process_page(self, page_index): 17 | result = [] 18 | 19 | url = self.url + "%02d.htm" % (page_index + 1) 20 | 21 | resp = await async_requests.get(url=url) 22 | html = resp.text 23 | tree = lxml.html.fromstring(html) 24 | elements = tree.xpath(".//td[starts-with(@data-label, 'IP:port')]") 25 | 26 | code_table_url = re.findall(r'script src="(/js(-socks)?/.+?\.js)', html)[0][0] 27 | 28 | code_table = ( 29 | await async_requests.get("https://premproxy.com" + code_table_url) 30 | ).text.replace("eval", "") 31 | ports_code_table = { 32 | match[0]: match[1] 33 | for match in re.findall( 34 | r"\$\('.([a-z0-9]+)'\)\.html\(([0-9]+)\)", 35 | py_mini_racer.MiniRacer().execute(code_table), 36 | ) 37 | } 38 | for el in elements: 39 | element_html = str(etree.tostring(el)) 40 | address, port = re.search( 41 | r"(?P
[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3})\|(?P[a-z0-9]+)", 42 | element_html, 43 | ).groups() 44 | try: 45 | port = ports_code_table[port] 46 | except KeyError as ex: 47 | raise Exception( 48 | "symbol is not present in code table: {}. address: {}".format( 49 | str(ex), address 50 | ) 51 | ) 52 | proxy = "{}:{}".format(address, port) 53 | result.append(proxy) 54 | 55 | return result 56 | 57 | 58 | class Collector(BaseCollectorPremProxyCom): 59 | __collector__ = True 60 | 61 | def __init__(self): 62 | super(Collector, self).__init__("https://premproxy.com/list/", 20) 63 | 64 | 65 | class CollectorSocksList(BaseCollectorPremProxyCom): 66 | __collector__ = True 67 | 68 | def __init__(self): 69 | super(CollectorSocksList, self).__init__( 70 | "https://premproxy.com/socks-list/", 20 71 | ) 72 | -------------------------------------------------------------------------------- /collectors/web/net/checkerproxy/collector.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | 4 | import async_requests 5 | from collectors.abstract_collector import AbstractCollector 6 | 7 | 8 | class Collector(AbstractCollector): 9 | __collector__ = True 10 | 11 | def __init__(self): 12 | super(Collector, self).__init__() 13 | self.processing_period = 3600 * 12 14 | self.time_delta = datetime.timedelta(-1) 15 | 16 | async def collect(self): 17 | url = "https://checkerproxy.net/api/archive/{}".format( 18 | str(datetime.date.today() + self.time_delta) 19 | ) 20 | 21 | res = await async_requests.get(url) 22 | text = res.text 23 | 24 | json_data = json.loads(text) 25 | 26 | return [proxy["addr"] for proxy in json_data] 27 | 28 | 29 | class CollectorToday(Collector): 30 | __collector__ = True 31 | 32 | def __init__(self): 33 | super(CollectorToday, self).__init__() 34 | self.processing_period = 3600 * 3 35 | self.time_delta = datetime.timedelta(0) 36 | -------------------------------------------------------------------------------- /collectors/web/net/free_proxy_list/collector.py: -------------------------------------------------------------------------------- 1 | import lxml.etree 2 | import lxml.html 3 | 4 | import async_requests 5 | from collectors.abstract_collector import AbstractCollector 6 | 7 | 8 | class BaseCollectorFreeProxyListNet(AbstractCollector): 9 | def __init__(self, url): 10 | super(BaseCollectorFreeProxyListNet, self).__init__() 11 | self.url = url 12 | 13 | async def collect(self): 14 | result = [] 15 | 16 | res = await async_requests.get(self.url) 17 | html = res.text 18 | tree = lxml.html.fromstring(html) 19 | table_element = tree.xpath(".//table[@id='proxylisttable']")[0] 20 | rows = table_element.xpath(".//tbody/tr") 21 | for row in rows: 22 | try: 23 | ip = row.xpath(".//td")[0].text 24 | port = row.xpath(".//td")[1].text 25 | result.append(str(ip) + ":" + str(port)) 26 | except: 27 | pass 28 | 29 | return result 30 | 31 | 32 | class CollectorFreeProxyListNet(BaseCollectorFreeProxyListNet): 33 | __collector__ = True 34 | 35 | def __init__(self): 36 | super(CollectorFreeProxyListNet, self).__init__("https://free-proxy-list.net/") 37 | 38 | 39 | class CollectorFreeProxyListNetAnonymousProxy(BaseCollectorFreeProxyListNet): 40 | __collector__ = True 41 | 42 | def __init__(self): 43 | super(CollectorFreeProxyListNetAnonymousProxy, self).__init__( 44 | "https://free-proxy-list.net/anonymous-proxy.html" 45 | ) 46 | 47 | 48 | class CollectorFreeProxyListNetUkProxy(BaseCollectorFreeProxyListNet): 49 | __collector__ = True 50 | 51 | def __init__(self): 52 | super(CollectorFreeProxyListNetUkProxy, self).__init__( 53 | "https://free-proxy-list.net/uk-proxy.html" 54 | ) 55 | 56 | 57 | class CollectorSocksProxyNet(BaseCollectorFreeProxyListNet): 58 | __collector__ = True 59 | 60 | def __init__(self): 61 | super(CollectorSocksProxyNet, self).__init__("https://socks-proxy.net/") 62 | 63 | 64 | class CollectorSslproxiesOrg(BaseCollectorFreeProxyListNet): 65 | __collector__ = True 66 | 67 | def __init__(self): 68 | super(CollectorSslproxiesOrg, self).__init__("https://www.sslproxies.org/") 69 | 70 | 71 | class CollectorUsProxyOrg(BaseCollectorFreeProxyListNet): 72 | __collector__ = True 73 | 74 | def __init__(self): 75 | super(CollectorUsProxyOrg, self).__init__("https://www.us-proxy.org/") 76 | -------------------------------------------------------------------------------- /collectors/web/net/freeproxylists/collector.py: -------------------------------------------------------------------------------- 1 | import random 2 | import re 3 | import string 4 | from urllib import parse 5 | 6 | import aiohttp 7 | import lxml 8 | import lxml.html 9 | 10 | import async_requests 11 | from collectors.pages_collector import PagesCollector 12 | 13 | BASE_URL = "http://freeproxylists.net/?page={}" 14 | 15 | 16 | class Collector(PagesCollector): 17 | """pages from 1""" 18 | 19 | # __collector__ = True 20 | # TODO: doesn't work, fix! 21 | # recaptcha accepts any word 22 | __collector__ = False 23 | 24 | def __init__(self): 25 | super(Collector, self).__init__() 26 | self.dynamic_pages_count = True 27 | 28 | async def process_page(self, page_index): 29 | async with aiohttp.ClientSession() as session: 30 | result = [] 31 | 32 | resp = await async_requests.get( 33 | BASE_URL.format(page_index + 1), override_session=session 34 | ) 35 | text = resp.text 36 | # matches = re.search(r"src=\"(.+?recaptcha/api/noscript.+?)\"", text) 37 | matches = re.search(r"src=\"(.+?recaptcha/api/challenge.+?)\"", text) 38 | 39 | if matches: 40 | captcha_url = matches.groups()[0] 41 | print("requesting captcha...") 42 | print(captcha_url) 43 | captcha_resp = await async_requests.get( 44 | captcha_url, override_session=session 45 | ) 46 | print(captcha_resp.text) 47 | # challenge = re.search(r"recaptcha_challenge_field\".+?value=\"(.+?)\"", captcha_resp.text).groups()[0] 48 | # , 49 | challenge = re.search( 50 | r"challenge.+'(.+?)'", captcha_resp.text 51 | ).groups()[0] 52 | site_key = re.search(r"site.+'(.+?)'", captcha_resp.text).groups()[0] 53 | 54 | # data = { 55 | # "recaptcha_challenge_field": challenge, 56 | # "recaptcha_response_field": ''.join(random.choice(string.ascii_letters + string.digits) for _ in 57 | # range(random.randint(4, 12))), 58 | # "submit": "I'm a human", 59 | # } 60 | 61 | headers = { 62 | "Referer": "http://freeproxylists.net", 63 | } 64 | 65 | # resp = await async_requests.post(captcha_url, data=data, headers=headers, override_session=session) 66 | reload_url = ( 67 | "http://www.google.com/recaptcha/api/reload?" 68 | "c={}&k={}&reason=i&type=image&lang=en" 69 | "&th=,2ypXKguwFW6UvSXto1a2LsWhsMJrZAjwAAAAaKAAAAB0awOHXxPK1pI0y7YA7ty_" 70 | "-fFe_ORT1_CMpFfrYwONVsEWIfVcS0089_1-" 71 | "eTnzeazXyZjH8zBe0B2BHoYf2tykdxDiJ6cRBLpWFx8eFpO81EDv24FV8vfwMT-" 72 | "HgejYnzg1sArxJtXC-U1bauomd8Qb1DmM8_ssEdReaiyy2Mz8h2s3eyE0kUE-" 73 | "ggt_gY5sDGo8v4_OkxHrYS16EEoDIHEs_HTXyXngZC-97bEL4nSPU0sE09TGSwRhztg6YOeDlYPOkgZ8MH-" 74 | "RDuDZbrIed0biaklXM4RSXF8PiKSkSy8iQj8hNxG_fhd51pjSd3bvVAgyhj9Nx_cFoyKYMC5cmTgSSNCPcM_" 75 | "deLNuReh33dySmp3Z6sPHlPK0ZSUR6oQo8RIhDQRE3aQ8nwOJLH7DLYGoSIzOtaqOVaf9xwLrIJtBxaWAZV4N" 76 | "z0W6OqUhDmVYFBIBepLI-Xit-Q1RzMMj1y50OvuGMWWnbA2zph7f5--" 77 | "KwG6njjK98cGUtojBB_F2CgeTbPaAXj3iB8Xl-oyThq6OjR38JbASJEud16K9HH2HHJNr8q1DbvFTp0wPQ" 78 | "4p6l0MYSAgxLLVnAh42sghUaW4q0Nt57cEh6t3DeQE7Dp87jIUozOveGAtxIEF5ab5D3M4WWysd7OAMgAic80" 79 | "4TBcABHP7_BPdDFd2uvITMe1PSQRZNknkeqatlf4LT0d0KmsjPpIzcoK2F1KsIdVAE3HK0GLp1Wyk9O7NHEfK" 80 | "bWHTq3GxwCWhEs0xBuRCQX7KJJ9TUf36evwkhFn2sBSTMeZTu-VvaaJMljMoXOAwG0dqvgoJcEGjGtSFjpWEi" 81 | "pUxi9ZHuOxUDb40mZh-CoE6EpjdTub01oTCapr1uItFgeuHQv15FMJbdxpsq5OtZLOyEbdyaz7IW_0waqKErO" 82 | "yM8ZAc5AWRtATapgChkGP37E_gIbr7OqCZBqIththvaTWYBG_JKiV_yAlD4tJj5JYtdHlqIZ1ivjxGWl9DuwR" 83 | "gFBlWqqKjM-CGY8diVjJXZDijFXbPpP1Y7a4IY5nhanpgaWTjueNBnhag0AHs_tT6ZcH7AD_f7NDYEllZzIdx1" 84 | "OBhb9zAcG_BJPnbtns4IDObBSKiwJi3eV8HWMscNA5k5mdgEYVSqc91l4EtSW6oYhuq-cDqQ7oeG2dTLKdt4Gn" 85 | "mDC1X10IIg4CkSUrF3NHuFqjzA09k_9AeTCStN3CLV".format( 86 | challenge, site_key 87 | ) 88 | ) 89 | 90 | resp = await async_requests.get( 91 | reload_url, headers=headers, override_session=session 92 | ) 93 | 94 | print(resp.text) 95 | 96 | challenge = re.search(r"finish_reload\('(.+?)'", resp.text).groups()[0] 97 | 98 | print(challenge) 99 | 100 | headers = { 101 | "Referer": "http://freeproxylists.net/", 102 | } 103 | 104 | data = { 105 | "recaptcha_challenge_field": challenge, 106 | "recaptcha_response_field": "".join( 107 | random.choice(string.ascii_letters + string.digits) 108 | for _ in range(random.randint(4, 12)) 109 | ), 110 | } 111 | 112 | resp = await async_requests.get( 113 | BASE_URL.format(page_index + 1), 114 | data=data, 115 | headers=headers, 116 | override_session=session, 117 | ) 118 | text = resp.text 119 | 120 | try: 121 | tree = lxml.html.fromstring(text) 122 | table_element = tree.xpath(".//table[@class='DataGrid']")[0] 123 | except BaseException: 124 | raise Exception("table not found: {}".format(text)) 125 | 126 | rows = table_element.xpath(".//tr") 127 | for row in rows: 128 | try: 129 | ip = row.xpath(".//td/script")[0].text 130 | port = row.xpath(".//td")[1].text 131 | if ip is None or port is None: 132 | continue 133 | 134 | ip = parse.unquote(ip) 135 | ip = re.search( 136 | r">([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}).*?", ip 137 | ) 138 | if ip is None: 139 | continue 140 | ip = ip.groups()[0] 141 | 142 | result.append("{}:{}".format(ip, port)) 143 | except IndexError: 144 | pass 145 | 146 | return result 147 | -------------------------------------------------------------------------------- /collectors/web/org/proxy_list/collector.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import re 3 | 4 | import lxml.etree 5 | import lxml.html 6 | 7 | import async_requests 8 | from collectors.pages_collector import PagesCollector 9 | 10 | 11 | class Collector(PagesCollector): 12 | __collector__ = True 13 | 14 | def __init__(self): 15 | super(Collector, self).__init__() 16 | self.pages_count = 10 17 | 18 | async def process_page(self, page_index): 19 | result = [] 20 | res = await async_requests.get( 21 | "http://proxy-list.org/english/index.php?p={0}".format(page_index + 1) 22 | ) 23 | html = res.text 24 | tree = lxml.html.fromstring(html) 25 | proxy_elements = tree.xpath( 26 | ".//div[@id='proxy-table']//div[@class='table']//li[@class='proxy']" 27 | ) 28 | for element in proxy_elements: 29 | # noinspection PyUnresolvedReferences 30 | element_data = lxml.etree.tostring(element) 31 | base64_proxy = re.search(r"Proxy\('(.+?)'\)", element_data.decode()).group( 32 | 1 33 | ) 34 | proxy = base64.b64decode(base64_proxy).decode() 35 | result.append(proxy) 36 | return result 37 | -------------------------------------------------------------------------------- /collectors_list.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import importlib.util 3 | import inspect 4 | import os 5 | 6 | from models import CollectorState, db 7 | from proxy_py import settings 8 | 9 | collectors = {} 10 | 11 | 12 | async def init(): 13 | global collectors 14 | 15 | _collectors_dirs = settings.COLLECTORS_DIRS 16 | if type(_collectors_dirs) is not list: 17 | _collectors_dirs = [_collectors_dirs] 18 | 19 | for collectors_dir in _collectors_dirs: 20 | if collectors_dir.startswith("/"): 21 | raise Exception("Collector's dir cannot be absolute") 22 | if collectors_dir.startswith(".."): 23 | raise Exception("Collector's dir cannot be in parent directory") 24 | 25 | for root, dirs, files in os.walk(collectors_dir): 26 | for file in files: 27 | if file.endswith(".py"): 28 | file_path = os.path.join(root, file) 29 | if file_path.startswith("./"): 30 | file_path = file_path[2:] 31 | module_name = os.path.splitext(file_path)[0].replace("/", ".") 32 | spec = importlib.util.spec_from_file_location( 33 | module_name, file_path 34 | ) 35 | collector_module = importlib.util.module_from_spec(spec) 36 | spec.loader.exec_module(collector_module) 37 | 38 | # TODO: iterate through all classes independent of their names 39 | for name, member in inspect.getmembers( 40 | collector_module, inspect.isclass 41 | ): 42 | # if inspect.isclass(member): 43 | if ( 44 | member.__module__ == collector_module.__name__ 45 | and hasattr(member, "__collector__") 46 | and member.__collector__ 47 | ): 48 | collectors[module_name + "." + member.__name__] = member() 49 | 50 | # init db 51 | 52 | for module_name, Collector in collectors.items(): 53 | try: 54 | await db.get( 55 | CollectorState.select().where(CollectorState.identifier == module_name) 56 | ) 57 | except CollectorState.DoesNotExist: 58 | await db.create( 59 | CollectorState, 60 | identifier=module_name, 61 | processing_period=Collector.processing_period, 62 | last_processing_time=0, 63 | ) 64 | 65 | 66 | def get_collector_of_module_name(module_name: str): 67 | if module_name not in collectors: 68 | raise CollectorNotFoundException( 69 | "Probably some collector exists in database but not in filesystem. " 70 | "module_name = {}".format(module_name) 71 | ) 72 | 73 | return collectors[module_name] 74 | 75 | 76 | async def load_collector(state: CollectorState): 77 | collector = get_collector_of_module_name(state.identifier) 78 | await collector.load_state(state) 79 | return collector 80 | 81 | 82 | async def save_collector(state: CollectorState): 83 | collector = get_collector_of_module_name(state.identifier) 84 | await collector.save_state(state) 85 | await db.update(state) 86 | 87 | 88 | class CollectorNotFoundException(BaseException): 89 | pass 90 | 91 | 92 | asyncio.get_event_loop().run_until_complete(init()) 93 | -------------------------------------------------------------------------------- /config_examples/docker_settings.py: -------------------------------------------------------------------------------- 1 | from ._settings import * 2 | 3 | DEBUG = False 4 | 5 | # override db settings here 6 | DATABASE_CONNECTION_KWARGS["host"] = "localhost" 7 | DATABASE_CONNECTION_KWARGS["database"] = "proxy_py" 8 | DATABASE_CONNECTION_KWARGS["user"] = "proxy_py" 9 | DATABASE_CONNECTION_KWARGS["password"] = "proxy_py" 10 | 11 | PROXY_PROVIDER_SERVER_ADDRESS = { 12 | "HOST": "0.0.0.0", 13 | "PORT": 55555, 14 | } 15 | -------------------------------------------------------------------------------- /config_examples/proxy_py.nginx.conf: -------------------------------------------------------------------------------- 1 | server { 2 | listen 80; 3 | server_name proxy.example.com; 4 | 5 | location / { 6 | proxy_pass http://127.0.0.1:55555; 7 | proxy_set_header X-Real-IP $remote_addr; 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /config_examples/proxy_py.supervisor.conf: -------------------------------------------------------------------------------- 1 | [program:proxy_py] 2 | command=/home/proxy_py/Env/proxy_py/bin/python3 /home/proxy_py/proxy_py/main.py 3 | directory=/home/proxy_py/proxy_py 4 | user=proxy_py 5 | autorestart=true 6 | redirect_stderr=true 7 | -------------------------------------------------------------------------------- /config_examples/settings.py: -------------------------------------------------------------------------------- 1 | from ._settings import * 2 | 3 | # it should be empty to let environment variables be the most important ones 4 | -------------------------------------------------------------------------------- /docker-compose-with-local.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | db: 5 | image: "postgres:12" 6 | restart: always 7 | environment: 8 | - POSTGRES_USER=proxy_py 9 | - POSTGRES_PASSWORD=proxy_py 10 | - POSTGRES_DB=proxy_py 11 | volumes: 12 | - db_data:/var/lib/postgresql/data 13 | - ./init_db.sql:/docker-entrypoint-initdb.d/init_db.sql 14 | core: 15 | image: "devalone/proxy_py:latest" 16 | command: > 17 | bash -c " 18 | source /proxy_py/env/bin/activate && 19 | pip3 install -r /proxy_py/local/requirements.txt && 20 | /proxy_py/run.sh 21 | " 22 | restart: always 23 | ports: 24 | - "55555:55555" 25 | environment: 26 | - PROXY_PY_PROXY_PROVIDER_SERVER_ADDRESS={'HOST':'0.0.0.0','PORT':55555} 27 | - PROXY_PY_DATABASE_CONNECTION_KWARGS={'host':'db','database':'proxy_py','user':'proxy_py','password':'proxy_py'} 28 | depends_on: 29 | - db 30 | volumes: 31 | - ./local/:/proxy_py/local/ 32 | - ./proxy_py/settings.py:/proxy_py/proxy_py/settings.py 33 | 34 | volumes: 35 | db_data: 36 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | db: 5 | image: "postgres:12" 6 | restart: always 7 | environment: 8 | - POSTGRES_USER=proxy_py 9 | - POSTGRES_PASSWORD=proxy_py 10 | - POSTGRES_DB=proxy_py 11 | volumes: 12 | - db_data:/var/lib/postgresql/data 13 | - ./init_db.sql:/docker-entrypoint-initdb.d/init_db.sql 14 | core: 15 | image: "devalone/proxy_py:latest" 16 | command: ./run.sh 17 | restart: always 18 | ports: 19 | - "55555:55555" 20 | environment: 21 | - PROXY_PY_PROXY_PROVIDER_SERVER_ADDRESS={'HOST':'0.0.0.0','PORT':55555} 22 | - PROXY_PY_DATABASE_CONNECTION_KWARGS={'host':'db','database':'proxy_py','user':'proxy_py','password':'proxy_py'} 23 | depends_on: 24 | - db 25 | 26 | volumes: 27 | db_data: 28 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | # SPHINXBUILD = sphinx-build 7 | SPHINXBUILD = python $(shell which sphinx-build) 8 | PAPER = 9 | BUILDDIR = build 10 | 11 | # User-friendly check for sphinx-build 12 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 13 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 14 | endif 15 | 16 | # Internal variables. 17 | PAPEROPT_a4 = -D latex_paper_size=a4 18 | PAPEROPT_letter = -D latex_paper_size=letter 19 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 20 | # the i18n builder cannot share the environment and doctrees with the others 21 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 22 | 23 | .PHONY: help 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " applehelp to make an Apple Help Book" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | 51 | .PHONY: clean 52 | clean: 53 | rm -rf $(BUILDDIR)/* 54 | 55 | .PHONY: html 56 | html: 57 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 58 | @echo 59 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 60 | 61 | .PHONY: dirhtml 62 | dirhtml: 63 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 64 | @echo 65 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 66 | 67 | .PHONY: singlehtml 68 | singlehtml: 69 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 70 | @echo 71 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 72 | 73 | .PHONY: pickle 74 | pickle: 75 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 76 | @echo 77 | @echo "Build finished; now you can process the pickle files." 78 | 79 | .PHONY: json 80 | json: 81 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 82 | @echo 83 | @echo "Build finished; now you can process the JSON files." 84 | 85 | .PHONY: htmlhelp 86 | htmlhelp: 87 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 88 | @echo 89 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 90 | ".hhp project file in $(BUILDDIR)/htmlhelp." 91 | 92 | .PHONY: qthelp 93 | qthelp: 94 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 95 | @echo 96 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 97 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 98 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/proxy_py.qhcp" 99 | @echo "To view the help file:" 100 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/proxy_py.qhc" 101 | 102 | .PHONY: applehelp 103 | applehelp: 104 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 105 | @echo 106 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 107 | @echo "N.B. You won't be able to view it unless you put it in" \ 108 | "~/Library/Documentation/Help or install it in your application" \ 109 | "bundle." 110 | 111 | .PHONY: devhelp 112 | devhelp: 113 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 114 | @echo 115 | @echo "Build finished." 116 | @echo "To view the help file:" 117 | @echo "# mkdir -p $$HOME/.local/share/devhelp/proxy_py" 118 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/proxy_py" 119 | @echo "# devhelp" 120 | 121 | .PHONY: epub 122 | epub: 123 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 124 | @echo 125 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 126 | 127 | .PHONY: latex 128 | latex: 129 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 130 | @echo 131 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 132 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 133 | "(use \`make latexpdf' here to do that automatically)." 134 | 135 | .PHONY: latexpdf 136 | latexpdf: 137 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 138 | @echo "Running LaTeX files through pdflatex..." 139 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 140 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 141 | 142 | .PHONY: latexpdfja 143 | latexpdfja: 144 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 145 | @echo "Running LaTeX files through platex and dvipdfmx..." 146 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 147 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 148 | 149 | .PHONY: text 150 | text: 151 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 152 | @echo 153 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 154 | 155 | .PHONY: man 156 | man: 157 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 158 | @echo 159 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 160 | 161 | .PHONY: texinfo 162 | texinfo: 163 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 164 | @echo 165 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 166 | @echo "Run \`make' in that directory to run these through makeinfo" \ 167 | "(use \`make info' here to do that automatically)." 168 | 169 | .PHONY: info 170 | info: 171 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 172 | @echo "Running Texinfo files through makeinfo..." 173 | make -C $(BUILDDIR)/texinfo info 174 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 175 | 176 | .PHONY: gettext 177 | gettext: 178 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 179 | @echo 180 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 181 | 182 | .PHONY: changes 183 | changes: 184 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 185 | @echo 186 | @echo "The overview file is in $(BUILDDIR)/changes." 187 | 188 | .PHONY: linkcheck 189 | linkcheck: 190 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 191 | @echo 192 | @echo "Link check complete; look for any errors in the above output " \ 193 | "or in $(BUILDDIR)/linkcheck/output.txt." 194 | 195 | .PHONY: doctest 196 | doctest: 197 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 198 | @echo "Testing of doctests in the sources finished, look at the " \ 199 | "results in $(BUILDDIR)/doctest/output.txt." 200 | 201 | .PHONY: coverage 202 | coverage: 203 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 204 | @echo "Testing of coverage in the sources finished, look at the " \ 205 | "results in $(BUILDDIR)/coverage/python.txt." 206 | 207 | .PHONY: xml 208 | xml: 209 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 210 | @echo 211 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 212 | 213 | .PHONY: pseudoxml 214 | pseudoxml: 215 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 216 | @echo 217 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 218 | -------------------------------------------------------------------------------- /docs/source/api_overview.md: -------------------------------------------------------------------------------- 1 | # proxy_py API v1 2 | 3 | proxy_py expects HTTP POST requests with JSON as a body, so you need 4 | to add header `Content-Type: application/json` and send correct 5 | JSON document. 6 | 7 | Example of correct request: 8 | ```json 9 | { 10 | "method": "get", 11 | "model": "proxy" 12 | } 13 | ``` 14 | 15 | Response is also HTTP with JSON and status code depending on whether 16 | error happened or not. 17 | 18 | * 200 if there wasn't error 19 | * 400 if you sent a bad request 20 | * 500 if there was an error during execution your request or in some 21 | other cases 22 | 23 | status_code is also duplicated in JSON body. 24 | 25 | ## Possible keys 26 | 27 | * `model` - specifies what you will work with. 28 | Now it's only supported to work with `proxy` model. 29 | * `method` - what you're gonna do with it 30 | * `get` - get model items as json objects. 31 | Detailed description is below 32 | * `count` - count how many items there are. 33 | Detailed description is below 34 | 35 | 36 | ### get method 37 | 38 | `get` method supports the following keys: 39 | * `order_by` (string) - specifies ordering fields as comma separated ("response_time" if not provided) 40 | value. 41 | 42 | Explanation: 43 | 44 | `"uptime"` just sorts proxies by uptime field ascending. 45 | 46 | Note: `uptime` is the timestamp from which proxy is working, 47 | NOT proxy's working time 48 | 49 | To sort descending use `-`(minus) before the field name. 50 | 51 | `"-response_time"` returns proxies with maximum response_time first 52 | (in microseconds) 53 | 54 | It's also possible to sort using multiple fields 55 | 56 | `"number_of_bad_checks, response_time"` returns proxies with minimum 57 | `number_of_bad_checks` first, if there are proxies with the same 58 | `number_of_bad_checks`, sorts them by `response_time` 59 | 60 | * `limit` (integer) - specifies how many proxies to return (1024 if not provided) 61 | * `offset` (integer) - specifies how many proxies to skip (0 if not provided) 62 | 63 | Example of `get` request: 64 | 65 | ```json 66 | 67 | { 68 | "model": "proxy", 69 | "method": "get", 70 | "order_by": "number_of_bad_checks, response_time", 71 | "limit": 100, 72 | "offset": 200 73 | } 74 | ``` 75 | 76 | Response 77 | 78 | ```json 79 | { 80 | "count": 6569, 81 | "data": [ 82 | { 83 | "address": "socks5://localhost:9999", 84 | "auth_data": "", 85 | "bad_proxy": false, 86 | "domain": "localhost", 87 | "last_check_time": 1517089048, 88 | "number_of_bad_checks": 0, 89 | "port": 9999, 90 | "protocol": "socks5", 91 | "response_time": 1819186, 92 | "uptime": 1517072132 93 | }, 94 | 95 | ... 96 | 97 | ], 98 | "has_more": true, 99 | "status": "ok", 100 | "status_code": 200 101 | } 102 | ``` 103 | 104 | Response fiels: 105 | 106 | * `count` (integer) - total number of proxies for that request(how many you can fetch increasing offset) 107 | * `data` (array) - list of proxies 108 | * `has_more` (boolean) - value indicating whether you can increase 109 | offset to get more proxies or not 110 | * `status` (string) - "error" if error happened, "ok" otherwise 111 | 112 | Example of error: 113 | 114 | Request: 115 | 116 | ```json 117 | { 118 | "model": "user", 119 | "method": "get", 120 | "order_by": "number_of_bad_checks, response_time", 121 | "limit": 100, 122 | "offset": 200 123 | } 124 | ``` 125 | 126 | Response: 127 | 128 | ```json 129 | { 130 | "error_message": "Model \"user\" doesn't exist or isn't allowed", 131 | "status": "error", 132 | "status_code": 400 133 | } 134 | ``` 135 | 136 | ### count method 137 | 138 | Same as get, but doesn't return data 139 | 140 | # proxy_py API v2 141 | 142 | Second version of API has only 2 methods so far 143 | 144 | ```bash 145 | curl https://localhost:55555/api/v2/get_proxy_for_id --data '{"id": "ID"}' 146 | ``` 147 | ```bash 148 | curl https://localhost:55555/api/v2/get_proxies_for_id --data '{"id": "ID", "number": 2}' 149 | ``` 150 | 151 | get_proxy_for_id should return the best proxy for a given ID avoiding overlapping with other IDs, but so far it just returns a random one ignoring ID at all. 152 | get_proxies_for_id is the same, but also has a parameter `number` to specify the number of proxies to return. 153 | -------------------------------------------------------------------------------- /docs/source/guides/guides.rst: -------------------------------------------------------------------------------- 1 | proxy_py Guides 2 | =============== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | how_to_create_collector.rst 8 | 9 | -------------------------------------------------------------------------------- /docs/source/guides/how_to_create_collector.rst: -------------------------------------------------------------------------------- 1 | proxy_py How to create a collector 2 | ================================== 3 | 4 | Collector is a class which is used to parse proxies from web page or another source. 5 | All collectors are inherited from `collectors.abstract_collector.AbstractCollector`, 6 | also there is `collectors.pages_collector.PagesCollector` which is used for paginated sources. 7 | It's always better to learn through the examples. 8 | 9 | Simple collector 10 | **************** 11 | 12 | Let's start with the simplest collector we can imagine, 13 | it will be collecting from the page http://www.89ip.cn/ti.html 14 | as you can see, it sends form as GET request to this url 15 | http://www.89ip.cn/tqdl.html?num=9999&address=&kill_address=&port=&kill_port=&isp= 16 | 17 | Firstly we can try to check that these proxies are really good. 18 | Just copy and paste list of proxies to file say /tmp/proxies and run this command inside virtual environment 19 | 20 | .. code-block:: bash 21 | 22 | cat /tmp/proxies | python3 check_from_stdin.py 23 | 24 | You're gonna get something like this: 25 | 26 | `++++++++++++++++++++++-+++++-+++++++++++++++++++++++++++-++++++-++++-+++++++++++++++++++++++++++++++--+++++++-+++++++-++-+-+++-+++++++++-+++++++++++++++++++++--++--+-++++++++++++++++-+++--+++-+-+++++++++++++++++--++++++++++++-+++++-+++-++++++++-+++++-+-+++++++-++-+--++++-+++-++++++++++-++++--+++++++-+++++++-++--+++++-+-+++++++++++++++++++++-++-+++-+++--++++--+++-+++++++-+++++++-+++++++++++++++---+++++-+++++++++-+++++-+-++++++++++++-+--+++--+-+-+-++-+++++-+++--++++++-+++++++++++--+-+++-+-++++--+++++--+++++++++-+-+-++++-+-++++++++++++++-++-++++++--+--++++-+-++--++--+++++-++-+++-++++--++--+---------+--+--++--------+++-++-+--++++++++++++++++-+++++++++-+++++++--+--+--+-+-+++---++------------------+--+----------+-+-+--++-+----------+-------+--+------+----+-+--+--++----+--+-++++++-++-+++` 27 | 28 | "\+" means working proxy with at least one protocol, "\-" means not working, the result above is perfect, so many good proxies. 29 | 30 | Note: working means proxy respond with timeout set in settings, 31 | if you increase it, you're likely to get more proxies. 32 | 33 | Alright, let's code! 34 | 35 | We need to place our collector inside `collectors/web/` 36 | directory using reversed domain path, 37 | it will be `collectors/web/cn/89ip/collector.py` 38 | 39 | To make class be a collector we need to declare a variable 40 | `__collector__` and set it to `True` 41 | 42 | Note: name of file and name of class don't make sense, 43 | you can declare as many files and classes in each file 44 | per domain as you want 45 | 46 | .. code-block:: python 47 | 48 | from collectors import AbstractCollector 49 | 50 | 51 | class Collector(AbstractCollector): 52 | __collector__ = True 53 | 54 | We can override default processing period in constructor 55 | like this: 56 | 57 | .. code-block:: python 58 | 59 | def __init__(self): 60 | super(Collector, self).__init__() 61 | # 30 minutes 62 | self.processing_period = 30 * 60 63 | ''' 64 | floating period means proxy_py will be changing 65 | period to not make extra requests and handle 66 | new proxies in time, you don't need to disable 67 | it in most cases 68 | ''' 69 | # self.floating_processing_period = False 70 | 71 | 72 | The last step is to implement `collect()` method. 73 | Import useful things 74 | 75 | .. code-block:: python 76 | 77 | from parsers import RegexParser 78 | 79 | import http_client 80 | 81 | 82 | and implement method like this: 83 | 84 | .. code-block:: python 85 | 86 | async def collect(self): 87 | url = 'http://www.89ip.cn/tqdl.html?num=9999&address=&kill_address=&port=&kill_port=&isp=' 88 | # send a request to get html code of the page 89 | html = await http_client.get_text(url) 90 | # and just parse it using regex parser with a default rule to parse 91 | # proxies like this: 92 | # 8.8.8.8:8080 93 | return RegexParser().parse(html) 94 | 95 | That's all! 96 | 97 | Now is time for a little test, to be sure your collector is working 98 | you can run proxy_py with `--test-collector` option: 99 | 100 | .. code-block:: bash 101 | 102 | python3 main.py core --test-collector collectors/web/cn/89ip/collector.py:Collector 103 | 104 | which means to take class Collector from the file `collectors/web/cn/89ip/collector.py` 105 | 106 | It's gonna draw you a pattern like this: 107 | 108 | .. image:: https://i.imgur.com/fmVp3Iz.png 109 | 110 | Where red cell means not working proxy 111 | 112 | - cyan - respond within a second 113 | - green - slower than 5 seconds 114 | - yellow - up to 10 seconds 115 | - magenta - slower than 10 seconds 116 | 117 | Note: don't forget that settings.py limits amount of time 118 | for proxy to respond. 119 | You can override proxy checking timeout by using 120 | `--proxy-checking-timeout` option. For example 121 | 122 | .. code-block:: bash 123 | 124 | python3 main.py --test-collector collectors/web/cn/89ip/collector.py:Collector --proxy-checking-timeout 60 125 | 126 | With 60 seconds timeout it looks better 127 | 128 | .. image:: https://i.imgur.com/DmNuzOI.png 129 | 130 | Paginated collector 131 | ******************* 132 | 133 | Alright, you've done with a simple collector, 134 | you're almost a pro, let's now dive a little deeper 135 | 136 | # TODO: complete this guide 137 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. proxy_py documentation master file, created by 2 | sphinx-quickstart on Fri Apr 6 00:53:21 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to proxy_py's documentation! 7 | ==================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | readme_link 15 | api_overview.md 16 | 17 | guides/guides 18 | 19 | modules/modules 20 | 21 | Indices and tables 22 | ================== 23 | 24 | * :ref:`genindex` 25 | * :ref:`modindex` 26 | * :ref:`search` 27 | -------------------------------------------------------------------------------- /docs/source/modules/async_requests.rst: -------------------------------------------------------------------------------- 1 | async\_requests module 2 | ====================== 3 | 4 | .. automodule:: async_requests 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/check_from_stdin.rst: -------------------------------------------------------------------------------- 1 | check\_from\_stdin module 2 | ========================= 3 | 4 | .. automodule:: check_from_stdin 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/checkers.base_checker.rst: -------------------------------------------------------------------------------- 1 | checkers.base\_checker module 2 | ============================= 3 | 4 | .. automodule:: checkers.base_checker 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/checkers.d3d_info_checker.rst: -------------------------------------------------------------------------------- 1 | checkers.d3d\_info\_checker module 2 | ================================== 3 | 4 | .. automodule:: checkers.d3d_info_checker 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/checkers.google_com_checker.rst: -------------------------------------------------------------------------------- 1 | checkers.google\_com\_checker module 2 | ==================================== 3 | 4 | .. automodule:: checkers.google_com_checker 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/checkers.ipinfo_io_checker.rst: -------------------------------------------------------------------------------- 1 | checkers.ipinfo\_io\_checker module 2 | =================================== 3 | 4 | .. automodule:: checkers.ipinfo_io_checker 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/checkers.rst: -------------------------------------------------------------------------------- 1 | checkers package 2 | ================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | checkers.base_checker 10 | checkers.d3d_info_checker 11 | checkers.google_com_checker 12 | checkers.ipinfo_io_checker 13 | 14 | Module contents 15 | --------------- 16 | 17 | .. automodule:: checkers 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.abstract_collector.rst: -------------------------------------------------------------------------------- 1 | collectors.abstract\_collector module 2 | ===================================== 3 | 4 | .. automodule:: collectors.abstract_collector 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.checkerproxy_net.collector_checkerproxy_net.rst: -------------------------------------------------------------------------------- 1 | collectors.checkerproxy\_net.collector\_checkerproxy\_net module 2 | ================================================================ 3 | 4 | .. automodule:: collectors.checkerproxy_net.collector_checkerproxy_net 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.checkerproxy_net.collector_checkerproxy_net_today.rst: -------------------------------------------------------------------------------- 1 | collectors.checkerproxy\_net.collector\_checkerproxy\_net\_today module 2 | ======================================================================= 3 | 4 | .. automodule:: collectors.checkerproxy_net.collector_checkerproxy_net_today 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.checkerproxy_net.rst: -------------------------------------------------------------------------------- 1 | collectors.checkerproxy\_net package 2 | ==================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | collectors.checkerproxy_net.collector_checkerproxy_net 10 | collectors.checkerproxy_net.collector_checkerproxy_net_today 11 | 12 | Module contents 13 | --------------- 14 | 15 | .. automodule:: collectors.checkerproxy_net 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.collector.rst: -------------------------------------------------------------------------------- 1 | collectors.collector module 2 | =========================== 3 | 4 | .. automodule:: collectors.collector 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.free_proxy_list_net.base_collector_free_proxy_list_net.rst: -------------------------------------------------------------------------------- 1 | collectors.free\_proxy\_list\_net.base\_collector\_free\_proxy\_list\_net module 2 | ================================================================================ 3 | 4 | .. automodule:: collectors.free_proxy_list_net.base_collector_free_proxy_list_net 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.free_proxy_list_net.collector_free_proxy_list_net.rst: -------------------------------------------------------------------------------- 1 | collectors.free\_proxy\_list\_net.collector\_free\_proxy\_list\_net module 2 | ========================================================================== 3 | 4 | .. automodule:: collectors.free_proxy_list_net.collector_free_proxy_list_net 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.free_proxy_list_net.collector_free_proxy_list_net_anonymous_proxy.rst: -------------------------------------------------------------------------------- 1 | collectors.free\_proxy\_list\_net.collector\_free\_proxy\_list\_net\_anonymous\_proxy module 2 | ============================================================================================ 3 | 4 | .. automodule:: collectors.free_proxy_list_net.collector_free_proxy_list_net_anonymous_proxy 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.free_proxy_list_net.collector_free_proxy_list_net_uk_proxy.rst: -------------------------------------------------------------------------------- 1 | collectors.free\_proxy\_list\_net.collector\_free\_proxy\_list\_net\_uk\_proxy module 2 | ===================================================================================== 3 | 4 | .. automodule:: collectors.free_proxy_list_net.collector_free_proxy_list_net_uk_proxy 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.free_proxy_list_net.collector_socks_proxy_net.rst: -------------------------------------------------------------------------------- 1 | collectors.free\_proxy\_list\_net.collector\_socks\_proxy\_net module 2 | ===================================================================== 3 | 4 | .. automodule:: collectors.free_proxy_list_net.collector_socks_proxy_net 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.free_proxy_list_net.collector_sslproxies_org.rst: -------------------------------------------------------------------------------- 1 | collectors.free\_proxy\_list\_net.collector\_sslproxies\_org module 2 | =================================================================== 3 | 4 | .. automodule:: collectors.free_proxy_list_net.collector_sslproxies_org 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.free_proxy_list_net.collector_us_proxy_org.rst: -------------------------------------------------------------------------------- 1 | collectors.free\_proxy\_list\_net.collector\_us\_proxy\_org module 2 | ================================================================== 3 | 4 | .. automodule:: collectors.free_proxy_list_net.collector_us_proxy_org 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.free_proxy_list_net.rst: -------------------------------------------------------------------------------- 1 | collectors.free\_proxy\_list\_net package 2 | ========================================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | collectors.free_proxy_list_net.base_collector_free_proxy_list_net 10 | collectors.free_proxy_list_net.collector_free_proxy_list_net 11 | collectors.free_proxy_list_net.collector_free_proxy_list_net_anonymous_proxy 12 | collectors.free_proxy_list_net.collector_free_proxy_list_net_uk_proxy 13 | collectors.free_proxy_list_net.collector_socks_proxy_net 14 | collectors.free_proxy_list_net.collector_sslproxies_org 15 | collectors.free_proxy_list_net.collector_us_proxy_org 16 | 17 | Module contents 18 | --------------- 19 | 20 | .. automodule:: collectors.free_proxy_list_net 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.freeproxylists_net.freeproxylists_net.rst: -------------------------------------------------------------------------------- 1 | collectors.freeproxylists\_net.freeproxylists\_net module 2 | ========================================================= 3 | 4 | .. automodule:: collectors.freeproxylists_net.freeproxylists_net 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.freeproxylists_net.rst: -------------------------------------------------------------------------------- 1 | collectors.freeproxylists\_net package 2 | ====================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | collectors.freeproxylists_net.freeproxylists_net 10 | 11 | Module contents 12 | --------------- 13 | 14 | .. automodule:: collectors.freeproxylists_net 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.gatherproxy_com.collector_gatherproxy_com.rst: -------------------------------------------------------------------------------- 1 | collectors.gatherproxy\_com.collector\_gatherproxy\_com module 2 | ============================================================== 3 | 4 | .. automodule:: collectors.gatherproxy_com.collector_gatherproxy_com 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.gatherproxy_com.rst: -------------------------------------------------------------------------------- 1 | collectors.gatherproxy\_com package 2 | =================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | collectors.gatherproxy_com.collector_gatherproxy_com 10 | 11 | Module contents 12 | --------------- 13 | 14 | .. automodule:: collectors.gatherproxy_com 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.nordvpn_com.nordvpn_com.rst: -------------------------------------------------------------------------------- 1 | collectors.nordvpn\_com.nordvpn\_com module 2 | =========================================== 3 | 4 | .. automodule:: collectors.nordvpn_com.nordvpn_com 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.nordvpn_com.rst: -------------------------------------------------------------------------------- 1 | collectors.nordvpn\_com package 2 | =============================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | collectors.nordvpn_com.nordvpn_com 10 | 11 | Module contents 12 | --------------- 13 | 14 | .. automodule:: collectors.nordvpn_com 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.pages_collector.rst: -------------------------------------------------------------------------------- 1 | collectors.pages\_collector module 2 | ================================== 3 | 4 | .. automodule:: collectors.pages_collector 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.premproxy_com.base_collector_premproxy_com.rst: -------------------------------------------------------------------------------- 1 | collectors.premproxy\_com.base\_collector\_premproxy\_com module 2 | ================================================================ 3 | 4 | .. automodule:: collectors.premproxy_com.base_collector_premproxy_com 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.premproxy_com.collector_premproxy_com.rst: -------------------------------------------------------------------------------- 1 | collectors.premproxy\_com.collector\_premproxy\_com module 2 | ========================================================== 3 | 4 | .. automodule:: collectors.premproxy_com.collector_premproxy_com 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.premproxy_com.collector_premproxy_com_socks_list.rst: -------------------------------------------------------------------------------- 1 | collectors.premproxy\_com.collector\_premproxy\_com\_socks\_list module 2 | ======================================================================= 3 | 4 | .. automodule:: collectors.premproxy_com.collector_premproxy_com_socks_list 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.premproxy_com.rst: -------------------------------------------------------------------------------- 1 | collectors.premproxy\_com package 2 | ================================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | collectors.premproxy_com.base_collector_premproxy_com 10 | collectors.premproxy_com.collector_premproxy_com 11 | collectors.premproxy_com.collector_premproxy_com_socks_list 12 | 13 | Module contents 14 | --------------- 15 | 16 | .. automodule:: collectors.premproxy_com 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.proxy_list_org.collector_proxy_list_org.rst: -------------------------------------------------------------------------------- 1 | collectors.proxy\_list\_org.collector\_proxy\_list\_org module 2 | ============================================================== 3 | 4 | .. automodule:: collectors.proxy_list_org.collector_proxy_list_org 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.proxy_list_org.rst: -------------------------------------------------------------------------------- 1 | collectors.proxy\_list\_org package 2 | =================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | collectors.proxy_list_org.collector_proxy_list_org 10 | 11 | Module contents 12 | --------------- 13 | 14 | .. automodule:: collectors.proxy_list_org 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | -------------------------------------------------------------------------------- /docs/source/modules/collectors.rst: -------------------------------------------------------------------------------- 1 | collectors package 2 | ================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | collectors.abstract_collector 10 | collectors.pages_collector 11 | 12 | Module contents 13 | --------------- 14 | 15 | .. automodule:: collectors 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | -------------------------------------------------------------------------------- /docs/source/modules/collectors_list.rst: -------------------------------------------------------------------------------- 1 | collectors\_list module 2 | ======================= 3 | 4 | .. automodule:: collectors_list 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/dump_db.rst: -------------------------------------------------------------------------------- 1 | dump\_db module 2 | =============== 3 | 4 | .. automodule:: dump_db 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/fill_db.rst: -------------------------------------------------------------------------------- 1 | fill\_db module 2 | =============== 3 | 4 | .. automodule:: fill_db 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/http_client.rst: -------------------------------------------------------------------------------- 1 | http\_client module 2 | =================== 3 | 4 | .. automodule:: http_client 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/main.rst: -------------------------------------------------------------------------------- 1 | main module 2 | =========== 3 | 4 | .. automodule:: main 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/materialized_view_updater.rst: -------------------------------------------------------------------------------- 1 | materialized\_view\_updater module 2 | ================================== 3 | 4 | .. automodule:: materialized_view_updater 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/models.rst: -------------------------------------------------------------------------------- 1 | models module 2 | ============= 3 | 4 | .. automodule:: models 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/modules.rst: -------------------------------------------------------------------------------- 1 | proxy_py Modules 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | async_requests 8 | check_from_stdin 9 | checkers 10 | collectors 11 | collectors_list 12 | http_client 13 | main 14 | materialized_view_updater 15 | models 16 | parsers 17 | processor 18 | proxy_py 19 | proxy_utils 20 | proxy_validator 21 | server 22 | setup 23 | statistics 24 | tests 25 | -------------------------------------------------------------------------------- /docs/source/modules/parsers.regex_parser.rst: -------------------------------------------------------------------------------- 1 | parsers.regex\_parser module 2 | ============================ 3 | 4 | .. automodule:: parsers.regex_parser 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/parsers.rst: -------------------------------------------------------------------------------- 1 | parsers package 2 | =============== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | parsers.regex_parser 10 | 11 | Module contents 12 | --------------- 13 | 14 | .. automodule:: parsers 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | -------------------------------------------------------------------------------- /docs/source/modules/processor.rst: -------------------------------------------------------------------------------- 1 | processor module 2 | ================ 3 | 4 | .. automodule:: processor 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/proxy_py.rst: -------------------------------------------------------------------------------- 1 | proxy\_py package 2 | ================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | proxy_py.settings 10 | 11 | Module contents 12 | --------------- 13 | 14 | .. automodule:: proxy_py 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | -------------------------------------------------------------------------------- /docs/source/modules/proxy_py.settings.rst: -------------------------------------------------------------------------------- 1 | proxy\_py.settings module 2 | ========================= 3 | 4 | .. automodule:: proxy_py.settings 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/proxy_utils.rst: -------------------------------------------------------------------------------- 1 | proxy\_utils module 2 | =================== 3 | 4 | .. automodule:: proxy_utils 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/proxy_validator.rst: -------------------------------------------------------------------------------- 1 | proxy\_validator module 2 | ======================= 3 | 4 | .. automodule:: proxy_validator 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/server.api_v1.api_request_handler.rst: -------------------------------------------------------------------------------- 1 | server.api\_v1.api\_request\_handler module 2 | =========================================== 3 | 4 | .. automodule:: server.api_v1.api_request_handler 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/server.api_v1.app.rst: -------------------------------------------------------------------------------- 1 | server.api\_v1.app module 2 | ========================= 3 | 4 | .. automodule:: server.api_v1.app 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/server.api_v1.requests_to_models.request.rst: -------------------------------------------------------------------------------- 1 | server.api\_v1.requests\_to\_models.request module 2 | ================================================== 3 | 4 | .. automodule:: server.api_v1.requests_to_models.request 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/server.api_v1.requests_to_models.request_executor.rst: -------------------------------------------------------------------------------- 1 | server.api\_v1.requests\_to\_models.request\_executor module 2 | ============================================================ 3 | 4 | .. automodule:: server.api_v1.requests_to_models.request_executor 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/server.api_v1.requests_to_models.request_parser.rst: -------------------------------------------------------------------------------- 1 | server.api\_v1.requests\_to\_models.request\_parser module 2 | ========================================================== 3 | 4 | .. automodule:: server.api_v1.requests_to_models.request_parser 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/server.api_v1.requests_to_models.rst: -------------------------------------------------------------------------------- 1 | server.api\_v1.requests\_to\_models package 2 | =========================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | server.api_v1.requests_to_models.request 10 | server.api_v1.requests_to_models.request_executor 11 | server.api_v1.requests_to_models.request_parser 12 | 13 | Module contents 14 | --------------- 15 | 16 | .. automodule:: server.api_v1.requests_to_models 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | -------------------------------------------------------------------------------- /docs/source/modules/server.api_v1.rst: -------------------------------------------------------------------------------- 1 | server.api\_v1 package 2 | ====================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | server.api_v1.requests_to_models 10 | 11 | Submodules 12 | ---------- 13 | 14 | .. toctree:: 15 | 16 | server.api_v1.api_request_handler 17 | server.api_v1.app 18 | 19 | Module contents 20 | --------------- 21 | 22 | .. automodule:: server.api_v1 23 | :members: 24 | :undoc-members: 25 | :show-inheritance: 26 | -------------------------------------------------------------------------------- /docs/source/modules/server.base_app.rst: -------------------------------------------------------------------------------- 1 | server.base\_app module 2 | ======================= 3 | 4 | .. automodule:: server.base_app 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/server.frontend.app.rst: -------------------------------------------------------------------------------- 1 | server.frontend.app module 2 | ========================== 3 | 4 | .. automodule:: server.frontend.app 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/server.frontend.rst: -------------------------------------------------------------------------------- 1 | server.frontend package 2 | ======================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | server.frontend.app 10 | 11 | Module contents 12 | --------------- 13 | 14 | .. automodule:: server.frontend 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | -------------------------------------------------------------------------------- /docs/source/modules/server.proxy_provider_server.rst: -------------------------------------------------------------------------------- 1 | server.proxy\_provider\_server module 2 | ===================================== 3 | 4 | .. automodule:: server.proxy_provider_server 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/server.rst: -------------------------------------------------------------------------------- 1 | server package 2 | ============== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | server.api_v1 10 | server.frontend 11 | 12 | Submodules 13 | ---------- 14 | 15 | .. toctree:: 16 | 17 | server.base_app 18 | server.proxy_provider_server 19 | 20 | Module contents 21 | --------------- 22 | 23 | .. automodule:: server 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | -------------------------------------------------------------------------------- /docs/source/modules/setup.rst: -------------------------------------------------------------------------------- 1 | setup module 2 | ============ 3 | 4 | .. automodule:: setup 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/statistics.rst: -------------------------------------------------------------------------------- 1 | statistics package 2 | ================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | statistics.statistics 10 | 11 | Module contents 12 | --------------- 13 | 14 | .. automodule:: statistics 15 | :members: 16 | :undoc-members: 17 | :show-inheritance: 18 | -------------------------------------------------------------------------------- /docs/source/modules/statistics.statistics.rst: -------------------------------------------------------------------------------- 1 | statistics.statistics module 2 | ============================ 3 | 4 | .. automodule:: statistics.statistics 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/tests.rst: -------------------------------------------------------------------------------- 1 | tests package 2 | ============= 3 | 4 | Submodules 5 | ---------- 6 | 7 | .. toctree:: 8 | 9 | tests.test_api 10 | tests.test_http_client 11 | tests.test_proxy_validation_regex 12 | 13 | Module contents 14 | --------------- 15 | 16 | .. automodule:: tests 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | -------------------------------------------------------------------------------- /docs/source/modules/tests.test_api.rst: -------------------------------------------------------------------------------- 1 | tests.test\_api module 2 | ====================== 3 | 4 | .. automodule:: tests.test_api 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/tests.test_http_client.rst: -------------------------------------------------------------------------------- 1 | tests.test\_http\_client module 2 | =============================== 3 | 4 | .. automodule:: tests.test_http_client 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/modules/tests.test_proxy_validation_regex.rst: -------------------------------------------------------------------------------- 1 | tests.test\_proxy\_validation\_regex module 2 | =========================================== 3 | 4 | .. automodule:: tests.test_proxy_validation_regex 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/readme_link.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../../README.rst 2 | -------------------------------------------------------------------------------- /docs/update_doc.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | sphinx-apidoc --force --separate -o source/modules ../ && sed -i "1s/.*/proxy_py Modules/" source/modules/modules.rst && make html 4 | -------------------------------------------------------------------------------- /http_client.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import aiohttp 4 | from aiosocks.connector import ProxyClientRequest, ProxyConnector 5 | from fake_useragent import UserAgent 6 | 7 | from proxy_py import settings 8 | 9 | 10 | class HttpClientResult: 11 | text = None 12 | aiohttp_response = None 13 | 14 | @staticmethod 15 | async def make(aiohttp_response): 16 | obj = HttpClientResult() 17 | obj.aiohttp_response = aiohttp_response 18 | obj.text = await obj.aiohttp_response.text() 19 | 20 | return obj 21 | 22 | def as_text(self): 23 | return self.text 24 | 25 | def as_json(self): 26 | return json.loads(self.text) 27 | 28 | 29 | # TODO: complete cookies saving 30 | class HttpClient: 31 | """ 32 | Simple class for making http requests, 33 | user-agent is set to random one in constructor 34 | """ 35 | 36 | _aiohttp_connector = None 37 | 38 | def __init__(self): 39 | self.user_agent = UserAgent().random 40 | self.timeout = 60 41 | if HttpClient._aiohttp_connector is None: 42 | HttpClient._aiohttp_connector = ProxyConnector( 43 | remote_resolve=True, 44 | limit=settings.NUMBER_OF_SIMULTANEOUS_REQUESTS, 45 | limit_per_host=settings.NUMBER_OF_SIMULTANEOUS_REQUESTS_PER_HOST, 46 | ) 47 | self.proxy_address = None 48 | 49 | async def get(self, url): 50 | """ 51 | send HTTP GET request 52 | 53 | :param url: 54 | :return: 55 | """ 56 | return await self.request("GET", url, None) 57 | 58 | async def post(self, url, data): 59 | """ 60 | send HTTP POST request 61 | 62 | :param url: 63 | :param data: 64 | :return: 65 | """ 66 | return await self.request("POST", url, data) 67 | 68 | async def request(self, method, url, data) -> HttpClientResult: 69 | headers = { 70 | "User-Agent": self.user_agent, 71 | } 72 | 73 | async with aiohttp.ClientSession( 74 | connector=HttpClient._aiohttp_connector, 75 | connector_owner=False, 76 | request_class=ProxyClientRequest, 77 | ) as session: 78 | async with session.request( 79 | method, 80 | url=url, 81 | data=data, 82 | proxy=self.proxy_address, 83 | timeout=self.timeout, 84 | headers=headers, 85 | ) as response: 86 | return await HttpClientResult.make(response) 87 | 88 | @staticmethod 89 | async def clean(): 90 | HttpClient._aiohttp_connector.close() 91 | 92 | 93 | async def get_text(url): 94 | """ 95 | fast method for sending get response without creating extra objects 96 | 97 | :param url: 98 | :return: 99 | """ 100 | return (await HttpClient().get(url)).as_text() 101 | 102 | 103 | async def get_json(url): 104 | return (await HttpClient().get(url)).as_json() 105 | -------------------------------------------------------------------------------- /init_db.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION IF NOT EXISTS tsm_system_rows; 2 | -------------------------------------------------------------------------------- /logs/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # should be called before everything else 4 | # it's very fucking important! 5 | def init_uvloop(): 6 | import uvloop 7 | 8 | uvloop.install() 9 | 10 | 11 | init_uvloop() 12 | 13 | import argparse 14 | import asyncio 15 | import logging 16 | import subprocess 17 | import sys 18 | from statistics import statistics 19 | 20 | import collectors_list 21 | import materialized_view_updater 22 | from checkers.base_checker import BaseChecker 23 | from processor import Processor 24 | from proxy_py import settings 25 | from server.proxy_provider_server import ProxyProviderServer 26 | from tools import test_collector 27 | 28 | test_collector_path = None 29 | main_logger = None 30 | 31 | 32 | def process_cmd_arguments(): 33 | global test_collector_path 34 | 35 | def str_to_bool(value): 36 | if value.lower() in ("yes", "true", "t", "y", "1"): 37 | return True 38 | elif value.lower() in ("no", "false", "f", "n", "0"): 39 | return False 40 | else: 41 | raise argparse.ArgumentTypeError("Boolean value expected.") 42 | 43 | cmd_parser = argparse.ArgumentParser() 44 | cmd_parser.add_argument( 45 | "--debug", type=str_to_bool, help="override settings' debug value" 46 | ) 47 | cmd_parser.add_argument( 48 | "--proxy-checking-timeout", 49 | type=float, 50 | help="override settings' proxy checking timeout", 51 | ) 52 | cmd_parser.add_argument("--test-collector", help="test collector with a given path") 53 | 54 | args = cmd_parser.parse_args() 55 | 56 | if args.debug is not None: 57 | settings.DEBUG = args.debug 58 | 59 | if args.proxy_checking_timeout is not None: 60 | if args.proxy_checking_timeout < 0: 61 | raise ValueError("--proxy-checking-timeout should be positive") 62 | 63 | settings.PROXY_CHECKING_TIMEOUT = args.proxy_checking_timeout 64 | 65 | test_collector_path = args.test_collector 66 | 67 | 68 | def prepare_loggers(): 69 | global main_logger 70 | 71 | asyncio_logger = logging.getLogger("asyncio") 72 | asyncio_logger.setLevel(logging.DEBUG if settings.DEBUG else logging.INFO) 73 | asyncio_logger_handler = logging.StreamHandler(sys.stdout) 74 | asyncio_logger_handler.setLevel(logging.DEBUG if settings.DEBUG else logging.INFO) 75 | asyncio_logger_handler.setFormatter(logging.Formatter(settings.LOG_FORMAT_STRING)) 76 | asyncio_logger.addHandler(asyncio_logger_handler) 77 | asyncio.get_event_loop().set_debug(settings.DEBUG) 78 | 79 | main_logger = logging.getLogger("proxy_py/main") 80 | main_logger.setLevel(logging.DEBUG if settings.DEBUG else logging.INFO) 81 | logger_handler = logging.StreamHandler(sys.stdout) 82 | logger_handler.setFormatter(logging.Formatter(settings.LOG_FORMAT_STRING)) 83 | logger_handler.setLevel(logging.DEBUG if settings.DEBUG else logging.INFO) 84 | 85 | main_logger.addHandler(logger_handler) 86 | 87 | 88 | async def core(): 89 | process_cmd_arguments() 90 | prepare_loggers() 91 | 92 | if test_collector_path is not None: 93 | return await test_collector.run(test_collector_path) 94 | 95 | proxy_processor = Processor.get_instance() 96 | 97 | try: 98 | code = await asyncio.gather( 99 | *[ 100 | proxy_processor.worker(), 101 | statistics.worker(), 102 | materialized_view_updater.worker(), 103 | ] 104 | ) 105 | BaseChecker.clean() 106 | return code 107 | except KeyboardInterrupt: 108 | pass 109 | except BaseException as ex: 110 | main_logger.exception(ex) 111 | print("critical error happened, see logs/main.log") 112 | return 1 113 | 114 | return 0 115 | 116 | 117 | async def print_collectors(): 118 | for collector_name in collectors_list.collectors.keys(): 119 | print(collector_name) 120 | 121 | 122 | def server(): 123 | proxy_provider_server = ProxyProviderServer( 124 | settings.PROXY_PROVIDER_SERVER_ADDRESS["HOST"], 125 | settings.PROXY_PROVIDER_SERVER_ADDRESS["PORT"], 126 | ) 127 | 128 | return proxy_provider_server.start(asyncio.get_event_loop()) 129 | 130 | 131 | def print_help(): 132 | print( 133 | """Usage: ./main.py [COMMAND] [OPTION]... 134 | Runs proxy_py 135 | 136 | The commands are: 137 | 138 | "core" - runs core part of the project (proxies parsing and processing) 139 | "print_collectors" - prints collectors 140 | "server" - runs server for providing API 141 | "" - runs both 142 | 143 | use ./main.py COMMAND --help to get more information 144 | 145 | Project's page: https://github.com/DevAlone/proxy_py 146 | """ 147 | ) 148 | 149 | 150 | def main(): 151 | if len(sys.argv) < 2: 152 | # run default configuration 153 | # server 154 | p = subprocess.Popen(["python3", sys.argv[0], "server"]) 155 | 156 | # and core 157 | code = asyncio.get_event_loop().run_until_complete(core()) 158 | p.wait() 159 | return code 160 | 161 | command = sys.argv[1].strip() 162 | sys.argv = sys.argv[1:] 163 | try: 164 | return { 165 | "core": lambda: asyncio.get_event_loop().run_until_complete(core()), 166 | "print_collectors": lambda: asyncio.get_event_loop().run_until_complete( 167 | print_collectors() 168 | ), 169 | "server": server, 170 | }[command]() 171 | except KeyError: 172 | print_help() 173 | return 0 174 | 175 | 176 | if __name__ == "__main__": 177 | exit(main()) 178 | -------------------------------------------------------------------------------- /materialized_view_updater.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import sys 3 | 4 | 5 | from models import raw_db 6 | 7 | 8 | async def worker(): 9 | while True: 10 | try: 11 | raw_db.execute_sql("REFRESH MATERIALIZED VIEW working_proxies") 12 | await asyncio.sleep(60) 13 | except BaseException as ex: 14 | sys.stderr.write(str(ex) + "\n") 15 | await asyncio.sleep(60 * 10) 16 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os.path 3 | 4 | import geoip2.database 5 | import peewee 6 | import peewee_async 7 | 8 | from proxy_py import settings 9 | 10 | log = logging.getLogger("proxy_py/main") 11 | 12 | raw_db = peewee_async.PooledPostgresqlDatabase( 13 | *settings.DATABASE_CONNECTION_ARGS, 14 | **settings.DATABASE_CONNECTION_KWARGS, 15 | ) 16 | 17 | location_database_reader = None 18 | 19 | 20 | def init_location_db_reader(): 21 | global location_database_reader 22 | if os.path.isfile(settings.GEOLITE2_CITY_FILE_LOCATION): 23 | location_database_reader = geoip2.database.Reader( 24 | settings.GEOLITE2_CITY_FILE_LOCATION 25 | ) 26 | else: 27 | # DB doesn`t exists 28 | log.warning( 29 | "Public IP Database is not found. See GEOLITE2_CITY_FILE_LOCATION in settings.py" 30 | ) 31 | 32 | 33 | init_location_db_reader() 34 | 35 | 36 | class Proxy(peewee.Model): 37 | class Meta: 38 | database = raw_db 39 | db_table = "proxies" 40 | indexes = ( 41 | (("raw_protocol", "auth_data", "domain", "port"), True), 42 | (("auth_data", "domain", "port"), False), # important! 43 | (("raw_protocol",), False), 44 | (("auth_data",), False), 45 | (("domain",), False), 46 | (("port",), False), 47 | (("number_of_bad_checks",), False), 48 | (("next_check_time",), False), 49 | (("last_check_time",), False), 50 | (("checking_period",), False), 51 | (("uptime",), False), 52 | (("bad_uptime",), False), 53 | (("response_time",), False), 54 | (("_white_ipv4",), False), 55 | (("_white_ipv6",), False), 56 | ) 57 | 58 | PROTOCOLS = ( 59 | "http", 60 | "socks4", 61 | "socks5", 62 | ) 63 | 64 | raw_protocol = peewee.SmallIntegerField(null=False) 65 | domain = peewee.CharField(settings.DB_MAX_DOMAIN_LENGTH, null=False) 66 | port = peewee.IntegerField(null=False) 67 | auth_data = peewee.CharField( 68 | settings.DB_AUTH_DATA_MAX_LENGTH, default="", null=False 69 | ) 70 | 71 | checking_period = peewee.IntegerField( 72 | default=settings.MIN_PROXY_CHECKING_PERIOD, null=False 73 | ) 74 | last_check_time = peewee.IntegerField(default=0, null=False) 75 | next_check_time = peewee.IntegerField(default=0, null=False) 76 | number_of_bad_checks = peewee.IntegerField(default=0, null=False) 77 | uptime = peewee.IntegerField(default=None, null=True) 78 | bad_uptime = peewee.IntegerField(default=None, null=True) 79 | # in microseconds 80 | response_time = peewee.IntegerField(default=None, null=True) 81 | # TODO: consider storing as binary 82 | _white_ipv4 = peewee.CharField(16, null=True) 83 | _white_ipv6 = peewee.CharField(45, null=True) 84 | 85 | def get_raw_protocol(self): 86 | return self.raw_protocol 87 | 88 | @property 89 | def location(self): 90 | if location_database_reader is None: 91 | return None 92 | 93 | response = location_database_reader.city(self.domain) 94 | 95 | return { 96 | "latitude": response.location.latitude, 97 | "longitude": response.location.longitude, 98 | "country_code": response.country.iso_code, 99 | "country": response.country.name, 100 | "city": response.city.name, 101 | } 102 | 103 | @property 104 | def address(self): 105 | return self.to_url() 106 | 107 | @property 108 | def protocol(self): 109 | return self.PROTOCOLS[int(self.raw_protocol)] 110 | 111 | @protocol.setter 112 | def protocol(self, protocol): 113 | self.raw_protocol = self.PROTOCOLS.index(protocol) 114 | 115 | @property 116 | def bad_proxy(self): 117 | return self.number_of_bad_checks > 0 118 | 119 | @property 120 | def white_ipv4(self): 121 | return self._white_ipv4 122 | 123 | @white_ipv4.setter 124 | def white_ipv4(self, value): 125 | self._white_ipv4 = value 126 | 127 | @property 128 | def white_ipv6(self): 129 | return self._white_ipv6 130 | 131 | @white_ipv6.setter 132 | def white_ipv6(self, value): 133 | self._white_ipv6 = value 134 | 135 | def to_url(self, protocol=None): 136 | address = ( 137 | protocol if protocol is not None else self.PROTOCOLS[int(self.raw_protocol)] 138 | ) 139 | address += "://" 140 | if self.auth_data: 141 | address += self.auth_data + "@" 142 | 143 | address += "{}:{}".format(self.domain, self.port) 144 | 145 | return address 146 | 147 | def __str__(self): 148 | return self.to_url() 149 | 150 | __repr__ = __str__ 151 | 152 | 153 | class ProxyCountItem(peewee.Model): 154 | class Meta: 155 | database = raw_db 156 | db_table = "proxy_count_items" 157 | 158 | timestamp = peewee.IntegerField(primary_key=True) 159 | good_proxies_count = peewee.IntegerField(null=False) 160 | bad_proxies_count = peewee.IntegerField(null=False) 161 | dead_proxies_count = peewee.IntegerField(null=False) 162 | 163 | 164 | class CollectorState(peewee.Model): 165 | class Meta: 166 | database = raw_db 167 | db_table = "collector_states" 168 | indexes = ( 169 | (("processing_period",), False), 170 | (("last_processing_time",), False), 171 | ) 172 | 173 | # python module name 174 | identifier = peewee.TextField(unique=True) 175 | processing_period = peewee.IntegerField(null=False) 176 | last_processing_time = peewee.IntegerField(null=False) 177 | last_processing_proxies_count = peewee.IntegerField(default=0, null=False) 178 | # TODO: add new proxies 179 | last_processing_new_proxies_count = peewee.IntegerField(default=0, null=False) 180 | data = peewee.TextField(default=None, null=True) 181 | 182 | 183 | class StatBaseModel(peewee.Model): 184 | class Meta: 185 | database = raw_db 186 | 187 | timestamp = peewee.BigIntegerField(primary_key=True) 188 | 189 | 190 | class NumberOfProxiesToProcess(StatBaseModel): 191 | class Meta: 192 | db_table = "number_of_proxies_to_process" 193 | 194 | good_proxies = peewee.IntegerField(null=False) 195 | bad_proxies = peewee.IntegerField(null=False) 196 | dead_proxies = peewee.IntegerField(null=False) 197 | 198 | 199 | class NumberOfCollectorsToProcess(StatBaseModel): 200 | class Meta: 201 | db_table = "number_of_collectors_to_process" 202 | 203 | value = peewee.IntegerField(null=False) 204 | 205 | 206 | class ProcessorProxiesQueueSize(StatBaseModel): 207 | class Meta: 208 | db_table = "processor_proxies_queue_size" 209 | 210 | value = peewee.IntegerField(null=False) 211 | 212 | 213 | _silent = True 214 | Proxy.create_table(_silent) 215 | ProxyCountItem.create_table(_silent) 216 | CollectorState.create_table(_silent) 217 | NumberOfProxiesToProcess.create_table(_silent) 218 | NumberOfCollectorsToProcess.create_table(_silent) 219 | ProcessorProxiesQueueSize.create_table(_silent) 220 | 221 | db = peewee_async.Manager(raw_db) 222 | 223 | raw_db.execute_sql( 224 | """CREATE MATERIALIZED VIEW IF NOT EXISTS working_proxies 225 | AS SELECT * FROM proxies WHERE number_of_bad_checks = 0;""" 226 | ) 227 | db.allow_sync() 228 | -------------------------------------------------------------------------------- /parsers/__init__.py: -------------------------------------------------------------------------------- 1 | from parsers.regex_parser import RegexParser 2 | -------------------------------------------------------------------------------- /parsers/regex_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | # TODO: add ipv6 addresses, make domain checking better 4 | _0_TO_255_REGEX = r"([0-9]|[1-8][0-9]|9[0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])" 5 | DOMAIN_LETTER_REGEX = r"[a-zA-Z0-9_\-]" 6 | PROXY_FIND_REGEX = ( 7 | r"((?P(http|socks4|socks5))://)?" 8 | r"((?P[a-zA-Z0-9_\.]+:[a-zA-Z0-9_\.]+)@)?" 9 | r"(?P" 10 | + r"(" 11 | + _0_TO_255_REGEX 12 | + "\.){3}" 13 | + _0_TO_255_REGEX 14 | + r"|" 15 | + DOMAIN_LETTER_REGEX 16 | + r"+(\.[a-zA-Z]" 17 | + DOMAIN_LETTER_REGEX 18 | + r"+)+):" 19 | r"(?P(6553[0-5]|655[0-2][0-9]|65[0-4][0-9]{2}|6[0-4][0-9]{3}|[1-5][0-9]{4}|999[0-9]|99[0-8][0-9]|9[0-8][0-9]{2}|[1-8][0-9]{3}|99[0-9]|9[0-8][0-9]|[1-8][0-9]{2}|9[0-9]|[1-8][0-9]|[1-9]))" 20 | ) 21 | PROXY_VALIDATE_REGEX = "^" + PROXY_FIND_REGEX + "/?$" 22 | 23 | 24 | class RegexParser: 25 | """ 26 | It's used to scratch proxies from text by regular expression, 27 | you can pass your own expression(see docstring for parse method) 28 | """ 29 | 30 | def __init__(self, expression=PROXY_FIND_REGEX, flags=0): 31 | """ 32 | 33 | :param expression: expression which is used to parse proxies with named groups: 34 | (?P) - for proxy protocol (socks4/5, http) 35 | (?P) - authentication data (login and password) 36 | (?P) - IP or domain 37 | (?P) - port 38 | There is a default value which parses proxies like these ones: 39 | 127.0.0.1:9060 40 | test.proxy.com:8080 41 | socks4://8.8.8.8:65000 42 | http://user:password@proxy.test.info/ 43 | 44 | :param flags: flags that are passed to re.compile 45 | """ 46 | 47 | flags |= re.MULTILINE 48 | self.expression = expression 49 | self.regex = re.compile(expression, flags) 50 | 51 | def parse(self, text: str) -> list: 52 | for match in self.regex.finditer(text): 53 | match = match.groupdict() 54 | yield match["domain"] + ":" + match["port"] 55 | -------------------------------------------------------------------------------- /proxy_py/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevAlone/proxy_py/8aba092685d7b7927fb528fd477fb8c817f47789/proxy_py/__init__.py -------------------------------------------------------------------------------- /proxy_py/_settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Settings evaluation order: 3 | 4 | 1. default values from proxy_py/_settings.py 5 | 2. environment values which are the same but with prefix "PROXY_PY_" 6 | 3. overriden values from proxy_py/settings.py 7 | 4. command line arguments as for example "--debug" or "--proxy-checking-timeout" 8 | 9 | """ 10 | import ast 11 | import os 12 | import string 13 | 14 | from checkers.google_com_checker import GoogleComChecker 15 | 16 | # enable to get more information in logs 17 | DEBUG = False 18 | LOG_FORMAT_STRING = "%(levelname)s ~ [%(name)s] ~ %(asctime)s ~ %(funcName)30s():L%(lineno)d - %(message)s" 19 | 20 | 21 | """ 22 | Database settings (do not try to change after creation of the database) 23 | """ 24 | GEOLITE2_CITY_FILE_LOCATION = ( 25 | "/tmp/proxy_py_9910549a_7d41_4102_9e9d_15d39418a5cb/GeoLite2-City.mmdb" 26 | ) 27 | 28 | DATABASE_CONNECTION_ARGS = () 29 | DATABASE_CONNECTION_KWARGS = { 30 | "host": "localhost", 31 | "database": "proxy_py", 32 | "user": "proxy_py", 33 | "password": "proxy_py", 34 | "max_connections": 20, 35 | } 36 | 37 | DB_MAX_DOMAIN_LENGTH = 128 38 | DB_AUTH_DATA_MAX_LENGTH = 64 39 | 40 | 41 | """ 42 | Fetcher settings 43 | """ 44 | 45 | # it allows you to override or add your own collectors 46 | # for example if you're making proxy checker for particular site 47 | # you can override COLLECTORS_DIR and PROXY_CHECKERS 48 | COLLECTORS_DIRS = [ 49 | "collectors", 50 | # 'local/collectors', # use to add your own collectors 51 | ] 52 | 53 | NUMBER_OF_CONCURRENT_TASKS = 128 54 | # makes aiohttp to not send more 55 | # than this number of simultaneous requests 56 | # works by common connector 57 | NUMBER_OF_SIMULTANEOUS_REQUESTS = 128 58 | # the same, but per host 59 | NUMBER_OF_SIMULTANEOUS_REQUESTS_PER_HOST = NUMBER_OF_SIMULTANEOUS_REQUESTS 60 | 61 | MIN_PROXY_CHECKING_PERIOD = 10 * 60 62 | MAX_PROXY_CHECKING_PERIOD = 30 * 60 63 | BAD_PROXY_CHECKING_PERIOD = MAX_PROXY_CHECKING_PERIOD * 2 64 | DEAD_PROXY_THRESHOLD = 12 65 | DEAD_PROXY_CHECKING_PERIOD = 1 * 24 * 60 * 60 66 | DO_NOT_CHECK_ON_N_BAD_CHECKS = DEAD_PROXY_THRESHOLD + 14 67 | # how many seconds to wait for response from proxy 68 | PROXY_CHECKING_TIMEOUT = 30 69 | # do not check proxy from collector if it has been checked recently 70 | PROXY_NOT_CHECKING_PERIOD = 15 * 60 71 | # limiter for maximum number of proxies gotten from collector 72 | # to fix potential issue with collectors' spamming 73 | COLLECTOR_MAXIMUM_NUMBER_OF_PROXIES_PER_REQUEST = 2 * 65536 74 | SLEEP_AFTER_ERROR_PERIOD = 10 75 | # how many collectors to process concurrently 76 | NUMBER_OF_CONCURRENT_COLLECTORS = 1 77 | 78 | # how many checkers should say that proxy is working 79 | # to consider it so 80 | # should be in range from 1 to len(PROXY_CHECKERS) 81 | # Note: the order of the checkers won't be the same, 82 | # they're shuffled for each proxy 83 | MINIMUM_NUMBER_OF_CHECKERS_PER_PROXY = 1 84 | 85 | PROXY_CHECKERS = [ 86 | GoogleComChecker, 87 | ] 88 | 89 | 90 | """ 91 | Server settings 92 | """ 93 | 94 | PROXY_PROVIDER_SERVER_ADDRESS = { 95 | "HOST": "localhost", 96 | "PORT": 55555, 97 | } 98 | 99 | PROXY_PROVIDER_SERVER_MAXIMUM_REQUEST_LENGTH = 1024 100 | PROXY_PROVIDER_SERVER_MAXIMUM_STRING_FIELD_SIZE = 128 101 | 102 | PROXY_PROVIDER_SERVER_API_CONFIG_FETCH_CONFIG = { 103 | "fields": [ 104 | "address", 105 | "protocol", 106 | "auth_data", 107 | "domain", 108 | "port", 109 | "last_check_time", 110 | "next_check_time", 111 | "number_of_bad_checks", 112 | "bad_proxy", 113 | "uptime", 114 | "response_time", 115 | "white_ipv4", 116 | "white_ipv6", 117 | "location", 118 | ], 119 | "filter_fields": [ 120 | "last_check_time", 121 | "protocol", 122 | "number_of_bad_checks", 123 | "bad_proxy", 124 | "uptime", 125 | "response_time", 126 | ], 127 | "order_by_fields": [ 128 | "last_check_time", 129 | "number_of_bad_checks", 130 | "uptime", 131 | "response_time", 132 | ], 133 | "default_order_by_fields": [ 134 | "response_time", 135 | ], 136 | } 137 | 138 | PROXY_PROVIDER_SERVER_API_CONFIG = { 139 | "proxy": { 140 | "model_class": ["models", "Proxy"], 141 | "methods": { 142 | "get": PROXY_PROVIDER_SERVER_API_CONFIG_FETCH_CONFIG, 143 | "count": PROXY_PROVIDER_SERVER_API_CONFIG_FETCH_CONFIG, 144 | }, 145 | } 146 | } 147 | 148 | TEMPLATES_PATH = "server/templates" 149 | 150 | 151 | """ 152 | Loading from the environment 153 | """ 154 | 155 | 156 | def load_settings_from_environment(): 157 | for key, val in globals().items(): 158 | # filter only variables with capital letters or digits or undescore 159 | rest = "".join( 160 | [ 161 | ch 162 | for ch in key 163 | if ch not in string.ascii_uppercase 164 | and ch not in string.digits 165 | and ch != "_" 166 | ] 167 | ) 168 | if len(rest) > 0: 169 | continue 170 | 171 | env_key = "PROXY_PY_" + key 172 | if env_key in os.environ: 173 | env_value = os.environ[env_key] 174 | try: 175 | globals()[key] = ast.literal_eval(env_value) 176 | except: 177 | raise Exception( 178 | f"An error happened during parsing environment value. " 179 | + f"Key = {env_key}, Value = {env_value}" 180 | ) 181 | 182 | 183 | load_settings_from_environment() 184 | -------------------------------------------------------------------------------- /proxy_utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import random 3 | 4 | from checkers.base_checker import CheckerResult 5 | from proxy_py import settings 6 | 7 | 8 | async def check_proxy(proxy_url: str, timeout=None) -> tuple: 9 | if not settings.PROXY_CHECKERS: 10 | raise Exception("add at least one checker") 11 | 12 | checkers = copy.copy(settings.PROXY_CHECKERS) 13 | random.shuffle(checkers) 14 | results = [] 15 | 16 | for checker, _ in zip( 17 | checkers, range(settings.MINIMUM_NUMBER_OF_CHECKERS_PER_PROXY) 18 | ): 19 | checker() 20 | result = await checker().check(proxy_url, timeout=timeout) 21 | if not result[0]: 22 | return False, None 23 | 24 | results.append(result) 25 | 26 | additional_information = CheckerResult() 27 | 28 | for result in results: 29 | additional_information.update_from_other(result[1]) 30 | 31 | return True, additional_information 32 | -------------------------------------------------------------------------------- /proxy_validator.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from parsers.regex_parser import PROXY_VALIDATE_REGEX 4 | 5 | 6 | class ValidationError(Exception): 7 | pass 8 | 9 | 10 | def retrieve(proxy) -> tuple: 11 | protocol = auth_data = domain = port = None 12 | 13 | if type(proxy) is str: 14 | matches = re.match(PROXY_VALIDATE_REGEX, proxy) 15 | if matches: 16 | matches = matches.groupdict() 17 | auth_data = matches["auth_data"] 18 | domain = matches["domain"] 19 | port = matches["port"] 20 | else: 21 | raise ValidationError("Proxy doesn't match regex") 22 | elif type(proxy) is dict: 23 | auth_data = proxy["auth_data"] if "auth_data" in proxy else None 24 | domain = proxy["domain"] if "domain" in proxy else None 25 | port = proxy["port"] if "port" in proxy else None 26 | str_proxy = "" 27 | if auth_data is not None and auth_data: 28 | str_proxy += auth_data + "@" 29 | 30 | str_proxy += domain + ":" + port 31 | return retrieve(str_proxy) 32 | else: 33 | raise ValidationError('Bad type. Type is "{}"'.format(type(proxy), proxy)) 34 | 35 | if protocol is not None: 36 | if protocol not in ("socks", "socks4", "socks5", "http"): 37 | raise ValidationError("Bad protocol") 38 | 39 | if auth_data is None: 40 | auth_data = "" 41 | 42 | if type(domain) is not str: 43 | raise ValidationError("Bad proxy(domain isn't string)") 44 | 45 | if type(port) is not str: 46 | raise ValidationError("Bad proxy(port isn't string)") 47 | 48 | return protocol, auth_data, domain, port 49 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==2.3.10 2 | aiohttp-jinja2==0.16.0 3 | aiosocks==0.2.6 4 | lxml==4.3.1 5 | fake-useragent 6 | jinja2 7 | peewee-async==0.5.12 8 | aiopg==0.16.0 9 | recommonmark 10 | sphinx_rtd_theme 11 | sphinx 12 | py_mini_racer 13 | pytest 14 | pytest-asyncio 15 | termcolor 16 | uvloop 17 | geoip2 18 | bs4 19 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | source ./env/bin/activate 4 | 5 | python main.py "$@" 6 | -------------------------------------------------------------------------------- /server/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevAlone/proxy_py/8aba092685d7b7927fb528fd477fb8c817f47789/server/__init__.py -------------------------------------------------------------------------------- /server/api_v1/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "api_request_handler", 3 | ] 4 | -------------------------------------------------------------------------------- /server/api_v1/api_request_handler.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | 3 | from proxy_py import settings 4 | from server.base_app import BaseApp 5 | 6 | from .requests_to_models.request_executor import ExecutionError, RequestExecutor 7 | from .requests_to_models.request_parser import ParseError, RequestParser 8 | 9 | 10 | class ApiRequestHandler: 11 | def __init__(self, app: BaseApp): 12 | self.request_parser = RequestParser(settings.PROXY_PROVIDER_SERVER_API_CONFIG) 13 | self.request_executor = RequestExecutor() 14 | self.app = app 15 | 16 | async def handle(self, request: aiohttp.ClientRequest, post_data: dict): 17 | try: 18 | req_dict = self.request_parser.parse(post_data) 19 | 20 | response = { 21 | "status": "ok", 22 | } 23 | response.update(await self.request_executor.execute(req_dict)) 24 | except ParseError as ex: 25 | self.app.log_info( 26 | request, 27 | "Error during parsing request. Request: {} Exception: {}".format( 28 | post_data, ex 29 | ), 30 | ) 31 | 32 | response = {"status": "error", "status_code": 400, "error_message": str(ex)} 33 | except ExecutionError as ex: 34 | self.app.log_error( 35 | request, 36 | "Error during execution request. Request: {} Exception: {}".format( 37 | post_data, ex 38 | ), 39 | ) 40 | self.app.log_exception(request, ex) 41 | 42 | response = { 43 | "status": "error", 44 | "status_code": 500, 45 | "error_message": "error during execution request", 46 | } 47 | 48 | return response 49 | -------------------------------------------------------------------------------- /server/api_v1/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import aiohttp 4 | 5 | from server.base_app import BaseApp 6 | 7 | from .api_request_handler import ApiRequestHandler 8 | 9 | 10 | class App(BaseApp): 11 | def __init__(self, *args, **kwargs): 12 | super(App, self).__init__(*args, **kwargs) 13 | 14 | self.request_handler = ApiRequestHandler(self) 15 | 16 | async def setup_router(self): 17 | self.app.router.add_post("/", self.post) 18 | 19 | async def post(self, request): 20 | data = await request.read() 21 | 22 | try: 23 | data = json.loads(data.decode()) 24 | 25 | response = await self.request_handler.handle(request, data) 26 | except ValueError: 27 | response = { 28 | "status": "error", 29 | "status_code": 400, 30 | "error_message": "Your request doesn't look like request", 31 | } 32 | 33 | if "status_code" in response: 34 | status_code = response["status_code"] 35 | else: 36 | if response["status"] != "ok": 37 | status_code = 500 38 | else: 39 | status_code = 200 40 | 41 | response["status_code"] = status_code 42 | 43 | return aiohttp.web.json_response(response, status=status_code) 44 | -------------------------------------------------------------------------------- /server/api_v1/requests_to_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevAlone/proxy_py/8aba092685d7b7927fb528fd477fb8c817f47789/server/api_v1/requests_to_models/__init__.py -------------------------------------------------------------------------------- /server/api_v1/requests_to_models/request.py: -------------------------------------------------------------------------------- 1 | class Request: 2 | def __init__(self, class_name): 3 | self.class_name = class_name 4 | 5 | 6 | class FetchRequest(Request): 7 | def __init__(self, class_name, fields: list = None, order_by: list = None): 8 | super(FetchRequest, self).__init__(class_name) 9 | self.fields = fields if fields is not None else [] 10 | self.order_by = order_by if order_by is not None else [] 11 | self.limit = 0 12 | self.offset = 0 13 | 14 | 15 | class GetRequest(FetchRequest): 16 | @staticmethod 17 | def from_request(request: Request): 18 | return GetRequest(request.class_name) 19 | 20 | 21 | class CountRequest(FetchRequest): 22 | @staticmethod 23 | def from_request(request: Request): 24 | return CountRequest(request.class_name) 25 | -------------------------------------------------------------------------------- /server/api_v1/requests_to_models/request_executor.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | 4 | from models import db 5 | 6 | from ..requests_to_models.request import FetchRequest, GetRequest, Request 7 | 8 | 9 | class RequestExecutor: 10 | async def execute(self, request: Request): 11 | try: 12 | if isinstance(request, FetchRequest): 13 | return await self._fetch(request) 14 | # return { 15 | # # GetRequest: self._get, 16 | # # CountRequest: self._count 17 | # FetchRequest: self._fetch 18 | # }[type(request)](request) 19 | except BaseException as ex: 20 | raise ExecutionError(repr(ex)) 21 | 22 | async def _fetch(self, request: FetchRequest): 23 | package = importlib.import_module(request.class_name[0]) 24 | class_name = getattr(package, request.class_name[1]) 25 | 26 | # TODO: remove checking number_of_bad_checks 27 | 28 | queryset = class_name.select().where( 29 | # class_name.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD 30 | class_name.number_of_bad_checks 31 | == 0 32 | ) 33 | 34 | result = { 35 | "count": await db.count(queryset), 36 | } 37 | 38 | if type(request) is GetRequest: 39 | if request.order_by: 40 | queryset = queryset.order_by( 41 | *self.order_by_list_to_peewee(request.order_by, class_name) 42 | ) 43 | 44 | if request.limit > 0: 45 | queryset = queryset.limit(request.limit) 46 | 47 | if request.offset > 0: 48 | queryset = queryset.offset(request.offset) 49 | 50 | data = [] 51 | 52 | for item in await db.execute(queryset): 53 | obj = {} 54 | 55 | for field_name in request.fields: 56 | obj[field_name] = getattr(item, field_name) 57 | 58 | data.append(obj) 59 | 60 | result["data"] = data 61 | 62 | if not data: 63 | result["has_more"] = False 64 | else: 65 | if request.limit > 0: 66 | result["has_more"] = ( 67 | request.offset + request.limit < result["count"] 68 | ) 69 | else: 70 | result["has_more"] = False 71 | 72 | return result 73 | 74 | # TODO: remove 75 | def order_by_list_to_sqlalchemy(self, order_by_fields: list, class_name): 76 | result = [] 77 | for field in order_by_fields: 78 | reverse = False 79 | if field[0] == "-": 80 | reverse = True 81 | field = field[1:] 82 | 83 | attribute = getattr(class_name, field) 84 | if reverse: 85 | attribute = attribute.desc() 86 | 87 | result.append(attribute) 88 | 89 | return result 90 | 91 | def order_by_list_to_peewee(self, order_by_fields: list, class_name): 92 | result = [] 93 | for field in order_by_fields: 94 | reverse = False 95 | if field[0] == "-": 96 | reverse = True 97 | field = field[1:] 98 | 99 | attribute = getattr(class_name, field) 100 | if reverse: 101 | attribute = attribute.desc() 102 | 103 | result.append(attribute) 104 | 105 | return result 106 | 107 | 108 | class ExecutionError(Exception): 109 | pass 110 | -------------------------------------------------------------------------------- /server/api_v1/requests_to_models/request_parser.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import re 3 | import string 4 | 5 | from ..requests_to_models.request import CountRequest, GetRequest, Request 6 | 7 | 8 | class RequestParser: 9 | ALLOWED_CHARS = string.ascii_letters + string.digits + "/: !=><,-*" 10 | COMMA_SEPARATED_KEYS = {"fields", "order_by"} 11 | ALLOWED_KEYS = { 12 | "model", 13 | "method", 14 | "fields", 15 | "filter", 16 | "order_by", 17 | "limit", 18 | "offset", 19 | } 20 | MAXIMUM_KEY_LENGTH = 64 21 | MAXIMUM_VALUE_LENGTH = 512 22 | # TODO: move to settings 23 | MINIMUM_LIMIT_VALUE = 1 24 | MAXIMUM_LIMIT_VALUE = 1024 25 | 26 | def __init__(self, config): 27 | self.config = config 28 | self._validate_config() 29 | 30 | def parse(self, request: dict): 31 | for key in request.keys(): 32 | request[key] = str(request[key]) 33 | self.validate_key(key) 34 | 35 | if key in self.COMMA_SEPARATED_KEYS: 36 | request[key] = self.comma_separated_field_to_list(request[key]) 37 | if key in {"limit", "offset"}: 38 | try: 39 | request[key] = int(request[key]) 40 | except ValueError: 41 | raise ValidationError( 42 | 'Value of key "{}" should be integer'.format(key) 43 | ) 44 | 45 | self.validate_value(key, request[key]) 46 | 47 | if "limit" not in request: 48 | request["limit"] = self.MAXIMUM_LIMIT_VALUE 49 | 50 | return self.parse_dict(request) 51 | 52 | def validate_value(self, key: str, value): 53 | if type(value) not in [str, int, list]: 54 | raise ValidationError("Value type should be string, integer or list") 55 | 56 | if type(value) in [str, list] and len(value) > self.MAXIMUM_VALUE_LENGTH: 57 | raise ValidationError( 58 | "Some value is too big. Maximum allowed length is {}".format( 59 | self.MAXIMUM_VALUE_LENGTH 60 | ) 61 | ) 62 | 63 | # validate list types 64 | if type(value) is list: 65 | for value_item in value: 66 | self.validate_value(key, value_item) 67 | return 68 | 69 | if key == "order_by": 70 | self._validate_value_type(key, value, str) 71 | self._validate_value_regex(key, value, r"^-?[a-zA-Z][a-zA-Z0-9_]+$") 72 | elif key in {"model", "method", "fields"}: 73 | self._validate_value_type(key, value, str) 74 | self._validate_value_regex(key, value, r"^[a-zA-Z][a-zA-Z0-9_]+$") 75 | elif key in {"filter"}: 76 | self._validate_value_type(key, value, str) 77 | self._validate_value_regex(key, value, r"^[a-zA-Z0-9_]+$") 78 | elif key in {"limit", "offset"}: 79 | self._validate_value_type(key, value, int) 80 | if value < 0: 81 | raise ValidationError( 82 | 'Value of key "{}" should be positive'.format(key) 83 | ) 84 | 85 | if key == "limit" and ( 86 | value < self.MINIMUM_LIMIT_VALUE or value > self.MAXIMUM_LIMIT_VALUE 87 | ): 88 | raise ValidationError( 89 | "Value of key limit should be from {} to {} (inclusive)".format( 90 | self.MINIMUM_LIMIT_VALUE, self.MAXIMUM_LIMIT_VALUE 91 | ) 92 | ) 93 | else: 94 | # It means I forget to add validation of field 95 | raise ValidationError("Server Error") 96 | 97 | def _validate_value_regex(self, key, value, pattern): 98 | if not re.match(pattern, value): 99 | raise ValidationError( 100 | "Value of key '{}' doesn't match to pattern {}".format(key, pattern) 101 | ) 102 | 103 | def _validate_value_type(self, key, value, expected_type): 104 | if type(value) is not expected_type: 105 | raise ValidationError( 106 | 'Value of key "{}" should be {}'.format(key, expected_type) 107 | ) 108 | 109 | def validate_key(self, key: str): 110 | if type(key) is not str: 111 | raise ValidationError("Key {} is not string".format(key)) 112 | if len(key) > self.MAXIMUM_KEY_LENGTH: 113 | raise ValidationError( 114 | "Some key is too big. Maximum allowed length is {}".format( 115 | self.MAXIMUM_KEY_LENGTH 116 | ) 117 | ) 118 | if not re.match(r"^[a-zA-Z][a-zA-Z0-9_]+$", key): 119 | raise ValidationError( 120 | 'Key "{}" doesn\'t match to pattern ^[a-zA-Z][a-zA-Z0-9_]+$'.format(key) 121 | ) 122 | if key not in self.ALLOWED_KEYS: 123 | raise ValidationError('Key "{}" isn\'t allowed'.format(key)) 124 | 125 | def comma_separated_field_to_list(self, string_field): 126 | result = [] 127 | for val in string_field.split(","): 128 | val = val.strip() 129 | if val: 130 | result.append(val) 131 | return result 132 | 133 | def parse_dict(self, req_dict): 134 | if "model" not in req_dict: 135 | raise ParseError('You should specify "model"') 136 | 137 | if req_dict["model"] not in self.config: 138 | raise ParseError( 139 | "Model \"{}\" doesn't exist or isn't allowed".format(req_dict["model"]) 140 | ) 141 | 142 | config = self.config[req_dict["model"]] 143 | 144 | result_request = Request(config["model_class"]) 145 | 146 | if "method" not in req_dict: 147 | raise ParseError('You should specify "method"') 148 | 149 | method = req_dict["method"] 150 | 151 | if method not in config["methods"]: 152 | raise ParseError("Method doesn't exist or isn't allowed") 153 | 154 | config = config["methods"][method] 155 | 156 | return {"get": self.method_get, "count": self.method_count}[method]( 157 | req_dict, config, result_request 158 | ) 159 | 160 | def method_get(self, req_dict, config, result_request): 161 | result_request = GetRequest.from_request(result_request) 162 | result_request.fields = self.parse_fields(req_dict, config) 163 | result_request.order_by = self.parse_order_by_fields(req_dict, config) 164 | if "limit" in req_dict: 165 | result_request.limit = req_dict["limit"] 166 | if "offset" in req_dict: 167 | result_request.offset = req_dict["offset"] 168 | 169 | return result_request 170 | 171 | def method_count(self, req_dict, config, result_request): 172 | result_request = CountRequest.from_request(result_request) 173 | result_request.fields = self.parse_fields(req_dict, config) 174 | result_request.order_by = self.parse_order_by_fields(req_dict, config) 175 | if "limit" in req_dict: 176 | result_request.limit = req_dict["limit"] 177 | if "offset" in req_dict: 178 | result_request.offset = req_dict["offset"] 179 | 180 | return result_request 181 | 182 | def parse_fields(self, req_dict, config): 183 | return self.parse_list(req_dict, config, "fields", "fields", config["fields"]) 184 | 185 | def parse_order_by_fields(self, req_dict, config): 186 | return self.parse_list( 187 | req_dict, 188 | config, 189 | "order_by", 190 | "order_by_fields", 191 | config["default_order_by_fields"], 192 | ) 193 | 194 | def parse_list(self, req_dict, config, request_key, config_key, default_value): 195 | if request_key not in req_dict: 196 | return copy.copy(default_value) 197 | 198 | result = [] 199 | 200 | for field in req_dict[request_key]: 201 | if config_key == "order_by_fields": 202 | if (field[1:] if field[0] == "-" else field) not in config[config_key]: 203 | raise ParseError( 204 | "Field \"{}\" doesn't exist or isn't allowed1".format(field) 205 | ) 206 | else: 207 | if field not in config[config_key]: 208 | raise ParseError( 209 | "Field \"{}\" doesn't exist or isn't allowed".format(field) 210 | ) 211 | 212 | result.append(field) 213 | 214 | return result 215 | 216 | def _validate_config(self): 217 | # TODO: check fields for existence and so on 218 | if False: 219 | raise ConfigFormatError() 220 | 221 | 222 | class ParseError(Exception): 223 | pass 224 | 225 | 226 | class ValidationError(ParseError): 227 | pass 228 | 229 | 230 | class ConfigFormatError(Exception): 231 | pass 232 | -------------------------------------------------------------------------------- /server/api_v2/api_request_handler.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import aiohttp 4 | 5 | from models import Proxy, db 6 | from proxy_py import settings 7 | from server.api_v1.requests_to_models.request_parser import ParseError 8 | from server.base_app import BaseApp 9 | 10 | 11 | class ApiRequestHandler: 12 | def __init__(self, app: BaseApp): 13 | self.app = app 14 | self.methods = { 15 | "get_model": self.get_model, 16 | "get_proxies_for_id": self.get_proxies_for_id, 17 | "get_proxy_for_id": self.get_proxy_for_id, 18 | } 19 | 20 | async def handle( 21 | self, request: aiohttp.ClientRequest, method_name: str, post_data: dict 22 | ): 23 | try: 24 | response = { 25 | "status": "ok", 26 | } 27 | if method_name not in self.methods: 28 | response = { 29 | "status": "error", 30 | "status_code": 400, 31 | "error_message": "there is no any method with this name", 32 | } 33 | else: 34 | response.update(await self.methods[method_name](post_data)) 35 | except ParseError as ex: 36 | self.app.log_info( 37 | request, 38 | "Error during parsing request. Request: {} Exception: {}".format( 39 | post_data, ex 40 | ), 41 | ) 42 | 43 | response = {"status": "error", "status_code": 400, "error_message": str(ex)} 44 | except ValueError as ex: 45 | response = { 46 | "status": "error", 47 | "status_code": 400, 48 | "error_message": "something's wrong with your request", 49 | } 50 | self.app.log_error(request, f"ValueError: {ex}") 51 | except BaseException as ex: 52 | response = { 53 | "status": "error", 54 | "status_code": 500, 55 | "error_message": "something bad happened, call the admin", 56 | } 57 | self.app.log_error( 58 | request, 59 | f"Error during execution request. Method: {method_name}. Request: {request}. Exception: {ex}", 60 | ) 61 | # except ExecutionError as ex: 62 | # self.app.log_error( 63 | # request, 64 | # "Error during execution request. Request: {} Exception: {}".format( 65 | # post_data, 66 | # ex 67 | # ) 68 | # ) 69 | # 70 | # response = { 71 | # 'status': 'error', 72 | # 'status_code': 500, 73 | # 'error_message': 'error during execution request' 74 | # } 75 | 76 | return response 77 | 78 | async def get_model(self, data: dict) -> dict: 79 | # TODO: implement 80 | validate_dict_must_have_key(data, "name") 81 | 82 | model_name = data["name"] 83 | validate_letters_digits_undescores(model_name) 84 | if model_name not in settings.PROXY_PROVIDER_SERVER_API_CONFIG: 85 | raise ParseError("You're not allowed to see this model") 86 | 87 | return { 88 | "result": "model " + model_name, 89 | } 90 | 91 | async def get_proxy_for_id(self, data: dict) -> dict: 92 | data["number"] = 1 93 | res = await self.get_proxies_for_id(data) 94 | res["result"] = res["results"][0] 95 | del res["results"] 96 | del res["number_of_results"] 97 | return res 98 | 99 | async def get_proxies_for_id(self, data: dict) -> dict: 100 | validate_dict_must_have_key(data, "id") 101 | validate_dict_must_have_key(data, "number") 102 | number = int(data["number"]) 103 | validate_uint(number) 104 | 105 | # TODO: validate id 106 | results = [] 107 | 108 | for item in await db.execute( 109 | Proxy.raw( 110 | f"SELECT * FROM working_proxies TABLESAMPLE SYSTEM_ROWS({number});" 111 | ) 112 | ): 113 | obj = {} 114 | 115 | for field_name in settings.PROXY_PROVIDER_SERVER_API_CONFIG_FETCH_CONFIG[ 116 | "fields" 117 | ]: 118 | obj[field_name] = getattr(item, field_name) 119 | 120 | results.append(obj) 121 | 122 | return { 123 | "number_of_results": len(results), 124 | "results": results, 125 | } 126 | 127 | 128 | def validate_dict_must_have_key(data: dict, key: str): 129 | if key not in data: 130 | raise ParseError(f'please, specify the "{key}" key') 131 | 132 | 133 | def validate_letters_digits_undescores(value): 134 | if len(value) > settings.PROXY_PROVIDER_SERVER_MAXIMUM_STRING_FIELD_SIZE: 135 | raise ParseError(f'value "{value}" is too big') 136 | 137 | return validate_regex(value, r"^[a-zA-Z0-9_]+$") 138 | 139 | 140 | def validate_regex(value: str, regex: str): 141 | if type(value) is not str: 142 | raise ParseError(f'value "{value}" should be string') 143 | 144 | if not re.match(regex, value): 145 | raise ParseError(f'value "{value}" doesn\'t match to regex "{regex}"') 146 | 147 | 148 | def validate_uint(value): 149 | if type(value) is not int: 150 | raise ParseError("value should be integer") 151 | if value < 0: 152 | raise ParseError("value should be positive") 153 | -------------------------------------------------------------------------------- /server/api_v2/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import aiohttp 4 | 5 | from proxy_py import settings 6 | from server.base_app import BaseApp 7 | 8 | from .api_request_handler import ApiRequestHandler 9 | 10 | 11 | class App(BaseApp): 12 | def __init__(self, *args, **kwargs): 13 | super(App, self).__init__(*args, **kwargs) 14 | 15 | self.request_handler = ApiRequestHandler(self) 16 | 17 | async def setup_router(self): 18 | self.app.router.add_post("/{method_name:[a-z_0-9]+}", self.post) 19 | 20 | async def post(self, request): 21 | method_name = request.match_info["method_name"] 22 | if method_name is None or not method_name: 23 | status_code = 400 24 | response = { 25 | "status": "error", 26 | "status_code": 400, 27 | "error_message": "bad method name", 28 | } 29 | else: 30 | data = await request.read() 31 | 32 | try: 33 | data = data.decode() 34 | if len(data) > settings.PROXY_PROVIDER_SERVER_MAXIMUM_REQUEST_LENGTH: 35 | response = { 36 | "status": "error", 37 | "status_code": 400, 38 | "error_message": "your request is too fat!", 39 | } 40 | else: 41 | data = json.loads(data) 42 | 43 | response = await self.request_handler.handle( 44 | request, method_name, data 45 | ) 46 | except ValueError: 47 | response = { 48 | "status": "error", 49 | "status_code": 400, 50 | "error_message": "your request doesn't look like request", 51 | } 52 | 53 | if "status_code" in response: 54 | status_code = response["status_code"] 55 | else: 56 | if response["status"] != "ok": 57 | status_code = 500 58 | else: 59 | status_code = 200 60 | 61 | response["status_code"] = status_code 62 | 63 | return aiohttp.web.json_response(response, status=status_code) 64 | -------------------------------------------------------------------------------- /server/base_app.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | import aiohttp_jinja2 4 | import jinja2 5 | from aiohttp import web 6 | 7 | from proxy_py import settings 8 | 9 | 10 | class BaseApp: 11 | def __init__(self, logger=None): 12 | self.logger = logger 13 | self._app = web.Application() 14 | 15 | aiohttp_jinja2.setup( 16 | self.app, loader=jinja2.FileSystemLoader(settings.TEMPLATES_PATH) 17 | ) 18 | 19 | async def init(self): 20 | """Call it before anything else""" 21 | await self.setup_router() 22 | await self.setup_middlewares() 23 | 24 | @abc.abstractmethod 25 | async def setup_router(self): 26 | pass 27 | 28 | async def setup_middlewares(self): 29 | pass 30 | 31 | @property 32 | def app(self): 33 | return self._app 34 | 35 | def log_critical(self, *args, **kwargs): 36 | self.log("critical", *args, **kwargs) 37 | 38 | def log_error(self, *args, **kwargs): 39 | self.log("error", *args, **kwargs) 40 | 41 | def log_exception(self, *args, **kwargs): 42 | self.log("exception", *args, **kwargs) 43 | 44 | def log_warning(self, *args, **kwargs): 45 | self.log("warning", *args, **kwargs) 46 | 47 | def log_info(self, *args, **kwargs): 48 | self.log("info", *args, **kwargs) 49 | 50 | def log_debug(self, *args, **kwargs): 51 | self.log("debug", *args, **kwargs) 52 | 53 | def log(self, level, request, message): 54 | # behind nginx or other reverse proxy 55 | client_ip = str(request.headers.get("X-Real-IP", "None")) 56 | 57 | if client_ip == "None" or client_ip.startswith("127.0.0.1"): 58 | self.logger.error( 59 | "Your reverse proxy doesn't present user's IP", 60 | extra={"client_ip": client_ip}, 61 | ) 62 | 63 | getattr(self.logger, level)( 64 | message, 65 | extra={ 66 | "client_ip": client_ip, 67 | }, 68 | ) 69 | -------------------------------------------------------------------------------- /server/frontend/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /server/frontend/app.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import functools 3 | import time 4 | 5 | import aiohttp_jinja2 6 | from aiohttp import web 7 | 8 | from models import ( 9 | CollectorState, 10 | NumberOfProxiesToProcess, 11 | ProcessorProxiesQueueSize, 12 | Proxy, 13 | ProxyCountItem, 14 | db, 15 | ) 16 | from proxy_py import settings 17 | from server.base_app import BaseApp 18 | 19 | 20 | def get_response_wrapper(template_name): 21 | def decorator_wrapper(func): 22 | @functools.wraps(func) 23 | @aiohttp_jinja2.template(template_name) 24 | async def wrap(self, *args, **kwargs): 25 | good_proxies_count = await db.count( 26 | Proxy.select().where(Proxy.number_of_bad_checks == 0) 27 | ) 28 | 29 | bad_proxies_count = await db.count( 30 | Proxy.select().where( 31 | Proxy.number_of_bad_checks > 0, 32 | Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD, 33 | ) 34 | ) 35 | 36 | dead_proxies_count = await db.count( 37 | Proxy.select().where( 38 | Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD, 39 | Proxy.number_of_bad_checks < settings.DO_NOT_CHECK_ON_N_BAD_CHECKS, 40 | ) 41 | ) 42 | 43 | not_checked_proxies_count = await db.count( 44 | Proxy.select().where( 45 | Proxy.number_of_bad_checks >= settings.DO_NOT_CHECK_ON_N_BAD_CHECKS, 46 | ) 47 | ) 48 | 49 | response = { 50 | "bad_proxies_count": bad_proxies_count, 51 | "good_proxies_count": good_proxies_count, 52 | "dead_proxies_count": dead_proxies_count, 53 | "not_checked_proxies_count": not_checked_proxies_count, 54 | } 55 | 56 | response.update(await func(self, *args, **kwargs)) 57 | 58 | return response 59 | 60 | return wrap 61 | 62 | return decorator_wrapper 63 | 64 | 65 | class App(BaseApp): 66 | async def setup_router(self): 67 | self.app.router.add_get("/get/proxy/", self.get_proxies_html) 68 | self.app.router.add_get( 69 | "/get/proxy_count_item/", self.get_proxy_count_items_html 70 | ) 71 | self.app.router.add_get( 72 | "/get/number_of_proxies_to_process/", 73 | self.get_number_of_proxies_to_process_html, 74 | ) 75 | self.app.router.add_get( 76 | "/get/processor_proxies_queue_size/", 77 | self.get_processor_proxies_queue_size_html, 78 | ) 79 | self.app.router.add_get("/get/collector_state/", self.get_collector_state_html) 80 | self.app.router.add_get("/get/best/http/proxy/", self.get_best_http_proxy) 81 | 82 | # self.app.router.add_get('/{tail:.*}', self.default_route) 83 | # 84 | # async def default_route(self, request: aiohttp.ClientRequest): 85 | # path = request.path 86 | # if path == '/': 87 | # path = '/index.html' 88 | # 89 | # if re.match(r'^(/([a-zA-Z0-9_]+(\.[a-zA-Z0-9]+)*)?)+$', path): 90 | # try: 91 | # path = os.path.join('./server/frontend/angular/dist', path) 92 | # with open(path, 'r'): 93 | # return web.FileResponse(path) 94 | # except (FileNotFoundError, IsADirectoryError): 95 | # pass 96 | # 97 | # return web.FileResponse('./server/frontend/angular/dist/index.html') 98 | 99 | @get_response_wrapper("collector_state.html") 100 | async def get_collector_state_html(self, request): 101 | return { 102 | "collector_states": list(await db.execute(CollectorState.select())), 103 | } 104 | 105 | @get_response_wrapper("proxies.html") 106 | async def get_proxies_html(self, request): 107 | proxies = await db.execute( 108 | Proxy.select() 109 | .where(Proxy.number_of_bad_checks == 0) 110 | .order_by(Proxy.response_time) 111 | ) 112 | proxies = list(proxies) 113 | current_timestamp = time.time() 114 | 115 | return { 116 | "proxies": [ 117 | { 118 | "address": proxy.address, 119 | "response_time": proxy.response_time / 1000 120 | if proxy.response_time is not None 121 | else None, 122 | "uptime": datetime.timedelta( 123 | seconds=int(current_timestamp - proxy.uptime) 124 | ) 125 | if proxy.uptime is not None 126 | else None, 127 | "bad_uptime": datetime.timedelta( 128 | seconds=int(current_timestamp - proxy.bad_uptime) 129 | ) 130 | if proxy.bad_uptime is not None 131 | else None, 132 | "last_check_time": proxy.last_check_time, 133 | "checking_period": proxy.checking_period, 134 | "number_of_bad_checks": proxy.number_of_bad_checks, 135 | "bad_proxy": proxy.bad_proxy, 136 | "white_ipv4": proxy.white_ipv4, 137 | "location": proxy.location, 138 | } 139 | for proxy in proxies 140 | ] 141 | } 142 | 143 | @get_response_wrapper("proxy_count_items.html") 144 | async def get_proxy_count_items_html(self, request): 145 | return { 146 | "proxy_count_items": list( 147 | await db.execute( 148 | ProxyCountItem.select() 149 | .where( 150 | ProxyCountItem.timestamp >= time.time() - 3600 * 24 * 7, 151 | ) 152 | .order_by(ProxyCountItem.timestamp) 153 | ) 154 | ) 155 | } 156 | 157 | @get_response_wrapper("number_of_proxies_to_process.html") 158 | async def get_number_of_proxies_to_process_html(self, request): 159 | return { 160 | "number_of_proxies_to_process": list( 161 | await db.execute( 162 | NumberOfProxiesToProcess.select() 163 | .where( 164 | NumberOfProxiesToProcess.timestamp 165 | >= time.time() - 3600 * 24 * 7, 166 | ) 167 | .order_by(NumberOfProxiesToProcess.timestamp) 168 | ) 169 | ) 170 | } 171 | 172 | @get_response_wrapper("processor_proxies_queue_size.html") 173 | async def get_processor_proxies_queue_size_html(self, request): 174 | return { 175 | "data": list( 176 | await db.execute( 177 | ProcessorProxiesQueueSize.select() 178 | .where( 179 | ProcessorProxiesQueueSize.timestamp 180 | >= time.time() - 3600 * 24 * 7, 181 | ) 182 | .order_by(ProcessorProxiesQueueSize.timestamp) 183 | ) 184 | ) 185 | } 186 | 187 | async def get_best_http_proxy(self, request): 188 | proxy_address = ( 189 | await db.get( 190 | Proxy.select() 191 | .where( 192 | Proxy.number_of_bad_checks == 0, 193 | Proxy.raw_protocol == Proxy.PROTOCOLS.index("http"), 194 | ) 195 | .order_by(Proxy.response_time) 196 | ) 197 | ).address 198 | 199 | return web.Response(text=proxy_address) 200 | -------------------------------------------------------------------------------- /server/proxy_provider_server.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import sys 4 | 5 | import aiohttp 6 | import aiohttp_jinja2 7 | from aiohttp import web 8 | 9 | from proxy_py import settings 10 | 11 | from .api_v1.app import App as ApiV1App 12 | from .api_v2.app import App as ApiV2App 13 | from .base_app import BaseApp 14 | from .frontend.app import App as FrontendApp 15 | 16 | 17 | class ProxyProviderServer(BaseApp): 18 | def __init__(self, host, port): 19 | logger = logging.getLogger("proxy_py/server") 20 | logger.setLevel(logging.DEBUG if settings.DEBUG else logging.INFO) 21 | 22 | logger_handler = logging.StreamHandler(sys.stdout) 23 | logger_handler.setLevel(logging.DEBUG if settings.DEBUG else logging.INFO) 24 | logger_handler.setFormatter(logging.Formatter(settings.LOG_FORMAT_STRING)) 25 | 26 | logger.addHandler(logger_handler) 27 | 28 | super(ProxyProviderServer, self).__init__(logger) 29 | 30 | self.host = host 31 | self.port = port 32 | self._request_number = 0 33 | 34 | def start(self, loop): 35 | loop.run_until_complete(self.init()) 36 | 37 | return web.run_app(self._app, host=self.host, port=self.port, loop=loop) 38 | 39 | async def setup_router(self): 40 | api_v1_app = ApiV1App(logger=self.logger) 41 | await api_v1_app.init() 42 | api_v2_app = ApiV2App(logger=self.logger) 43 | await api_v2_app.init() 44 | frontend_app = FrontendApp(logger=self.logger) 45 | await frontend_app.init() 46 | 47 | self._app.add_subapp("/api/v1/", api_v1_app.app) 48 | self._app.add_subapp("/api/v2/", api_v2_app.app) 49 | self._app.add_subapp("/i/", frontend_app.app) 50 | 51 | async def setup_middlewares(self): 52 | error_middleware = self.error_pages_handler( 53 | { 54 | 404: self.handle_404, 55 | 500: self.handle_500, 56 | } 57 | ) 58 | 59 | self.app.middlewares.append(error_middleware) 60 | self.app.middlewares.append(self.logging_middleware) 61 | 62 | @web.middleware 63 | async def logging_middleware(self, request: aiohttp.ClientRequest, handler): 64 | self._request_number += 1 65 | 66 | current_request_number = self._request_number 67 | 68 | request_data = { 69 | "request_number": current_request_number, 70 | "method": request.method, 71 | "url": str(request.url), 72 | "user-agent": request.headers.get("User-Agent", None), 73 | } 74 | 75 | if request.body_exists: 76 | request_data["body"] = (await request.read()).decode() 77 | 78 | self.log_info(request, "-> data={}".format(json.dumps(request_data))) 79 | 80 | status_code = None 81 | exc = None 82 | 83 | try: 84 | response = await handler(request) 85 | status_code = response.status 86 | except web.web_exceptions.HTTPException as ex: 87 | status_code = ex.status 88 | exc = ex 89 | raise ex 90 | except BaseException as ex: 91 | exc = ex 92 | raise ex 93 | finally: 94 | self.log_info( 95 | request, 96 | "<- data={}".format( 97 | json.dumps( 98 | { 99 | "request_number": current_request_number, 100 | "status_code": status_code, 101 | "exception": str(exc), 102 | } 103 | ) 104 | ), 105 | ) 106 | 107 | return response 108 | 109 | def error_pages_handler(self, overrides): 110 | @web.middleware 111 | async def middleware(request, handler): 112 | try: 113 | response = await handler(request) 114 | override = overrides.get(response.status) 115 | if override is None: 116 | return response 117 | else: 118 | return await override(request, response) 119 | except aiohttp.web.HTTPException as ex: 120 | override = overrides.get(ex.status) 121 | if override is None: 122 | raise 123 | else: 124 | return await override(request, ex) 125 | 126 | return middleware 127 | 128 | async def handle_404(self, request, _): 129 | resp = aiohttp_jinja2.render_template("index.html", request, {}) 130 | 131 | resp.set_status(404) 132 | 133 | return resp 134 | 135 | async def handle_500(self, *_): 136 | return aiohttp.web.Response(status=500, text="Server error") 137 | -------------------------------------------------------------------------------- /server/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Go away! 7 | 15 | {% block head %} 16 | 17 | {% endblock %} 18 | 19 | 20 |

Good: {{ good_proxies_count }}

21 |

Bad: {{ bad_proxies_count }}

22 |

Dead: {{ dead_proxies_count }}

23 |

Not checked: {{ not_checked_proxies_count }}

24 | {% block body %} 25 | 26 | {% endblock %} 27 | -------------------------------------------------------------------------------- /server/templates/collector_state.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block body %} 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | {% for collector_state in collector_states %} 16 | 17 | 18 | 19 | 20 | 21 | 24 | 25 | 26 | 27 | 28 | 32 | {% endfor %} 33 | 34 |
processing_periodlast_processing_timelast_processing_proxies_countlast_processing_new_proxies_countdata
{{ collector_state.identifier }}
{{ collector_state.processing_period }} 22 | {{ collector_state.last_processing_time }} 23 | {{ collector_state.last_processing_proxies_count }}Not implemented{{ collector_state.data }}
35 | {% endblock %} 36 | -------------------------------------------------------------------------------- /server/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Go away! 6 | 16 | 17 | 18 | 19 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /server/templates/number_of_proxies_to_process.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block head %} 4 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | {% endblock %} 16 | 17 | {% block body %} 18 | 19 | 212 | 213 |
214 |
215 |
216 | {% endblock %} 217 | -------------------------------------------------------------------------------- /server/templates/processor_proxies_queue_size.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block head %} 4 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | {% endblock %} 16 | 17 | {% block body %} 18 | 19 | 84 | 85 |
86 | {% endblock %} 87 | -------------------------------------------------------------------------------- /server/templates/proxies.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | {% block body %} 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | {% for proxy in proxies %} 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 38 | {% endfor %} 39 | 40 |
addressresponse_timeuptimebad_uptimelast_check_timechecking_periodnumber_of_bad_checksbad_proxyipcitycountry
{{ proxy.address }}{{ proxy.response_time }} ms{{ proxy.uptime }}{{ proxy.bad_uptime }}{{ proxy.last_check_time }}{{ proxy.checking_period }}{{ proxy.number_of_bad_checks }}{{ proxy.bad_proxy }}{{ proxy.white_ipv4 }}{{ proxy.location['city'] }}{{ proxy.location['country_code'] }}
41 | {% endblock %} 42 | -------------------------------------------------------------------------------- /server/templates/proxy_count_items.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block head %} 4 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | {% endblock %} 16 | 17 | {% block body %} 18 | 19 | 212 | 213 |
214 |
215 |
216 | {% endblock %} 217 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | # This flag says to generate wheels that support both Python 2 and Python 3 | # 3. If your code will not run unchanged on both Python 2 and 3, you will 4 | # need to generate separate wheels for each Python version that you 5 | # support. 6 | # universal=1 7 | 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """A setuptools based setup module. 2 | 3 | See: 4 | https://packaging.python.org/en/latest/distributing.html 5 | https://github.com/pypa/sampleproject 6 | """ 7 | 8 | # To use a consistent encoding 9 | from codecs import open 10 | from os import path 11 | 12 | # Always prefer setuptools over distutils 13 | from setuptools import find_packages, setup 14 | 15 | here = path.abspath(path.dirname(__file__)) 16 | 17 | # Get the long description from the README file 18 | with open(path.join(here, "README.md"), encoding="utf-8") as f: 19 | long_description = f.read() 20 | 21 | # Arguments marked as "Required" below must be included for upload to PyPI. 22 | # Fields marked as "Optional" may be commented out. 23 | 24 | setup( 25 | # This is the name of your project. The first time you publish this 26 | # package, this name will be registered for you. It will determine how 27 | # users can install this project, e.g.: 28 | # 29 | # $ pip install sampleproject 30 | # 31 | # And where it will live on PyPI: https://pypi.org/project/sampleproject/ 32 | # 33 | # There are some restrictions on what makes a valid project name 34 | # specification here: 35 | # https://packaging.python.org/specifications/core-metadata/#name 36 | name="proxypy", # Required 37 | # Versions should comply with PEP 440: 38 | # https://www.python.org/dev/peps/pep-0440/ 39 | # 40 | # For a discussion on single-sourcing the version across setup.py and the 41 | # project code, see 42 | # https://packaging.python.org/en/latest/single_source_version.html 43 | version="2.0", # Required 44 | # This is a one-line description or tagline of what your project does. This 45 | # corresponds to the "Summary" metadata field: 46 | # https://packaging.python.org/specifications/core-metadata/#summary 47 | description="Proxy collector", # Required 48 | # This is an optional longer description of your project that represents 49 | # the body of text which users will see when they visit PyPI. 50 | # 51 | # Often, this is the same as your README, so you can just read it in from 52 | # that file directly (as we have already done above) 53 | # 54 | # This field corresponds to the "Description" metadata field: 55 | # https://packaging.python.org/specifications/core-metadata/#description-optional 56 | long_description=long_description, # Optional 57 | # Denotes that our long_description is in Markdown; valid values are 58 | # text/plain, text/x-rst, and text/markdown 59 | # 60 | # Optional if long_description is written in reStructuredText (rst) but 61 | # required for plain-text or Markdown; if unspecified, "applications should 62 | # attempt to render [the long_description] as text/x-rst; charset=UTF-8 and 63 | # fall back to text/plain if it is not valid rst" (see link below) 64 | # 65 | # This field corresponds to the "Description-Content-Type" metadata field: 66 | # https://packaging.python.org/specifications/core-metadata/#description-content-type-optional 67 | long_description_content_type="text/markdown", # Optional (see note above) 68 | # This should be a valid link to your project's main homepage. 69 | # 70 | # This field corresponds to the "Home-Page" metadata field: 71 | # https://packaging.python.org/specifications/core-metadata/#home-page-optional 72 | url="https://github.com/DevAlone/proxy_py", # Optional 73 | # This should be your name or the name of the organization which owns the 74 | # project. 75 | author="DevAlone", # Optional 76 | # This should be a valid email address corresponding to the author listed 77 | # above. 78 | author_email="dev@d3d.info", # Optional 79 | # Classifiers help users find your project by categorizing it. 80 | # 81 | # For a list of valid classifiers, see 82 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers 83 | classifiers=[ # Optional 84 | # How mature is this project? Common values are 85 | # 3 - Alpha 86 | # 4 - Beta 87 | # 5 - Production/Stable 88 | "Development Status :: 5 - Production/Stable", 89 | # Indicate who your project is intended for 90 | "Intended Audience :: Developers", 91 | "Topic :: Internet", 92 | # Pick your license as you wish 93 | "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", 94 | # Specify the Python versions you support here. In particular, ensure 95 | # that you indicate whether you support Python 2, Python 3 or both. 96 | "Programming Language :: Python :: 3", 97 | "Programming Language :: Python :: 3.5", 98 | "Programming Language :: Python :: 3.6", 99 | ], 100 | # This field adds keywords for your project which will appear on the 101 | # project page. What does your project relate to? 102 | # 103 | # Note that this is a string of words separated by whitespace, not a list. 104 | keywords="proxy proxies proxy_collector aiohttp peewee_async", # Optional 105 | # You can just specify package directories manually here if your project is 106 | # simple. Or you can use find_packages(). 107 | # 108 | # Alternatively, if you just want to distribute a single Python file, use 109 | # the `py_modules` argument instead as follows, which will expect a file 110 | # called `my_module.py` to exist: 111 | # 112 | # py_modules=["my_module"], 113 | # 114 | packages=find_packages(exclude=["contrib", "docs", "tests"]), # Required 115 | # This field lists other packages that your project depends on to run. 116 | # Any package you put here will be installed by pip when your project is 117 | # installed, so they must be valid existing projects. 118 | # 119 | # For an analysis of "install_requires" vs pip's requirements files see: 120 | # https://packaging.python.org/en/latest/requirements.html 121 | install_requires=[ 122 | "yarl", 123 | "aiohttp", 124 | "aiosocks", 125 | "lxml", 126 | "PySocks", 127 | "fake-useragent", 128 | "aiohttp_jinja2", 129 | "jinja2", 130 | "peewee-async", 131 | "aiopg", 132 | ], # Optional 133 | # List additional groups of dependencies here (e.g. development 134 | # dependencies). Users will be able to install these using the "extras" 135 | # syntax, for example: 136 | # 137 | # $ pip install sampleproject[dev] 138 | # 139 | # Similar to `install_requires` above, these must be valid existing 140 | # projects. 141 | extras_require={ # Optional 142 | "dev": ["check-manifest"], 143 | "test": ["coverage"], 144 | }, 145 | # If there are data files included in your packages that need to be 146 | # installed, specify them here. 147 | # 148 | # If using Python 2.6 or earlier, then these have to be included in 149 | # MANIFEST.in as well. 150 | package_data={ # Optional 151 | # 'sample': ['package_data.dat'], 152 | }, 153 | # Although 'package_data' is the preferred approach, in some case you may 154 | # need to place data files outside of your packages. See: 155 | # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files 156 | # 157 | # In this case, 'data_file' will be installed into '/my_data' 158 | # data_files=[('my_data', ['data/data_file'])], # Optional 159 | # To provide executable scripts, use entry points in preference to the 160 | # "scripts" keyword. Entry points provide cross-platform support and allow 161 | # `pip` to create the appropriate form of executable for the target 162 | # platform. 163 | # 164 | # For example, the following would provide a command called `sample` which 165 | # executes the function `main` from this package when invoked: 166 | entry_points={ # Optional 167 | "console_scripts": [ 168 | # 'proxy_py=proxy_py:main', 169 | ], 170 | }, 171 | # List additional URLs that are relevant to your project as a dict. 172 | # 173 | # This field corresponds to the "Project-URL" metadata fields: 174 | # https://packaging.python.org/specifications/core-metadata/#project-url-multiple-use 175 | # 176 | # Examples listed include a pattern for specifying where the package tracks 177 | # issues, where the source is hosted, where to say thanks to the package 178 | # maintainers, and where to support the project financially. The key is 179 | # what's used to render the link text on PyPI. 180 | project_urls={ # Optional 181 | "Bug Reports": "https://github.com/DevAlone/proxy_py/issues", 182 | # 'Funding': 'https://donate.pypi.org', 183 | # 'Say Thanks!': 'http://saythanks.io/to/example', 184 | "Source": "https://github.com/DevAlone/proxy_py", 185 | }, 186 | ) 187 | -------------------------------------------------------------------------------- /statistics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevAlone/proxy_py/8aba092685d7b7927fb528fd477fb8c817f47789/statistics/__init__.py -------------------------------------------------------------------------------- /statistics/statistics.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | 4 | from models import ( 5 | CollectorState, 6 | NumberOfCollectorsToProcess, 7 | NumberOfProxiesToProcess, 8 | Proxy, 9 | ProxyCountItem, 10 | db, 11 | ) 12 | from proxy_py import settings 13 | 14 | 15 | async def worker(): 16 | while True: 17 | await process_graph(ProxyCountItem, 60, create_proxy_count_item) 18 | await process_graph(NumberOfProxiesToProcess, 60, number_of_proxies_to_process) 19 | await process_graph( 20 | NumberOfCollectorsToProcess, 60, number_of_collectors_to_process 21 | ) 22 | await asyncio.sleep(10) 23 | 24 | 25 | async def process_graph(model, period, func): 26 | timestamp = time.time() 27 | 28 | if (await db.count(model.select())) == 0: 29 | await func(timestamp) 30 | else: 31 | last_item = await db.get( 32 | model.select().order_by(model.timestamp.desc()).limit(1) 33 | ) 34 | 35 | if int(last_item.timestamp // period) * period + period < timestamp: 36 | await func(timestamp) 37 | 38 | 39 | async def create_proxy_count_item(timestamp): 40 | good_proxies_count = await db.count( 41 | Proxy.select().where(Proxy.number_of_bad_checks == 0) 42 | ) 43 | bad_proxies_count = await db.count( 44 | Proxy.select().where( 45 | Proxy.number_of_bad_checks > 0, 46 | Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD, 47 | ) 48 | ) 49 | dead_proxies_count = await db.count( 50 | Proxy.select().where( 51 | Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD 52 | ) 53 | ) 54 | 55 | await db.create( 56 | ProxyCountItem, 57 | timestamp=timestamp, 58 | good_proxies_count=good_proxies_count, 59 | bad_proxies_count=bad_proxies_count, 60 | dead_proxies_count=dead_proxies_count, 61 | ) 62 | 63 | 64 | async def number_of_proxies_to_process(timestamp): 65 | good_proxies_count = await db.count( 66 | Proxy.select().where( 67 | Proxy.number_of_bad_checks == 0, 68 | Proxy.next_check_time < timestamp, 69 | ) 70 | ) 71 | 72 | bad_proxies_count = await db.count( 73 | Proxy.select().where( 74 | Proxy.number_of_bad_checks > 0, 75 | Proxy.number_of_bad_checks < settings.DEAD_PROXY_THRESHOLD, 76 | Proxy.next_check_time < timestamp, 77 | ) 78 | ) 79 | 80 | dead_proxies_count = await db.count( 81 | Proxy.select().where( 82 | Proxy.number_of_bad_checks >= settings.DEAD_PROXY_THRESHOLD, 83 | Proxy.number_of_bad_checks < settings.DO_NOT_CHECK_ON_N_BAD_CHECKS, 84 | Proxy.next_check_time < timestamp, 85 | ) 86 | ) 87 | 88 | await db.create( 89 | NumberOfProxiesToProcess, 90 | timestamp=timestamp, 91 | good_proxies=good_proxies_count, 92 | bad_proxies=bad_proxies_count, 93 | dead_proxies=dead_proxies_count, 94 | ) 95 | 96 | 97 | async def number_of_collectors_to_process(timestamp): 98 | number_of_collectors = await db.count( 99 | CollectorState.select().where( 100 | CollectorState.last_processing_time 101 | < timestamp - CollectorState.processing_period, 102 | ) 103 | ) 104 | 105 | await db.create( 106 | NumberOfCollectorsToProcess, 107 | timestamp=timestamp, 108 | value=number_of_collectors, 109 | ) 110 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DevAlone/proxy_py/8aba092685d7b7927fb528fd477fb8c817f47789/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import aiohttp 4 | import pytest 5 | 6 | API_URL = "http://localhost:55555/api/v1/" 7 | 8 | 9 | async def get_proxies(session, request): 10 | async with session.post(API_URL, json=request) as resp: 11 | json_data = json.loads(await resp.text()) 12 | return json_data["data"] 13 | 14 | 15 | async def check_ordering(session, field_name): 16 | request_data = {"method": "get", "model": "proxy", "order_by": field_name} 17 | 18 | previous_proxy = None 19 | 20 | for proxy in await get_proxies(session, request_data): 21 | if previous_proxy is not None: 22 | if previous_proxy[field_name] > proxy[field_name]: 23 | return False 24 | 25 | previous_proxy = proxy 26 | 27 | return True 28 | 29 | 30 | async def check_complex_ordering(session, *args): 31 | fields = args 32 | 33 | request_data = {"method": "get", "model": "proxy", "order_by": ", ".join(fields)} 34 | 35 | previous_proxy = None 36 | 37 | for proxy in await get_proxies(session, request_data): 38 | if previous_proxy is not None: 39 | for i in range(1, len(fields)): 40 | previous_field = fields[i - 1] 41 | field = fields[i] 42 | 43 | if previous_proxy[previous_field] > proxy[previous_field]: 44 | return False 45 | elif previous_proxy[previous_field] < proxy[previous_field]: 46 | break 47 | 48 | previous_proxy = proxy 49 | 50 | return True 51 | 52 | 53 | @pytest.mark.asyncio 54 | async def test_ordering(): 55 | tests = [ 56 | (check_ordering, "response_time"), 57 | (check_ordering, "uptime"), 58 | (check_ordering, "number_of_bad_checks"), 59 | (check_ordering, "last_check_time"), 60 | ] 61 | 62 | for test in tests: 63 | async with aiohttp.ClientSession() as session: 64 | result = await test[0](session, *test[1:]) 65 | assert result 66 | 67 | 68 | @pytest.mark.asyncio 69 | async def test_complex_ordering(): 70 | tests = [ 71 | (check_complex_ordering, "uptime", "last_check_time"), 72 | (check_complex_ordering, "number_of_bad_checks", "uptime", "response_time"), 73 | ] 74 | 75 | for test in tests: 76 | async with aiohttp.ClientSession() as session: 77 | result = await test[0](session, *test[1:]) 78 | assert result 79 | -------------------------------------------------------------------------------- /tests/test_http_client.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import http_client 4 | 5 | 6 | @pytest.mark.asyncio 7 | async def test_fast_methods(): 8 | resp = await http_client.get_json("https://ipinfo.io/json") 9 | assert "ip" in resp 10 | 11 | 12 | def test_saving_state(): 13 | # TODO: implement 14 | pass 15 | -------------------------------------------------------------------------------- /tests/test_proxy_validation_regex.py: -------------------------------------------------------------------------------- 1 | import proxy_validator 2 | 3 | valid_proxies = [ 4 | "192.168.0.1:8080", 5 | "8.8.8.8:1", 6 | "99.99.99.99:65535", 7 | "socks4://99.99.99.99:666", 8 | "socks5://99.99.99.99:66/", 9 | "http://99.99.99.99:66", 10 | "user999_1:asdfAADSF_@8.8.4.4:8080", 11 | "http://user999_1:asdfAADSF_@8.8.4.4:8080", 12 | "socks4://user999_1:asdfAADSF_@8.8.4.4:8080", 13 | "socks5://user999_1:asdfAADSF_@8.8.4.4:8080", 14 | "http://user999_1:asdfAADSF_@proxy.google.com:8080", 15 | "http://user999_1:asdfAADSF_@proxy.google:8080", 16 | "the-best-proxy-ever.com:80", 17 | ] 18 | 19 | invalid_proxies = [ 20 | "256.54.11.0:8080", 21 | "255.255.255.1:0", 22 | "246.119.80.80:65536", 23 | "246.119.80.80:100000", 24 | "246.119.80.80/100000", 25 | "socks://99.99.99.99:66", 26 | "80.66.161.99", 27 | "padding-top:1", 28 | "margin:80", 29 | # don't consider localhost as valid 30 | "http://user999_1:asdfAADSF_@localhost:8080", 31 | ] 32 | 33 | 34 | def check_proxy(proxy: str, should_be_valid=True): 35 | try: 36 | proxy_validator.retrieve(proxy) 37 | if not should_be_valid: 38 | raise AssertionError("Proxy shouldn't be considered as valid") 39 | except proxy_validator.ValidationError as ex: 40 | if should_be_valid: 41 | raise AssertionError( 42 | "Proxy should be considered as valid. Message: {}".format(ex) 43 | ) 44 | 45 | 46 | def test_valid_proxies(): 47 | for proxy in valid_proxies: 48 | check_proxy(proxy, True) 49 | 50 | 51 | def test_invalid_proxies(): 52 | for proxy in invalid_proxies: 53 | check_proxy(proxy, False) 54 | -------------------------------------------------------------------------------- /tools/test_collector.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import re 3 | import sys 4 | import time 5 | 6 | from termcolor import colored 7 | 8 | import proxy_utils 9 | import proxy_validator 10 | from collectors_list import collectors 11 | from models import Proxy 12 | from proxy_py import settings 13 | 14 | 15 | def eprint(*args, **kwargs): 16 | return print(*args, file=sys.stderr, **kwargs) 17 | 18 | 19 | PROXIES_PER_TIME = 8192 20 | 21 | 22 | async def run(path: str): 23 | path, class_name = path.split(":", maxsplit=2) 24 | path = re.sub(r"\.py$", "", path).replace("/", ".") 25 | path += "." + class_name 26 | 27 | try: 28 | collector = collectors[path] 29 | except KeyError: 30 | eprint("Collector doesn't exist(maybe you forgot to set __collector__ to True)") 31 | return 1 32 | 33 | total = 0 34 | result = [] 35 | 36 | async for proxy in collector.collect(): 37 | total += 1 38 | result.append(proxy) 39 | 40 | if len(result) >= PROXIES_PER_TIME: 41 | print(f"got more than {PROXIES_PER_TIME} proxies, checking this part") 42 | # await asyncio.gather(*[process_proxy(proxy) for proxy in result]) 43 | result = [] 44 | 45 | # await asyncio.gather(*[process_proxy(proxy) for proxy in result]) 46 | print("Total number of proxies: {}".format(total)) 47 | 48 | 49 | proxies_semaphore = asyncio.BoundedSemaphore(settings.NUMBER_OF_CONCURRENT_TASKS) 50 | 51 | 52 | async def process_proxy(proxy_url: str): 53 | async with proxies_semaphore: 54 | try: 55 | _, auth_data, domain, port = proxy_validator.retrieve(proxy_url) 56 | except proxy_validator.ValidationError as ex: 57 | raise ValueError( 58 | 'Your collector returned bad proxy "{}". Message: "{}"'.format( 59 | proxy_url, ex 60 | ) 61 | ) 62 | 63 | is_working = False 64 | for raw_protocol in range(len(Proxy.PROTOCOLS)): 65 | proxy_url = "{}://".format(Proxy.PROTOCOLS[raw_protocol]) 66 | if auth_data: 67 | proxy_url += auth_data + "@" 68 | 69 | proxy_url += domain + ":" + str(port) 70 | 71 | start_checking_time = time.time() 72 | check_result, checker_additional_info = await proxy_utils.check_proxy( 73 | proxy_url 74 | ) 75 | end_checking_time = time.time() 76 | 77 | if check_result: 78 | is_working = True 79 | break 80 | 81 | response_time = end_checking_time - start_checking_time 82 | 83 | color = "" 84 | 85 | if not is_working: 86 | color = "red" 87 | elif response_time < 1: 88 | color = "cyan" 89 | elif response_time < 5: 90 | color = "green" 91 | elif response_time < 10: 92 | color = "yellow" 93 | else: 94 | color = "magenta" 95 | 96 | print(colored(" ", on_color="on_" + color), end="") 97 | 98 | sys.stdout.flush() 99 | -------------------------------------------------------------------------------- /www/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: / 3 | --------------------------------------------------------------------------------