├── .gitattributes ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── _config.yml ├── api ├── __init__.py └── proxyApi.py ├── db ├── __init__.py ├── dbClient.py ├── redisClient.py └── ssdbClient.py ├── docker-compose.yml ├── docs ├── Makefile ├── changelog.rst ├── conf.py ├── dev │ ├── ext_fetcher.rst │ ├── ext_validator.rst │ └── index.rst ├── index.rst ├── make.bat └── user │ ├── how_to_config.rst │ ├── how_to_run.rst │ ├── how_to_use.rst │ └── index.rst ├── fetcher ├── __init__.py └── proxyFetcher.py ├── handler ├── __init__.py ├── configHandler.py ├── logHandler.py └── proxyHandler.py ├── helper ├── __init__.py ├── check.py ├── fetch.py ├── launcher.py ├── proxy.py ├── scheduler.py └── validator.py ├── proxyList ├── http.txt ├── https.txt ├── socks4.txt └── socks5.txt ├── proxyPool.py ├── requirements.txt ├── setting.py ├── start.sh ├── test.py ├── test ├── __init__.py ├── testConfigHandler.py ├── testDbClient.py ├── testLogHandler.py ├── testProxyClass.py ├── testProxyFetcher.py ├── testProxyValidator.py ├── testRedisClient.py └── testSsdbClient.py ├── tmp ├── http.txt ├── https.txt ├── mix.txt ├── proxiesCheckPool.py ├── randomProxy.py ├── run.sh ├── socks4.txt └── socks5.txt └── util ├── __init__.py ├── lazyProperty.py ├── singleton.py ├── six.py └── webRequest.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.pyc 3 | *.log 4 | tmp/fofa.py 5 | tmp/fofa_config.py 6 | proxyPool/ 7 | backup/dump.rdb 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.5" 5 | - "3.6" 6 | - "3.7" 7 | - "3.8" 8 | - "3.9" 9 | - "3.10" 10 | - "3.11" 11 | os: 12 | - linux 13 | install: 14 | - pip install -r requirements.txt 15 | 16 | script: python test.py 17 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6-alpine 2 | 3 | MAINTAINER jhao104 4 | 5 | WORKDIR /app 6 | 7 | COPY ./requirements.txt . 8 | 9 | # apk repository 10 | RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.ustc.edu.cn/g' /etc/apk/repositories 11 | 12 | # timezone 13 | RUN apk add -U tzdata && cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && apk del tzdata 14 | 15 | # runtime environment 16 | RUN apk add musl-dev gcc libxml2-dev libxslt-dev && \ 17 | pip install --no-cache-dir -r requirements.txt && \ 18 | apk del gcc musl-dev 19 | 20 | COPY . . 21 | 22 | EXPOSE 5010 23 | 24 | ENTRYPOINT [ "sh", "start.sh" ] 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ProxyPool 代理IP池 3 | ======= 4 | 5 | ______ ______ _ 6 | | ___ \_ | ___ \ | | 7 | | |_/ / \__ __ __ _ __ _ | |_/ /___ ___ | | 8 | | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | 9 | | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ 10 | \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____\ 11 | __ / / 12 | /___ / 13 | 14 | ### ProxyPool 15 | 16 |

        代理IP池项目是从https://github.com/jhao104/proxy_pool.git clone而来,对 ProxyPool 代理的二次魔改,除了原有的HTTP代理外,新增了对SOCKS4、SOCKS5、HTTPS代理协议支持,增加了完善了proxy fetch来源,改善了部分功能,如/count请求、server和schedule模式形成守护进程等,优化修改了部分bug,快速实现大量活跃可用代理的持续监测收集工作。

17 |

        代理IP池项目,主要功能为定时采集网上发布或者手工收录的免费代理验证入库,定时验证入库的代理保证代理的可用性,提供API和CLI两种使用方式,同时你也可以扩展代理源以增加代理池IP的质量和数量。

18 | 19 | PS:本项目目的:人人可建立属于自己的高质量免费代理池。 20 | 21 | * 文档: [document](https://proxy-pool.readthedocs.io/zh/latest/) [![Documentation Status](https://readthedocs.org/projects/proxy-pool/badge/?version=latest)](https://proxy-pool.readthedocs.io/zh/latest/?badge=latest) 22 | 23 | * 支持版本: [![](https://img.shields.io/badge/Python-2.7-green.svg)](https://docs.python.org/2.7/) 24 | [![](https://img.shields.io/badge/Python-3.5-blue.svg)](https://docs.python.org/3.5/) 25 | [![](https://img.shields.io/badge/Python-3.6-blue.svg)](https://docs.python.org/3.6/) 26 | [![](https://img.shields.io/badge/Python-3.7-blue.svg)](https://docs.python.org/3.7/) 27 | [![](https://img.shields.io/badge/Python-3.8-blue.svg)](https://docs.python.org/3.8/) 28 | [![](https://img.shields.io/badge/Python-3.9-blue.svg)](https://docs.python.org/3.9/) 29 | [![](https://img.shields.io/badge/Python-3.10-blue.svg)](https://docs.python.org/3.10/) 30 | [![](https://img.shields.io/badge/Python-3.11-blue.svg)](https://docs.python.org/3.11/) 31 | 32 | ### 项目效果 33 | 34 | ![image](https://github.com/user-attachments/assets/284f6600-d634-433f-b464-c6d175382a36)![image](https://github.com/user-attachments/assets/e9c7de0b-ee25-4aa5-8fb6-4331b16b5631)![image](https://github.com/user-attachments/assets/a178d3b0-2545-4b1e-80aa-74cb557c0f14) 35 | ![image](https://github.com/user-attachments/assets/94d97304-8c12-4ed3-8d2d-af57bfc2e875) 36 | 37 | ### 运行项目 38 | 39 | ##### 下载代码: 40 | 41 | * git clone 42 | 43 | ```bash 44 | git clone git@github.com:RanSecWlx/proxyPool.git 45 | ``` 46 | 47 | * releases 48 | 49 | ```bash 50 | https://github.com/RanSecWlx/proxyPool/releases 下载对应zip文件 51 | ``` 52 | 53 | ##### 安装依赖: 54 | 55 | ```bash 56 | pip install -r requirements.txt 57 | ``` 58 | 59 | ##### 更新配置: 60 | 61 | 62 | ```python 63 | # setting.py 为项目配置文件 64 | 65 | # 配置API服务 66 | 67 | HOST = "0.0.0.0" # IP 68 | PORT = 5000 # 监听端口 69 | 70 | 71 | # 配置数据库 72 | 73 | DB_CONN = 'redis://:pwd@127.0.0.1:8888/0' 74 | 75 | 76 | # 配置 ProxyFetcher 77 | 78 | PROXY_FETCHER = [ 79 | "freeProxy01", # 这里是启用的代理抓取方法名,所有fetch方法位于fetcher/proxyFetcher.py 80 | "freeProxy02", 81 | # .... 82 | ] 83 | 84 | # ############# proxy validator ################# 85 | # 代理验证目标网站 86 | HTTP_URL = "http://ifconfig.me/ip" 87 | HTTPS_URL = "https://ifconfig.me/ip" 88 | # 代理验证时超时时间 89 | VERIFY_TIMEOUT = 5 90 | 91 | # 近PROXY_CHECK_COUNT次校验中允许的最大失败次数,超过则剔除代理 92 | MAX_FAIL_COUNT = 0 93 | 94 | # 近PROXY_CHECK_COUNT次校验中允许的最大失败率,超过则剔除代理 95 | MAX_FAIL_RATE = 0.3 96 | 97 | # proxyCheck时代理数量少于POOL_SIZE_MIN触发抓取 98 | POOL_SIZE_MIN = 2000 99 | 100 | # getCount时,check_count_xxx_proxies 过滤条件下限,筛选出来质量比较高的代理 101 | MIN_AVAIL_LIMIT = 20 102 | 103 | # 解决代理源需fanqiang访问问题 104 | API_PROXY_CONFIG = "http://127.0.0.1:1080" 105 | 106 | # 每个proxy采集进程启动的线程数量 107 | RAW_THREADS_NUM = 50 108 | 109 | # 每个proxy检查进程启动的线程数量 110 | USE_THREADS_NUM = 10 111 | 112 | # proxy采集任务每间隔多少分钟执行一次 113 | RAW_INTERVAL_MIN = 4 114 | 115 | # proxy检查任务每间隔多少分钟执行一次 116 | USE_INTERVAL_MIN = 2 117 | 118 | # 控制 执行器(如线程池或进程池)中 最大并发执行任务的数量。 119 | THREADPOOL_WORKS_NUM = 60 120 | 121 | # 一个进程池(ProcessPoolExecutor),最多可以同时运行多少个进程 122 | PROCESSPOOL_WORKS_NUM = 5 123 | 124 | # 控制 同一个任务 在调度器中的最大实例数量 125 | JOB_INSTANCES_NUM = 30 126 | 127 | ``` 128 | 129 | #### 免费代理源 130 | 131 | 1、目前实现的采集免费代理网站有(排名不分先后, 下面仅是对其发布的免费代理情况, 付费代理测评可以参考[这里](https://zhuanlan.zhihu.com/p/33576641)): 132 | 133 | https://github.com/proxifly/free-proxy-list 134 | https://github.com/TheSpeed/SOCKS-List 135 | https://github.com/monosans/proxy-list 136 | https://proxy5.net 137 | https://api.openproxylist.xyz/ 138 | https://fineproxy.org/ 139 | https://api.proxyscrape.com/ 140 | https://www.freeproxy.world/ 141 | https://proxyelite.info 142 | https://geoxy.io/ 143 | https://proxylist.geonode.com 144 | 145 | 146 | 2、此外,还可以从网络空间测绘引擎上获取免费代理,比如FOFA引擎:
147 |     获取SOCKS4代理:protocol="socks4" && "Results:request granted";
148 |     获取SOCKS5代理:protocol=="socks5" && "Version:5 Method:No Authentication(0x00)" && after="2022-02-01";
149 |     获取HTTP 代理:server=="Mikrotik HttpProxy" && status_code=="401";
150 | 151 | 152 | 3、在tmp目录下执行bash run.sh 可以在线获取HTTP、SOCKS4、SOCKS5代理,并自动更新到proxyList目录下的配置文件。
153 | 154 | 4、大家可以在tmp目录下run.sh继续完善添加的代理源:
155 | "HTTP": [
156 | "https://raw.githubusercontent.com/B4RC0DE-TM/proxy-list/main/HTTP.txt",
157 | "https://raw.githubusercontent.com/mmpx12/proxy-list/master/http.txt",
158 | "https://api.proxyscrape.com/v2/?request=getproxies&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all",
159 | "https://api.proxyscrape.com/?request=getproxies&proxytype=https&timeout=10000&country=all&ssl=all&anonymity=all",
160 | "https://api.openproxylist.xyz/http.txt",
161 | "https://multiproxy.org/txt_all/proxy.txt",
162 | "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/http.txt",
163 | "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies_anonymous/http.txt",
164 | "https://raw.githubusercontent.com/shiftytr/proxy-list/master/proxy.txt",
165 | "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt",
166 | "https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/http.txt",
167 | "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-http.txt",
168 | "https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt",
169 | "https://raw.githubusercontent.com/sunny9577/proxy-scraper/master/proxies.txt",
170 | "https://raw.githubusercontent.com/roosterkid/openproxylist/main/HTTPS_RAW.txt",
171 | "https://raw.githubusercontent.com/opsxcq/proxy-list/master/list.txt",
172 | "https://raw.githubusercontent.com/proxy4parsing/proxy-list/main/http.txt",
173 | "https://rootjazz.com/proxies/proxies.txt",
174 | "https://spys.me/proxy.txt",
175 | "https://raw.githubusercontent.com/rdavydov/proxy-list/main/proxies/http.txt",
176 | "https://raw.githubusercontent.com/rdavydov/proxy-list/main/proxies_anonymous/http.txt",
177 | "https://raw.githubusercontent.com/zevtyardt/proxy-list/main/http.txt",
178 | "https://sunny9577.github.io/proxy-scraper/proxies.txt",
179 | "https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/http.txt",
180 | "https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/http.txt",
181 | "https://raw.githubusercontent.com/prxchk/proxy-list/main/http.txt",
182 | "https://raw.githubusercontent.com/ALIILAPRO/Proxy/main/http.txt",
183 | "https://raw.githubusercontent.com/zloi-user/hideip.me/main/http.txt", 184 | "https://raw.githubusercontent.com/saisuiu/Lionkings-Http-Proxys-Proxies/main/cnfree.txt",
185 | "https://raw.githubusercontent.com/Anonym0usWork1221/Free-Proxies/main/proxy_files/http_proxies.txt",
186 | "https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/http/http.txt",
187 | "https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/http.txt",
188 | "https://raw.githubusercontent.com/elliottophellia/yakumo/master/results/http/global/http_checked.txt",
189 | "https://raw.githubusercontent.com/ProxyScraper/ProxyScraper/refs/heads/main/http.txt",
190 | "https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/http/data.txt"
191 | ],
192 | "SOCKS4": [ 193 | "https://api.proxyscrape.com/v2/?request=displayproxies&protocol=socks4",
194 | "https://api.proxyscrape.com/?request=displayproxies&proxytype=socks4&country=all",
195 | "https://api.openproxylist.xyz/socks4.txt",
196 | "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks4.txt",
197 | "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies_anonymous/socks4.txt",
198 | "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-socks4.txt",
199 | "https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks4.txt",
200 | "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks4.txt",
201 | "https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS4_RAW.txt",
202 | "https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks4.txt",
203 | "https://raw.githubusercontent.com/B4RC0DE-TM/proxy-list/main/SOCKS4.txt",
204 | "https://raw.githubusercontent.com/rdavydov/proxy-list/main/proxies/socks4.txt",
205 | "https://raw.githubusercontent.com/rdavydov/proxy-list/main/proxies_anonymous/socks4.txt",
206 | "https://raw.githubusercontent.com/zevtyardt/proxy-list/main/socks4.txt",
207 | "https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/socks4.txt",
208 | "https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/socks4.txt",
209 | "https://raw.githubusercontent.com/prxchk/proxy-list/main/socks4.txt",
210 | "https://raw.githubusercontent.com/ALIILAPRO/Proxy/main/socks4.txt",
211 | "https://raw.githubusercontent.com/zloi-user/hideip.me/main/socks4.txt",
212 | "https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks4.txt",
213 | "https://raw.githubusercontent.com/B4RC0DE-TM/proxy-list/main/SOCKS4.txt",
214 | "https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/socks4.txt",
215 | "https://raw.githubusercontent.com/ProxyScraper/ProxyScraper/refs/heads/main/socks4.txt",
216 | "https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/socks4/data.txt"
217 | ],
218 | "SOCKS5": [
219 | "https://raw.githubusercontent.com/B4RC0DE-TM/proxy-list/main/SOCKS5.txt",
220 | "https://raw.githubusercontent.com/mmpx12/proxy-list/master/socks5.txt",
221 | "https://api.openproxylist.xyz/socks5.txt",
222 | "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies/socks5.txt",
223 | "https://raw.githubusercontent.com/monosans/proxy-list/main/proxies_anonymous/socks5.txt",
224 | "https://raw.githubusercontent.com/ShiftyTR/Proxy-List/master/socks5.txt",
225 | "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-socks5.txt",
226 | "https://raw.githubusercontent.com/roosterkid/openproxylist/main/SOCKS5_RAW.txt",
227 | "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/socks5.txt",
228 | "https://raw.githubusercontent.com/hookzof/socks5_list/master/proxy.txt",
229 | "https://raw.githubusercontent.com/rdavydov/proxy-list/main/proxies/socks5.txt",
230 | "https://raw.githubusercontent.com/rdavydov/proxy-list/main/proxies_anonymous/socks5.txt",
231 | "https://raw.githubusercontent.com/zevtyardt/proxy-list/main/socks5.txt",
232 | "https://raw.githubusercontent.com/MuRongPIG/Proxy-Master/main/socks5.txt",
233 | "https://raw.githubusercontent.com/prxchk/proxy-list/main/socks5.txt",
234 | "https://spys.me/socks.txt",
235 | "https://raw.githubusercontent.com/zloi-user/hideip.me/main/socks5.txt",
236 | "https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/socks5.txt",
237 | "https://raw.githubusercontent.com/elliottophellia/yakumo/master/results/socks5/global/socks5_checked.txt",
238 | "https://raw.githubusercontent.com/ProxyScraper/ProxyScraper/refs/heads/main/socks5.txt",
239 | "https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/socks5/data.txt"
240 | ],
241 | "HTTPS": [
242 | "https://raw.githubusercontent.com/jetkai/proxy-list/main/online-proxies/txt/proxies-https.txt",
243 | "https://raw.githubusercontent.com/Zaeem20/FREE_PROXIES_LIST/master/https.txt",
244 | "https://api.proxyscrape.com/?request=getproxies&proxytype=https&timeout=10000&country=all&ssl=all&anonymity=all",
245 | "https://raw.githubusercontent.com/zloi-user/hideip.me/main/https.txt",
246 | "https://raw.githubusercontent.com/Anonym0usWork1221/Free-Proxies/main/proxy_files/https_proxies.txt",
247 | "https://raw.githubusercontent.com/officialputuid/KangProxy/KangProxy/https/https.txt",
248 | "https://raw.githubusercontent.com/ErcinDedeoglu/proxies/main/proxies/https.txt",
249 | "https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/https/data.txt"
250 | ]
251 | 252 | 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/RanSecWlx/proxyPool/issues/), 下次更新时会考虑在项目中支持。 253 | 254 | 255 | #### 启动项目: 256 | 257 | ```bash 258 | # 如果已经具备运行条件, 可用通过proxyPool.py启动。 259 | # 程序分为: schedule 调度程序 和 server Api服务 260 | python3 proxyPool.py 261 | Usage: proxyPool.py [OPTIONS] COMMAND [ARGS]... 262 | 263 | ProxyPool cli工具 264 | 265 | Options: 266 | --version Show the version and exit. 267 | -h, --help Show this message and exit. 268 | 269 | Commands: 270 | startScheduler 启动调度程序 271 | startServer 启动api服务 272 | stopScheduler 停止调度程序 273 | stopServer 停止api服务 274 | 275 | # 启停调度程序 276 | python3 proxyPool.py startSchedule 277 | python3 proxyPool.py stopSchedule 278 | 279 | # 启停webApi服务 280 | python3 proxyPool.py startServer 281 | python3 proxyPool.py stopServer 282 | ``` 283 | 284 | * Api 285 | 286 | 启动web服务后, 默认配置下会开启 http://127.0.0.1:5010 的api接口服务: 287 | 288 | | api | method | Description | params| 289 | | ----| ---- | ---- | ----| 290 | | / | GET | api介绍 | None | 291 | | /get | GET | 随机获取一个代理| 可选参数: `?type=https` 过滤支持https的代理| 292 | | /pop | GET | 获取并删除一个代理| 可选参数: `?type=https` 过滤支持https的代理| 293 | | /all | GET | 获取所有代理 |可选参数: `?type=https` 过滤支持https的代理| 294 | | /count | GET | 查看代理数量 |None| 295 | | /delete | GET | 删除代理 |`?proxy=host:ip`| 296 | | /clear | GET | 清空代理 |None| 297 | 298 | * 爬虫使用 299 | 300 |   如果要在爬虫代码中使用的话, 可以将此api封装成函数直接使用,例如: 301 | 302 | ```python 303 | import requests 304 | 305 | def get_proxy(): 306 | return requests.get("http://127.0.0.1:5010/get/").json() 307 | 308 | def delete_proxy(proxy): 309 | requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) 310 | 311 | # your spider code 312 | 313 | def getHtml(): 314 | # .... 315 | retry_count = 5 316 | proxy = get_proxy().get("proxy") 317 | while retry_count > 0: 318 | try: 319 | html = requests.get('http://www.example.com', proxies={"http": "http://{}".format(proxy)}) 320 | # 使用代理访问 321 | return html 322 | except Exception: 323 | retry_count -= 1 324 | # 删除代理池中代理 325 | delete_proxy(proxy) 326 | return None 327 | ``` 328 | 329 | ### 扩展代理 330 | 331 |   项目默认包含几个免费的代理获取源,但是免费的毕竟质量有限,所以如果直接运行可能拿到的代理质量不理想。所以,提供了代理获取的扩展方法。 332 | 333 |   添加一个新的代理源方法如下: 334 | 335 | * 1、首先在[ProxyFetcher](https://github.com/RanSecWlx/proxyPool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/fetcher/proxyFetcher.py#L21)类中添加自定义的获取代理的静态方法, 336 | 该方法需要以生成器(yield)形式返回`host:ip`格式的代理,例如: 337 | 338 | ```python 339 | 340 | class ProxyFetcher(object): 341 | # .... 342 | 343 | # 自定义代理源获取方法 344 | @classmethod 345 | def freeProxyCustom1(cls): # 命名不和已有重复即可 346 | 347 | # 通过某网站或者某接口或某数据库获取代理 348 | # 假设你已经拿到了一个代理列表 349 | proxies = ["x.x.x.x:3128", "x.x.x.x:80"] 350 | for proxy in proxies: 351 | yield proxy 352 | # 确保每个proxy都是 host:ip正确的格式返回 353 | ``` 354 | 355 | * 2、添加好方法后,修改[setting.py](https://github.com/RanSecWlx/proxyPool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/setting.py#L47)文件中的`PROXY_FETCHER`项: 356 | 357 |   在`PROXY_FETCHER`下添加自定义方法的名字: 358 | 359 | ```python 360 | PROXY_FETCHER = [ 361 | "freeProxy01", 362 | "freeProxy02", 363 | # .... 364 | "freeProxyCustom1" # # 确保名字和你添加方法名字一致 365 | ] 366 | ``` 367 | 368 | 369 |   `schedule` 进程会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。 370 | 371 | 372 | ### 问题反馈 373 | 374 |   任何问题欢迎在[Issues](https://github.com/RanSecWlx/proxyPool/issues) 中反馈。 375 | 376 |   你的反馈会让此项目变得更加完美。 377 | 378 | ### 贡献代码 379 | 380 |   这里感谢以下contributor的无私奉献: 381 | 382 |  [@jhao104](https://github.com/jhao104) | [@kangnwh](https://github.com/kangnwh) | [@bobobo80](https://github.com/bobobo80) | [@halleywj](https://github.com/halleywj) | [@newlyedward](https://github.com/newlyedward) | [@wang-ye](https://github.com/wang-ye) | [@gladmo](https://github.com/gladmo) | [@bernieyangmh](https://github.com/bernieyangmh) | [@PythonYXY](https://github.com/PythonYXY) | [@zuijiawoniu](https://github.com/zuijiawoniu) | [@netAir](https://github.com/netAir) | [@scil](https://github.com/scil) | [@tangrela](https://github.com/tangrela) | [@highroom](https://github.com/highroom) | [@luocaodan](https://github.com/luocaodan) | [@vc5](https://github.com/vc5) | [@1again](https://github.com/1again) | [@obaiyan](https://github.com/obaiyan) | [@zsbh](https://github.com/zsbh) | [@jiannanya](https://github.com/jiannanya) | [@Jerry12228](https://github.com/Jerry12228) 383 | 384 | 385 | ### Release Notes 386 | 387 | [changelog](https://github.com/RanSecWlx/proxyPool/blob/master/docs/changelog.rst) 388 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /api/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py 5 | Description : 6 | Author : 007x 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | -------------------------------------------------------------------------------- /api/proxyApi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: ProxyApi.py 6 | Description : WebApi 7 | Author : 007x 8 | date: 2016/12/4 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/12/04: WebApi 12 | 2019/08/14: 集成Gunicorn启动方式 13 | 2020/06/23: 新增pop接口 14 | 2022/07/21: 更新count接口 15 | ------------------------------------------------- 16 | """ 17 | __author__ = '007x' 18 | import re 19 | import platform 20 | from werkzeug.wrappers import Response 21 | from flask import Flask, jsonify, request 22 | 23 | from util.six import iteritems 24 | from helper.proxy import Proxy 25 | from handler.proxyHandler import ProxyHandler 26 | from handler.configHandler import ConfigHandler 27 | import json 28 | from collections import OrderedDict 29 | 30 | app = Flask(__name__) 31 | conf = ConfigHandler() 32 | proxy_handler = ProxyHandler() 33 | 34 | 35 | class JsonResponse(Response): 36 | @classmethod 37 | def force_type(cls, response, environ=None): 38 | if isinstance(response, (dict, list)): 39 | response = jsonify(response) 40 | 41 | return super(JsonResponse, cls).force_type(response, environ) 42 | 43 | 44 | app.response_class = JsonResponse 45 | 46 | api_list = [ 47 | {"url": "/get", "params": "type: ''https'|''", "desc": "get a proxy"}, 48 | {"url": "/pop", "params": "", "desc": "get and delete a proxy"}, 49 | {"url": "/delete", "params": "proxy: 'e.g. 127.0.0.1:8080'", "desc": "delete an unable proxy"}, 50 | {"url": "/all", "params": "type: ''https'|''", "desc": "get all proxy from proxy pool"}, 51 | {"url": "/count", "params": "", "desc": "return proxy count"}, 52 | {"url": "/clear", "params": "", "desc": "clear proxy"} 53 | # 'refresh': 'refresh proxy pool', 54 | ] 55 | 56 | 57 | @app.after_request 58 | def apply_caching(response): 59 | response.headers['Connection'] = 'close' 60 | return response 61 | 62 | @app.route('/') 63 | def index(): 64 | return {'url': api_list} 65 | 66 | 67 | @app.route('/get/') 68 | def get(): 69 | https = request.args.get("type", "").lower() == 'https' 70 | proxy = proxy_handler.get(https) 71 | return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} 72 | 73 | 74 | @app.route('/pop/') 75 | def pop(): 76 | https = request.args.get("type", "").lower() == 'https' 77 | proxy = proxy_handler.pop(https) 78 | return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} 79 | 80 | 81 | @app.route('/refresh/') 82 | def refresh(): 83 | # TODO refresh会有守护程序定时执行,由api直接调用性能较差,暂不使用 84 | return 'success' 85 | 86 | 87 | @app.route('/all/') 88 | def getAll(): 89 | https = request.args.get("type", "").lower() == 'https' 90 | proxies = proxy_handler.getAll(https) 91 | proxies_json = [_.to_dict for _ in proxies] 92 | #print(type(proxies_json), dir(proxies_json)) 93 | # 按 value 字段倒序排序 94 | sort_proxies = sorted(proxies_json, key=lambda x: x["check_count"], reverse=True) 95 | return jsonify(sort_proxies) 96 | #return jsonify([_.to_dict for _ in proxies]) 97 | 98 | @app.route('/clear/') 99 | def clear(): 100 | status = proxy_handler.clear() 101 | return {"code": 0, "src": status} 102 | 103 | @app.route('/delete/', methods=['GET']) 104 | def delete(): 105 | proxy = request.args.get('proxy') 106 | status = proxy_handler.delete(Proxy(proxy)) 107 | return {"code": 0, "src": status} 108 | 109 | 110 | @app.route('/count/') 111 | def getCount(): 112 | proxies = proxy_handler.getAll() 113 | http_type_dict = {} 114 | source_type_dict = {} 115 | proxy_type_dict = {} 116 | check_count_dict = {} 117 | region_count_dict = {} 118 | avail_proxies_list = [] 119 | 120 | min_avail_limit = conf.minAvailLimit 121 | avild_proxies_count = 0 122 | for proxy in proxies: 123 | http_type = 'http' 124 | http_type_dict[http_type] = http_type_dict.get(http_type, 0) + 1 125 | if proxy.https is True: 126 | http_type = 'https' 127 | http_type_dict[http_type] = http_type_dict.get(http_type, 0) + 1 128 | for source in proxy.source.split('/'): 129 | source_type_dict[source] = source_type_dict.get(source, 0) + 1 130 | for proxy_type in re.split(r'[ /、,,]+', proxy.proxy_type): 131 | proxy_type_dict[proxy_type] = proxy_type_dict.get(proxy_type, 0) + 1 132 | check_count_dict[f"{(proxy.check_count//min_avail_limit)*min_avail_limit}-{((proxy.check_count//min_avail_limit)+1)*min_avail_limit-1}"] = \ 133 | check_count_dict.get(f"{(proxy.check_count//min_avail_limit)*min_avail_limit}-{((proxy.check_count//min_avail_limit)+1)*min_avail_limit-1}",0) + 1 134 | if proxy.check_count >= min_avail_limit: 135 | avail_proxies_list.append(f"{proxy.proxy_type}://{proxy.proxy}") 136 | avild_proxies_count += 1 137 | 138 | region = proxy.region.split()[0] 139 | if region not in region_count_dict: 140 | region_count_dict[region] = 1 141 | else: 142 | region_count_dict[region] += 1 143 | 144 | 145 | http_type_sort_dict = OrderedDict(sorted(http_type_dict.items())) 146 | proxy_type_sort_dict = OrderedDict(sorted(proxy_type_dict.items())) 147 | source_type_sort_dict = OrderedDict(sorted(source_type_dict.items(), key=lambda x: int(re.search(r'\d+', x[0]).group()))) 148 | check_count_sort_dict = dict(sorted(check_count_dict.items(), key=lambda x: int(x[0].split('-')[0]), reverse=True)) 149 | region_count_sort_dict = dict(sorted(region_count_dict.items(), key=lambda x: x[1], reverse=True)) 150 | 151 | #print(check_count_sort_dict) 152 | return json.dumps(OrderedDict([("count", len(proxies)), ("http_type", http_type_sort_dict), ("proxy_type", proxy_type_sort_dict), 153 | ("source", source_type_sort_dict), ("region", region_count_sort_dict), 154 | ("check_count", check_count_sort_dict), 155 | (f"check_count_{min_avail_limit}_count", avild_proxies_count), 156 | (f"check_count_{min_avail_limit}_proxies", avail_proxies_list)])) 157 | #return {"http_type": http_type_dict, "source": source_type_dict, "count": len(proxies), "proxy_type": proxy_type_dict, "check_count": check_count_sort_dict} 158 | 159 | def runFlask(): 160 | if platform.system() == "Windows": 161 | app.run(host=conf.serverHost, port=conf.serverPort) 162 | else: 163 | import gunicorn.app.base 164 | 165 | class StandaloneApplication(gunicorn.app.base.BaseApplication): 166 | 167 | def __init__(self, app, options=None): 168 | self.options = options or {} 169 | self.application = app 170 | super(StandaloneApplication, self).__init__() 171 | 172 | def load_config(self): 173 | _config = dict([(key, value) for key, value in iteritems(self.options) 174 | if key in self.cfg.settings and value is not None]) 175 | for key, value in iteritems(_config): 176 | self.cfg.set(key.lower(), value) 177 | 178 | def load(self): 179 | return self.application 180 | 181 | _options = { 182 | 'bind': '%s:%s' % (conf.serverHost, conf.serverPort), 183 | 'workers': 4, 184 | 'timeout': 60, 185 | 'keepalive': 0, 186 | #'accesslog': '-', # log to stdout 187 | 'accesslog': conf.serverAccessLogFile, 188 | 'errorlog': conf.serverErrorLogFile, 189 | 'access_log_format': '%(h)s %(l)s %(t)s "%(r)s" %(s)s "%(a)s"', 190 | 'daemon': True # 以 daemon 形式运行 191 | } 192 | StandaloneApplication(app, _options).run() 193 | 194 | 195 | if __name__ == '__main__': 196 | runFlask() 197 | -------------------------------------------------------------------------------- /db/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py.py 5 | Description : 6 | Author : 007x 7 | date: 2016/12/2 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/2: 11 | ------------------------------------------------- 12 | """ -------------------------------------------------------------------------------- /db/dbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: DbClient.py 6 | Description : DB工厂类 7 | Author : 007x 8 | date: 2016/12/2 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/12/02: DB工厂类 12 | 2020/07/03: 取消raw_proxy储存 13 | ------------------------------------------------- 14 | """ 15 | __author__ = '007x' 16 | 17 | import os 18 | import sys 19 | 20 | from util.six import urlparse, withMetaclass 21 | from util.singleton import Singleton 22 | 23 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 24 | 25 | 26 | class DbClient(withMetaclass(Singleton)): 27 | """ 28 | DbClient DB工厂类 提供get/put/update/pop/delete/exists/getAll/clean/getCount/changeTable方法 29 | 30 | 31 | 抽象方法定义: 32 | get(): 随机返回一个proxy; 33 | put(proxy): 存入一个proxy; 34 | pop(): 顺序返回并删除一个proxy; 35 | update(proxy): 更新指定proxy信息; 36 | delete(proxy): 删除指定proxy; 37 | exists(proxy): 判断指定proxy是否存在; 38 | getAll(): 返回所有代理; 39 | clean(): 清除所有proxy信息; 40 | getCount(): 返回proxy统计信息; 41 | changeTable(name): 切换操作对象 42 | 43 | 44 | 所有方法需要相应类去具体实现: 45 | ssdb: ssdbClient.py 46 | redis: redisClient.py 47 | mongodb: mongodbClient.py 48 | 49 | """ 50 | 51 | def __init__(self, db_conn): 52 | """ 53 | init 54 | :return: 55 | """ 56 | self.parseDbConn(db_conn) 57 | self.__initDbClient() 58 | 59 | @classmethod 60 | def parseDbConn(cls, db_conn): 61 | db_conf = urlparse(db_conn) 62 | cls.db_type = db_conf.scheme.upper().strip() 63 | cls.db_host = db_conf.hostname 64 | cls.db_port = db_conf.port 65 | cls.db_user = db_conf.username 66 | cls.db_pwd = db_conf.password 67 | cls.db_name = db_conf.path[1:] 68 | return cls 69 | 70 | def __initDbClient(self): 71 | """ 72 | init DB Client 73 | :return: 74 | """ 75 | __type = None 76 | if "SSDB" == self.db_type: 77 | __type = "ssdbClient" 78 | elif "REDIS" == self.db_type: 79 | __type = "redisClient" 80 | else: 81 | pass 82 | assert __type, 'type error, Not support DB type: {}'.format(self.db_type) 83 | self.client = getattr(__import__(__type), "%sClient" % self.db_type.title())(host=self.db_host, 84 | port=self.db_port, 85 | username=self.db_user, 86 | password=self.db_pwd, 87 | db=self.db_name) 88 | 89 | def get(self, https, **kwargs): 90 | return self.client.get(https, **kwargs) 91 | 92 | def put(self, key, **kwargs): 93 | return self.client.put(key, **kwargs) 94 | 95 | def update(self, key, value, **kwargs): 96 | return self.client.update(key, value, **kwargs) 97 | 98 | def delete(self, key, **kwargs): 99 | return self.client.delete(key, **kwargs) 100 | 101 | def exists(self, key, **kwargs): 102 | return self.client.exists(key, **kwargs) 103 | 104 | def pop(self, https, **kwargs): 105 | return self.client.pop(https, **kwargs) 106 | 107 | def getAll(self, https): 108 | return self.client.getAll(https) 109 | 110 | def clear(self): 111 | return self.client.clear() 112 | 113 | def changeTable(self, name): 114 | self.client.changeTable(name) 115 | 116 | def getCount(self): 117 | return self.client.getCount() 118 | 119 | def test(self): 120 | return self.client.test() 121 | -------------------------------------------------------------------------------- /db/redisClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ----------------------------------------------------- 4 | File Name: redisClient.py 5 | Description : 封装Redis相关操作 6 | Author : 007x 7 | date: 2019/8/9 8 | ------------------------------------------------------ 9 | Change Activity: 10 | 2019/08/09: 封装Redis相关操作 11 | 2020/06/23: 优化pop方法, 改用hscan命令 12 | 2021/05/26: 区别http/https代理 13 | ------------------------------------------------------ 14 | """ 15 | __author__ = '007x' 16 | 17 | from redis.exceptions import TimeoutError, ConnectionError, ResponseError 18 | from redis.connection import BlockingConnectionPool 19 | from handler.logHandler import LogHandler 20 | from random import choice 21 | from redis import Redis 22 | import json 23 | 24 | 25 | class RedisClient(object): 26 | """ 27 | Redis client 28 | 29 | Redis中代理存放的结构为hash: 30 | key为ip:port, value为代理属性的字典; 31 | 32 | """ 33 | 34 | def __init__(self, **kwargs): 35 | """ 36 | init 37 | :param host: host 38 | :param port: port 39 | :param password: password 40 | :param db: db 41 | :return: 42 | """ 43 | self.name = "" 44 | kwargs.pop("username") 45 | self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True, 46 | timeout=5, 47 | socket_timeout=5, 48 | **kwargs)) 49 | 50 | def get(self, https): 51 | """ 52 | 返回一个代理 53 | :return: 54 | """ 55 | if https: 56 | items = self.__conn.hvals(self.name) 57 | proxies = list(filter(lambda x: json.loads(x).get("https"), items)) 58 | return choice(proxies) if proxies else None 59 | else: 60 | proxies = self.__conn.hkeys(self.name) 61 | proxy = choice(proxies) if proxies else None 62 | return self.__conn.hget(self.name, proxy) if proxy else None 63 | 64 | def put(self, proxy_obj): 65 | """ 66 | 将代理放入hash, 使用changeTable指定hash name 67 | :param proxy_obj: Proxy obj 68 | :return: 69 | """ 70 | data = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) 71 | return data 72 | 73 | def pop(self, https): 74 | """ 75 | 弹出一个代理 76 | :return: dict {proxy: value} 77 | """ 78 | proxy = self.get(https) 79 | if proxy: 80 | self.__conn.hdel(self.name, json.loads(proxy).get("proxy", "")) 81 | return proxy if proxy else None 82 | 83 | def delete(self, proxy_str): 84 | """ 85 | 移除指定代理, 使用changeTable指定hash name 86 | :param proxy_str: proxy str 87 | :return: 88 | """ 89 | return self.__conn.hdel(self.name, proxy_str) 90 | 91 | def exists(self, proxy_str): 92 | """ 93 | 判断指定代理是否存在, 使用changeTable指定hash name 94 | :param proxy_str: proxy str 95 | :return: 96 | """ 97 | return self.__conn.hexists(self.name, proxy_str) 98 | 99 | def update(self, proxy_obj): 100 | """ 101 | 更新 proxy 属性 102 | :param proxy_obj: 103 | :return: 104 | """ 105 | return self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) 106 | 107 | def getAll(self, https): 108 | """ 109 | 字典形式返回所有代理, 使用changeTable指定hash name 110 | :return: 111 | """ 112 | items = self.__conn.hvals(self.name) 113 | if https: 114 | return list(filter(lambda x: json.loads(x).get("https"), items)) 115 | else: 116 | return items 117 | 118 | def clear(self): 119 | """ 120 | 清空所有代理, 使用changeTable指定hash name 121 | :return: 122 | """ 123 | return self.__conn.delete(self.name) 124 | 125 | def getCount(self): 126 | """ 127 | 返回代理数量 128 | :return: 129 | """ 130 | proxies = self.getAll(https=False) 131 | return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))} 132 | 133 | def changeTable(self, name): 134 | """ 135 | 切换操作对象 136 | :param name: 137 | :return: 138 | """ 139 | self.name = name 140 | 141 | def test(self): 142 | log = LogHandler('redis_client') 143 | try: 144 | self.getCount() 145 | except TimeoutError as e: 146 | log.error('redis connection time out: %s' % str(e), exc_info=True) 147 | return e 148 | except ConnectionError as e: 149 | log.error('redis connection error: %s' % str(e), exc_info=True) 150 | return e 151 | except ResponseError as e: 152 | log.error('redis connection error: %s' % str(e), exc_info=True) 153 | return e 154 | 155 | 156 | -------------------------------------------------------------------------------- /db/ssdbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: ssdbClient.py 6 | Description : 封装SSDB操作 7 | Author : 007x 8 | date: 2016/12/2 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/12/2: 12 | 2017/09/22: PY3中 redis-py返回的数据是bytes型 13 | 2017/09/27: 修改pop()方法 返回{proxy:value}字典 14 | 2020/07/03: 2.1.0 优化代码结构 15 | 2021/05/26: 区分http和https代理 16 | ------------------------------------------------- 17 | """ 18 | __author__ = '007x' 19 | from redis.exceptions import TimeoutError, ConnectionError, ResponseError 20 | from redis.connection import BlockingConnectionPool 21 | from handler.logHandler import LogHandler 22 | from random import choice 23 | from redis import Redis 24 | import json 25 | 26 | 27 | class SsdbClient(object): 28 | """ 29 | SSDB client 30 | 31 | SSDB中代理存放的结构为hash: 32 | key为代理的ip:por, value为代理属性的字典; 33 | """ 34 | 35 | def __init__(self, **kwargs): 36 | """ 37 | init 38 | :param host: host 39 | :param port: port 40 | :param password: password 41 | :return: 42 | """ 43 | self.name = "" 44 | kwargs.pop("username") 45 | self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True, 46 | timeout=5, 47 | socket_timeout=5, 48 | **kwargs)) 49 | 50 | def get(self, https): 51 | """ 52 | 从hash中随机返回一个代理 53 | :return: 54 | """ 55 | if https: 56 | items_dict = self.__conn.hgetall(self.name) 57 | proxies = list(filter(lambda x: json.loads(x).get("https"), items_dict.values())) 58 | return choice(proxies) if proxies else None 59 | else: 60 | proxies = self.__conn.hkeys(self.name) 61 | proxy = choice(proxies) if proxies else None 62 | return self.__conn.hget(self.name, proxy) if proxy else None 63 | 64 | def put(self, proxy_obj): 65 | """ 66 | 将代理放入hash 67 | :param proxy_obj: Proxy obj 68 | :return: 69 | """ 70 | result = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) 71 | return result 72 | 73 | def pop(self, https): 74 | """ 75 | 顺序弹出一个代理 76 | :return: proxy 77 | """ 78 | proxy = self.get(https) 79 | if proxy: 80 | self.__conn.hdel(self.name, json.loads(proxy).get("proxy", "")) 81 | return proxy if proxy else None 82 | 83 | def delete(self, proxy_str): 84 | """ 85 | 移除指定代理, 使用changeTable指定hash name 86 | :param proxy_str: proxy str 87 | :return: 88 | """ 89 | self.__conn.hdel(self.name, proxy_str) 90 | 91 | def exists(self, proxy_str): 92 | """ 93 | 判断指定代理是否存在, 使用changeTable指定hash name 94 | :param proxy_str: proxy str 95 | :return: 96 | """ 97 | return self.__conn.hexists(self.name, proxy_str) 98 | 99 | def update(self, proxy_obj): 100 | """ 101 | 更新 proxy 属性 102 | :param proxy_obj: 103 | :return: 104 | """ 105 | self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) 106 | 107 | def getAll(self, https): 108 | """ 109 | 字典形式返回所有代理, 使用changeTable指定hash name 110 | :return: 111 | """ 112 | item_dict = self.__conn.hgetall(self.name) 113 | if https: 114 | return list(filter(lambda x: json.loads(x).get("https"), item_dict.values())) 115 | else: 116 | return item_dict.values() 117 | 118 | def clear(self): 119 | """ 120 | 清空所有代理, 使用changeTable指定hash name 121 | :return: 122 | """ 123 | return self.__conn.delete(self.name) 124 | 125 | def getCount(self): 126 | """ 127 | 返回代理数量 128 | :return: 129 | """ 130 | proxies = self.getAll(https=False) 131 | return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))} 132 | 133 | def changeTable(self, name): 134 | """ 135 | 切换操作对象 136 | :param name: 137 | :return: 138 | """ 139 | self.name = name 140 | 141 | def test(self): 142 | log = LogHandler('ssdb_client') 143 | try: 144 | self.getCount() 145 | except TimeoutError as e: 146 | log.error('ssdb connection time out: %s' % str(e), exc_info=True) 147 | return e 148 | except ConnectionError as e: 149 | log.error('ssdb connection error: %s' % str(e), exc_info=True) 150 | return e 151 | except ResponseError as e: 152 | log.error('ssdb connection error: %s' % str(e), exc_info=True) 153 | return e 154 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | proxy_pool: 4 | build: . 5 | container_name: proxy_pool 6 | ports: 7 | - "5010:5010" 8 | links: 9 | - proxy_redis 10 | environment: 11 | DB_CONN: "redis://@proxy_redis:6379/0" 12 | proxy_redis: 13 | image: "redis" 14 | container_name: proxy_redis -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | .. _changelog: 2 | 3 | ChangeLog 4 | ========== 5 | 2.8.0 (2024-11-26) 6 | ------------------ 7 | 8 | 1. 新增了对SOCKS4、SOCKS5、HTTPS代理协议支持; 9 | 2. 更换十余个高质量代理源; 10 | 3. 改善了部分功能,如/count请求、server和schedule模式形成守护进程等; 11 | 4. 优化修改了部分bug 12 | 13 | 2.4.2 (2024-01-18) 14 | ------------------ 15 | 16 | 1. 代理格式检查支持需认证的代理格式 `username:password@ip:port` ; (2023-03-10) 17 | 2. 新增代理源 **稻壳代理**; (2023-05-15) 18 | 3. 新增代理源 **冰凌代理**; (2023-01-18) 19 | 20 | 2.4.1 (2022-07-17) 21 | ------------------ 22 | 23 | 1. 新增代理源 **FreeProxyList**; (2022-07-21) 24 | 2. 新增代理源 **FateZero**; (2022-08-01) 25 | 3. 新增代理属性 ``region``; (2022-08-16) 26 | 27 | 2.4.0 (2021-11-17) 28 | ------------------ 29 | 30 | 1. 移除无效代理源 **神鸡代理**; (2021-11-16) 31 | 2. 移除无效代理源 **极速代理**; (2021-11-16) 32 | 3. 移除代理源 **西拉代理**; (2021-11-16) 33 | 4. 新增代理源 **蝶鸟IP**; (2021-11-16) 34 | 5. 新增代理源 **PROXY11**; (2021-11-16) 35 | 6. 多线程采集代理; (2021-11-17) 36 | 37 | 2.3.0 (2021-05-27) 38 | ------------------ 39 | 40 | 1. 修复Dockerfile时区问题; (2021-04-12) 41 | 2. 新增Proxy属性 ``source``, 标记代理来源; (2021-04-13) 42 | 3. 新增Proxy属性 ``https``, 标记支持https的代理; (2021-05-27) 43 | 44 | 2.2.0 (2021-04-08) 45 | ------------------ 46 | 47 | 1. 启动时检查数据库连通性; 48 | 2. 新增免费代理源 **米扑代理**; 49 | 3. 新增免费代理源 **Pzzqz**; 50 | 4. 新增免费代理源 **神鸡代理**; 51 | 5. 新增免费代理源 **极速代理**; 52 | 6. 新增免费代理源 **小幻代理**; 53 | 54 | 2.1.1 (2021-02-23) 55 | ------------------ 56 | 57 | 1. Fix Bug `#493`_, 新增时区配置; (2020-08-12) 58 | 2. 修复 **66代理** 采集; (2020-11-04) 59 | 3. 修复 **全网代理** 采集, 解决HTML端口加密问题; (2020-11-04) 60 | 4. 新增 **代理盒子** 免费源; (2020-11-04) 61 | 5. 新增 ``POOL_SIZE_MIN`` 配置项, runProxyCheck时, 剩余代理少于POOL_SIZE_MIN触发抓取; (2021-02-23) 62 | 63 | .. _#493: https://github.com/jhao104/proxy_pool/issues/493 64 | 65 | 2.1.0 (2020.07) 66 | ------------------ 67 | 68 | 1. 新增免费代理源 **西拉代理** (2020-03-30) 69 | 2. Fix Bug `#356`_ `#401`_ 70 | 3. 优化Docker镜像体积; (2020-06-19) 71 | 4. 优化配置方式; 72 | 5. 优化代码结构; 73 | 6. 不再储存raw_proxy, 抓取后直接验证入库; 74 | 75 | .. _#401: https://github.com/jhao104/proxy_pool/issues/401 76 | .. _#356: https://github.com/jhao104/proxy_pool/issues/356 77 | 78 | 2.0.1 (2019.10) 79 | ----------------- 80 | 81 | 1. 新增免费代理源 **89免费代理**; 82 | #. 新增免费代理源 **齐云代理** 83 | 84 | 2.0.0 (2019.08) 85 | ------------------ 86 | 87 | 1. WebApi集成Gunicorn方式启动, Windows平台暂不支持; 88 | #. 优化Proxy调度程序; 89 | #. 扩展Proxy属性; 90 | #. 新增cli工具, 更加方便启动proxyPool 91 | 92 | 1.14 (2019.07) 93 | ----------------- 94 | 95 | 1. 修复 Queue阻塞导致的 ``ProxyValidSchedule`` 假死bug; 96 | #. 修改代理源 **云代理** 抓取; 97 | #. 修改代理源 **码农代理** 抓取; 98 | #. 修改代理源 **代理66** 抓取, 引入 ``PyExecJS`` 模块破解加速乐动态Cookies加密; 99 | 100 | 1.13 (2019.02) 101 | ----------------- 102 | 103 | 1. 使用.py文件替换.ini作为配置文件; 104 | 105 | #. 优化代理采集部分; 106 | 107 | 1.12 (2018.04) 108 | ----------------- 109 | 110 | 1. 优化代理格式检查; 111 | 112 | #. 增加代理源; 113 | 114 | #. fix bug `#122`_ `#126`_ 115 | 116 | .. _#122: https://github.com/jhao104/proxy_pool/issues/122 117 | .. _#126: https://github.com/jhao104/proxy_pool/issues/126 118 | 119 | 1.11 (2017.08) 120 | ----------------- 121 | 122 | 1. 使用多线程验证useful_pool; 123 | 124 | 1.10 (2016.11) 125 | ----------------- 126 | 127 | 1. 第一版; 128 | 129 | #. 支持PY2/PY3; 130 | 131 | #. 代理池基本功能; 132 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | import sphinx_rtd_theme 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'ProxyPool' 21 | copyright = '2020, jhao104' 22 | author = 'jhao104' 23 | 24 | master_doc = 'index' 25 | 26 | # The full version, including alpha/beta/rc tags 27 | release = '2.1.0' 28 | 29 | # -- General configuration --------------------------------------------------- 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | ] 36 | 37 | # If true, sectionauthor and moduleauthor directives will be shown in the 38 | # output. They are ignored by default. 39 | show_authors = False 40 | 41 | # The name of the Pygments (syntax highlighting) style to use. 42 | pygments_style = "sphinx" 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ['_templates'] 46 | 47 | # The language for content autogenerated by Sphinx. Refer to documentation 48 | # for a list of supported languages. 49 | # 50 | # This is also used if you do content translation via gettext catalogs. 51 | # Usually you set "language" from the command line for these cases. 52 | language = 'zh_CN' 53 | 54 | # List of patterns, relative to source directory, that match files and 55 | # directories to ignore when looking for source files. 56 | # This pattern also affects html_static_path and html_extra_path. 57 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 58 | 59 | # -- Options for HTML output ------------------------------------------------- 60 | 61 | # The theme to use for HTML and HTML Help pages. See the documentation for 62 | # a list of builtin themes. 63 | # 64 | html_theme = 'sphinx_rtd_theme' 65 | 66 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 67 | 68 | # Add any paths that contain custom static files (such as style sheets) here, 69 | # relative to this directory. They are copied after the builtin static files, 70 | # so a file named "default.css" will overwrite the builtin "default.css". 71 | html_static_path = ['_static'] 72 | -------------------------------------------------------------------------------- /docs/dev/ext_fetcher.rst: -------------------------------------------------------------------------------- 1 | .. ext_fetcher 2 | 3 | 扩展代理源 4 | ----------- 5 | 6 | 项目默认包含几个免费的代理获取源,但是免费的毕竟质量有限,如果直接运行可能拿到的代理质量不理想。因此提供了用户自定义扩展代理获取的方法。 7 | 8 | 如果要添加一个新的代理获取方法, 过程如下: 9 | 10 | 1. 首先在 `ProxyFetcher`_ 类中添加自定义的获取代理的静态方法,该方法需要以生成器(yield)形式返回 ``host:ip`` 格式的代理字符串, 例如: 11 | 12 | .. code-block:: python 13 | 14 | class ProxyFetcher(object): 15 | # .... 16 | # 自定义代理源获取方法 17 | @staticmethod 18 | def freeProxyCustom01(): # 命名不和已有重复即可 19 | # 通过某网站或者某接口或某数据库获取代理 20 | # 假设你已经拿到了一个代理列表 21 | proxies = ["x.x.x.x:3128", "x.x.x.x:80"] 22 | for proxy in proxies: 23 | yield proxy 24 | # 确保每个proxy都是 host:ip正确的格式返回 25 | 26 | 2. 添加好方法后,修改配置文件 `setting.py`_ 中的 ``PROXY_FETCHER`` 项, 加入刚才添加的自定义方法的名字: 27 | 28 | .. code-block:: python 29 | 30 | PROXY_FETCHER = [ 31 | # .... 32 | "freeProxyCustom01" # # 确保名字和你添加方法名字一致 33 | ] 34 | 35 | .. _ProxyFetcher: https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/fetcher/proxyFetcher.py#L20 36 | .. _setting.py: https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/setting.py#L47 -------------------------------------------------------------------------------- /docs/dev/ext_validator.rst: -------------------------------------------------------------------------------- 1 | .. ext_validator 2 | 3 | 代理校验 4 | ----------- 5 | 6 | 内置校验 7 | >>>>>>>>> 8 | 9 | 项目中使用的代理校验方法全部定义在 `validator.py`_ 中, 通过 `ProxyValidator`_ 类中提供的装饰器来区分。校验方法返回 ``True`` 表示 10 | 校验通过, 返回 ``False`` 表示校验不通过。 11 | 12 | * 代理校验方法分为三类: ``preValidator`` 、 ``httpValidator`` 、 ``httpsValidator``: 13 | 14 | * **preValidator**: 预校验,在代理抓取后验证前调用,目前实现了 `formatValidator`_ 校验代理IP格式是否合法; 15 | * **httpValidator**: 代理可用性校验,通过则认为代理可用, 目前实现了 `httpTimeOutValidator`_ 校验; 16 | * **httpsValidator**: 校验代理是否支持https,目前实现了 `httpsTimeOutValidator`_ 校验。 17 | 18 | 19 | .. _validator.py: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py 20 | .. _ProxyValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L29 21 | .. _formatValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L51 22 | .. _httpTimeOutValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L58 23 | .. _httpsTimeOutValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L71 24 | 25 | 每种校验可以定义多个方法,只有 **所有** 方法都返回 ``True`` 的情况下才视为该校验通过,校验方法执行顺序为: 先执行 **httpValidator** , 前者通过后再执行 **httpsValidator** 。 26 | 只有 `preValidator` 校验通过的代理才会进入可用性校验, `httpValidator` 校验通过后认为代理可用准备更新入代理池, `httpValidator` 校验通过后视为代理支持https更新代理的 `https` 属性为 `True` 。 27 | 28 | 扩展校验 29 | >>>>>>>>> 30 | 31 | 在 `validator.py`_ 已有自定义校验的示例,自定义函数需返回True或者False,使用 `ProxyValidator`_ 中提供的装饰器来区分校验类型。 下面是两个例子: 32 | 33 | * 1. 自定义一个代理可用性的校验(``addHttpValidator``): 34 | 35 | .. code-block:: python 36 | 37 | @ProxyValidator.addHttpValidator 38 | def customValidatorExample01(proxy): 39 | """自定义代理可用性校验函数""" 40 | proxies = {"http": "http://{proxy}".format(proxy=proxy)} 41 | try: 42 | r = requests.get("http://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5) 43 | return True if r.status_code == 200 and len(r.content) > 200 else False 44 | except Exception as e: 45 | return False 46 | 47 | * 2. 自定义一个代理是否支持https的校验(``addHttpsValidator``): 48 | 49 | .. code-block:: python 50 | 51 | @ProxyValidator.addHttpsValidator 52 | def customValidatorExample02(proxy): 53 | """自定义代理是否支持https校验函数""" 54 | proxies = {"https": "https://{proxy}".format(proxy=proxy)} 55 | try: 56 | r = requests.get("https://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5, verify=False) 57 | return True if r.status_code == 200 and len(r.content) > 200 else False 58 | except Exception as e: 59 | return False 60 | 61 | 注意,比如在运行代理可用性校验时,所有被 ``ProxyValidator.addHttpValidator`` 装饰的函数会被依次按定义顺序执行,只有当所有函数都返回True时才会判断代理可用。 ``HttpsValidator`` 运行机制也是如此。 62 | -------------------------------------------------------------------------------- /docs/dev/index.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | 开发指南 3 | ========= 4 | 5 | .. module:: dev 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | ext_fetcher 11 | ext_validator 12 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. ProxyPool documentation master file, created by 2 | sphinx-quickstart on Wed Jul 8 16:13:42 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ProxyPool 7 | ===================================== 8 | 9 | :: 10 | 11 | **************************************************************** 12 | *** ______ ********************* ______ *********** _ ******** 13 | *** | ___ \_ ******************** | ___ \ ********* | | ******** 14 | *** | |_/ / \__ __ __ _ __ _ | |_/ /___ * ___ | | ******** 15 | *** | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | ******** 16 | *** | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ **** 17 | *** \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____/ **** 18 | **** __ / / ***** 19 | ************************* /___ / ******************************* 20 | ************************* ******************************** 21 | **************************************************************** 22 | 23 | Python爬虫代理IP池 24 | 25 | 安装 26 | ----- 27 | 28 | * 下载代码 29 | 30 | .. code-block:: console 31 | 32 | $ git clone git@github.com:jhao104/proxy_pool.git 33 | 34 | * 安装依赖 35 | 36 | .. code-block:: console 37 | 38 | $ pip install -r requirements.txt 39 | 40 | * 更新配置 41 | 42 | .. code-block:: python 43 | 44 | HOST = "0.0.0.0" 45 | PORT = 5000 46 | 47 | DB_CONN = 'redis://@127.0.0.1:8888' 48 | 49 | PROXY_FETCHER = [ 50 | "freeProxy01", 51 | "freeProxy02", 52 | # .... 53 | ] 54 | 55 | * 启动项目 56 | 57 | .. code-block:: console 58 | 59 | $ python proxyPool.py schedule 60 | $ python proxyPool.py server 61 | 62 | 使用 63 | ______ 64 | 65 | * API 66 | 67 | ============ ======== ================ ============== 68 | Api Method Description Params 69 | ============ ======== ================ ============== 70 | / GET API介绍 无 71 | /get GET 返回一个代理 可选参数: `?type=https` 过滤支持https的代理 72 | /pop GET 返回并删除一个代理 可选参数: `?type=https` 过滤支持https的代理 73 | /all GET 返回所有代理 可选参数: `?type=https` 过滤支持https的代理 74 | /count GET 返回代理数量 无 75 | /delete GET 删除指定代理 `?proxy=host:ip` 76 | ============ ======== ================ ============== 77 | 78 | 79 | * 爬虫 80 | 81 | .. code-block:: python 82 | 83 | import requests 84 | 85 | def get_proxy(): 86 | return requests.get("http://127.0.0.1:5010/get?type=https").json() 87 | 88 | def delete_proxy(proxy): 89 | requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) 90 | 91 | # your spider code 92 | 93 | def getHtml(): 94 | # .... 95 | retry_count = 5 96 | proxy = get_proxy().get("proxy") 97 | while retry_count > 0: 98 | try: 99 | html = requests.get('https://www.example.com', proxies={"http": "http://{}".format(proxy), "https": "https://{}".format(proxy)}) 100 | # 使用代理访问 101 | return html 102 | except Exception: 103 | retry_count -= 1 104 | # 删除代理池中代理 105 | delete_proxy(proxy) 106 | return None 107 | 108 | Contents 109 | -------- 110 | 111 | .. toctree:: 112 | :maxdepth: 2 113 | 114 | user/index 115 | dev/index 116 | changelog 117 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/user/how_to_config.rst: -------------------------------------------------------------------------------- 1 | .. how_to_config 2 | 3 | 配置参考 4 | --------- 5 | 6 | 配置文件 ``setting.py`` 位于项目的主目录下, 配置主要分为四类: **服务配置** 、 **数据库配置** 、 **采集配置** 、 **校验配置**. 7 | 8 | 服务配置 9 | >>>>>>>>> 10 | 11 | * ``HOST`` 12 | 13 | API服务监听的IP, 本机访问设置为 ``127.0.0.1``, 开启远程访问设置为: ``0.0.0.0``. 14 | 15 | * ``PORT`` 16 | 17 | API服务监听的端口. 18 | 19 | 数据库配置 20 | >>>>>>>>>>> 21 | 22 | * ``DB_CONN`` 23 | 24 | 用户存放代理IP的数据库URI, 配置格式为: ``db_type://[[user]:[pwd]]@ip:port/[db]``. 25 | 26 | 目前支持的db_type有: ``ssdb`` 、 ``redis``. 27 | 28 | 配置示例: 29 | 30 | .. code-block:: python 31 | 32 | # SSDB IP: 127.0.0.1 Port: 8888 33 | DB_CONN = 'ssdb://@127.0.0.1:8888' 34 | # SSDB IP: 127.0.0.1 Port: 8899 Password: 123456 35 | DB_CONN = 'ssdb://:123456@127.0.0.1:8888' 36 | 37 | # Redis IP: 127.0.0.1 Port: 6379 38 | DB_CONN = 'redis://@127.0.0.1:6379' 39 | # Redis IP: 127.0.0.1 Port: 6379 Password: 123456 40 | DB_CONN = 'redis://:123456@127.0.0.1:6379' 41 | # Redis IP: 127.0.0.1 Port: 6379 Password: 123456 DB: 15 42 | DB_CONN = 'redis://:123456@127.0.0.1:6379/15' 43 | 44 | 45 | * ``TABLE_NAME`` 46 | 47 | 存放代理的数据载体名称, ssdb和redis的存放结构为hash. 48 | 49 | 采集配置 50 | >>>>>>>>> 51 | 52 | * ``PROXY_FETCHER`` 53 | 54 | 启用的代理采集方法名, 代理采集方法位于 ``fetcher/proxyFetcher.py`` 类中. 55 | 56 | 由于各个代理源的稳定性不容易掌握, 当某个代理采集方法失效时, 可以该配置中注释掉其名称. 57 | 58 | 如果有增加某些代理采集方法, 也请在该配置中添加其方法名, 具体请参考 :doc:`/dev/extend_fetcher`. 59 | 60 | 调度程序每次执行采集任务时都会再次加载该配置, 保证每次运行的采集方法都是有效的. 61 | 62 | 校验配置 63 | >>>>>>>>> 64 | 65 | * ``HTTP_URL`` 66 | 67 | 用于检验代理是否可用的地址, 默认为 ``http://httpbin.org``, 可根据使用场景修改为其他地址. 68 | 69 | * ``HTTPS_URL`` 70 | 71 | 用于检验代理是否支持HTTPS的地址, 默认为 ``https://www.qq.com``, 可根据使用场景修改为其他地址. 72 | 73 | * ``VERIFY_TIMEOUT`` 74 | 75 | 检验代理的超时时间, 默认为 ``10`` , 单位秒. 使用代理访问 ``HTTP(S)_URL`` 耗时超过 ``VERIFY_TIMEOUT`` 时, 视为代理不可用. 76 | 77 | * ``MAX_FAIL_COUNT`` 78 | 79 | 检验代理允许最大失败次数, 默认为 ``0``, 即出错一次即删除. 80 | 81 | * ``POOL_SIZE_MIN`` 82 | 83 | 代理检测定时任务运行前若代理数量小于 `POOL_SIZE_MIN`, 则先运行抓取程序. -------------------------------------------------------------------------------- /docs/user/how_to_run.rst: -------------------------------------------------------------------------------- 1 | .. how_to_run 2 | 3 | 4 | 如何运行 5 | --------- 6 | 7 | 下载代码 8 | >>>>>>>>> 9 | 10 | 本项目需要下载代码到本地运行, 通过 ``git`` 下载: 11 | 12 | .. code-block:: console 13 | 14 | $ git clone git@github.com:jhao104/proxy_pool.git 15 | 16 | 或者下载特定的 ``release`` 版本: 17 | 18 | .. code-block:: console 19 | 20 | https://github.com/jhao104/proxy_pool/releases 21 | 22 | 安装依赖 23 | >>>>>>>>> 24 | 25 | 到项目目录下使用 ``pip`` 安装依赖库: 26 | 27 | .. code-block:: console 28 | 29 | $ pip install -r requirements.txt 30 | 31 | 32 | 更新配置 33 | >>>>>>>>> 34 | 35 | 配置文件 ``setting.py`` 位于项目的主目录下: 36 | 37 | .. code-block:: python 38 | 39 | # 配置API服务 40 | 41 | HOST = "0.0.0.0" # IP 42 | PORT = 5000 # 监听端口 43 | 44 | # 配置数据库 45 | 46 | DB_CONN = 'redis://@127.0.0.1:8888/0' 47 | 48 | # 配置 ProxyFetcher 49 | 50 | PROXY_FETCHER = [ 51 | "freeProxy01", # 这里是启用的代理抓取方法,所有fetch方法位于fetcher/proxyFetcher.py 52 | "freeProxy02", 53 | # .... 54 | ] 55 | 56 | 更多配置请参考 :doc:`/user/how_to_config` 57 | 58 | 启动项目 59 | >>>>>>>>> 60 | 61 | 如果已配置好运行环境, 具备运行条件, 可以通过 ``proxyPool.py`` 启动. ``proxyPool.py`` 是项目的CLI入口. 62 | 完整程序包含两部份: ``schedule`` 调度程序和 ``server`` API服务, 调度程序负责采集和验证代理, API服务提供代理服务HTTP接口. 63 | 64 | 通过命令行程序分别启动调度程序和API服务: 65 | 66 | .. code-block:: console 67 | 68 | # 启动调度程序 69 | $ python proxyPool.py schedule 70 | 71 | # 启动webApi服务 72 | $ python proxyPool.py server 73 | 74 | -------------------------------------------------------------------------------- /docs/user/how_to_use.rst: -------------------------------------------------------------------------------- 1 | .. how_to_use 2 | 3 | 如何使用 4 | ---------- 5 | 6 | 爬虫代码要对接代理池目前有两种方式: 一是通过调用API接口使用, 二是直接读取数据库. 7 | 8 | 调用API 9 | >>>>>>>>> 10 | 11 | 启动ProxyPool的 ``server`` 后会提供如下几个http接口: 12 | 13 | ============ ======== ================ ============== 14 | Api Method Description Arg 15 | ============ ======== ================ ============== 16 | / GET API介绍 无 17 | /get GET 随机返回一个代理 无 18 | /get_all GET 返回所有代理 无 19 | /get_status GET 返回代理数量 无 20 | /delete GET 删除指定代理 proxy=host:ip 21 | ============ ======== ================ ============== 22 | 23 | 在代码中可以通过封装上面的API接口来使用代理, 例子: 24 | 25 | .. code-block:: python 26 | 27 | import requests 28 | 29 | def get_proxy(): 30 | return requests.get("http://127.0.0.1:5010/get/").json() 31 | 32 | def delete_proxy(proxy): 33 | requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) 34 | 35 | # your spider code 36 | 37 | def getHtml(): 38 | # .... 39 | retry_count = 5 40 | proxy = get_proxy().get("proxy") 41 | while retry_count > 0: 42 | try: 43 | # 使用代理访问 44 | html = requests.get('http://www.example.com', proxies={"http": "http://{}".format(proxy)}) 45 | return html 46 | except Exception: 47 | retry_count -= 1 48 | # 删除代理池中代理 49 | delete_proxy(proxy) 50 | return None 51 | 52 | 本例中我们在本地 ``127.0.0.1`` 启动端口为 ``5010`` 的 ``server``, 使用 ``/get`` 接口获取代理, ``/delete`` 删除代理. 53 | 54 | 读数据库 55 | >>>>>>>>> 56 | 57 | 目前支持配置两种数据库: ``REDIS`` 、 ``SSDB``. 58 | 59 | * **REDIS** 储存结构为 ``hash``, hash name为配置项中的 **TABLE_NAME** 60 | 61 | * **SSDB** 储存结构为 ``hash``, hash name为配置项中的 **TABLE_NAME** 62 | 63 | 可以在代码中自行读取. 64 | -------------------------------------------------------------------------------- /docs/user/index.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | 用户指南 3 | ========= 4 | 5 | .. module:: user 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | how_to_run 11 | how_to_use 12 | how_to_config 13 | -------------------------------------------------------------------------------- /fetcher/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py 5 | Description : 6 | Author : 007x 7 | date: 2016/11/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/11/25: 11 | ------------------------------------------------- 12 | """ -------------------------------------------------------------------------------- /fetcher/proxyFetcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: proxyFetcher 5 | Description : 6 | Author : 007x 7 | date: 2016/11/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/11/25: proxyFetcher 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | import os 16 | import re 17 | import json 18 | import fileinput 19 | from time import sleep 20 | from lxml import html 21 | from util.webRequest import WebRequest 22 | from lxml.etree import tostring 23 | from handler.configHandler import ConfigHandler 24 | 25 | class ProxyFetcher(object): 26 | """ 27 | proxy getter 28 | """ 29 | 30 | conf = ConfigHandler() 31 | proxyInfor = conf.apiProxyConfig 32 | if len(proxyInfor) >= 8: 33 | proxiesInfor = {"http": proxyInfor, "https": proxyInfor} 34 | else: 35 | proxiesInfor = None 36 | 37 | @classmethod 38 | def freeProxy01(cls): 39 | """ 40 | 站大爷 https://www.zdaye.com/dayProxy.html 41 | """ 42 | start_url = "https://www.zdaye.com/dayProxy.html" 43 | html_tree = WebRequest().get(start_url, verify=False).tree 44 | latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() 45 | from datetime import datetime 46 | interval = datetime.now() - datetime.strptime(latest_page_time, "%Y/%m/%d %H:%M:%S") 47 | if interval.seconds < 300: # 只采集5分钟内的更新 48 | target_url = "https://www.zdaye.com/" + html_tree.xpath("//h3[@class='thread_title']/a/@href")[0].strip() 49 | while target_url: 50 | _tree = WebRequest().get(target_url, verify=False).tree 51 | for tr in _tree.xpath("//table//tr"): 52 | ip = "".join(tr.xpath("./td[1]/text()")).strip() 53 | port = "".join(tr.xpath("./td[2]/text()")).strip() 54 | yield "%s:%s" % (ip, port) 55 | next_page = _tree.xpath("//div[@class='page']/a[@title='下一页']/@href") 56 | target_url = "https://www.zdaye.com/" + next_page[0].strip() if next_page else False 57 | sleep(5) 58 | 59 | @classmethod 60 | def freeProxy02(cls): 61 | """ 62 | 代理66 http://www.66ip.cn/ 63 | """ 64 | url = "http://www.66ip.cn/" 65 | resp = WebRequest().get(url, timeout=10).tree 66 | for i, tr in enumerate(resp.xpath("(//table)[3]//tr")): 67 | if i > 0: 68 | ip = "".join(tr.xpath("./td[1]/text()")).strip() 69 | port = "".join(tr.xpath("./td[2]/text()")).strip() 70 | yield "%s:%s" % (ip, port) 71 | 72 | @classmethod 73 | def freeProxy03(cls): 74 | """ 开心代理 """ 75 | target_urls = ["http://www.kxdaili.com/dailiip.html", "http://www.kxdaili.com/dailiip/2/1.html"] 76 | for url in target_urls: 77 | tree = WebRequest().get(url).tree 78 | for tr in tree.xpath("//table[@class='active']//tr")[1:]: 79 | ip = "".join(tr.xpath('./td[1]/text()')).strip() 80 | port = "".join(tr.xpath('./td[2]/text()')).strip() 81 | yield "%s:%s" % (ip, port) 82 | 83 | @classmethod 84 | def freeProxy04(cls): 85 | """ FreeProxyList https://www.freeproxylists.net/zh/ """ 86 | url = "https://www.freeproxylists.net/zh/?c=CN&pt=&pr=&a%5B%5D=0&a%5B%5D=1&a%5B%5D=2&u=50" 87 | tree = WebRequest().get(url, verify=False).tree 88 | from urllib import parse 89 | 90 | def parse_ip(input_str): 91 | html_str = parse.unquote(input_str) 92 | ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_str) 93 | return ips[0] if ips else None 94 | 95 | for tr in tree.xpath("//tr[@class='Odd']") + tree.xpath("//tr[@class='Even']"): 96 | ip = parse_ip("".join(tr.xpath('./td[1]/script/text()')).strip()) 97 | port = "".join(tr.xpath('./td[2]/text()')).strip() 98 | if ip: 99 | yield "%s:%s" % (ip, port) 100 | 101 | @classmethod 102 | def freeProxy05(cls, page_count=1): 103 | """ 快代理 https://www.kuaidaili.com """ 104 | url_pattern = [ 105 | 'https://www.kuaidaili.com/free/inha/{}/', 106 | 'https://www.kuaidaili.com/free/intr/{}/' 107 | ] 108 | url_list = [] 109 | for page_index in range(1, page_count + 1): 110 | for pattern in url_pattern: 111 | url_list.append(pattern.format(page_index)) 112 | 113 | for url in url_list: 114 | tree = WebRequest().get(url).tree 115 | proxy_list = tree.xpath('.//table//tr') 116 | sleep(1) # 必须sleep 不然第二条请求不到数据 117 | for tr in proxy_list[1:]: 118 | yield ':'.join(tr.xpath('./td/text()')[0:2]) 119 | 120 | @classmethod 121 | def freeProxy06(cls): 122 | """ 冰凌代理 https://www.binglx.cn """ 123 | url = "https://www.binglx.cn/?page=1" 124 | try: 125 | tree = WebRequest().get(url).tree 126 | proxy_list = tree.xpath('.//table//tr') 127 | for tr in proxy_list[1:]: 128 | yield ':'.join(tr.xpath('./td/text()')[0:2]) 129 | except Exception as e: 130 | print(e) 131 | 132 | @classmethod 133 | def freeProxy07(cls): 134 | """ 云代理 """ 135 | urls = ['http://www.ip3366.net/free/?stype=1', "http://www.ip3366.net/free/?stype=2"] 136 | for url in urls: 137 | r = WebRequest().get(url, timeout=10) 138 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) 139 | for proxy in proxies: 140 | yield ":".join(proxy) 141 | 142 | @classmethod 143 | def freeProxy08(cls): 144 | """ 小幻代理 """ 145 | urls = ['https://ip.ihuan.me/address/5Lit5Zu9.html'] 146 | for url in urls: 147 | r = WebRequest().get(url, timeout=10) 148 | proxies = re.findall(r'>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?(\d+)', r.text) 149 | for proxy in proxies: 150 | yield ":".join(proxy) 151 | 152 | @classmethod 153 | def freeProxy09(cls, page_count=1): 154 | """ 免费代理库 """ 155 | for i in range(1, page_count + 1): 156 | url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i) 157 | html_tree = WebRequest().get(url, verify=False).tree 158 | for index, tr in enumerate(html_tree.xpath("//table//tr")): 159 | if index == 0: 160 | continue 161 | yield ":".join(tr.xpath("./td/text()")[0:2]).strip() 162 | 163 | @classmethod 164 | def freeProxy10(cls): 165 | """ 89免费代理 """ 166 | r = WebRequest().get("https://www.89ip.cn/index_1.html", timeout=10) 167 | proxies = re.findall( 168 | r'[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?[\s\S]*?[\s\S]*?(\d+)[\s\S]*?', 169 | r.text) 170 | for proxy in proxies: 171 | yield ':'.join(proxy) 172 | 173 | @classmethod 174 | def freeProxy11(cls): 175 | """ 稻壳代理 https://www.docip.net/ """ 176 | r = WebRequest().get("https://www.docip.net/data/free.json", timeout=10) 177 | try: 178 | for each in r.json['data']: 179 | yield each['ip'], "HTTP" 180 | except Exception as e: 181 | print(e) 182 | 183 | @classmethod 184 | def freeProxy12(cls): 185 | """ 稻壳代理 https://proxylist.geonode.com/api/proxy-list """ 186 | target_urls=[ 187 | 'https://proxylist.geonode.com/api/proxy-list?protocols=http&limit=500&page=1&sort_by=lastChecked&sort_type=desc', 188 | 'https://proxylist.geonode.com/api/proxy-list?protocols=socks4&limit=500&page=1&sort_by=lastChecked&sort_type=desc', 189 | 'https://proxylist.geonode.com/api/proxy-list?protocols=socks5&limit=500&page=1&sort_by=lastChecked&sort_type=desc' 190 | ] 191 | 192 | for url in target_urls: 193 | try: 194 | #r = WebRequest().get(url, proxies, verify=False, timeout=10) 195 | r = WebRequest().get(url, verify=False, timeout=10) 196 | for each in r.json['data']: 197 | yield "%s:%s" % ( each['ip'], each['port'] ), each["protocols"][0] 198 | except Exception as e: 199 | print(e) 200 | pass 201 | 202 | @classmethod 203 | def freeProxy13(cls): 204 | """ 稻壳代理 https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/ """ 205 | 206 | target_urls = [ 207 | 'https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt', 208 | 'https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt', 209 | 'https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt' 210 | ] 211 | proxy_types = [ 212 | "HTTP", 213 | "HTTPS", 214 | "SOCKS4", 215 | "SOCKS5" 216 | ] 217 | 218 | for url in target_urls: 219 | try: 220 | if url.endswith("http.txt"): 221 | proxy_type = proxy_types[0] 222 | elif url.endswith("https.txt"): 223 | proxy_type = proxy_types[1] 224 | elif url.endswith("socks4.txt"): 225 | proxy_type = proxy_types[2] 226 | elif url.endswith("socks5.txt"): 227 | proxy_type = proxy_types[3] 228 | 229 | r = WebRequest().get(url, proxies = cls.proxiesInfor, verify=False, timeout=10) 230 | for each in r.text.splitlines(): 231 | yield each, proxy_type 232 | except Exception as e: 233 | print(e) 234 | pass 235 | 236 | @classmethod 237 | def freeProxy96(cls): 238 | target_urls = [ 239 | 'https://106.14.14.210/proxies_status', 240 | 'http://47.243.63.109:5000/proxies_status', 241 | 'http://142.171.31.40:5010/proxies_status', 242 | ] 243 | proxy_types = [ 244 | "HTTP", 245 | "HTTPS", 246 | "SOCKS4", 247 | "SOCKS5" 248 | ] 249 | 250 | for url in target_urls: 251 | try: 252 | #r = WebRequest().get(url, verify=False, timeout=10) 253 | import requests 254 | r = requests.get(url, verify=False, timeout=10) 255 | #print(json.dumps(r.text)) 256 | if r.status_code == 200: 257 | for item in r.json()['proxies']: 258 | each = f"{item['ip']}:{item['port']}" 259 | proxy_type = item['protocol'] 260 | yield each, proxy_type 261 | else: 262 | print(url, len(r.text)) 263 | 264 | except Exception as e: 265 | print(e) 266 | pass 267 | 268 | 269 | @classmethod 270 | def freeProxy97(cls): 271 | """ http://example.com/en/ """ 272 | # URL 273 | url = 'https://geoxy.io/proxies?page=%d&count=50' 274 | 275 | # 请求头 276 | headers = { 277 | 'accept': '*/*', 278 | 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 279 | 'authorization': 'BgPXfhUc8CAhK7wGOqzqz9m77j3sH7', 280 | 'cache-control': 'no-cache', 281 | 'pragma': 'no-cache', 282 | 'priority': 'u=1, i', 283 | 'referer': 'https://geoxy.io/', 284 | 'sec-ch-ua': '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"', 285 | 'sec-ch-ua-mobile': '?1', 286 | 'sec-ch-ua-platform': '"Android"', 287 | 'sec-fetch-dest': 'empty', 288 | 'sec-fetch-mode': 'cors', 289 | 'sec-fetch-site': 'same-origin', 290 | 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Mobile Safari/537.36', 291 | } 292 | 293 | pageid = 1 294 | target_url = url % pageid 295 | while True: 296 | try: 297 | r = WebRequest().get(target_url, header=headers, verify=False, timeout = 10) 298 | for each in r.json: 299 | proxy_str = each['address'] 300 | proxy_type = "/".join(each['protocols']) 301 | #print(proxy_str, proxy_type) 302 | yield proxy_str, proxy_type 303 | if pageid >= 16: 304 | break 305 | pageid += 1 306 | target_url = url % pageid 307 | 308 | except Exception as e: 309 | print(e) 310 | break 311 | 312 | 313 | @classmethod 314 | def freeProxy98(cls): 315 | """ http://example.com/en/ """ 316 | target_urls=[ 317 | "http://42.192.20.108:5000/all/", 318 | "http://117.72.44.211:5010/all/", 319 | "http://124.221.144.122:5010/all/", 320 | "http://119.45.214.228:5010/all/", 321 | "http://106.52.214.84:5010/all/", 322 | "http://152.69.217.58:5010/all/", 323 | "http://47.116.207.92:5010/all/" 324 | ] 325 | 326 | for url in target_urls: 327 | try: 328 | r = WebRequest().get(url, verify=False, timeout=10) 329 | for each in r.json: 330 | #print(each['proxy'], "HTTP") 331 | yield each['proxy'], "HTTP" 332 | except Exception as e: 333 | print(e) 334 | pass 335 | 336 | @classmethod 337 | def freeProxy99(cls): 338 | """ http://example.com/en/ """ 339 | 340 | # URL 341 | url = 'https://proxyelite.info/free-proxy-list/?utm_referrer=https%3A%2F%2Fproxyelite.info%2F' 342 | 343 | # 请求头 344 | headers = { 345 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 346 | 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 347 | 'cache-control': 'no-cache', 348 | 'cookie': 'bhnTfg=wFEMWaiPAUCrQezVRxkpgHfnoIZKmj; _ga=GA1.1.333839908.1730476180; _gcl_au=1.1.2036505239.1730476192; _ym_uid=1730476195347327949; _ym_d=1730476195; wordpress_test_cookie=WP%20Cookie%20check; _ym_isad=1; _ym_visorc=w; _ga_S5PS0TSJE6=GS1.1.1730552817.3.1.1730553129.0.0.0; bhnTfg_hits=48; wFEMWaiPAUCrQezVRxkpgHfnoIZKmj=b2960ca1472f8689fc92c3b448ea3dfc-1730553131-1730553128', 349 | 'pragma': 'no-cache', 350 | 'priority': 'u=0, i', 351 | 'referer': 'https://proxyelite.info/free-proxy-list/', 352 | 'sec-ch-ua': '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"', 353 | 'sec-ch-ua-arch': '""', 354 | 'sec-ch-ua-bitness': '"64"', 355 | 'sec-ch-ua-full-version': '"130.0.6723.70"', 356 | 'sec-ch-ua-full-version-list': '"Chromium";v="130.0.6723.70", "Google Chrome";v="130.0.6723.70", "Not?A_Brand";v="99.0.0.0"', 357 | 'sec-ch-ua-mobile': '?1', 358 | 'sec-ch-ua-model': '"Nexus 5"', 359 | 'sec-ch-ua-platform': '"Android"', 360 | 'sec-ch-ua-platform-version': '"6.0"', 361 | 'sec-fetch-dest': 'document', 362 | 'sec-fetch-mode': 'navigate', 363 | 'sec-fetch-site': 'same-origin', 364 | 'sec-fetch-user': '?1', 365 | 'upgrade-insecure-requests': '1', 366 | 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Mobile Safari/537.36', 367 | } 368 | 369 | # 请求数据 370 | data = { 371 | 'action': 'proxylister_load_more', 372 | 'nonce': '3d3e163d5f', 373 | 'page': '1', 374 | 'atts[downloads]': 'true' 375 | } 376 | 377 | over = 0 378 | pageid = 1 379 | while True: 380 | # 发送 POST 请求 381 | try: 382 | r = WebRequest().post(url, header=headers, data=data, verify=False, timeout = 10) 383 | 384 | # 打印响应内容 385 | #print(r.text) 386 | #print(r.json['data']['rows']) 387 | #_tree = html.fromstring(r.json['data']['rows']) 388 | _tree = html.fromstring(r.text) 389 | #print(tostring(_tree, pretty_print=True, encoding='unicode')) 390 | for tr in _tree.xpath("//table//tr"): 391 | ip = "".join(tr.xpath("./td[1]/text()")).strip() 392 | port = "".join(tr.xpath("./td[2]/text()")).strip() 393 | proxy_type = "".join(tr.xpath("./td[3]/text()")).strip() 394 | etstr = "".join(tr.xpath("./td[10]/text()")).strip().split() 395 | #print(len(etstr), "".join(tr.xpath("./td[10]/text()")).strip()) 396 | if len(etstr) >= 2 and "分钟" not in etstr[1]: 397 | over = 1 398 | break 399 | #print(ip, port, proxy_type) 400 | yield "%s:%s" % (ip, port), proxy_type 401 | if over == 0: 402 | if pageid > 10: 403 | break 404 | pageid += 1 405 | data['page'] = '%d' % pageid 406 | else: 407 | break 408 | except Exception as e: 409 | print(e) 410 | break 411 | 412 | @classmethod 413 | def freeProxy100(cls): 414 | """ http://example.com/en/ """ 415 | 416 | over = 0 417 | pageid = 1 418 | target_url='https://www.freeproxy.world/?type=&anonymity=&country=&speed=&port=&page=%d' % pageid 419 | while target_url: 420 | #print(target_url) 421 | r = WebRequest().get(target_url, verify=False, timeout=10) 422 | _tree = html.fromstring(r.text) 423 | #print(tostring(_tree, pretty_print=True, encoding='unicode')) 424 | for tr in _tree.xpath("//table//tr"): 425 | #print(tostring(tr, pretty_print=True, encoding='unicode')[:64]) 426 | ip = "".join(tr.xpath("./td[1]/text()")).strip() 427 | if ip == "": 428 | continue 429 | port = "".join(tr.xpath("./td[2]/a/text()")).strip() 430 | if port == "": 431 | continue 432 | proxy_type = "".join(tr.xpath("./td[6]/a/text()")).strip() 433 | if proxy_type == "": 434 | continue 435 | etstr = "".join(tr.xpath("./td[8]/text()")).strip().split() 436 | if len(etstr) >= 2 and "minutes" not in etstr[1]: 437 | over = 1 438 | break 439 | #print(ip, port, proxy_type) 440 | yield "%s:%s" % (ip, port), proxy_type 441 | if over == 0: 442 | if pageid > 10: 443 | break 444 | pageid += 1 445 | target_url = 'https://www.freeproxy.world/?type=&anonymity=&country=&speed=&port=&page=%d' % pageid 446 | else: 447 | break 448 | 449 | 450 | @classmethod 451 | def freeProxy101(cls): 452 | """ read proxy lists from file """ 453 | 454 | filelist = [ 455 | './proxyList/http.txt', 456 | './proxyList/https.txt', 457 | './proxyList/socks4.txt', 458 | './proxyList/socks5.txt', 459 | ] 460 | 461 | proxy_types = [ 462 | "HTTP", 463 | "HTTPS", 464 | "SOCKS4", 465 | "SOCKS5" 466 | ] 467 | 468 | try: 469 | for fl in filelist: 470 | if not os.path.exists(fl): 471 | continue 472 | if os.path.getsize(fl) < 8: 473 | continue 474 | if fl.endswith("http.txt"): 475 | proxy_type = proxy_types[0] 476 | elif fl.endswith("https.txt"): 477 | proxy_type = proxy_types[1] 478 | elif fl.endswith("socks4.txt"): 479 | proxy_type = proxy_types[2] 480 | elif fl.endswith("socks5.txt"): 481 | proxy_type = proxy_types[3] 482 | 483 | for line in fileinput.input(fl): 484 | proxy = line.strip() 485 | proxy = proxy.upper().replace(f"{proxy_type}://", "") 486 | yield proxy, proxy_type 487 | 488 | 489 | except Exception as e: 490 | print(e) 491 | pass 492 | 493 | 494 | 495 | """ 496 | latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() 497 | from datetime import datetime 498 | interval = datetime.now() - datetime.strptime(latest_page_time, "%Y/%m/%d %H:%M:%S") 499 | if interval.seconds < 300: # 只采集5分钟内的更新 500 | target_url = "https://www.zdaye.com/" + html_tree.xpath("//h3[@class='thread_title']/a/@href")[0].strip() 501 | while target_url: 502 | _tree = WebRequest().get(target_url, verify=False).tree 503 | for tr in _tree.xpath("//table//tr"): 504 | ip = "".join(tr.xpath("./td[1]/text()")).strip() 505 | port = "".join(tr.xpath("./td[2]/text()")).strip() 506 | yield "%s:%s" % (ip, port) 507 | next_page = _tree.xpath("//div[@class='page']/a[@title='下一页']/@href") 508 | target_url = "https://www.zdaye.com/" + next_page[0].strip() if next_page else False 509 | sleep(5) 510 | 511 | 512 | try: 513 | for each in r.text.splitlines(): 514 | yield each, "http" 515 | except Exception as e: 516 | print(e) 517 | """ 518 | # @classmethod 519 | # def wallProxy01(cls): 520 | # """ 521 | # PzzQz https://pzzqz.com/ 522 | # """ 523 | # from requests import Session 524 | # from lxml import etree 525 | # session = Session() 526 | # try: 527 | # index_resp = session.get("https://pzzqz.com/", timeout=20, verify=False).text 528 | # x_csrf_token = re.findall('X-CSRFToken": "(.*?)"', index_resp) 529 | # if x_csrf_token: 530 | # data = {"http": "on", "ping": "3000", "country": "cn", "ports": ""} 531 | # proxy_resp = session.post("https://pzzqz.com/", verify=False, 532 | # headers={"X-CSRFToken": x_csrf_token[0]}, json=data).json() 533 | # tree = etree.HTML(proxy_resp["proxy_html"]) 534 | # for tr in tree.xpath("//tr"): 535 | # ip = "".join(tr.xpath("./td[1]/text()")) 536 | # port = "".join(tr.xpath("./td[2]/text()")) 537 | # yield "%s:%s" % (ip, port) 538 | # except Exception as e: 539 | # print(e) 540 | 541 | # @classmethod 542 | # def freeProxy10(cls): 543 | # """ 544 | # 墙外网站 cn-proxy 545 | # :return: 546 | # """ 547 | # urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] 548 | # request = WebRequest() 549 | # for url in urls: 550 | # r = request.get(url, timeout=10) 551 | # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) 552 | # for proxy in proxies: 553 | # yield ':'.join(proxy) 554 | 555 | # @classmethod 556 | # def freeProxy11(cls): 557 | # """ 558 | # https://proxy-list.org/english/index.php 559 | # :return: 560 | # """ 561 | # urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] 562 | # request = WebRequest() 563 | # import base64 564 | # for url in urls: 565 | # r = request.get(url, timeout=10) 566 | # proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) 567 | # for proxy in proxies: 568 | # yield base64.b64decode(proxy).decode() 569 | 570 | # @classmethod 571 | # def freeProxy12(cls): 572 | # urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] 573 | # request = WebRequest() 574 | # for url in urls: 575 | # r = request.get(url, timeout=10) 576 | # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) 577 | # for proxy in proxies: 578 | # yield ':'.join(proxy) 579 | 580 | 581 | if __name__ == '__main__': 582 | p = ProxyFetcher() 583 | for each in p.freeProxy96(): 584 | print(each) 585 | 586 | # http://nntime.com/proxy-list-01.htm 587 | -------------------------------------------------------------------------------- /handler/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py 5 | Description : 6 | Author : 007x 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | # from handler.ProxyManager import ProxyManager 16 | -------------------------------------------------------------------------------- /handler/configHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: configHandler 5 | Description : 6 | Author : 007x 7 | date: 2020/6/22 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/22: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | import os 16 | import setting 17 | from util.singleton import Singleton 18 | from util.lazyProperty import LazyProperty 19 | from util.six import reload_six, withMetaclass 20 | 21 | 22 | class ConfigHandler(withMetaclass(Singleton)): 23 | 24 | def __init__(self): 25 | pass 26 | 27 | @LazyProperty 28 | def serverHost(self): 29 | return os.environ.get("HOST", setting.HOST) 30 | 31 | @LazyProperty 32 | def serverPort(self): 33 | return os.environ.get("PORT", setting.PORT) 34 | 35 | @LazyProperty 36 | def dbConn(self): 37 | return os.getenv("DB_CONN", setting.DB_CONN) 38 | 39 | @LazyProperty 40 | def tableName(self): 41 | return os.getenv("TABLE_NAME", setting.TABLE_NAME) 42 | 43 | @property 44 | def fetchers(self): 45 | reload_six(setting) 46 | return setting.PROXY_FETCHER 47 | 48 | @LazyProperty 49 | def httpUrl(self): 50 | return os.getenv("HTTP_URL", setting.HTTP_URL) 51 | 52 | @LazyProperty 53 | def httpsUrl(self): 54 | return os.getenv("HTTPS_URL", setting.HTTPS_URL) 55 | 56 | @LazyProperty 57 | def serverPidFile(self): 58 | return os.getenv("SERVER_PIDFILE", setting.SERVER_PIDFILE) 59 | 60 | @LazyProperty 61 | def serverAccessLogFile(self): 62 | return os.getenv("SERVER_ACCESS_LOGFILE", setting.SERVER_ACCESS_LOGFILE) 63 | 64 | @LazyProperty 65 | def serverErrorLogFile(self): 66 | return os.getenv("SERVER_ERROR_LOGFILE", setting.SERVER_ERROR_LOGFILE) 67 | 68 | @LazyProperty 69 | def schedulerPidFile(self): 70 | return os.getenv("SCHEDULER_PIDFILE", setting.SCHEDULER_PIDFILE) 71 | 72 | @LazyProperty 73 | def verifyTimeout(self): 74 | return int(os.getenv("VERIFY_TIMEOUT", setting.VERIFY_TIMEOUT)) 75 | 76 | @LazyProperty 77 | def proxyCheckCount(self): 78 | return int(os.getenv("PROXY_CHECK_COUNT", setting.PROXY_CHECK_COUNT)) 79 | 80 | @LazyProperty 81 | def maxFailCount(self): 82 | return int(os.getenv("MAX_FAIL_COUNT", setting.MAX_FAIL_COUNT)) 83 | 84 | @LazyProperty 85 | def maxFailRate(self): 86 | return float(os.getenv("MAX_FAIL_RATE", setting.MAX_FAIL_RATE)) 87 | 88 | @LazyProperty 89 | def minAvailLimit(self): 90 | return int(os.getenv("MIN_AVAIL_LIMIT", setting.MIN_AVAIL_LIMIT)) 91 | 92 | @LazyProperty 93 | def poolSizeMin(self): 94 | return int(os.getenv("POOL_SIZE_MIN", setting.POOL_SIZE_MIN)) 95 | 96 | @LazyProperty 97 | def proxyRegion(self): 98 | return bool(os.getenv("PROXY_REGION", setting.PROXY_REGION)) 99 | 100 | @LazyProperty 101 | def apiProxyConfig(self): 102 | return os.getenv("API_PROXY_CONFIG", setting.API_PROXY_CONFIG) 103 | 104 | @LazyProperty 105 | def rawThreadsNum(self): 106 | return os.getenv("RAW_THREADS_NUM", setting.RAW_THREADS_NUM) 107 | 108 | @LazyProperty 109 | def useThreadsNum(self): 110 | return os.getenv("USE_THREADS_NUM", setting.USE_THREADS_NUM) 111 | 112 | @LazyProperty 113 | def rawIntervalMin(self): 114 | return os.getenv("RAW_INTERVAL_MIN", setting.RAW_INTERVAL_MIN) 115 | 116 | @LazyProperty 117 | def useIntervalMin(self): 118 | return os.getenv("USE_INTERVAL_MIN", setting.USE_INTERVAL_MIN) 119 | 120 | @LazyProperty 121 | def threadPoolWorksNum(self): 122 | return os.getenv("THREADPOOL_WORKS_NUM", setting.THREADPOOL_WORKS_NUM) 123 | 124 | @LazyProperty 125 | def processPoolWorksNum(self): 126 | return os.getenv("PROCESSPOOL_WORKS_NUM", setting.PROCESSPOOL_WORKS_NUM) 127 | 128 | @LazyProperty 129 | def jobInstancesNum(self): 130 | return os.getenv("JOB_INSTANCES_NUM", setting.JOB_INSTANCES_NUM) 131 | 132 | @LazyProperty 133 | def timezone(self): 134 | return os.getenv("TIMEZONE", setting.TIMEZONE) 135 | 136 | -------------------------------------------------------------------------------- /handler/logHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: LogHandler.py 5 | Description : 日志操作模块 6 | Author : 007x 7 | date: 2017/3/6 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/03/06: log handler 11 | 2017/09/21: 屏幕输出/文件输出 可选(默认屏幕和文件均输出) 12 | 2020/07/13: Windows下TimedRotatingFileHandler线程不安全, 不再使用 13 | ------------------------------------------------- 14 | """ 15 | __author__ = '007x' 16 | 17 | import os 18 | import logging 19 | import platform 20 | 21 | from logging.handlers import TimedRotatingFileHandler 22 | 23 | # 日志级别 24 | CRITICAL = 50 25 | FATAL = CRITICAL 26 | ERROR = 40 27 | WARNING = 30 28 | WARN = WARNING 29 | INFO = 20 30 | DEBUG = 10 31 | NOTSET = 0 32 | 33 | CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) 34 | ROOT_PATH = os.path.join(CURRENT_PATH, os.pardir) 35 | LOG_PATH = os.path.join(ROOT_PATH, 'log') 36 | 37 | if not os.path.exists(LOG_PATH): 38 | try: 39 | os.mkdir(LOG_PATH) 40 | except FileExistsError: 41 | pass 42 | 43 | 44 | class LogHandler(logging.Logger): 45 | """ 46 | LogHandler 47 | """ 48 | 49 | def __init__(self, name, level=DEBUG, stream=True, file=True): 50 | self.name = name 51 | self.level = level 52 | logging.Logger.__init__(self, self.name, level=level) 53 | if stream: 54 | self.__setStreamHandler__() 55 | if file: 56 | if platform.system() != "Windows": 57 | self.__setFileHandler__() 58 | 59 | def __setFileHandler__(self, level=None): 60 | """ 61 | set file handler 62 | :param level: 63 | :return: 64 | """ 65 | file_name = os.path.join(LOG_PATH, '{name}.log'.format(name=self.name)) 66 | # 设置日志回滚, 保存在log目录, 一天保存一个文件, 保留15天 67 | file_handler = TimedRotatingFileHandler(filename=file_name, when='D', interval=1, backupCount=15) 68 | file_handler.suffix = '%Y%m%d.log' 69 | if not level: 70 | file_handler.setLevel(self.level) 71 | else: 72 | file_handler.setLevel(level) 73 | formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') 74 | 75 | file_handler.setFormatter(formatter) 76 | self.file_handler = file_handler 77 | self.addHandler(file_handler) 78 | 79 | def __setStreamHandler__(self, level=None): 80 | """ 81 | set stream handler 82 | :param level: 83 | :return: 84 | """ 85 | stream_handler = logging.StreamHandler() 86 | formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') 87 | stream_handler.setFormatter(formatter) 88 | if not level: 89 | stream_handler.setLevel(self.level) 90 | else: 91 | stream_handler.setLevel(level) 92 | self.addHandler(stream_handler) 93 | 94 | 95 | if __name__ == '__main__': 96 | log = LogHandler('test') 97 | log.info('this is a test msg') 98 | -------------------------------------------------------------------------------- /handler/proxyHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: ProxyHandler.py 5 | Description : 6 | Author : 007x 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/03: 11 | 2020/05/26: 区分http和https 12 | ------------------------------------------------- 13 | """ 14 | __author__ = '007x' 15 | 16 | from helper.proxy import Proxy 17 | from db.dbClient import DbClient 18 | from handler.configHandler import ConfigHandler 19 | 20 | 21 | class ProxyHandler(object): 22 | """ Proxy CRUD operator""" 23 | 24 | def __init__(self): 25 | self.conf = ConfigHandler() 26 | self.db = DbClient(self.conf.dbConn) 27 | self.db.changeTable(self.conf.tableName) 28 | 29 | def get(self, https=False): 30 | """ 31 | return a proxy 32 | Args: 33 | https: True/False 34 | Returns: 35 | """ 36 | proxy = self.db.get(https) 37 | return Proxy.createFromJson(proxy) if proxy else None 38 | 39 | def update(self, key, value, **kwargs): 40 | """ 41 | upate a proxy 42 | """ 43 | self.db.update(key, value, **kwargs) 44 | 45 | def pop(self, https): 46 | """ 47 | return and delete a useful proxy 48 | :return: 49 | """ 50 | proxy = self.db.pop(https) 51 | if proxy: 52 | return Proxy.createFromJson(proxy) 53 | return None 54 | 55 | def put(self, proxy): 56 | """ 57 | put proxy into use proxy 58 | :return: 59 | """ 60 | self.db.put(proxy) 61 | 62 | def delete(self, proxy): 63 | """ 64 | delete useful proxy 65 | :param proxy: 66 | :return: 67 | """ 68 | return self.db.delete(proxy.proxy) 69 | 70 | def clear(self): 71 | """ 72 | clear proxy 73 | """ 74 | return self.db.clear() 75 | 76 | def getAll(self, https=False): 77 | """ 78 | get all proxy from pool as Proxy list 79 | :return: 80 | """ 81 | proxies = self.db.getAll(https) 82 | return [Proxy.createFromJson(_) for _ in proxies] 83 | 84 | def exists(self, proxy): 85 | """ 86 | check proxy exists 87 | :param proxy: 88 | :return: 89 | """ 90 | return self.db.exists(proxy.proxy) 91 | 92 | def getCount(self): 93 | """ 94 | return raw_proxy and use_proxy count 95 | :return: 96 | """ 97 | total_use_proxy = self.db.getCount() 98 | return {'count': total_use_proxy} 99 | -------------------------------------------------------------------------------- /helper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RanSecWlx/proxyPool/094f763164b4e821b2f41f9153cb4e2b3f9945ec/helper/__init__.py -------------------------------------------------------------------------------- /helper/check.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: check 5 | Description : 执行代理校验 6 | Author : 007x 7 | date: 2019/8/6 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/08/06: 执行代理校验 11 | 2021/05/25: 分别校验http和https 12 | 2022/08/16: 获取代理Region信息 13 | ------------------------------------------------- 14 | """ 15 | __author__ = '007x' 16 | 17 | from util.six import Empty 18 | from threading import Thread 19 | from datetime import datetime 20 | from util.webRequest import WebRequest 21 | from handler.logHandler import LogHandler 22 | from helper.validator import ProxyValidator 23 | from handler.proxyHandler import ProxyHandler 24 | from handler.configHandler import ConfigHandler 25 | 26 | class DoValidator(object): 27 | """ 执行校验 """ 28 | 29 | conf = ConfigHandler() 30 | 31 | @classmethod 32 | def validator(cls, proxy, work_type): 33 | """ 34 | 校验入口 35 | Args: 36 | proxy: Proxy Object 37 | work_type: raw/use 38 | Returns: 39 | Proxy Object 40 | """ 41 | http_r = cls.httpValidator(proxy) 42 | https_r = False if not http_r else cls.httpsValidator(proxy) 43 | #print(cls.conf.maxFailRate) 44 | proxy.check_count += 1 45 | proxy.last_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 46 | proxy.last_status = True if http_r else False 47 | if http_r: 48 | #if proxy.fail_count > 0: 49 | # proxy.fail_count -= 1 50 | proxy.https = True if https_r else False 51 | if work_type == "raw": 52 | proxy.region = cls.regionGetter(proxy) if cls.conf.proxyRegion else "" 53 | else: 54 | proxy.fail_count += 1 55 | return proxy 56 | 57 | @classmethod 58 | def httpValidator(cls, proxy): 59 | for func in ProxyValidator.http_validator: 60 | if not func(proxy): 61 | return False 62 | return True 63 | 64 | @classmethod 65 | def httpsValidator(cls, proxy): 66 | for func in ProxyValidator.https_validator: 67 | if not func(proxy): 68 | return False 69 | return True 70 | 71 | @classmethod 72 | def preValidator(cls, proxy): 73 | for func in ProxyValidator.pre_validator: 74 | if not func(proxy): 75 | return False 76 | return True 77 | 78 | @classmethod 79 | def regionGetter(cls, proxy): 80 | try: 81 | 82 | url = 'https://searchplugin.csdn.net/api/v1/ip/get?ip=%s' % proxy.proxy.split(':')[0] 83 | r = WebRequest().get(url=url, retry_time=1, timeout=2).json 84 | return r['data']['address'] 85 | except: 86 | return 'error' 87 | 88 | 89 | class _ThreadChecker(Thread): 90 | """ 多线程检测 """ 91 | 92 | def __init__(self, work_type, target_queue, thread_name): 93 | Thread.__init__(self, name=thread_name) 94 | self.work_type = work_type 95 | self.log = LogHandler("checker") 96 | self.proxy_handler = ProxyHandler() 97 | self.target_queue = target_queue 98 | self.conf = ConfigHandler() 99 | 100 | def run(self): 101 | #self.log.info("{}ProxyCheck - {}: start".format(self.work_type.title(), self.name)) 102 | while True: 103 | try: 104 | proxy = self.target_queue.get(block=False) 105 | except Empty: 106 | #self.log.info("{}ProxyCheck - {}: complete".format(self.work_type.title(), self.name)) 107 | break 108 | 109 | if self.work_type == "raw" and self.proxy_handler.exists(proxy): 110 | continue 111 | 112 | proxy = DoValidator.validator(proxy, self.work_type) 113 | if self.work_type == "raw": 114 | self.__ifRaw(proxy) 115 | else: 116 | self.__ifUse(proxy) 117 | self.target_queue.task_done() 118 | 119 | def __ifRaw(self, proxy): 120 | if proxy.last_status: 121 | if self.proxy_handler.exists(proxy): 122 | self.log.info('RawProxyCheck - {}: {} {} {} exist'.format(self.name, proxy.proxy.ljust(23), proxy.proxy_type, proxy.source)) 123 | else: 124 | self.log.info('RawProxyCheck - {}: {} {} {} pass'.format(self.name, proxy.proxy.ljust(23), proxy.proxy_type, proxy.source)) 125 | proxy.create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 126 | self.proxy_handler.put(proxy) 127 | #else: 128 | # self.log.info('RawProxyCheck - {}: {} {} {} fail'.format(self.name, proxy.proxy.ljust(23), proxy.proxy_type, proxy.source)) 129 | 130 | def __ifUse(self, proxy): 131 | if proxy.last_status: 132 | self.log.info('UseProxyCheck - {}: {} {} {} pass'.format(self.name, proxy.proxy.ljust(23), proxy.proxy_type, proxy.source)) 133 | #proxy.create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 134 | self.proxy_handler.put(proxy) 135 | else: 136 | fail_rate = proxy.fail_count / proxy.check_count 137 | #print(fail_rate, self.conf.maxFailRate, proxy.fail_count, proxy.check_count) 138 | if proxy.check_count >= 5 and fail_rate >= self.conf.maxFailRate: 139 | #if proxy.fail_count > self.conf.maxFailCount: 140 | self.log.info('UseProxyCheck - {}: {} {} {} fail, count {} {} delete'.format(self.name, 141 | proxy.proxy.ljust(23), proxy.proxy_type, proxy.source, 142 | proxy.fail_count, proxy.check_count)) 143 | self.proxy_handler.delete(proxy) 144 | else: 145 | self.log.info('UseProxyCheck - {}: {} {} {} fail, count {} keep'.format(self.name, 146 | proxy.proxy.ljust(23), proxy.proxy_type, proxy.source, 147 | proxy.fail_count)) 148 | #proxy.create_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 149 | self.proxy_handler.put(proxy) 150 | 151 | 152 | def Checker(tp, queue): 153 | """ 154 | run Proxy ThreadChecker 155 | :param tp: raw/use 156 | :param queue: Proxy Queue 157 | :return: 158 | """ 159 | conf = ConfigHandler() 160 | 161 | thread_list = list() 162 | if "raw" in tp: 163 | threadsNum = conf.rawThreadsNum 164 | else: 165 | threadsNum = conf.useThreadsNum 166 | 167 | for index in range(threadsNum): 168 | thread_list.append(_ThreadChecker(tp, queue, "thread_%s" % str(index).zfill(2))) 169 | 170 | for thread in thread_list: 171 | thread.setDaemon(True) 172 | thread.start() 173 | 174 | for thread in thread_list: 175 | thread.join() 176 | -------------------------------------------------------------------------------- /helper/fetch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: fetchScheduler 5 | Description : 6 | Author : 007x 7 | date: 2019/8/6 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2021/11/18: 多线程采集 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | from threading import Thread 16 | from helper.proxy import Proxy 17 | from helper.check import DoValidator 18 | from handler.logHandler import LogHandler 19 | from handler.proxyHandler import ProxyHandler 20 | from fetcher.proxyFetcher import ProxyFetcher 21 | from handler.configHandler import ConfigHandler 22 | 23 | 24 | class _ThreadFetcher(Thread): 25 | 26 | def __init__(self, fetch_source, proxy_dict): 27 | Thread.__init__(self) 28 | self.fetch_source = fetch_source 29 | self.proxy_dict = proxy_dict 30 | self.fetcher = getattr(ProxyFetcher, fetch_source, None) 31 | self.log = LogHandler("fetcher") 32 | self.conf = ConfigHandler() 33 | self.proxy_handler = ProxyHandler() 34 | 35 | def run(self): 36 | self.log.info("ProxyFetch - {func}: start".format(func=self.fetch_source)) 37 | try: 38 | for proxy, proxy_type in self.fetcher(): 39 | #self.log.info('ProxyFetch - %s: %s ok' % (self.fetch_source, proxy.ljust(23))) 40 | proxy = proxy.strip() 41 | if proxy in self.proxy_dict: 42 | #print(self.fetch_source) 43 | self.proxy_dict[proxy].add_source(self.fetch_source, proxy_type.upper()) 44 | else: 45 | self.proxy_dict[proxy] = Proxy( 46 | proxy, source=self.fetch_source, proxy_type=proxy_type.upper()) 47 | except Exception as e: 48 | self.log.error("ProxyFetch - {func}: error".format(func=self.fetch_source)) 49 | self.log.error(str(e)) 50 | 51 | 52 | class Fetcher(object): 53 | name = "fetcher" 54 | 55 | def __init__(self): 56 | self.log = LogHandler(self.name) 57 | self.conf = ConfigHandler() 58 | 59 | def run(self): 60 | """ 61 | fetch proxy with proxyFetcher 62 | :return: 63 | """ 64 | proxy_dict = dict() 65 | thread_list = list() 66 | self.log.info("ProxyFetch : start") 67 | 68 | for fetch_source in self.conf.fetchers: 69 | #self.log.info("ProxyFetch - {func}: start".format(func=fetch_source)) 70 | fetcher = getattr(ProxyFetcher, fetch_source, None) 71 | if not fetcher: 72 | self.log.error("ProxyFetch - {func}: class method not exists!".format(func=fetch_source)) 73 | continue 74 | if not callable(fetcher): 75 | self.log.error("ProxyFetch - {func}: must be class method".format(func=fetch_source)) 76 | continue 77 | thread_list.append(_ThreadFetcher(fetch_source, proxy_dict)) 78 | 79 | for thread in thread_list: 80 | thread.setDaemon(True) 81 | thread.start() 82 | 83 | for thread in thread_list: 84 | thread.join() 85 | 86 | self.log.info("ProxyFetch - all complete!") 87 | for _ in proxy_dict.values(): 88 | if DoValidator.preValidator(_.proxy): 89 | yield _ 90 | -------------------------------------------------------------------------------- /helper/launcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: launcher 5 | Description : 启动器 6 | Author : 007x 7 | date: 2021/3/26 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2021/3/26: 启动器 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | import os 16 | import sys 17 | import signal 18 | import daemon 19 | from daemon.pidfile import PIDLockFile 20 | from db.dbClient import DbClient 21 | from handler.logHandler import LogHandler 22 | from handler.configHandler import ConfigHandler 23 | 24 | log = LogHandler('launcher') 25 | conf = ConfigHandler() 26 | 27 | def startServerProcess(): 28 | __beforeStart() 29 | from api.proxyApi import runFlask 30 | with daemon.DaemonContext( 31 | #stdout=sys.stdout, stderr=sys.stderr, 32 | working_directory='./', 33 | pidfile=PIDLockFile(conf.serverPidFile), 34 | umask=0o022, 35 | detach_process=True): 36 | runFlask() 37 | #runFlask() 38 | 39 | def stopServerProcess(): 40 | try: 41 | with open(conf.serverPidFile, "r") as f: 42 | pid = int(f.read().strip()) 43 | os.kill(pid, signal.SIGTERM) 44 | os.remove(conf.serverPidFile) 45 | print(f"Server process: {pid} stoped") 46 | #print("Server pid file removed") 47 | except FileNotFoundError: 48 | pass 49 | except ProcessLookupError: 50 | pass 51 | 52 | def startSchedulerProcess(): 53 | __beforeStart() 54 | from helper.scheduler import runScheduler 55 | with daemon.DaemonContext( 56 | #stdout=sys.stdout, stderr=sys.stderr, 57 | working_directory='./', 58 | pidfile=PIDLockFile(conf.schedulerPidFile), 59 | umask=0o022, 60 | detach_process=True): 61 | runScheduler() 62 | 63 | def stopSchedulerProcess(): 64 | try: 65 | with open(conf.schedulerPidFile, "r") as f: 66 | pid = int(f.read().strip()) 67 | os.kill(pid, signal.SIGTERM) 68 | os.remove(conf.schedulerPidFile) 69 | print(f"Server process: {pid} stoped") 70 | #print("Server pid file removed") 71 | except FileNotFoundError: 72 | pass 73 | except ProcessLookupError: 74 | pass 75 | 76 | def __beforeStart(): 77 | __showVersion() 78 | __showConfigure() 79 | if __checkDBConfig(): 80 | log.info('exit!') 81 | sys.exit() 82 | 83 | 84 | def __showVersion(): 85 | from setting import VERSION 86 | log.info("ProxyPool Version: %s" % VERSION) 87 | 88 | 89 | def __showConfigure(): 90 | log.info("ProxyPool configure HOST: %s" % conf.serverHost) 91 | log.info("ProxyPool configure PORT: %s" % conf.serverPort) 92 | log.info("ProxyPool configure PROXY_FETCHER: %s" % conf.fetchers) 93 | 94 | 95 | def __checkDBConfig(): 96 | db = DbClient(conf.dbConn) 97 | log.info("============ DATABASE CONFIGURE ================") 98 | log.info("DB_TYPE: %s" % db.db_type) 99 | log.info("DB_HOST: %s" % db.db_host) 100 | log.info("DB_PORT: %s" % db.db_port) 101 | log.info("DB_NAME: %s" % db.db_name) 102 | log.info("DB_USER: %s" % db.db_user) 103 | log.info("=================================================") 104 | return db.test() 105 | -------------------------------------------------------------------------------- /helper/proxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: Proxy 5 | Description : 代理对象类型封装 6 | Author : 007x 7 | date: 2019/7/11 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/7/11: 代理对象类型封装 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | import json 16 | 17 | 18 | class Proxy(object): 19 | 20 | def __init__(self, proxy, fail_count=0, region="", anonymous="", 21 | source="", check_count=0, last_status="", last_time="", create_time="", 22 | https=False, resp_time=0, proxy_type=""): 23 | self._proxy = proxy 24 | self._fail_count = fail_count 25 | self._region = region 26 | self._anonymous = anonymous 27 | self._source = source.split('/') 28 | self._check_count = check_count 29 | self._last_status = last_status 30 | self._create_time = create_time 31 | self._last_time = last_time 32 | self._https = https 33 | self._resp_time = resp_time 34 | self._proxy_type = proxy_type 35 | 36 | @classmethod 37 | def createFromJson(cls, proxy_json): 38 | _dict = json.loads(proxy_json) 39 | return cls(proxy=_dict.get("proxy", ""), 40 | fail_count=_dict.get("fail_count", 0), 41 | region=_dict.get("region", ""), 42 | anonymous=_dict.get("anonymous", ""), 43 | source=_dict.get("source", ""), 44 | check_count=_dict.get("check_count", 0), 45 | last_status=_dict.get("last_status", ""), 46 | create_time=_dict.get("create_time", ""), 47 | last_time=_dict.get("last_time", ""), 48 | https=_dict.get("https", False), 49 | resp_time= _dict.get("resp_time", 0), 50 | proxy_type= _dict.get("proxy_type", "") 51 | ) 52 | 53 | @property 54 | def proxy(self): 55 | """ 代理 ip:port """ 56 | return self._proxy 57 | 58 | @property 59 | def fail_count(self): 60 | """ 检测失败次数 """ 61 | return self._fail_count 62 | 63 | @property 64 | def region(self): 65 | """ 地理位置(国家/城市) """ 66 | return self._region 67 | 68 | @property 69 | def anonymous(self): 70 | """ 匿名 """ 71 | return self._anonymous 72 | 73 | @property 74 | def source(self): 75 | """ 代理来源 """ 76 | return '/'.join(self._source) 77 | 78 | @property 79 | def check_count(self): 80 | """ 代理检测次数 """ 81 | return self._check_count 82 | 83 | @property 84 | def last_status(self): 85 | """ 最后一次检测结果 True -> 可用; False -> 不可用""" 86 | return self._last_status 87 | 88 | @property 89 | def create_time(self): 90 | """ 第一次检测时间 """ 91 | return self._create_time 92 | 93 | @property 94 | def last_time(self): 95 | """ 最后一次检测时间 """ 96 | return self._last_time 97 | 98 | @property 99 | def https(self): 100 | """ 是否支持https """ 101 | return self._https 102 | 103 | @property 104 | def resp_time(self): 105 | """ 响应时间 """ 106 | return self._resp_time 107 | 108 | @property 109 | def proxy_type(self): 110 | """ 代理类型 """ 111 | return self._proxy_type 112 | 113 | @property 114 | def to_dict(self): 115 | """ 属性字典 """ 116 | return {"proxy": self.proxy, 117 | "https": self.https, 118 | "fail_count": self.fail_count, 119 | "region": self.region, 120 | "anonymous": self.anonymous, 121 | "source": self.source, 122 | "check_count": self.check_count, 123 | "last_status": self.last_status, 124 | "create_time": self.create_time, 125 | "last_time": self.last_time, 126 | "resp_time": self.resp_time, 127 | "proxy_type": self.proxy_type} 128 | 129 | @property 130 | def to_json(self): 131 | """ 属性json格式 """ 132 | return json.dumps(self.to_dict, ensure_ascii=False) 133 | 134 | @fail_count.setter 135 | def fail_count(self, value): 136 | self._fail_count = value 137 | 138 | @check_count.setter 139 | def check_count(self, value): 140 | self._check_count = value 141 | 142 | @last_status.setter 143 | def last_status(self, value): 144 | self._last_status = value 145 | 146 | @create_time.setter 147 | def create_time(self, value): 148 | self._create_time = value 149 | 150 | @last_time.setter 151 | def last_time(self, value): 152 | self._last_time = value 153 | 154 | @https.setter 155 | def https(self, value): 156 | self._https = value 157 | 158 | @region.setter 159 | def region(self, value): 160 | self._region = value 161 | 162 | @resp_time.setter 163 | def resp_time(self, value): 164 | self._resp_time = value 165 | 166 | @proxy_type.setter 167 | def proxy_type(self, value): 168 | self._proxy_type = value 169 | 170 | def add_source(self, source_str, proxy_type=""): 171 | if source_str: 172 | self._source.append(source_str) 173 | self._source = list(set(self._source)) 174 | self._proxy_type = proxy_type 175 | -------------------------------------------------------------------------------- /helper/scheduler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: proxyScheduler 5 | Description : 6 | Author : 007x 7 | date: 2019/8/5 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/08/05: proxyScheduler 11 | 2021/02/23: runProxyCheck时,剩余代理少于POOL_SIZE_MIN时执行抓取 12 | ------------------------------------------------- 13 | """ 14 | __author__ = '007x' 15 | 16 | from apscheduler.schedulers.blocking import BlockingScheduler 17 | from apscheduler.executors.pool import ProcessPoolExecutor 18 | 19 | from util.six import Queue 20 | from helper.fetch import Fetcher 21 | from helper.check import Checker 22 | from handler.logHandler import LogHandler 23 | from handler.proxyHandler import ProxyHandler 24 | from handler.configHandler import ConfigHandler 25 | 26 | 27 | def __runProxyFetch(): 28 | proxy_queue = Queue() 29 | proxy_fetcher = Fetcher() 30 | 31 | for proxy in proxy_fetcher.run(): 32 | proxy_queue.put(proxy) 33 | 34 | Checker("raw", proxy_queue) 35 | 36 | 37 | def __runProxyCheck(): 38 | proxy_handler = ProxyHandler() 39 | proxy_queue = Queue() 40 | if proxy_handler.db.getCount().get("total", 0) < proxy_handler.conf.poolSizeMin: 41 | __runProxyFetch() 42 | for proxy in proxy_handler.getAll(): 43 | proxy_queue.put(proxy) 44 | Checker("use", proxy_queue) 45 | 46 | 47 | def runScheduler(): 48 | __runProxyFetch() 49 | conf = ConfigHandler() 50 | 51 | timezone = ConfigHandler().timezone 52 | scheduler_log = LogHandler("scheduler") 53 | scheduler = BlockingScheduler(logger=scheduler_log, timezone=timezone) 54 | 55 | scheduler.add_job(__runProxyFetch, 'interval', minutes=conf.rawIntervalMin, id="proxy_fetch", name="proxy采集") 56 | scheduler.add_job(__runProxyCheck, 'interval', minutes=conf.useIntervalMin, id="proxy_check", name="proxy检查") 57 | executors = { 58 | 'default': {'type': 'threadpool', 'max_workers': conf.threadPoolWorksNum}, 59 | 'processpool': ProcessPoolExecutor(max_workers=conf.processPoolWorksNum) 60 | } 61 | job_defaults = { 62 | 'coalesce': False, 63 | 'max_instances': conf.jobInstancesNum 64 | } 65 | 66 | scheduler.configure(executors=executors, job_defaults=job_defaults, timezone=timezone) 67 | 68 | scheduler.start() 69 | 70 | 71 | if __name__ == '__main__': 72 | runScheduler() 73 | -------------------------------------------------------------------------------- /helper/validator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: _validators 5 | Description : 定义proxy验证方法 6 | Author : 007x 7 | date: 2021/5/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2023/03/10: 支持带用户认证的代理格式 username:password@ip:port 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | import re 16 | import time 17 | import requests 18 | import ipaddress 19 | from util.six import withMetaclass 20 | from util.singleton import Singleton 21 | from handler.configHandler import ConfigHandler 22 | 23 | conf = ConfigHandler() 24 | 25 | HEADER = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 26 | 'Accept': '*/*', 27 | 'Connection': 'keep-alive', 28 | 'Accept-Language': 'zh-CN,zh;q=0.8'} 29 | 30 | IP_REGEX = re.compile(r"(.*:.*@)?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}") 31 | 32 | 33 | class ProxyValidator(withMetaclass(Singleton)): 34 | pre_validator = [] 35 | http_validator = [] 36 | https_validator = [] 37 | 38 | @classmethod 39 | def addPreValidator(cls, func): 40 | cls.pre_validator.append(func) 41 | return func 42 | 43 | @classmethod 44 | def addHttpValidator(cls, func): 45 | cls.http_validator.append(func) 46 | return func 47 | 48 | @classmethod 49 | def addHttpsValidator(cls, func): 50 | cls.https_validator.append(func) 51 | return func 52 | 53 | 54 | @ProxyValidator.addPreValidator 55 | def formatValidator(proxy): 56 | """检查代理格式""" 57 | return True if IP_REGEX.fullmatch(proxy) else False 58 | 59 | 60 | @ProxyValidator.addHttpValidator 61 | def httpTimeOutValidator(oproxy): 62 | """ http检测超时 """ 63 | proxy = oproxy.proxy 64 | if "HTTPS" in oproxy.proxy_type: 65 | proxies = {"http": "https://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)} 66 | elif "HTTP" in oproxy.proxy_type: 67 | proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "http://{proxy}".format(proxy=proxy)} 68 | elif "SOCKS4" in oproxy.proxy_type: 69 | proxies = {"http": "socks4://{proxy}".format(proxy=proxy), "https": "socks4://{proxy}".format(proxy=proxy)} 70 | elif "SOCKS5" in oproxy.proxy_type: 71 | proxies = {"http": "socks5://{proxy}".format(proxy=proxy), "https": "socks5://{proxy}".format(proxy=proxy)} 72 | else: 73 | proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "http://{proxy}".format(proxy=proxy)} 74 | 75 | start_time = time.perf_counter() 76 | try: 77 | #print(proxies, conf.httpUrl) 78 | r = requests.get(conf.httpUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout) 79 | end_time = time.perf_counter() 80 | oproxy.resp_time = int((end_time - start_time) * 1000); 81 | #print(r.text) 82 | #if r.status_code != 200: 83 | # print(r.status_code, conf.httpsUrl, proxies) 84 | #print(oproxy.resp_time) 85 | """ 86 | test proxy test by ifconfig.me 87 | """ 88 | if r.status_code == 200: 89 | try: 90 | ip = ipaddress.ip_address(r.text) 91 | #print(ip) 92 | return True 93 | except Exception as e: 94 | #print(e) 95 | pass 96 | return False 97 | """ the original proxy test """ 98 | #return True if r.status_code == 200 else False 99 | except Exception as e: 100 | #print(e) 101 | return False 102 | 103 | 104 | @ProxyValidator.addHttpsValidator 105 | def httpsTimeOutValidator(oproxy): 106 | """https检测超时""" 107 | proxy = oproxy.proxy 108 | if "HTTPS" in oproxy.proxy_type : 109 | proxies = {"http": "https://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)} 110 | elif "HTTP" in oproxy.proxy_type : 111 | proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "http://{proxy}".format(proxy=proxy)} 112 | elif "SOCKS4" in oproxy.proxy_type: 113 | proxies = {"http": "socks4://{proxy}".format(proxy=proxy), "https": "socks4://{proxy}".format(proxy=proxy)} 114 | elif "SOCKS5" in oproxy.proxy_type: 115 | proxies = {"http": "socks5://{proxy}".format(proxy=proxy), "https": "socks5://{proxy}".format(proxy=proxy)} 116 | else: 117 | proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "http://{proxy}".format(proxy=proxy)} 118 | 119 | start_time = time.perf_counter() 120 | try: 121 | #print(proxies, conf.httpsUrl) 122 | r = requests.get(conf.httpsUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout, verify=False) 123 | end_time = time.perf_counter() 124 | oproxy.resp_time = int((end_time - start_time) * 1000); 125 | #print(r.text) 126 | #if r.status_code != 200: 127 | # print(r.status_code, conf.httpsUrl, proxies) 128 | #print(oproxy.resp_time) 129 | """ 130 | test proxy test by ifconfig.me 131 | """ 132 | if r.status_code == 200: 133 | try: 134 | ip = ipaddress.ip_address(r.text) 135 | #print(ip) 136 | return True 137 | except Exception as e: 138 | #print(e) 139 | pass 140 | return False 141 | """ the original proxy test """ 142 | #return True if r.status_code == 200 else False 143 | except Exception as e: 144 | #print(e) 145 | return False 146 | 147 | 148 | @ProxyValidator.addHttpValidator 149 | def customValidatorExample(proxy): 150 | """自定义validator函数,校验代理是否可用, 返回True/False""" 151 | return True 152 | -------------------------------------------------------------------------------- /proxyList/http.txt: -------------------------------------------------------------------------------- 1 | http://1.20.214.46:8080 2 | http://1.52.203.241:8080 3 | -------------------------------------------------------------------------------- /proxyList/https.txt: -------------------------------------------------------------------------------- 1 | https://44.218.183.55:80 2 | https://8.220.136.174:31281 3 | -------------------------------------------------------------------------------- /proxyList/socks4.txt: -------------------------------------------------------------------------------- 1 | socks4://38.54.116.9:9999 2 | socks4://47.252.18.37:8443 3 | -------------------------------------------------------------------------------- /proxyList/socks5.txt: -------------------------------------------------------------------------------- 1 | socks5://101.37.12.43:5000 2 | socks5://101.37.12.43:8000 3 | -------------------------------------------------------------------------------- /proxyPool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: proxy_pool 5 | Description : proxy pool 启动入口 6 | Author : 007x 7 | date: 2020/6/19 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/19: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | import click 16 | from helper.launcher import startServerProcess, stopServerProcess 17 | from helper.launcher import startSchedulerProcess, stopSchedulerProcess 18 | from setting import BANNER, VERSION 19 | 20 | CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) 21 | 22 | 23 | @click.group(context_settings=CONTEXT_SETTINGS) 24 | @click.version_option(version=VERSION) 25 | def cli(): 26 | """ProxyPool cli工具""" 27 | 28 | 29 | @cli.command(name="startScheduler") 30 | def startSchedule(): 31 | """ 启动调度程序 """ 32 | click.echo(BANNER) 33 | startSchedulerProcess() 34 | 35 | @cli.command(name="stopScheduler") 36 | def stopSchedule(): 37 | """ 停止调度程序 """ 38 | stopSchedulerProcess() 39 | 40 | 41 | @cli.command(name="startServer") 42 | def startServer(): 43 | """ 启动api服务 """ 44 | click.echo(BANNER) 45 | startServerProcess() 46 | 47 | @cli.command(name="stopServer") 48 | def stopServer(): 49 | """ 停止api服务 """ 50 | stopServerProcess() 51 | 52 | if __name__ == '__main__': 53 | cli() 54 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.20.0 2 | gunicorn==19.9.0 3 | lxml==4.9.2 4 | redis==3.5.3 5 | pysocks==1.7.1 6 | ps4== 0.0.2 7 | APScheduler==3.10.0;python_version>="3.10" 8 | APScheduler==3.2.0;python_version<"3.10" 9 | click==8.0.1;python_version>"3.6" 10 | click==7.0;python_version<="3.6" 11 | Flask==2.1.1;python_version>"3.6" 12 | Flask==1.0;python_version<="3.6" 13 | werkzeug==2.1.0;python_version>"3.6" 14 | werkzeug==0.15.5;python_version<="3.6" 15 | daemon==1.2;python_version>"3.6" 16 | 17 | 18 | -------------------------------------------------------------------------------- /setting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: setting.py 5 | Description : 配置文件 6 | Author : 007x 7 | date: 2019/2/15 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2024/11/27: 11 | ------------------------------------------------- 12 | """ 13 | 14 | BANNER = r""" 15 | **************************************************************** 16 | *** ______ ********************* ______ *********** _ ******** 17 | *** | ___ \_ ******************** | ___ \ ********* | | ******** 18 | *** | |_/ / \__ __ __ _ __ _ | |_/ /___ * ___ | | ******** 19 | *** | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | ******** 20 | *** | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ **** 21 | *** \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____/ **** 22 | **** __ / / ***** 23 | ************************* /___ / ******************************* 24 | ************************* ******************************** 25 | **************************************************************** 26 | """ 27 | 28 | VERSION = "2.8.0" 29 | 30 | # ############### server config ############### 31 | HOST = "0.0.0.0" 32 | 33 | PORT = 5010 34 | 35 | # ############### database config ################### 36 | # db connection uri 37 | # example: 38 | # Redis: redis://:password@ip:port/db 39 | # Ssdb: ssdb://:password@ip:port 40 | DB_CONN = 'redis://:123456@127.0.0.1:6379/0' 41 | 42 | # proxy table name 43 | TABLE_NAME = 'use_proxy' 44 | """ #no usefull 45 | "freeProxy01", 46 | "freeProxy02", 47 | "freeProxy03", 48 | "freeProxy04", 49 | "freeProxy05", 50 | "freeProxy06", 51 | "freeProxy07", 52 | "freeProxy08", 53 | "freeProxy09", 54 | "freeProxy10", 55 | 56 | #usefull 57 | "freeProxy11", 58 | "freeProxy12", 59 | "freeProxy13", 60 | "freeProxy96", 61 | "freeProxy97", 62 | "freeProxy98", 63 | "freeProxy99", 64 | "freeProxy100", 65 | "freeProxy101" 66 | """ 67 | # ###### config the proxy fetch function ###### 68 | PROXY_FETCHER = [ 69 | "freeProxy11", 70 | "freeProxy12", 71 | "freeProxy13", 72 | "freeProxy96", 73 | "freeProxy97", 74 | "freeProxy98", 75 | "freeProxy99", 76 | "freeProxy100", 77 | "freeProxy101" 78 | ] 79 | 80 | # ############# proxy validator ################# 81 | # 代理验证目标网站 82 | HTTP_URL = "http://ifconfig.me/ip" 83 | HTTPS_URL = "https://ifconfig.me/ip" 84 | # 代理验证时超时时间 85 | VERIFY_TIMEOUT = 5 86 | 87 | # 近PROXY_CHECK_COUNT次校验中允许的最大失败次数,超过则剔除代理 88 | MAX_FAIL_COUNT = 0 89 | 90 | # 近PROXY_CHECK_COUNT次校验中允许的最大失败率,超过则剔除代理 91 | MAX_FAIL_RATE = 0.3 92 | 93 | # proxyCheck时代理数量少于POOL_SIZE_MIN触发抓取 94 | POOL_SIZE_MIN = 2000 95 | 96 | # getCount时,check_count_xxx_proxies 过滤条件下限,筛选出来质量比较高的代理 97 | MIN_AVAIL_LIMIT = 20 98 | 99 | # 解决代理源需fanqiang访问问题 100 | API_PROXY_CONFIG = "http://127.0.0.1:1080" 101 | 102 | # 每个proxy采集进程启动的线程数量 103 | RAW_THREADS_NUM = 50 104 | 105 | # 每个proxy检查进程启动的线程数量 106 | USE_THREADS_NUM = 10 107 | 108 | # proxy采集任务每间隔多少分钟执行一次 109 | RAW_INTERVAL_MIN = 4 110 | 111 | # proxy检查任务每间隔多少分钟执行一次 112 | USE_INTERVAL_MIN = 2 113 | 114 | # 控制 执行器(如线程池或进程池)中 最大并发执行任务的数量。 115 | THREADPOOL_WORKS_NUM = 60 116 | 117 | # 一个进程池(ProcessPoolExecutor),最多可以同时运行多少个进程 118 | PROCESSPOOL_WORKS_NUM = 5 119 | 120 | # 控制 同一个任务 在调度器中的最大实例数量 121 | JOB_INSTANCES_NUM = 30 122 | 123 | # ############# proxy attributes ################# 124 | # 是否启用代理地域属性 125 | PROXY_REGION = True 126 | 127 | SERVER_PIDFILE = '/tmp/proxyPoolServer.pid' 128 | SERVER_ACCESS_LOGFILE = './log/access.log' 129 | SERVER_ERROR_LOGFILE = './log/error.log' 130 | 131 | SCHEDULER_PIDFILE = "/tmp/proxyPoolScheduler.pid" 132 | 133 | # ############# scheduler config ################# 134 | 135 | # Set the timezone for the scheduler forcely (optional) 136 | # If it is running on a VM, and 137 | # "ValueError: Timezone offset does not match system offset" 138 | # was raised during scheduling. 139 | # Please uncomment the following line and set a timezone for the scheduler. 140 | # Otherwise it will detect the timezone from the system automatically. 141 | 142 | TIMEZONE = "Asia/Shanghai" 143 | -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python proxyPool.py server & 3 | python proxyPool.py schedule -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: test.py 5 | Description : 6 | Author : 007x 7 | date: 2017/3/7 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/3/7: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | from test import testProxyValidator 16 | from test import testConfigHandler 17 | from test import testLogHandler 18 | from test import testDbClient 19 | 20 | if __name__ == '__main__': 21 | print("ConfigHandler:") 22 | testConfigHandler.testConfig() 23 | 24 | print("LogHandler:") 25 | testLogHandler.testLogHandler() 26 | 27 | print("DbClient:") 28 | testDbClient.testDbClient() 29 | 30 | print("ProxyValidator:") 31 | testProxyValidator.testProxyValidator() 32 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__ 5 | Description : 6 | Author : 007x 7 | date: 2019/2/15 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/2/15: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | -------------------------------------------------------------------------------- /test/testConfigHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testGetConfig 5 | Description : testGetConfig 6 | Author : J_hao 7 | date: 2017/7/31 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/7/31: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'J_hao' 14 | 15 | from handler.configHandler import ConfigHandler 16 | from time import sleep 17 | 18 | 19 | def testConfig(): 20 | """ 21 | :return: 22 | """ 23 | conf = ConfigHandler() 24 | print(conf.dbConn) 25 | print(conf.serverPort) 26 | print(conf.serverHost) 27 | print(conf.tableName) 28 | assert isinstance(conf.fetchers, list) 29 | print(conf.fetchers) 30 | 31 | for _ in range(2): 32 | print(conf.fetchers) 33 | sleep(5) 34 | 35 | 36 | if __name__ == '__main__': 37 | testConfig() 38 | 39 | -------------------------------------------------------------------------------- /test/testDbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testDbClient 5 | Description : 6 | Author : 007x 7 | date: 2020/6/23 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/23: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | from db.dbClient import DbClient 16 | 17 | 18 | def testDbClient(): 19 | # ############### ssdb ############### 20 | ssdb_uri = "ssdb://:password@127.0.0.1:8888" 21 | s = DbClient.parseDbConn(ssdb_uri) 22 | assert s.db_type == "SSDB" 23 | assert s.db_pwd == "password" 24 | assert s.db_host == "127.0.0.1" 25 | assert s.db_port == 8888 26 | 27 | # ############### redis ############### 28 | redis_uri = "redis://:password@127.0.0.1:6379/1" 29 | r = DbClient.parseDbConn(redis_uri) 30 | assert r.db_type == "REDIS" 31 | assert r.db_pwd == "password" 32 | assert r.db_host == "127.0.0.1" 33 | assert r.db_port == 6379 34 | assert r.db_name == "1" 35 | print("DbClient ok!") 36 | 37 | 38 | if __name__ == '__main__': 39 | testDbClient() 40 | -------------------------------------------------------------------------------- /test/testLogHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testLogHandler 5 | Description : 6 | Author : J_hao 7 | date: 2017/8/2 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/8/2: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'J_hao' 14 | 15 | from handler.logHandler import LogHandler 16 | 17 | 18 | def testLogHandler(): 19 | log = LogHandler('test') 20 | log.info('this is info') 21 | log.error('this is error') 22 | 23 | 24 | if __name__ == '__main__': 25 | testLogHandler() 26 | -------------------------------------------------------------------------------- /test/testProxyClass.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testProxyClass 5 | Description : 6 | Author : 007x 7 | date: 2019/8/8 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/8/8: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | import json 16 | from helper.proxy import Proxy 17 | 18 | 19 | def testProxyClass(): 20 | proxy = Proxy("127.0.0.1:8080") 21 | 22 | print(proxy.to_json) 23 | 24 | proxy.source = "test" 25 | 26 | proxy_str = json.dumps(proxy.to_dict, ensure_ascii=False) 27 | 28 | print(proxy_str) 29 | 30 | print(Proxy.createFromJson(proxy_str).to_dict) 31 | 32 | 33 | if __name__ == '__main__': 34 | testProxyClass() 35 | -------------------------------------------------------------------------------- /test/testProxyFetcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testProxyFetcher 5 | Description : 6 | Author : 007x 7 | date: 2020/6/23 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/23: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | from fetcher.proxyFetcher import ProxyFetcher 16 | from handler.configHandler import ConfigHandler 17 | 18 | 19 | def testProxyFetcher(): 20 | conf = ConfigHandler() 21 | proxy_getter_functions = conf.fetchers 22 | proxy_counter = {_: 0 for _ in proxy_getter_functions} 23 | for proxyGetter in proxy_getter_functions: 24 | for proxy in getattr(ProxyFetcher, proxyGetter.strip())(): 25 | if proxy: 26 | print('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) 27 | proxy_counter[proxyGetter] = proxy_counter.get(proxyGetter) + 1 28 | for key, value in proxy_counter.items(): 29 | print(key, value) 30 | 31 | 32 | if __name__ == '__main__': 33 | testProxyFetcher() 34 | -------------------------------------------------------------------------------- /test/testProxyValidator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testProxyValidator 5 | Description : 6 | Author : 007x 7 | date: 2021/5/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2021/5/25: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | from helper.validator import ProxyValidator 16 | 17 | 18 | def testProxyValidator(): 19 | for _ in ProxyValidator.pre_validator: 20 | print(_) 21 | for _ in ProxyValidator.http_validator: 22 | print(_) 23 | for _ in ProxyValidator.https_validator: 24 | print(_) 25 | 26 | 27 | if __name__ == '__main__': 28 | testProxyValidator() 29 | -------------------------------------------------------------------------------- /test/testRedisClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testRedisClient 5 | Description : 6 | Author : 007x 7 | date: 2020/6/23 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/23: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | 16 | def testRedisClient(): 17 | from db.dbClient import DbClient 18 | from helper.proxy import Proxy 19 | 20 | uri = "redis://:pwd@127.0.0.1:6379" 21 | db = DbClient(uri) 22 | db.changeTable("use_proxy") 23 | proxy = Proxy.createFromJson('{"proxy": "118.190.79.36:8090", "https": false, "fail_count": 0, "region": "", "anonymous": "", "source": "freeProxy14", "check_count": 4, "last_status": true, "last_time": "2021-05-26 10:58:04"}') 24 | 25 | print("put: ", db.put(proxy)) 26 | 27 | print("get: ", db.get(https=None)) 28 | 29 | print("exists: ", db.exists("27.38.96.101:9797")) 30 | 31 | print("exists: ", db.exists("27.38.96.101:8888")) 32 | 33 | print("pop: ", db.pop(https=None)) 34 | 35 | print("getAll: ", db.getAll(https=None)) 36 | 37 | print("getCount", db.getCount()) 38 | 39 | 40 | if __name__ == '__main__': 41 | testRedisClient() 42 | -------------------------------------------------------------------------------- /test/testSsdbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testSsdbClient 5 | Description : 6 | Author : 007x 7 | date: 2020/7/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/7/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | 16 | def testSsdbClient(): 17 | from db.dbClient import DbClient 18 | from helper.proxy import Proxy 19 | 20 | uri = "ssdb://@127.0.0.1:8888" 21 | db = DbClient(uri) 22 | db.changeTable("use_proxy") 23 | proxy = Proxy.createFromJson('{"proxy": "118.190.79.36:8090", "https": false, "fail_count": 0, "region": "", "anonymous": "", "source": "freeProxy14", "check_count": 4, "last_status": true, "last_time": "2021-05-26 10:58:04"}') 24 | 25 | print("put: ", db.put(proxy)) 26 | 27 | print("get: ", db.get(https=None)) 28 | 29 | print("exists: ", db.exists("27.38.96.101:9797")) 30 | 31 | print("exists: ", db.exists("27.38.96.101:8888")) 32 | 33 | print("getAll: ", db.getAll(https=None)) 34 | 35 | # print("pop: ", db.pop(https=None)) 36 | 37 | print("clear: ", db.clear()) 38 | 39 | print("getCount", db.getCount()) 40 | 41 | 42 | if __name__ == '__main__': 43 | testSsdbClient() 44 | -------------------------------------------------------------------------------- /tmp/http.txt: -------------------------------------------------------------------------------- 1 | 3.124.133.93:80 2 | 20.187.77.5:80 3 | -------------------------------------------------------------------------------- /tmp/https.txt: -------------------------------------------------------------------------------- 1 | -1-1 -------------------------------------------------------------------------------- /tmp/mix.txt: -------------------------------------------------------------------------------- 1 | http://64.227.46.7:8080 2 | http://71.86.129.152:8080 3 | -------------------------------------------------------------------------------- /tmp/proxiesCheckPool.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import shutil 5 | from randomProxy import randomProxy 6 | from multiprocessing import Queue, Process, Manager 7 | 8 | class proxiesCheckPool(object): 9 | def __init__(self, fpath): 10 | self._randomProxyData = randomProxy(fpath) 11 | 12 | # 定义消费者任务,处理队列中的数据 13 | def consumer_task(self, task_queue, result_queue): 14 | while True: 15 | item = task_queue.get() # 获取任务 16 | if item is None: # 如果为 None 表示无更多任务 17 | break 18 | # 处理任务 19 | result = self._randomProxyData.is_network_reachable(item) 20 | #print("consumer_task:", result) 21 | if result.endswith("failed") is True: 22 | continue 23 | result_queue.put(result) # 将结果放入结果队列 24 | 25 | # 定义生产者任务,将任务放入队列 26 | def producer_task(self, task_queue, num_consumers): 27 | for item in self._randomProxyData._proxiesList: 28 | task_queue.put(item) # 将任务放入队列 29 | 30 | print(f"proxies pool:{task_queue.qsize()} proxies accessible") 31 | # 放入结束标记 32 | for _ in range(num_consumers): 33 | task_queue.put(None) 34 | 35 | # 结果处理进程 36 | def result_task(self, result_queue): 37 | count = 0 38 | fpath = self._randomProxyData._readFile 39 | while True: 40 | result = result_queue.get() 41 | if result is None: # 退出标记 42 | break 43 | #print(result) 44 | proxy = result.strip().replace(" ok", "") 45 | res = f"{proxy}\r\n" 46 | 47 | if res.lower().startswith("http://"): 48 | proxy_type = "http" 49 | elif res.lower().startswith("https://"): 50 | proxy_type = "https" 51 | elif res.lower().startswith("socks4://"): 52 | proxy_type = "socks4" 53 | elif res.lower().startswith("socks5://"): 54 | proxy_type = "socks5" 55 | else: 56 | continue 57 | #print(res.strip()) 58 | with open(f"{self._randomProxyData._outDir}/{proxy_type}.txt", "a") as tf: 59 | tf.write(res) 60 | count += 1 61 | #print(count) 62 | #shutil.move(f"{fpath}.new", fpath) 63 | print(f"proxies pool:{count} proxies ok") 64 | 65 | def do_proxies_load(self): 66 | # 初始化任务队列和结果队列 67 | task_queue = Queue() 68 | result_queue = Queue() 69 | 70 | # 定义消费者进程数量 71 | num_consumers = 300 72 | 73 | # 创建并启动消费者进程 74 | consumers = [Process(target=self.consumer_task, args=(task_queue, result_queue)) for _ in range(num_consumers)] 75 | for consumer in consumers: 76 | consumer.start() 77 | 78 | # 启动生产者任务 79 | #self.producer_task(task_queue, num_consumers) 80 | producer_processor = Process(target=self.producer_task, args=(task_queue, num_consumers)) 81 | producer_processor.start() 82 | 83 | # 启动结果处理进程 84 | result_processor = Process(target=self.result_task, args=(result_queue,)) 85 | result_processor.start() 86 | 87 | producer_processor.join() 88 | # 等待所有消费者完成任务 89 | for consumer in consumers: 90 | consumer.join() 91 | 92 | # 发送退出信号给结果处理进程并等待其完成 93 | result_queue.put(None) 94 | result_processor.join() 95 | 96 | # 在主进程中打印所有结果 97 | #print("所有任务处理结果:", len(shared_results)) 98 | #print("结果示例:", list(shared_results)[:10]) # 打印前10个结果 99 | 100 | if __name__ == "__main__": 101 | 102 | if len(sys.argv) != 2: 103 | print(f"Usage: python3 {os.path.basename(__file__)} proxyfile") 104 | sys.exit(1) 105 | proxiesCheckPoolData = proxiesCheckPool(sys.argv[1]) 106 | proxiesCheckPoolData.do_proxies_load() 107 | -------------------------------------------------------------------------------- /tmp/randomProxy.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | import random 5 | import requests 6 | import fileinput 7 | import ipaddress 8 | import subprocess 9 | from urllib3.exceptions import InsecureRequestWarning 10 | 11 | class randomProxy(object): 12 | def __init__(self, fpath): 13 | self._proxiesList = [] 14 | self._outDir = "../proxyList/" 15 | self._readFile = fpath 16 | self.read_proxy_info() 17 | self._test_url_list = [ 18 | 'https://ifconfig.me/ip', 19 | #'https://ipinfo.io/ip', 20 | #'https://icanhazip.com', 21 | #'https://checkip.amazonaws.com', 22 | #'https://ipecho.net/plain', 23 | #'https://www.trackip.net/ip' 24 | ] 25 | self._test_count = 3 26 | self._request_timeout = 5 27 | 28 | def read_proxy_info(self): 29 | if not os.path.exists(self._readFile): 30 | return 31 | if os.path.getsize(self._readFile) < 8: 32 | return 33 | 34 | self._proxy_type = None 35 | if self._readFile.endswith("http.txt"): 36 | self._proxy_type = "http" 37 | elif self._readFile.endswith("https.txt"): 38 | self._proxy_type = "https" 39 | elif self._readFile.endswith("socks4.txt"): 40 | self._proxy_type = "socks4" 41 | elif self._readFile.endswith("socks5.txt"): 42 | self._proxy_type = "socks5" 43 | else: 44 | self._proxy_type = None 45 | 46 | for line in fileinput.input(self._readFile): 47 | if len(line) < 9: 48 | continue 49 | if self._proxy_type is None: 50 | proxy = line.strip() 51 | else: 52 | proxy = f"{self._proxy_type}://{line.strip()}" 53 | 54 | self._proxiesList.append(proxy) 55 | 56 | def get_random_proxy(self): 57 | if len(self._proxiesList) == 0: 58 | return None 59 | return random.choice(self._proxiesList) 60 | 61 | def is_network_reachable(self, proxy): 62 | 63 | id = 0 64 | proxies = {"http": proxy, "https": proxy} 65 | self._test_url = random.choice(self._test_url_list) 66 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 67 | 68 | while id < self._test_count: 69 | try: 70 | id += 1 71 | #start_time = time.time() 72 | r = requests.get(self._test_url, proxies = proxies, verify=False, timeout=self._request_timeout) 73 | #end_time = time.time() 74 | #vist_time = end_time - start_time 75 | status_code = r.status_code 76 | #print(status_code, r.text) 77 | if status_code == 200: 78 | try: 79 | #print(status_code, r.text) 80 | ip = ipaddress.ip_address(r.text) 81 | #print(ip) 82 | #print(proxy, r.status_code, id) 83 | return ("%s ok") % proxy 84 | except Exception as e: 85 | #print(proxy, r.status_code, id) 86 | #print(e) 87 | pass 88 | except Exception as e: 89 | #print(proxy, "failed", id) 90 | #print(proxy, "failed") 91 | #print(e) 92 | pass 93 | 94 | return ("%s failed") % proxy 95 | 96 | if __name__ == "__main__": 97 | randomProxyData = randomProxy("./proxies.conf") 98 | print(randomProxyData.is_network_reachable(sys.argv[1])) 99 | -------------------------------------------------------------------------------- /tmp/run.sh: -------------------------------------------------------------------------------- 1 | #set proxy 2 | export https_proxy=http://127.0.0.1:1080 3 | 4 | #download raw data 5 | #质量很差 6 | #curl https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/socks5/data.txt -o socks5.txt 7 | #curl https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/socks4/data.txt -o socks4.txt 8 | #curl https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/http/data.txt -o http.txt 9 | #curl https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/https/data.txt -o https.txt 10 | 11 | #质量好 12 | curl https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks4.txt -o socks4.txt 13 | curl https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/socks5.txt -o socks5.txt 14 | curl https://raw.githubusercontent.com/TheSpeedX/SOCKS-List/master/http.txt -o http.txt 15 | 16 | curl https://raw.githubusercontent.com/monosans/proxy-list/refs/heads/main/proxies/http.txt >>http.txt 17 | curl https://raw.githubusercontent.com/monosans/proxy-list/refs/heads/main/proxies/socks4.txt >> socks4.txt 18 | curl https://raw.githubusercontent.com/monosans/proxy-list/refs/heads/main/proxies/socks5.txt >> socks5.txt 19 | curl https://raw.githubusercontent.com/monosans/proxy-list/refs/heads/main/proxies_anonymous/http.txt >>http.txt 20 | curl https://raw.githubusercontent.com/monosans/proxy-list/refs/heads/main/proxies_anonymous/socks4.txt >>socks4.txt 21 | curl https://raw.githubusercontent.com/monosans/proxy-list/refs/heads/main/proxies_anonymous/socks5.txt >>socks5.txt 22 | 23 | curl https://raw.githubusercontent.com/mmpx12/proxy-list/refs/heads/master/http.txt >> http.txt 24 | 25 | 26 | 27 | 28 | curl 'https://proxy5.net/wp-admin/admin-ajax.php?action=proxylister_download&nonce=8b12c3524c&format=txt&filter=\{%22protocols%22:%22HTTP%22,%22latency%22:0,%22page_size%22:20,%22page%22:1\}' \ 29 | -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7' \ 30 | -H 'accept-language: zh-CN,zh;q=0.9' \ 31 | -H 'cache-control: no-cache' \ 32 | -H 'cookie: _ga=GA1.1.1293367504.1732002488; _gcl_au=1.1.1875742973.1732002489; _ym_uid=1732002490428729031; _ym_d=1732002490; _ym_isad=2; _ym_visorc=w; _ga_2ZGKN4M0P5=GS1.1.1732002488.1.0.1732002499.0.0.0' \ 33 | -H 'pragma: no-cache' \ 34 | -H 'priority: u=0, i' \ 35 | -H 'referer: https://proxy5.net/free-proxy' \ 36 | -H 'sec-ch-ua: "Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"' \ 37 | -H 'sec-ch-ua-mobile: ?0' \ 38 | -H 'sec-ch-ua-platform: "Windows"' \ 39 | -H 'sec-fetch-dest: document' \ 40 | -H 'sec-fetch-mode: navigate' \ 41 | -H 'sec-fetch-site: same-origin' \ 42 | -H 'sec-fetch-user: ?1' \ 43 | -H 'upgrade-insecure-requests: 1' \ 44 | -H 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36' >>http.txt 45 | 46 | curl 'https://proxy5.net/wp-admin/admin-ajax.php?action=proxylister_download&nonce=8b12c3524c&format=txt&filter=\{%22protocols%22:%22SOCKS4%22,%22latency%22:0,%22page_size%22:20,%22page%22:1\}' \ 47 | -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7' \ 48 | -H 'accept-language: zh-CN,zh;q=0.9' \ 49 | -H 'cache-control: no-cache' \ 50 | -H 'cookie: _ga=GA1.1.1293367504.1732002488; _gcl_au=1.1.1875742973.1732002489; _ym_uid=1732002490428729031; _ym_d=1732002490; _ym_isad=2; _ym_visorc=w; _ga_2ZGKN4M0P5=GS1.1.1732002488.1.0.1732002499.0.0.0' \ 51 | -H 'pragma: no-cache' \ 52 | -H 'priority: u=0, i' \ 53 | -H 'referer: https://proxy5.net/free-proxy' \ 54 | -H 'sec-ch-ua: "Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"' \ 55 | -H 'sec-ch-ua-mobile: ?0' \ 56 | -H 'sec-ch-ua-platform: "Windows"' \ 57 | -H 'sec-fetch-dest: document' \ 58 | -H 'sec-fetch-mode: navigate' \ 59 | -H 'sec-fetch-site: same-origin' \ 60 | -H 'sec-fetch-user: ?1' \ 61 | -H 'upgrade-insecure-requests: 1' \ 62 | -H 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36' >>socks4.txt 63 | 64 | curl 'https://proxy5.net/wp-admin/admin-ajax.php?action=proxylister_download&nonce=8b12c3524c&format=txt&filter=\{%22protocols%22:%22SOCKS5%22,%22latency%22:0,%22page_size%22:20,%22page%22:1\}' \ 65 | -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7' \ 66 | -H 'accept-language: zh-CN,zh;q=0.9' \ 67 | -H 'cache-control: no-cache' \ 68 | -H 'cookie: _ga=GA1.1.1293367504.1732002488; _gcl_au=1.1.1875742973.1732002489; _ym_uid=1732002490428729031; _ym_d=1732002490; _ym_isad=2; _ym_visorc=w; _ga_2ZGKN4M0P5=GS1.1.1732002488.1.0.1732002499.0.0.0' \ 69 | -H 'pragma: no-cache' \ 70 | -H 'priority: u=0, i' \ 71 | -H 'referer: https://proxy5.net/free-proxy' \ 72 | -H 'sec-ch-ua: "Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"' \ 73 | -H 'sec-ch-ua-mobile: ?0' \ 74 | -H 'sec-ch-ua-platform: "Windows"' \ 75 | -H 'sec-fetch-dest: document' \ 76 | -H 'sec-fetch-mode: navigate' \ 77 | -H 'sec-fetch-site: same-origin' \ 78 | -H 'sec-fetch-user: ?1' \ 79 | -H 'upgrade-insecure-requests: 1' \ 80 | -H 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36' >>socks5.txt 81 | 82 | curl https://api.openproxylist.xyz/http.txt >> http.txt 83 | curl https://api.openproxylist.xyz/socks4.txt >> socks4.txt 84 | curl https://api.openproxylist.xyz/socks5.txt >> socks5.txt 85 | 86 | 87 | curl 'https://fineproxy.org/wp-admin/admin-ajax.php?action=proxylister_download&nonce=286663de32&format=txt&filter=%7B%22protocols%22%3A%22HTTP%22%2C%22latency%22%3A0%2C%22uptime%22%3A0%2C%22last_checked%22%3A%220%22%2C%22trp-form-language%22%3A%22cn%22%2C%22page_size%22%3A20%2C%22page%22%3A1%7D' >>http.txt 88 | curl 'https://fineproxy.org/wp-admin/admin-ajax.php?action=proxylister_download&nonce=286663de32&format=txt&filter=%7B%22protocols%22%3A%22HTTPS%22%2C%22latency%22%3A0%2C%22uptime%22%3A0%2C%22last_checked%22%3A%220%22%2C%22trp-form-language%22%3A%22cn%22%2C%22page_size%22%3A20%2C%22page%22%3A1%7D' >>https.txt 89 | curl 'https://fineproxy.org/wp-admin/admin-ajax.php?action=proxylister_download&nonce=286663de32&format=txt&filter=%7B%22protocols%22%3A%22SOCKS4%22%2C%22latency%22%3A0%2C%22uptime%22%3A0%2C%22last_checked%22%3A%220%22%2C%22trp-form-language%22%3A%22cn%22%2C%22page_size%22%3A20%2C%22page%22%3A1%7D' >>socks4.txt 90 | curl 'https://fineproxy.org/wp-admin/admin-ajax.php?action=proxylister_download&nonce=286663de32&format=txt&filter=%7B%22protocols%22%3A%22SOCKS5%22%2C%22latency%22%3A0%2C%22uptime%22%3A0%2C%22last_checked%22%3A%220%22%2C%22trp-form-language%22%3A%22cn%22%2C%22page_size%22%3A20%2C%22page%22%3A1%7D' >>socks5.txt 91 | 92 | 93 | curl 'https://api.proxyscrape.com/v4/free-proxy-list/get?request=get_proxies&skip=0&proxy_format=protocolipport&format=text&limit=1500' -H 'accept: application/json, text/plain, */*' -H 'accept-language: zh-CN,zh;q=0.9' -H 'cache-control: no-cache' -H 'origin: https://proxyscrape.com' -H 'pragma: no-cache' -H 'priority: u=1, i' -H 'referer: https://proxyscrape.com/' -H 'sec-ch-ua: "Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"' -H 'sec-ch-ua-mobile: ?0' -H 'sec-ch-ua-platform: "Windows"' -H 'sec-fetch-dest: empty' -H 'sec-fetch-mode: cors' -H 'sec-fetch-site: same-site' -H 'user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36' -o mix.txt 94 | 95 | 96 | 97 | #filter available 98 | python3 proxiesCheckPool.py http.txt 99 | python3 proxiesCheckPool.py socks4.txt 100 | python3 proxiesCheckPool.py socks5.txt 101 | python3 proxiesCheckPool.py mix.txt 102 | 103 | #remove duplicate 104 | sort -k 1 ../proxyList/http.txt | uniq > ../proxyList/http.txt.new 105 | mv ../proxyList/http.txt.new ../proxyList/http.txt 106 | sort -k 1 ../proxyList/https.txt | uniq > ../proxyList/https.txt.new 107 | mv ../proxyList/https.txt.new ../proxyList/https.txt 108 | sort -k 1 ../proxyList/socks4.txt | uniq > ../proxyList/socks4.txt.new 109 | mv ../proxyList/socks4.txt.new ../proxyList/socks4.txt 110 | sort -k 1 ../proxyList/socks5.txt | uniq > ../proxyList/socks5.txt.new 111 | mv ../proxyList/socks5.txt.new ../proxyList/socks5.txt 112 | 113 | 114 | wc -l ../proxyList/*.txt 115 | -------------------------------------------------------------------------------- /tmp/socks4.txt: -------------------------------------------------------------------------------- 1 | 3.124.133.93:80 2 | 20.187.77.5:80 3 | -------------------------------------------------------------------------------- /tmp/socks5.txt: -------------------------------------------------------------------------------- 1 | 3.124.133.93:80 2 | 20.187.77.5:80 3 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__ 5 | Description : 6 | Author : 007x 7 | date: 2020/7/6 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/7/6: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | -------------------------------------------------------------------------------- /util/lazyProperty.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: lazyProperty 5 | Description : 6 | Author : 007x 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | 16 | class LazyProperty(object): 17 | """ 18 | LazyProperty 19 | explain: http://www.spiderpy.cn/blog/5/ 20 | """ 21 | 22 | def __init__(self, func): 23 | self.func = func 24 | 25 | def __get__(self, instance, owner): 26 | if instance is None: 27 | return self 28 | else: 29 | value = self.func(instance) 30 | setattr(instance, self.func.__name__, value) 31 | return value 32 | -------------------------------------------------------------------------------- /util/singleton.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: singleton 5 | Description : 6 | Author : 007x 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | 16 | class Singleton(type): 17 | """ 18 | Singleton Metaclass 19 | """ 20 | 21 | _inst = {} 22 | 23 | def __call__(cls, *args, **kwargs): 24 | if cls not in cls._inst: 25 | cls._inst[cls] = super(Singleton, cls).__call__(*args) 26 | return cls._inst[cls] 27 | -------------------------------------------------------------------------------- /util/six.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: six 5 | Description : 6 | Author : 007x 7 | date: 2020/6/22 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/22: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = '007x' 14 | 15 | import sys 16 | 17 | PY2 = sys.version_info[0] == 2 18 | PY3 = sys.version_info[0] == 3 19 | 20 | if PY3: 21 | def iteritems(d, **kw): 22 | return iter(d.items(**kw)) 23 | else: 24 | def iteritems(d, **kw): 25 | return d.iteritems(**kw) 26 | 27 | if PY3: 28 | from urllib.parse import urlparse 29 | else: 30 | from urlparse import urlparse 31 | 32 | if PY3: 33 | from imp import reload as reload_six 34 | else: 35 | reload_six = reload 36 | 37 | if PY3: 38 | from queue import Empty, Queue 39 | else: 40 | from Queue import Empty, Queue 41 | 42 | 43 | def withMetaclass(meta, *bases): 44 | """Create a base class with a metaclass.""" 45 | 46 | # This requires a bit of explanation: the basic idea is to make a dummy 47 | # metaclass for one level of class instantiation that replaces itself with 48 | # the actual metaclass. 49 | class MetaClass(meta): 50 | 51 | def __new__(cls, name, this_bases, d): 52 | return meta(name, bases, d) 53 | 54 | return type.__new__(MetaClass, 'temporary_class', (), {}) 55 | -------------------------------------------------------------------------------- /util/webRequest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: WebRequest 5 | Description : Network Requests Class 6 | Author : J_hao 7 | date: 2017/7/31 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/7/31: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'J_hao' 14 | 15 | from requests.models import Response 16 | from lxml import etree 17 | import requests 18 | import random 19 | import time 20 | 21 | from handler.logHandler import LogHandler 22 | 23 | requests.packages.urllib3.disable_warnings() 24 | 25 | 26 | class WebRequest(object): 27 | name = "web_request" 28 | 29 | def __init__(self, *args, **kwargs): 30 | self.log = LogHandler(self.name, file=False) 31 | self.response = Response() 32 | 33 | @property 34 | def user_agent(self): 35 | """ 36 | return an User-Agent at random 37 | :return: 38 | """ 39 | ua_list = [ 40 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101', 41 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122', 42 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71', 43 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95', 44 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71', 45 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 46 | 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 47 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 48 | ] 49 | return random.choice(ua_list) 50 | 51 | @property 52 | def header(self): 53 | """ 54 | basic header 55 | :return: 56 | """ 57 | return {'User-Agent': self.user_agent, 58 | 'Accept': '*/*', 59 | 'Connection': 'keep-alive', 60 | 'Accept-Language': 'zh-CN,zh;q=0.8'} 61 | 62 | def get(self, url, proxies = None, header=None, retry_time=3, retry_interval=5, timeout=5, *args, **kwargs): 63 | """ 64 | get method 65 | :param url: target url 66 | :param header: headers 67 | :param retry_time: retry time 68 | :param retry_interval: retry interval 69 | :param timeout: network timeout 70 | :return: 71 | """ 72 | headers = self.header 73 | if header and isinstance(header, dict): 74 | headers.update(header) 75 | while True: 76 | try: 77 | self.response = requests.get(url, proxies = proxies, headers=headers, timeout=timeout, *args, **kwargs) 78 | return self 79 | except Exception as e: 80 | self.log.error("requests: %s error: %s" % (url, str(e))) 81 | retry_time -= 1 82 | if retry_time <= 0: 83 | resp = Response() 84 | resp.status_code = 200 85 | return self 86 | self.log.info("retry %s second after" % retry_interval) 87 | time.sleep(retry_interval) 88 | 89 | def post(self, url, proxies = None, header=None, data=None, retry_time=3, retry_interval=5, timeout=5, *args, **kwargs): 90 | """ 91 | post method 92 | :param url: target url 93 | :param header: headers 94 | :param data: body data 95 | :param retry_time: retry time 96 | :param retry_interval: retry interval 97 | :param timeout: network timeout 98 | :return: 99 | """ 100 | headers = self.header 101 | if header and isinstance(header, dict): 102 | headers.update(header) 103 | while True: 104 | try: 105 | self.response = requests.post(url, proxies = proxies, headers=headers, data=data, timeout=timeout, *args, **kwargs) 106 | return self 107 | except Exception as e: 108 | self.log.error("requests: %s error: %s" % (url, str(e))) 109 | retry_time -= 1 110 | if retry_time <= 0: 111 | resp = Response() 112 | resp.status_code = 200 113 | return self 114 | self.log.info("retry %s second after" % retry_interval) 115 | time.sleep(retry_interval) 116 | 117 | @property 118 | def tree(self): 119 | return etree.HTML(self.response.content) 120 | 121 | @property 122 | def text(self): 123 | return self.response.text 124 | 125 | @property 126 | def json(self): 127 | try: 128 | return self.response.json() 129 | except Exception as e: 130 | self.log.error(str(e)) 131 | return {} 132 | 133 | --------------------------------------------------------------------------------