├── .github └── workflows │ ├── docker-image-latest.yml │ └── docker-image-tags.yml ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── _config.yml ├── api ├── __init__.py └── proxyApi.py ├── db ├── __init__.py ├── dbClient.py ├── redisClient.py └── ssdbClient.py ├── docker-compose.yml ├── docs ├── Makefile ├── changelog.rst ├── conf.py ├── dev │ ├── ext_fetcher.rst │ ├── ext_validator.rst │ └── index.rst ├── index.rst ├── make.bat └── user │ ├── how_to_config.rst │ ├── how_to_run.rst │ ├── how_to_use.rst │ └── index.rst ├── fetcher ├── __init__.py └── proxyFetcher.py ├── handler ├── __init__.py ├── configHandler.py ├── logHandler.py └── proxyHandler.py ├── helper ├── __init__.py ├── check.py ├── fetch.py ├── launcher.py ├── proxy.py ├── scheduler.py └── validator.py ├── proxyPool.py ├── requirements.txt ├── setting.py ├── start.sh ├── test.py ├── test ├── __init__.py ├── testConfigHandler.py ├── testDbClient.py ├── testLogHandler.py ├── testProxyClass.py ├── testProxyFetcher.py ├── testProxyValidator.py ├── testRedisClient.py └── testSsdbClient.py └── util ├── __init__.py ├── lazyProperty.py ├── singleton.py ├── six.py └── webRequest.py /.github/workflows/docker-image-latest.yml: -------------------------------------------------------------------------------- 1 | name: Publish Docker image latest 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'master' 7 | 8 | jobs: 9 | 10 | push_to_registry: 11 | name: Push Docker image to Docker Hub 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: Check out the repo 16 | uses: actions/checkout@v2 17 | 18 | - name: Log in to Docker Hub 19 | uses: docker/login-action@v1 20 | with: 21 | username: ${{ secrets.DOCKERHUB_USERNAME }} 22 | password: ${{ secrets.DOCKERHUB_TOKEN }} 23 | 24 | - name: Extract metadata (tags, labels) for Docker 25 | id: meta 26 | uses: docker/metadata-action@v3 27 | with: 28 | images: jhao104/proxy_pool 29 | 30 | - name: Build and push Docker image 31 | uses: docker/build-push-action@v2 32 | with: 33 | context: . 34 | push: true 35 | tags: jhao104/proxy_pool:latest 36 | -------------------------------------------------------------------------------- /.github/workflows/docker-image-tags.yml: -------------------------------------------------------------------------------- 1 | name: Publish Docker image tags 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | 10 | push_to_registry: 11 | name: Push Docker image to Docker Hub 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: Check out the repo 16 | uses: actions/checkout@v2 17 | 18 | - name: Log in to Docker Hub 19 | uses: docker/login-action@v1 20 | with: 21 | username: ${{ secrets.DOCKERHUB_USERNAME }} 22 | password: ${{ secrets.DOCKERHUB_TOKEN }} 23 | 24 | - name: Extract metadata (tags, labels) for Docker 25 | id: meta 26 | uses: docker/metadata-action@v3 27 | with: 28 | images: jhao104/proxy_pool 29 | 30 | - name: Build and push Docker image 31 | uses: docker/build-push-action@v2 32 | with: 33 | context: . 34 | push: true 35 | tags: ${{ steps.meta.outputs.tags }} 36 | labels: ${{ steps.meta.outputs.labels }} 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | docs/_build 3 | *.pyc 4 | *.log 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.5" 5 | - "3.6" 6 | - "3.7" 7 | - "3.8" 8 | - "3.9" 9 | - "3.10" 10 | - "3.11" 11 | os: 12 | - linux 13 | install: 14 | - pip install -r requirements.txt 15 | 16 | script: python test.py 17 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6-alpine 2 | 3 | MAINTAINER jhao104 4 | 5 | WORKDIR /app 6 | 7 | COPY ./requirements.txt . 8 | 9 | # apk repository 10 | RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.ustc.edu.cn/g' /etc/apk/repositories 11 | 12 | # timezone 13 | RUN apk add -U tzdata && cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && apk del tzdata 14 | 15 | # runtime environment 16 | RUN apk add musl-dev gcc libxml2-dev libxslt-dev && \ 17 | pip install --no-cache-dir -r requirements.txt && \ 18 | apk del gcc musl-dev 19 | 20 | COPY . . 21 | 22 | EXPOSE 5010 23 | 24 | ENTRYPOINT [ "sh", "start.sh" ] 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 J_hao104 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ProxyPool 爬虫代理IP池 3 | ======= 4 | [![Build Status](https://travis-ci.org/jhao104/proxy_pool.svg?branch=master)](https://travis-ci.org/jhao104/proxy_pool) 5 | [![](https://img.shields.io/badge/Powered%20by-@j_hao104-green.svg)](http://www.spiderpy.cn/blog/) 6 | [![Packagist](https://img.shields.io/packagist/l/doctrine/orm.svg)](https://github.com/jhao104/proxy_pool/blob/master/LICENSE) 7 | [![GitHub contributors](https://img.shields.io/github/contributors/jhao104/proxy_pool.svg)](https://github.com/jhao104/proxy_pool/graphs/contributors) 8 | [![](https://img.shields.io/badge/language-Python-green.svg)](https://github.com/jhao104/proxy_pool) 9 | 10 | ______ ______ _ 11 | | ___ \_ | ___ \ | | 12 | | |_/ / \__ __ __ _ __ _ | |_/ /___ ___ | | 13 | | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | 14 | | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ 15 | \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____\ 16 | __ / / 17 | /___ / 18 | 19 | ### ProxyPool 20 | 21 | 爬虫代理IP池项目,主要功能为定时采集网上发布的免费代理验证入库,定时验证入库的代理保证代理的可用性,提供API和CLI两种使用方式。同时你也可以扩展代理源以增加代理池IP的质量和数量。 22 | 23 | * 文档: [document](https://proxy-pool.readthedocs.io/zh/latest/) [![Documentation Status](https://readthedocs.org/projects/proxy-pool/badge/?version=latest)](https://proxy-pool.readthedocs.io/zh/latest/?badge=latest) 24 | 25 | * 支持版本: [![](https://img.shields.io/badge/Python-2.7-green.svg)](https://docs.python.org/2.7/) 26 | [![](https://img.shields.io/badge/Python-3.5-blue.svg)](https://docs.python.org/3.5/) 27 | [![](https://img.shields.io/badge/Python-3.6-blue.svg)](https://docs.python.org/3.6/) 28 | [![](https://img.shields.io/badge/Python-3.7-blue.svg)](https://docs.python.org/3.7/) 29 | [![](https://img.shields.io/badge/Python-3.8-blue.svg)](https://docs.python.org/3.8/) 30 | [![](https://img.shields.io/badge/Python-3.9-blue.svg)](https://docs.python.org/3.9/) 31 | [![](https://img.shields.io/badge/Python-3.10-blue.svg)](https://docs.python.org/3.10/) 32 | [![](https://img.shields.io/badge/Python-3.11-blue.svg)](https://docs.python.org/3.11/) 33 | 34 | * 测试地址: http://demo.spiderpy.cn (勿压谢谢) 35 | 36 | * 付费代理推荐: [luminati-china](https://get.brightdata.com/github_jh). 国外的亮数据BrightData(以前叫luminati)被认为是代理市场领导者,覆盖全球的7200万IP,大部分是真人住宅IP,成功率扛扛的。付费套餐多种,需要高质量代理IP的可以注册后联系中文客服。[申请免费试用](https://get.brightdata.com/github_jh) 目前有50%折扣优惠活动。(PS:用不明白的同学可以参考这个[使用教程](https://www.cnblogs.com/jhao/p/15611785.html))。 37 | 38 | 39 | ### 运行项目 40 | 41 | ##### 下载代码: 42 | 43 | * git clone 44 | 45 | ```bash 46 | git clone git@github.com:jhao104/proxy_pool.git 47 | ``` 48 | 49 | * releases 50 | 51 | ```bash 52 | https://github.com/jhao104/proxy_pool/releases 下载对应zip文件 53 | ``` 54 | 55 | ##### 安装依赖: 56 | 57 | ```bash 58 | pip install -r requirements.txt 59 | ``` 60 | 61 | ##### 更新配置: 62 | 63 | 64 | ```python 65 | # setting.py 为项目配置文件 66 | 67 | # 配置API服务 68 | 69 | HOST = "0.0.0.0" # IP 70 | PORT = 5000 # 监听端口 71 | 72 | 73 | # 配置数据库 74 | 75 | DB_CONN = 'redis://:pwd@127.0.0.1:8888/0' 76 | 77 | 78 | # 配置 ProxyFetcher 79 | 80 | PROXY_FETCHER = [ 81 | "freeProxy01", # 这里是启用的代理抓取方法名,所有fetch方法位于fetcher/proxyFetcher.py 82 | "freeProxy02", 83 | # .... 84 | ] 85 | ``` 86 | 87 | #### 启动项目: 88 | 89 | ```bash 90 | # 如果已经具备运行条件, 可用通过proxyPool.py启动。 91 | # 程序分为: schedule 调度程序 和 server Api服务 92 | 93 | # 启动调度程序 94 | python proxyPool.py schedule 95 | 96 | # 启动webApi服务 97 | python proxyPool.py server 98 | 99 | ``` 100 | 101 | ### Docker Image 102 | 103 | ```bash 104 | docker pull jhao104/proxy_pool 105 | 106 | docker run --env DB_CONN=redis://:password@ip:port/0 -p 5010:5010 jhao104/proxy_pool:latest 107 | ``` 108 | ### docker-compose 109 | 110 | 项目目录下运行: 111 | ``` bash 112 | docker-compose up -d 113 | ``` 114 | 115 | ### 使用 116 | 117 | * Api 118 | 119 | 启动web服务后, 默认配置下会开启 http://127.0.0.1:5010 的api接口服务: 120 | 121 | | api | method | Description | params| 122 | | ----| ---- | ---- | ----| 123 | | / | GET | api介绍 | None | 124 | | /get | GET | 随机获取一个代理| 可选参数: `?type=https` 过滤支持https的代理| 125 | | /pop | GET | 获取并删除一个代理| 可选参数: `?type=https` 过滤支持https的代理| 126 | | /all | GET | 获取所有代理 |可选参数: `?type=https` 过滤支持https的代理| 127 | | /count | GET | 查看代理数量 |None| 128 | | /delete | GET | 删除代理 |`?proxy=host:ip`| 129 | 130 | 131 | * 爬虫使用 132 | 133 |   如果要在爬虫代码中使用的话, 可以将此api封装成函数直接使用,例如: 134 | 135 | ```python 136 | import requests 137 | 138 | def get_proxy(): 139 | return requests.get("http://127.0.0.1:5010/get/").json() 140 | 141 | def delete_proxy(proxy): 142 | requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) 143 | 144 | # your spider code 145 | 146 | def getHtml(): 147 | # .... 148 | retry_count = 5 149 | proxy = get_proxy().get("proxy") 150 | while retry_count > 0: 151 | try: 152 | html = requests.get('http://www.example.com', proxies={"http": "http://{}".format(proxy)}) 153 | # 使用代理访问 154 | return html 155 | except Exception: 156 | retry_count -= 1 157 | # 删除代理池中代理 158 | delete_proxy(proxy) 159 | return None 160 | ``` 161 | 162 | ### 扩展代理 163 | 164 |   项目默认包含几个免费的代理获取源,但是免费的毕竟质量有限,所以如果直接运行可能拿到的代理质量不理想。所以,提供了代理获取的扩展方法。 165 | 166 |   添加一个新的代理源方法如下: 167 | 168 | * 1、首先在[ProxyFetcher](https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/fetcher/proxyFetcher.py#L21)类中添加自定义的获取代理的静态方法, 169 | 该方法需要以生成器(yield)形式返回`host:ip`格式的代理,例如: 170 | 171 | ```python 172 | 173 | class ProxyFetcher(object): 174 | # .... 175 | 176 | # 自定义代理源获取方法 177 | @staticmethod 178 | def freeProxyCustom1(): # 命名不和已有重复即可 179 | 180 | # 通过某网站或者某接口或某数据库获取代理 181 | # 假设你已经拿到了一个代理列表 182 | proxies = ["x.x.x.x:3128", "x.x.x.x:80"] 183 | for proxy in proxies: 184 | yield proxy 185 | # 确保每个proxy都是 host:ip正确的格式返回 186 | ``` 187 | 188 | * 2、添加好方法后,修改[setting.py](https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/setting.py#L47)文件中的`PROXY_FETCHER`项: 189 | 190 |   在`PROXY_FETCHER`下添加自定义方法的名字: 191 | 192 | ```python 193 | PROXY_FETCHER = [ 194 | "freeProxy01", 195 | "freeProxy02", 196 | # .... 197 | "freeProxyCustom1" # # 确保名字和你添加方法名字一致 198 | ] 199 | ``` 200 | 201 | 202 |   `schedule` 进程会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。 203 | 204 | ### 免费代理源 205 | 206 | 目前实现的采集免费代理网站有(排名不分先后, 下面仅是对其发布的免费代理情况, 付费代理测评可以参考[这里](https://zhuanlan.zhihu.com/p/33576641)): 207 | 208 | | 代理名称 | 状态 | 更新速度 | 可用率 | 地址 | 代码 | 209 | |---------------| ---- | -------- | ------ | ----- |------------------------------------------------| 210 | | 站大爷 | ✔ | ★ | ** | [地址](https://www.zdaye.com/) | [`freeProxy01`](/fetcher/proxyFetcher.py#L28) | 211 | | 66代理 | ✔ | ★ | * | [地址](http://www.66ip.cn/) | [`freeProxy02`](/fetcher/proxyFetcher.py#L50) | 212 | | 开心代理 | ✔ | ★ | * | [地址](http://www.kxdaili.com/) | [`freeProxy03`](/fetcher/proxyFetcher.py#L63) | 213 | | FreeProxyList | ✔ | ★ | * | [地址](https://www.freeproxylists.net/zh/) | [`freeProxy04`](/fetcher/proxyFetcher.py#L74) | 214 | | 快代理 | ✔ | ★ | * | [地址](https://www.kuaidaili.com/) | [`freeProxy05`](/fetcher/proxyFetcher.py#L92) | 215 | | 冰凌代理 | ✔ | ★★★ | * | [地址](https://www.binglx.cn/) | [`freeProxy06`](/fetcher/proxyFetcher.py#L111) | 216 | | 云代理 | ✔ | ★ | * | [地址](http://www.ip3366.net/) | [`freeProxy07`](/fetcher/proxyFetcher.py#L123) | 217 | | 小幻代理 | ✔ | ★★ | * | [地址](https://ip.ihuan.me/) | [`freeProxy08`](/fetcher/proxyFetcher.py#L133) | 218 | | 免费代理库 | ✔ | ☆ | * | [地址](http://ip.jiangxianli.com/) | [`freeProxy09`](/fetcher/proxyFetcher.py#L143) | 219 | | 89代理 | ✔ | ☆ | * | [地址](https://www.89ip.cn/) | [`freeProxy10`](/fetcher/proxyFetcher.py#L154) | 220 | | 稻壳代理 | ✔ | ★★ | *** | [地址](https://www.docip.ne) | [`freeProxy11`](/fetcher/proxyFetcher.py#L164) | 221 | 222 | 223 | 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。 224 | 225 | ### 问题反馈 226 | 227 |   任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,同时也可以到我的[博客](http://www.spiderpy.cn/blog/message)中留言。 228 | 229 |   你的反馈会让此项目变得更加完美。 230 | 231 | ### 贡献代码 232 | 233 |   本项目仅作为基本的通用的代理池架构,不接收特有功能(当然,不限于特别好的idea)。 234 | 235 |   本项目依然不够完善,如果发现bug或有新的功能添加,请在[Issues](https://github.com/jhao104/proxy_pool/issues)中提交bug(或新功能)描述,我会尽力改进,使她更加完美。 236 | 237 |   这里感谢以下contributor的无私奉献: 238 | 239 |   [@kangnwh](https://github.com/kangnwh) | [@bobobo80](https://github.com/bobobo80) | [@halleywj](https://github.com/halleywj) | [@newlyedward](https://github.com/newlyedward) | [@wang-ye](https://github.com/wang-ye) | [@gladmo](https://github.com/gladmo) | [@bernieyangmh](https://github.com/bernieyangmh) | [@PythonYXY](https://github.com/PythonYXY) | [@zuijiawoniu](https://github.com/zuijiawoniu) | [@netAir](https://github.com/netAir) | [@scil](https://github.com/scil) | [@tangrela](https://github.com/tangrela) | [@highroom](https://github.com/highroom) | [@luocaodan](https://github.com/luocaodan) | [@vc5](https://github.com/vc5) | [@1again](https://github.com/1again) | [@obaiyan](https://github.com/obaiyan) | [@zsbh](https://github.com/zsbh) | [@jiannanya](https://github.com/jiannanya) | [@Jerry12228](https://github.com/Jerry12228) 240 | 241 | 242 | ### Release Notes 243 | 244 | [changelog](https://github.com/jhao104/proxy_pool/blob/master/docs/changelog.rst) 245 | 246 | Featured|HelloGitHub 247 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /api/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py 5 | Description : 6 | Author : JHao 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | -------------------------------------------------------------------------------- /api/proxyApi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: ProxyApi.py 6 | Description : WebApi 7 | Author : JHao 8 | date: 2016/12/4 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/12/04: WebApi 12 | 2019/08/14: 集成Gunicorn启动方式 13 | 2020/06/23: 新增pop接口 14 | 2022/07/21: 更新count接口 15 | ------------------------------------------------- 16 | """ 17 | __author__ = 'JHao' 18 | 19 | import platform 20 | from werkzeug.wrappers import Response 21 | from flask import Flask, jsonify, request 22 | 23 | from util.six import iteritems 24 | from helper.proxy import Proxy 25 | from handler.proxyHandler import ProxyHandler 26 | from handler.configHandler import ConfigHandler 27 | 28 | app = Flask(__name__) 29 | conf = ConfigHandler() 30 | proxy_handler = ProxyHandler() 31 | 32 | 33 | class JsonResponse(Response): 34 | @classmethod 35 | def force_type(cls, response, environ=None): 36 | if isinstance(response, (dict, list)): 37 | response = jsonify(response) 38 | 39 | return super(JsonResponse, cls).force_type(response, environ) 40 | 41 | 42 | app.response_class = JsonResponse 43 | 44 | api_list = [ 45 | {"url": "/get", "params": "type: ''https'|''", "desc": "get a proxy"}, 46 | {"url": "/pop", "params": "", "desc": "get and delete a proxy"}, 47 | {"url": "/delete", "params": "proxy: 'e.g. 127.0.0.1:8080'", "desc": "delete an unable proxy"}, 48 | {"url": "/all", "params": "type: ''https'|''", "desc": "get all proxy from proxy pool"}, 49 | {"url": "/count", "params": "", "desc": "return proxy count"} 50 | # 'refresh': 'refresh proxy pool', 51 | ] 52 | 53 | 54 | @app.route('/') 55 | def index(): 56 | return {'url': api_list} 57 | 58 | 59 | @app.route('/get/') 60 | def get(): 61 | https = request.args.get("type", "").lower() == 'https' 62 | proxy = proxy_handler.get(https) 63 | return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} 64 | 65 | 66 | @app.route('/pop/') 67 | def pop(): 68 | https = request.args.get("type", "").lower() == 'https' 69 | proxy = proxy_handler.pop(https) 70 | return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} 71 | 72 | 73 | @app.route('/refresh/') 74 | def refresh(): 75 | # TODO refresh会有守护程序定时执行,由api直接调用性能较差,暂不使用 76 | return 'success' 77 | 78 | 79 | @app.route('/all/') 80 | def getAll(): 81 | https = request.args.get("type", "").lower() == 'https' 82 | proxies = proxy_handler.getAll(https) 83 | return jsonify([_.to_dict for _ in proxies]) 84 | 85 | 86 | @app.route('/delete/', methods=['GET']) 87 | def delete(): 88 | proxy = request.args.get('proxy') 89 | status = proxy_handler.delete(Proxy(proxy)) 90 | return {"code": 0, "src": status} 91 | 92 | 93 | @app.route('/count/') 94 | def getCount(): 95 | proxies = proxy_handler.getAll() 96 | http_type_dict = {} 97 | source_dict = {} 98 | for proxy in proxies: 99 | http_type = 'https' if proxy.https else 'http' 100 | http_type_dict[http_type] = http_type_dict.get(http_type, 0) + 1 101 | for source in proxy.source.split('/'): 102 | source_dict[source] = source_dict.get(source, 0) + 1 103 | return {"http_type": http_type_dict, "source": source_dict, "count": len(proxies)} 104 | 105 | 106 | def runFlask(): 107 | if platform.system() == "Windows": 108 | app.run(host=conf.serverHost, port=conf.serverPort) 109 | else: 110 | import gunicorn.app.base 111 | 112 | class StandaloneApplication(gunicorn.app.base.BaseApplication): 113 | 114 | def __init__(self, app, options=None): 115 | self.options = options or {} 116 | self.application = app 117 | super(StandaloneApplication, self).__init__() 118 | 119 | def load_config(self): 120 | _config = dict([(key, value) for key, value in iteritems(self.options) 121 | if key in self.cfg.settings and value is not None]) 122 | for key, value in iteritems(_config): 123 | self.cfg.set(key.lower(), value) 124 | 125 | def load(self): 126 | return self.application 127 | 128 | _options = { 129 | 'bind': '%s:%s' % (conf.serverHost, conf.serverPort), 130 | 'workers': 4, 131 | 'accesslog': '-', # log to stdout 132 | 'access_log_format': '%(h)s %(l)s %(t)s "%(r)s" %(s)s "%(a)s"' 133 | } 134 | StandaloneApplication(app, _options).run() 135 | 136 | 137 | if __name__ == '__main__': 138 | runFlask() 139 | -------------------------------------------------------------------------------- /db/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py.py 5 | Description : 6 | Author : JHao 7 | date: 2016/12/2 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/2: 11 | ------------------------------------------------- 12 | """ -------------------------------------------------------------------------------- /db/dbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: DbClient.py 6 | Description : DB工厂类 7 | Author : JHao 8 | date: 2016/12/2 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/12/02: DB工厂类 12 | 2020/07/03: 取消raw_proxy储存 13 | ------------------------------------------------- 14 | """ 15 | __author__ = 'JHao' 16 | 17 | import os 18 | import sys 19 | 20 | from util.six import urlparse, withMetaclass 21 | from util.singleton import Singleton 22 | 23 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 24 | 25 | 26 | class DbClient(withMetaclass(Singleton)): 27 | """ 28 | DbClient DB工厂类 提供get/put/update/pop/delete/exists/getAll/clean/getCount/changeTable方法 29 | 30 | 31 | 抽象方法定义: 32 | get(): 随机返回一个proxy; 33 | put(proxy): 存入一个proxy; 34 | pop(): 顺序返回并删除一个proxy; 35 | update(proxy): 更新指定proxy信息; 36 | delete(proxy): 删除指定proxy; 37 | exists(proxy): 判断指定proxy是否存在; 38 | getAll(): 返回所有代理; 39 | clean(): 清除所有proxy信息; 40 | getCount(): 返回proxy统计信息; 41 | changeTable(name): 切换操作对象 42 | 43 | 44 | 所有方法需要相应类去具体实现: 45 | ssdb: ssdbClient.py 46 | redis: redisClient.py 47 | mongodb: mongodbClient.py 48 | 49 | """ 50 | 51 | def __init__(self, db_conn): 52 | """ 53 | init 54 | :return: 55 | """ 56 | self.parseDbConn(db_conn) 57 | self.__initDbClient() 58 | 59 | @classmethod 60 | def parseDbConn(cls, db_conn): 61 | db_conf = urlparse(db_conn) 62 | cls.db_type = db_conf.scheme.upper().strip() 63 | cls.db_host = db_conf.hostname 64 | cls.db_port = db_conf.port 65 | cls.db_user = db_conf.username 66 | cls.db_pwd = db_conf.password 67 | cls.db_name = db_conf.path[1:] 68 | return cls 69 | 70 | def __initDbClient(self): 71 | """ 72 | init DB Client 73 | :return: 74 | """ 75 | __type = None 76 | if "SSDB" == self.db_type: 77 | __type = "ssdbClient" 78 | elif "REDIS" == self.db_type: 79 | __type = "redisClient" 80 | else: 81 | pass 82 | assert __type, 'type error, Not support DB type: {}'.format(self.db_type) 83 | self.client = getattr(__import__(__type), "%sClient" % self.db_type.title())(host=self.db_host, 84 | port=self.db_port, 85 | username=self.db_user, 86 | password=self.db_pwd, 87 | db=self.db_name) 88 | 89 | def get(self, https, **kwargs): 90 | return self.client.get(https, **kwargs) 91 | 92 | def put(self, key, **kwargs): 93 | return self.client.put(key, **kwargs) 94 | 95 | def update(self, key, value, **kwargs): 96 | return self.client.update(key, value, **kwargs) 97 | 98 | def delete(self, key, **kwargs): 99 | return self.client.delete(key, **kwargs) 100 | 101 | def exists(self, key, **kwargs): 102 | return self.client.exists(key, **kwargs) 103 | 104 | def pop(self, https, **kwargs): 105 | return self.client.pop(https, **kwargs) 106 | 107 | def getAll(self, https): 108 | return self.client.getAll(https) 109 | 110 | def clear(self): 111 | return self.client.clear() 112 | 113 | def changeTable(self, name): 114 | self.client.changeTable(name) 115 | 116 | def getCount(self): 117 | return self.client.getCount() 118 | 119 | def test(self): 120 | return self.client.test() 121 | -------------------------------------------------------------------------------- /db/redisClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ----------------------------------------------------- 4 | File Name: redisClient.py 5 | Description : 封装Redis相关操作 6 | Author : JHao 7 | date: 2019/8/9 8 | ------------------------------------------------------ 9 | Change Activity: 10 | 2019/08/09: 封装Redis相关操作 11 | 2020/06/23: 优化pop方法, 改用hscan命令 12 | 2021/05/26: 区别http/https代理 13 | ------------------------------------------------------ 14 | """ 15 | __author__ = 'JHao' 16 | 17 | from redis.exceptions import TimeoutError, ConnectionError, ResponseError 18 | from redis.connection import BlockingConnectionPool 19 | from handler.logHandler import LogHandler 20 | from random import choice 21 | from redis import Redis 22 | import json 23 | 24 | 25 | class RedisClient(object): 26 | """ 27 | Redis client 28 | 29 | Redis中代理存放的结构为hash: 30 | key为ip:port, value为代理属性的字典; 31 | 32 | """ 33 | 34 | def __init__(self, **kwargs): 35 | """ 36 | init 37 | :param host: host 38 | :param port: port 39 | :param password: password 40 | :param db: db 41 | :return: 42 | """ 43 | self.name = "" 44 | kwargs.pop("username") 45 | self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True, 46 | timeout=5, 47 | socket_timeout=5, 48 | **kwargs)) 49 | 50 | def get(self, https): 51 | """ 52 | 返回一个代理 53 | :return: 54 | """ 55 | if https: 56 | items = self.__conn.hvals(self.name) 57 | proxies = list(filter(lambda x: json.loads(x).get("https"), items)) 58 | return choice(proxies) if proxies else None 59 | else: 60 | proxies = self.__conn.hkeys(self.name) 61 | proxy = choice(proxies) if proxies else None 62 | return self.__conn.hget(self.name, proxy) if proxy else None 63 | 64 | def put(self, proxy_obj): 65 | """ 66 | 将代理放入hash, 使用changeTable指定hash name 67 | :param proxy_obj: Proxy obj 68 | :return: 69 | """ 70 | data = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) 71 | return data 72 | 73 | def pop(self, https): 74 | """ 75 | 弹出一个代理 76 | :return: dict {proxy: value} 77 | """ 78 | proxy = self.get(https) 79 | if proxy: 80 | self.__conn.hdel(self.name, json.loads(proxy).get("proxy", "")) 81 | return proxy if proxy else None 82 | 83 | def delete(self, proxy_str): 84 | """ 85 | 移除指定代理, 使用changeTable指定hash name 86 | :param proxy_str: proxy str 87 | :return: 88 | """ 89 | return self.__conn.hdel(self.name, proxy_str) 90 | 91 | def exists(self, proxy_str): 92 | """ 93 | 判断指定代理是否存在, 使用changeTable指定hash name 94 | :param proxy_str: proxy str 95 | :return: 96 | """ 97 | return self.__conn.hexists(self.name, proxy_str) 98 | 99 | def update(self, proxy_obj): 100 | """ 101 | 更新 proxy 属性 102 | :param proxy_obj: 103 | :return: 104 | """ 105 | return self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) 106 | 107 | def getAll(self, https): 108 | """ 109 | 字典形式返回所有代理, 使用changeTable指定hash name 110 | :return: 111 | """ 112 | items = self.__conn.hvals(self.name) 113 | if https: 114 | return list(filter(lambda x: json.loads(x).get("https"), items)) 115 | else: 116 | return items 117 | 118 | def clear(self): 119 | """ 120 | 清空所有代理, 使用changeTable指定hash name 121 | :return: 122 | """ 123 | return self.__conn.delete(self.name) 124 | 125 | def getCount(self): 126 | """ 127 | 返回代理数量 128 | :return: 129 | """ 130 | proxies = self.getAll(https=False) 131 | return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))} 132 | 133 | def changeTable(self, name): 134 | """ 135 | 切换操作对象 136 | :param name: 137 | :return: 138 | """ 139 | self.name = name 140 | 141 | def test(self): 142 | log = LogHandler('redis_client') 143 | try: 144 | self.getCount() 145 | except TimeoutError as e: 146 | log.error('redis connection time out: %s' % str(e), exc_info=True) 147 | return e 148 | except ConnectionError as e: 149 | log.error('redis connection error: %s' % str(e), exc_info=True) 150 | return e 151 | except ResponseError as e: 152 | log.error('redis connection error: %s' % str(e), exc_info=True) 153 | return e 154 | 155 | 156 | -------------------------------------------------------------------------------- /db/ssdbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: ssdbClient.py 6 | Description : 封装SSDB操作 7 | Author : JHao 8 | date: 2016/12/2 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/12/2: 12 | 2017/09/22: PY3中 redis-py返回的数据是bytes型 13 | 2017/09/27: 修改pop()方法 返回{proxy:value}字典 14 | 2020/07/03: 2.1.0 优化代码结构 15 | 2021/05/26: 区分http和https代理 16 | ------------------------------------------------- 17 | """ 18 | __author__ = 'JHao' 19 | from redis.exceptions import TimeoutError, ConnectionError, ResponseError 20 | from redis.connection import BlockingConnectionPool 21 | from handler.logHandler import LogHandler 22 | from random import choice 23 | from redis import Redis 24 | import json 25 | 26 | 27 | class SsdbClient(object): 28 | """ 29 | SSDB client 30 | 31 | SSDB中代理存放的结构为hash: 32 | key为代理的ip:por, value为代理属性的字典; 33 | """ 34 | 35 | def __init__(self, **kwargs): 36 | """ 37 | init 38 | :param host: host 39 | :param port: port 40 | :param password: password 41 | :return: 42 | """ 43 | self.name = "" 44 | kwargs.pop("username") 45 | self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True, 46 | timeout=5, 47 | socket_timeout=5, 48 | **kwargs)) 49 | 50 | def get(self, https): 51 | """ 52 | 从hash中随机返回一个代理 53 | :return: 54 | """ 55 | if https: 56 | items_dict = self.__conn.hgetall(self.name) 57 | proxies = list(filter(lambda x: json.loads(x).get("https"), items_dict.values())) 58 | return choice(proxies) if proxies else None 59 | else: 60 | proxies = self.__conn.hkeys(self.name) 61 | proxy = choice(proxies) if proxies else None 62 | return self.__conn.hget(self.name, proxy) if proxy else None 63 | 64 | def put(self, proxy_obj): 65 | """ 66 | 将代理放入hash 67 | :param proxy_obj: Proxy obj 68 | :return: 69 | """ 70 | result = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) 71 | return result 72 | 73 | def pop(self, https): 74 | """ 75 | 顺序弹出一个代理 76 | :return: proxy 77 | """ 78 | proxy = self.get(https) 79 | if proxy: 80 | self.__conn.hdel(self.name, json.loads(proxy).get("proxy", "")) 81 | return proxy if proxy else None 82 | 83 | def delete(self, proxy_str): 84 | """ 85 | 移除指定代理, 使用changeTable指定hash name 86 | :param proxy_str: proxy str 87 | :return: 88 | """ 89 | self.__conn.hdel(self.name, proxy_str) 90 | 91 | def exists(self, proxy_str): 92 | """ 93 | 判断指定代理是否存在, 使用changeTable指定hash name 94 | :param proxy_str: proxy str 95 | :return: 96 | """ 97 | return self.__conn.hexists(self.name, proxy_str) 98 | 99 | def update(self, proxy_obj): 100 | """ 101 | 更新 proxy 属性 102 | :param proxy_obj: 103 | :return: 104 | """ 105 | self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) 106 | 107 | def getAll(self, https): 108 | """ 109 | 字典形式返回所有代理, 使用changeTable指定hash name 110 | :return: 111 | """ 112 | item_dict = self.__conn.hgetall(self.name) 113 | if https: 114 | return list(filter(lambda x: json.loads(x).get("https"), item_dict.values())) 115 | else: 116 | return item_dict.values() 117 | 118 | def clear(self): 119 | """ 120 | 清空所有代理, 使用changeTable指定hash name 121 | :return: 122 | """ 123 | return self.__conn.delete(self.name) 124 | 125 | def getCount(self): 126 | """ 127 | 返回代理数量 128 | :return: 129 | """ 130 | proxies = self.getAll(https=False) 131 | return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))} 132 | 133 | def changeTable(self, name): 134 | """ 135 | 切换操作对象 136 | :param name: 137 | :return: 138 | """ 139 | self.name = name 140 | 141 | def test(self): 142 | log = LogHandler('ssdb_client') 143 | try: 144 | self.getCount() 145 | except TimeoutError as e: 146 | log.error('ssdb connection time out: %s' % str(e), exc_info=True) 147 | return e 148 | except ConnectionError as e: 149 | log.error('ssdb connection error: %s' % str(e), exc_info=True) 150 | return e 151 | except ResponseError as e: 152 | log.error('ssdb connection error: %s' % str(e), exc_info=True) 153 | return e 154 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | proxy_pool: 4 | build: . 5 | container_name: proxy_pool 6 | ports: 7 | - "5010:5010" 8 | links: 9 | - proxy_redis 10 | environment: 11 | DB_CONN: "redis://@proxy_redis:6379/0" 12 | proxy_redis: 13 | image: "redis" 14 | container_name: proxy_redis -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | .. _changelog: 2 | 3 | ChangeLog 4 | ========== 5 | 6 | 2.4.2 (2024-01-18) 7 | ------------------ 8 | 9 | 1. 代理格式检查支持需认证的代理格式 `username:password@ip:port` ; (2023-03-10) 10 | 2. 新增代理源 **稻壳代理**; (2023-05-15) 11 | 3. 新增代理源 **冰凌代理**; (2023-01-18) 12 | 13 | 2.4.1 (2022-07-17) 14 | ------------------ 15 | 16 | 1. 新增代理源 **FreeProxyList**; (2022-07-21) 17 | 2. 新增代理源 **FateZero**; (2022-08-01) 18 | 3. 新增代理属性 ``region``; (2022-08-16) 19 | 20 | 2.4.0 (2021-11-17) 21 | ------------------ 22 | 23 | 1. 移除无效代理源 **神鸡代理**; (2021-11-16) 24 | 2. 移除无效代理源 **极速代理**; (2021-11-16) 25 | 3. 移除代理源 **西拉代理**; (2021-11-16) 26 | 4. 新增代理源 **蝶鸟IP**; (2021-11-16) 27 | 5. 新增代理源 **PROXY11**; (2021-11-16) 28 | 6. 多线程采集代理; (2021-11-17) 29 | 30 | 2.3.0 (2021-05-27) 31 | ------------------ 32 | 33 | 1. 修复Dockerfile时区问题; (2021-04-12) 34 | 2. 新增Proxy属性 ``source``, 标记代理来源; (2021-04-13) 35 | 3. 新增Proxy属性 ``https``, 标记支持https的代理; (2021-05-27) 36 | 37 | 2.2.0 (2021-04-08) 38 | ------------------ 39 | 40 | 1. 启动时检查数据库连通性; 41 | 2. 新增免费代理源 **米扑代理**; 42 | 3. 新增免费代理源 **Pzzqz**; 43 | 4. 新增免费代理源 **神鸡代理**; 44 | 5. 新增免费代理源 **极速代理**; 45 | 6. 新增免费代理源 **小幻代理**; 46 | 47 | 2.1.1 (2021-02-23) 48 | ------------------ 49 | 50 | 1. Fix Bug `#493`_, 新增时区配置; (2020-08-12) 51 | 2. 修复 **66代理** 采集; (2020-11-04) 52 | 3. 修复 **全网代理** 采集, 解决HTML端口加密问题; (2020-11-04) 53 | 4. 新增 **代理盒子** 免费源; (2020-11-04) 54 | 5. 新增 ``POOL_SIZE_MIN`` 配置项, runProxyCheck时, 剩余代理少于POOL_SIZE_MIN触发抓取; (2021-02-23) 55 | 56 | .. _#493: https://github.com/jhao104/proxy_pool/issues/493 57 | 58 | 2.1.0 (2020.07) 59 | ------------------ 60 | 61 | 1. 新增免费代理源 **西拉代理** (2020-03-30) 62 | 2. Fix Bug `#356`_ `#401`_ 63 | 3. 优化Docker镜像体积; (2020-06-19) 64 | 4. 优化配置方式; 65 | 5. 优化代码结构; 66 | 6. 不再储存raw_proxy, 抓取后直接验证入库; 67 | 68 | .. _#401: https://github.com/jhao104/proxy_pool/issues/401 69 | .. _#356: https://github.com/jhao104/proxy_pool/issues/356 70 | 71 | 2.0.1 (2019.10) 72 | ----------------- 73 | 74 | 1. 新增免费代理源 **89免费代理**; 75 | #. 新增免费代理源 **齐云代理** 76 | 77 | 2.0.0 (2019.08) 78 | ------------------ 79 | 80 | 1. WebApi集成Gunicorn方式启动, Windows平台暂不支持; 81 | #. 优化Proxy调度程序; 82 | #. 扩展Proxy属性; 83 | #. 新增cli工具, 更加方便启动proxyPool 84 | 85 | 1.14 (2019.07) 86 | ----------------- 87 | 88 | 1. 修复 Queue阻塞导致的 ``ProxyValidSchedule`` 假死bug; 89 | #. 修改代理源 **云代理** 抓取; 90 | #. 修改代理源 **码农代理** 抓取; 91 | #. 修改代理源 **代理66** 抓取, 引入 ``PyExecJS`` 模块破解加速乐动态Cookies加密; 92 | 93 | 1.13 (2019.02) 94 | ----------------- 95 | 96 | 1. 使用.py文件替换.ini作为配置文件; 97 | 98 | #. 优化代理采集部分; 99 | 100 | 1.12 (2018.04) 101 | ----------------- 102 | 103 | 1. 优化代理格式检查; 104 | 105 | #. 增加代理源; 106 | 107 | #. fix bug `#122`_ `#126`_ 108 | 109 | .. _#122: https://github.com/jhao104/proxy_pool/issues/122 110 | .. _#126: https://github.com/jhao104/proxy_pool/issues/126 111 | 112 | 1.11 (2017.08) 113 | ----------------- 114 | 115 | 1. 使用多线程验证useful_pool; 116 | 117 | 1.10 (2016.11) 118 | ----------------- 119 | 120 | 1. 第一版; 121 | 122 | #. 支持PY2/PY3; 123 | 124 | #. 代理池基本功能; 125 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | import sphinx_rtd_theme 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'ProxyPool' 21 | copyright = '2020, jhao104' 22 | author = 'jhao104' 23 | 24 | master_doc = 'index' 25 | 26 | # The full version, including alpha/beta/rc tags 27 | release = '2.1.0' 28 | 29 | # -- General configuration --------------------------------------------------- 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | ] 36 | 37 | # If true, sectionauthor and moduleauthor directives will be shown in the 38 | # output. They are ignored by default. 39 | show_authors = False 40 | 41 | # The name of the Pygments (syntax highlighting) style to use. 42 | pygments_style = "sphinx" 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ['_templates'] 46 | 47 | # The language for content autogenerated by Sphinx. Refer to documentation 48 | # for a list of supported languages. 49 | # 50 | # This is also used if you do content translation via gettext catalogs. 51 | # Usually you set "language" from the command line for these cases. 52 | language = 'zh_CN' 53 | 54 | # List of patterns, relative to source directory, that match files and 55 | # directories to ignore when looking for source files. 56 | # This pattern also affects html_static_path and html_extra_path. 57 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 58 | 59 | # -- Options for HTML output ------------------------------------------------- 60 | 61 | # The theme to use for HTML and HTML Help pages. See the documentation for 62 | # a list of builtin themes. 63 | # 64 | html_theme = 'sphinx_rtd_theme' 65 | 66 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 67 | 68 | # Add any paths that contain custom static files (such as style sheets) here, 69 | # relative to this directory. They are copied after the builtin static files, 70 | # so a file named "default.css" will overwrite the builtin "default.css". 71 | html_static_path = ['_static'] 72 | -------------------------------------------------------------------------------- /docs/dev/ext_fetcher.rst: -------------------------------------------------------------------------------- 1 | .. ext_fetcher 2 | 3 | 扩展代理源 4 | ----------- 5 | 6 | 项目默认包含几个免费的代理获取源,但是免费的毕竟质量有限,如果直接运行可能拿到的代理质量不理想。因此提供了用户自定义扩展代理获取的方法。 7 | 8 | 如果要添加一个新的代理获取方法, 过程如下: 9 | 10 | 1. 首先在 `ProxyFetcher`_ 类中添加自定义的获取代理的静态方法,该方法需要以生成器(yield)形式返回 ``host:ip`` 格式的代理字符串, 例如: 11 | 12 | .. code-block:: python 13 | 14 | class ProxyFetcher(object): 15 | # .... 16 | # 自定义代理源获取方法 17 | @staticmethod 18 | def freeProxyCustom01(): # 命名不和已有重复即可 19 | # 通过某网站或者某接口或某数据库获取代理 20 | # 假设你已经拿到了一个代理列表 21 | proxies = ["x.x.x.x:3128", "x.x.x.x:80"] 22 | for proxy in proxies: 23 | yield proxy 24 | # 确保每个proxy都是 host:ip正确的格式返回 25 | 26 | 2. 添加好方法后,修改配置文件 `setting.py`_ 中的 ``PROXY_FETCHER`` 项, 加入刚才添加的自定义方法的名字: 27 | 28 | .. code-block:: python 29 | 30 | PROXY_FETCHER = [ 31 | # .... 32 | "freeProxyCustom01" # # 确保名字和你添加方法名字一致 33 | ] 34 | 35 | .. _ProxyFetcher: https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/fetcher/proxyFetcher.py#L20 36 | .. _setting.py: https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/setting.py#L47 -------------------------------------------------------------------------------- /docs/dev/ext_validator.rst: -------------------------------------------------------------------------------- 1 | .. ext_validator 2 | 3 | 代理校验 4 | ----------- 5 | 6 | 内置校验 7 | >>>>>>>>> 8 | 9 | 项目中使用的代理校验方法全部定义在 `validator.py`_ 中, 通过 `ProxyValidator`_ 类中提供的装饰器来区分。校验方法返回 ``True`` 表示 10 | 校验通过, 返回 ``False`` 表示校验不通过。 11 | 12 | * 代理校验方法分为三类: ``preValidator`` 、 ``httpValidator`` 、 ``httpsValidator``: 13 | 14 | * **preValidator**: 预校验,在代理抓取后验证前调用,目前实现了 `formatValidator`_ 校验代理IP格式是否合法; 15 | * **httpValidator**: 代理可用性校验,通过则认为代理可用, 目前实现了 `httpTimeOutValidator`_ 校验; 16 | * **httpsValidator**: 校验代理是否支持https,目前实现了 `httpsTimeOutValidator`_ 校验。 17 | 18 | 19 | .. _validator.py: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py 20 | .. _ProxyValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L29 21 | .. _formatValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L51 22 | .. _httpTimeOutValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L58 23 | .. _httpsTimeOutValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L71 24 | 25 | 每种校验可以定义多个方法,只有 **所有** 方法都返回 ``True`` 的情况下才视为该校验通过,校验方法执行顺序为: 先执行 **httpValidator** , 前者通过后再执行 **httpsValidator** 。 26 | 只有 `preValidator` 校验通过的代理才会进入可用性校验, `httpValidator` 校验通过后认为代理可用准备更新入代理池, `httpValidator` 校验通过后视为代理支持https更新代理的 `https` 属性为 `True` 。 27 | 28 | 扩展校验 29 | >>>>>>>>> 30 | 31 | 在 `validator.py`_ 已有自定义校验的示例,自定义函数需返回True或者False,使用 `ProxyValidator`_ 中提供的装饰器来区分校验类型。 下面是两个例子: 32 | 33 | * 1. 自定义一个代理可用性的校验(``addHttpValidator``): 34 | 35 | .. code-block:: python 36 | 37 | @ProxyValidator.addHttpValidator 38 | def customValidatorExample01(proxy): 39 | """自定义代理可用性校验函数""" 40 | proxies = {"http": "http://{proxy}".format(proxy=proxy)} 41 | try: 42 | r = requests.get("http://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5) 43 | return True if r.status_code == 200 and len(r.content) > 200 else False 44 | except Exception as e: 45 | return False 46 | 47 | * 2. 自定义一个代理是否支持https的校验(``addHttpsValidator``): 48 | 49 | .. code-block:: python 50 | 51 | @ProxyValidator.addHttpsValidator 52 | def customValidatorExample02(proxy): 53 | """自定义代理是否支持https校验函数""" 54 | proxies = {"https": "https://{proxy}".format(proxy=proxy)} 55 | try: 56 | r = requests.get("https://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5, verify=False) 57 | return True if r.status_code == 200 and len(r.content) > 200 else False 58 | except Exception as e: 59 | return False 60 | 61 | 注意,比如在运行代理可用性校验时,所有被 ``ProxyValidator.addHttpValidator`` 装饰的函数会被依次按定义顺序执行,只有当所有函数都返回True时才会判断代理可用。 ``HttpsValidator`` 运行机制也是如此。 62 | -------------------------------------------------------------------------------- /docs/dev/index.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | 开发指南 3 | ========= 4 | 5 | .. module:: dev 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | ext_fetcher 11 | ext_validator 12 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. ProxyPool documentation master file, created by 2 | sphinx-quickstart on Wed Jul 8 16:13:42 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ProxyPool 7 | ===================================== 8 | 9 | :: 10 | 11 | **************************************************************** 12 | *** ______ ********************* ______ *********** _ ******** 13 | *** | ___ \_ ******************** | ___ \ ********* | | ******** 14 | *** | |_/ / \__ __ __ _ __ _ | |_/ /___ * ___ | | ******** 15 | *** | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | ******** 16 | *** | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ **** 17 | *** \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____/ **** 18 | **** __ / / ***** 19 | ************************* /___ / ******************************* 20 | ************************* ******************************** 21 | **************************************************************** 22 | 23 | Python爬虫代理IP池 24 | 25 | 安装 26 | ----- 27 | 28 | * 下载代码 29 | 30 | .. code-block:: console 31 | 32 | $ git clone git@github.com:jhao104/proxy_pool.git 33 | 34 | * 安装依赖 35 | 36 | .. code-block:: console 37 | 38 | $ pip install -r requirements.txt 39 | 40 | * 更新配置 41 | 42 | .. code-block:: python 43 | 44 | HOST = "0.0.0.0" 45 | PORT = 5000 46 | 47 | DB_CONN = 'redis://@127.0.0.1:8888' 48 | 49 | PROXY_FETCHER = [ 50 | "freeProxy01", 51 | "freeProxy02", 52 | # .... 53 | ] 54 | 55 | * 启动项目 56 | 57 | .. code-block:: console 58 | 59 | $ python proxyPool.py schedule 60 | $ python proxyPool.py server 61 | 62 | 使用 63 | ______ 64 | 65 | * API 66 | 67 | ============ ======== ================ ============== 68 | Api Method Description Params 69 | ============ ======== ================ ============== 70 | / GET API介绍 无 71 | /get GET 返回一个代理 可选参数: `?type=https` 过滤支持https的代理 72 | /pop GET 返回并删除一个代理 可选参数: `?type=https` 过滤支持https的代理 73 | /all GET 返回所有代理 可选参数: `?type=https` 过滤支持https的代理 74 | /count GET 返回代理数量 无 75 | /delete GET 删除指定代理 `?proxy=host:ip` 76 | ============ ======== ================ ============== 77 | 78 | 79 | * 爬虫 80 | 81 | .. code-block:: python 82 | 83 | import requests 84 | 85 | def get_proxy(): 86 | return requests.get("http://127.0.0.1:5010/get?type=https").json() 87 | 88 | def delete_proxy(proxy): 89 | requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) 90 | 91 | # your spider code 92 | 93 | def getHtml(): 94 | # .... 95 | retry_count = 5 96 | proxy = get_proxy().get("proxy") 97 | while retry_count > 0: 98 | try: 99 | html = requests.get('https://www.example.com', proxies={"http": "http://{}".format(proxy), "https": "https://{}".format(proxy)}) 100 | # 使用代理访问 101 | return html 102 | except Exception: 103 | retry_count -= 1 104 | # 删除代理池中代理 105 | delete_proxy(proxy) 106 | return None 107 | 108 | Contents 109 | -------- 110 | 111 | .. toctree:: 112 | :maxdepth: 2 113 | 114 | user/index 115 | dev/index 116 | changelog 117 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/user/how_to_config.rst: -------------------------------------------------------------------------------- 1 | .. how_to_config 2 | 3 | 配置参考 4 | --------- 5 | 6 | 配置文件 ``setting.py`` 位于项目的主目录下, 配置主要分为四类: **服务配置** 、 **数据库配置** 、 **采集配置** 、 **校验配置**. 7 | 8 | 服务配置 9 | >>>>>>>>> 10 | 11 | * ``HOST`` 12 | 13 | API服务监听的IP, 本机访问设置为 ``127.0.0.1``, 开启远程访问设置为: ``0.0.0.0``. 14 | 15 | * ``PORT`` 16 | 17 | API服务监听的端口. 18 | 19 | 数据库配置 20 | >>>>>>>>>>> 21 | 22 | * ``DB_CONN`` 23 | 24 | 用户存放代理IP的数据库URI, 配置格式为: ``db_type://[[user]:[pwd]]@ip:port/[db]``. 25 | 26 | 目前支持的db_type有: ``ssdb`` 、 ``redis``. 27 | 28 | 配置示例: 29 | 30 | .. code-block:: python 31 | 32 | # SSDB IP: 127.0.0.1 Port: 8888 33 | DB_CONN = 'ssdb://@127.0.0.1:8888' 34 | # SSDB IP: 127.0.0.1 Port: 8899 Password: 123456 35 | DB_CONN = 'ssdb://:123456@127.0.0.1:8888' 36 | 37 | # Redis IP: 127.0.0.1 Port: 6379 38 | DB_CONN = 'redis://@127.0.0.1:6379' 39 | # Redis IP: 127.0.0.1 Port: 6379 Password: 123456 40 | DB_CONN = 'redis://:123456@127.0.0.1:6379' 41 | # Redis IP: 127.0.0.1 Port: 6379 Password: 123456 DB: 15 42 | DB_CONN = 'redis://:123456@127.0.0.1:6379/15' 43 | 44 | 45 | * ``TABLE_NAME`` 46 | 47 | 存放代理的数据载体名称, ssdb和redis的存放结构为hash. 48 | 49 | 采集配置 50 | >>>>>>>>> 51 | 52 | * ``PROXY_FETCHER`` 53 | 54 | 启用的代理采集方法名, 代理采集方法位于 ``fetcher/proxyFetcher.py`` 类中. 55 | 56 | 由于各个代理源的稳定性不容易掌握, 当某个代理采集方法失效时, 可以该配置中注释掉其名称. 57 | 58 | 如果有增加某些代理采集方法, 也请在该配置中添加其方法名, 具体请参考 :doc:`/dev/extend_fetcher`. 59 | 60 | 调度程序每次执行采集任务时都会再次加载该配置, 保证每次运行的采集方法都是有效的. 61 | 62 | 校验配置 63 | >>>>>>>>> 64 | 65 | * ``HTTP_URL`` 66 | 67 | 用于检验代理是否可用的地址, 默认为 ``http://httpbin.org``, 可根据使用场景修改为其他地址. 68 | 69 | * ``HTTPS_URL`` 70 | 71 | 用于检验代理是否支持HTTPS的地址, 默认为 ``https://www.qq.com``, 可根据使用场景修改为其他地址. 72 | 73 | * ``VERIFY_TIMEOUT`` 74 | 75 | 检验代理的超时时间, 默认为 ``10`` , 单位秒. 使用代理访问 ``HTTP(S)_URL`` 耗时超过 ``VERIFY_TIMEOUT`` 时, 视为代理不可用. 76 | 77 | * ``MAX_FAIL_COUNT`` 78 | 79 | 检验代理允许最大失败次数, 默认为 ``0``, 即出错一次即删除. 80 | 81 | * ``POOL_SIZE_MIN`` 82 | 83 | 代理检测定时任务运行前若代理数量小于 `POOL_SIZE_MIN`, 则先运行抓取程序. -------------------------------------------------------------------------------- /docs/user/how_to_run.rst: -------------------------------------------------------------------------------- 1 | .. how_to_run 2 | 3 | 4 | 如何运行 5 | --------- 6 | 7 | 下载代码 8 | >>>>>>>>> 9 | 10 | 本项目需要下载代码到本地运行, 通过 ``git`` 下载: 11 | 12 | .. code-block:: console 13 | 14 | $ git clone git@github.com:jhao104/proxy_pool.git 15 | 16 | 或者下载特定的 ``release`` 版本: 17 | 18 | .. code-block:: console 19 | 20 | https://github.com/jhao104/proxy_pool/releases 21 | 22 | 安装依赖 23 | >>>>>>>>> 24 | 25 | 到项目目录下使用 ``pip`` 安装依赖库: 26 | 27 | .. code-block:: console 28 | 29 | $ pip install -r requirements.txt 30 | 31 | 32 | 更新配置 33 | >>>>>>>>> 34 | 35 | 配置文件 ``setting.py`` 位于项目的主目录下: 36 | 37 | .. code-block:: python 38 | 39 | # 配置API服务 40 | 41 | HOST = "0.0.0.0" # IP 42 | PORT = 5000 # 监听端口 43 | 44 | # 配置数据库 45 | 46 | DB_CONN = 'redis://@127.0.0.1:8888/0' 47 | 48 | # 配置 ProxyFetcher 49 | 50 | PROXY_FETCHER = [ 51 | "freeProxy01", # 这里是启用的代理抓取方法,所有fetch方法位于fetcher/proxyFetcher.py 52 | "freeProxy02", 53 | # .... 54 | ] 55 | 56 | 更多配置请参考 :doc:`/user/how_to_config` 57 | 58 | 启动项目 59 | >>>>>>>>> 60 | 61 | 如果已配置好运行环境, 具备运行条件, 可以通过 ``proxyPool.py`` 启动. ``proxyPool.py`` 是项目的CLI入口. 62 | 完整程序包含两部份: ``schedule`` 调度程序和 ``server`` API服务, 调度程序负责采集和验证代理, API服务提供代理服务HTTP接口. 63 | 64 | 通过命令行程序分别启动调度程序和API服务: 65 | 66 | .. code-block:: console 67 | 68 | # 启动调度程序 69 | $ python proxyPool.py schedule 70 | 71 | # 启动webApi服务 72 | $ python proxyPool.py server 73 | 74 | -------------------------------------------------------------------------------- /docs/user/how_to_use.rst: -------------------------------------------------------------------------------- 1 | .. how_to_use 2 | 3 | 如何使用 4 | ---------- 5 | 6 | 爬虫代码要对接代理池目前有两种方式: 一是通过调用API接口使用, 二是直接读取数据库. 7 | 8 | 调用API 9 | >>>>>>>>> 10 | 11 | 启动ProxyPool的 ``server`` 后会提供如下几个http接口: 12 | 13 | ============ ======== ================ ============== 14 | Api Method Description Arg 15 | ============ ======== ================ ============== 16 | / GET API介绍 无 17 | /get GET 随机返回一个代理 无 18 | /get_all GET 返回所有代理 无 19 | /get_status GET 返回代理数量 无 20 | /delete GET 删除指定代理 proxy=host:ip 21 | ============ ======== ================ ============== 22 | 23 | 在代码中可以通过封装上面的API接口来使用代理, 例子: 24 | 25 | .. code-block:: python 26 | 27 | import requests 28 | 29 | def get_proxy(): 30 | return requests.get("http://127.0.0.1:5010/get/").json() 31 | 32 | def delete_proxy(proxy): 33 | requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) 34 | 35 | # your spider code 36 | 37 | def getHtml(): 38 | # .... 39 | retry_count = 5 40 | proxy = get_proxy().get("proxy") 41 | while retry_count > 0: 42 | try: 43 | # 使用代理访问 44 | html = requests.get('http://www.example.com', proxies={"http": "http://{}".format(proxy)}) 45 | return html 46 | except Exception: 47 | retry_count -= 1 48 | # 删除代理池中代理 49 | delete_proxy(proxy) 50 | return None 51 | 52 | 本例中我们在本地 ``127.0.0.1`` 启动端口为 ``5010`` 的 ``server``, 使用 ``/get`` 接口获取代理, ``/delete`` 删除代理. 53 | 54 | 读数据库 55 | >>>>>>>>> 56 | 57 | 目前支持配置两种数据库: ``REDIS`` 、 ``SSDB``. 58 | 59 | * **REDIS** 储存结构为 ``hash``, hash name为配置项中的 **TABLE_NAME** 60 | 61 | * **SSDB** 储存结构为 ``hash``, hash name为配置项中的 **TABLE_NAME** 62 | 63 | 可以在代码中自行读取. 64 | -------------------------------------------------------------------------------- /docs/user/index.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | 用户指南 3 | ========= 4 | 5 | .. module:: user 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | how_to_run 11 | how_to_use 12 | how_to_config 13 | -------------------------------------------------------------------------------- /fetcher/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py 5 | Description : 6 | Author : JHao 7 | date: 2016/11/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/11/25: 11 | ------------------------------------------------- 12 | """ -------------------------------------------------------------------------------- /fetcher/proxyFetcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: proxyFetcher 5 | Description : 6 | Author : JHao 7 | date: 2016/11/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/11/25: proxyFetcher 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import re 16 | import json 17 | from time import sleep 18 | 19 | from util.webRequest import WebRequest 20 | 21 | 22 | class ProxyFetcher(object): 23 | """ 24 | proxy getter 25 | """ 26 | 27 | @staticmethod 28 | def freeProxy01(): 29 | """ 30 | 站大爷 https://www.zdaye.com/dayProxy.html 31 | """ 32 | start_url = "https://www.zdaye.com/dayProxy.html" 33 | html_tree = WebRequest().get(start_url, verify=False).tree 34 | latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() 35 | from datetime import datetime 36 | interval = datetime.now() - datetime.strptime(latest_page_time, "%Y/%m/%d %H:%M:%S") 37 | if interval.seconds < 300: # 只采集5分钟内的更新 38 | target_url = "https://www.zdaye.com/" + html_tree.xpath("//h3[@class='thread_title']/a/@href")[0].strip() 39 | while target_url: 40 | _tree = WebRequest().get(target_url, verify=False).tree 41 | for tr in _tree.xpath("//table//tr"): 42 | ip = "".join(tr.xpath("./td[1]/text()")).strip() 43 | port = "".join(tr.xpath("./td[2]/text()")).strip() 44 | yield "%s:%s" % (ip, port) 45 | next_page = _tree.xpath("//div[@class='page']/a[@title='下一页']/@href") 46 | target_url = "https://www.zdaye.com/" + next_page[0].strip() if next_page else False 47 | sleep(5) 48 | 49 | @staticmethod 50 | def freeProxy02(): 51 | """ 52 | 代理66 http://www.66ip.cn/ 53 | """ 54 | url = "http://www.66ip.cn/" 55 | resp = WebRequest().get(url, timeout=10).tree 56 | for i, tr in enumerate(resp.xpath("(//table)[3]//tr")): 57 | if i > 0: 58 | ip = "".join(tr.xpath("./td[1]/text()")).strip() 59 | port = "".join(tr.xpath("./td[2]/text()")).strip() 60 | yield "%s:%s" % (ip, port) 61 | 62 | @staticmethod 63 | def freeProxy03(): 64 | """ 开心代理 """ 65 | target_urls = ["http://www.kxdaili.com/dailiip.html", "http://www.kxdaili.com/dailiip/2/1.html"] 66 | for url in target_urls: 67 | tree = WebRequest().get(url).tree 68 | for tr in tree.xpath("//table[@class='active']//tr")[1:]: 69 | ip = "".join(tr.xpath('./td[1]/text()')).strip() 70 | port = "".join(tr.xpath('./td[2]/text()')).strip() 71 | yield "%s:%s" % (ip, port) 72 | 73 | @staticmethod 74 | def freeProxy04(): 75 | """ FreeProxyList https://www.freeproxylists.net/zh/ """ 76 | url = "https://www.freeproxylists.net/zh/?c=CN&pt=&pr=&a%5B%5D=0&a%5B%5D=1&a%5B%5D=2&u=50" 77 | tree = WebRequest().get(url, verify=False).tree 78 | from urllib import parse 79 | 80 | def parse_ip(input_str): 81 | html_str = parse.unquote(input_str) 82 | ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_str) 83 | return ips[0] if ips else None 84 | 85 | for tr in tree.xpath("//tr[@class='Odd']") + tree.xpath("//tr[@class='Even']"): 86 | ip = parse_ip("".join(tr.xpath('./td[1]/script/text()')).strip()) 87 | port = "".join(tr.xpath('./td[2]/text()')).strip() 88 | if ip: 89 | yield "%s:%s" % (ip, port) 90 | 91 | @staticmethod 92 | def freeProxy05(page_count=1): 93 | """ 快代理 https://www.kuaidaili.com """ 94 | url_pattern = [ 95 | 'https://www.kuaidaili.com/free/inha/{}/', 96 | 'https://www.kuaidaili.com/free/intr/{}/' 97 | ] 98 | url_list = [] 99 | for page_index in range(1, page_count + 1): 100 | for pattern in url_pattern: 101 | url_list.append(pattern.format(page_index)) 102 | 103 | for url in url_list: 104 | tree = WebRequest().get(url).tree 105 | proxy_list = tree.xpath('.//table//tr') 106 | sleep(1) # 必须sleep 不然第二条请求不到数据 107 | for tr in proxy_list[1:]: 108 | yield ':'.join(tr.xpath('./td/text()')[0:2]) 109 | 110 | @staticmethod 111 | def freeProxy06(): 112 | """ 冰凌代理 https://www.binglx.cn """ 113 | url = "https://www.binglx.cn/?page=1" 114 | try: 115 | tree = WebRequest().get(url).tree 116 | proxy_list = tree.xpath('.//table//tr') 117 | for tr in proxy_list[1:]: 118 | yield ':'.join(tr.xpath('./td/text()')[0:2]) 119 | except Exception as e: 120 | print(e) 121 | 122 | @staticmethod 123 | def freeProxy07(): 124 | """ 云代理 """ 125 | urls = ['http://www.ip3366.net/free/?stype=1', "http://www.ip3366.net/free/?stype=2"] 126 | for url in urls: 127 | r = WebRequest().get(url, timeout=10) 128 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) 129 | for proxy in proxies: 130 | yield ":".join(proxy) 131 | 132 | @staticmethod 133 | def freeProxy08(): 134 | """ 小幻代理 """ 135 | urls = ['https://ip.ihuan.me/address/5Lit5Zu9.html'] 136 | for url in urls: 137 | r = WebRequest().get(url, timeout=10) 138 | proxies = re.findall(r'>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?(\d+)', r.text) 139 | for proxy in proxies: 140 | yield ":".join(proxy) 141 | 142 | @staticmethod 143 | def freeProxy09(page_count=1): 144 | """ 免费代理库 """ 145 | for i in range(1, page_count + 1): 146 | url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i) 147 | html_tree = WebRequest().get(url, verify=False).tree 148 | for index, tr in enumerate(html_tree.xpath("//table//tr")): 149 | if index == 0: 150 | continue 151 | yield ":".join(tr.xpath("./td/text()")[0:2]).strip() 152 | 153 | @staticmethod 154 | def freeProxy10(): 155 | """ 89免费代理 """ 156 | r = WebRequest().get("https://www.89ip.cn/index_1.html", timeout=10) 157 | proxies = re.findall( 158 | r'[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?[\s\S]*?[\s\S]*?(\d+)[\s\S]*?', 159 | r.text) 160 | for proxy in proxies: 161 | yield ':'.join(proxy) 162 | 163 | @staticmethod 164 | def freeProxy11(): 165 | """ 稻壳代理 https://www.docip.net/ """ 166 | r = WebRequest().get("https://www.docip.net/data/free.json", timeout=10) 167 | try: 168 | for each in r.json['data']: 169 | yield each['ip'] 170 | except Exception as e: 171 | print(e) 172 | 173 | # @staticmethod 174 | # def wallProxy01(): 175 | # """ 176 | # PzzQz https://pzzqz.com/ 177 | # """ 178 | # from requests import Session 179 | # from lxml import etree 180 | # session = Session() 181 | # try: 182 | # index_resp = session.get("https://pzzqz.com/", timeout=20, verify=False).text 183 | # x_csrf_token = re.findall('X-CSRFToken": "(.*?)"', index_resp) 184 | # if x_csrf_token: 185 | # data = {"http": "on", "ping": "3000", "country": "cn", "ports": ""} 186 | # proxy_resp = session.post("https://pzzqz.com/", verify=False, 187 | # headers={"X-CSRFToken": x_csrf_token[0]}, json=data).json() 188 | # tree = etree.HTML(proxy_resp["proxy_html"]) 189 | # for tr in tree.xpath("//tr"): 190 | # ip = "".join(tr.xpath("./td[1]/text()")) 191 | # port = "".join(tr.xpath("./td[2]/text()")) 192 | # yield "%s:%s" % (ip, port) 193 | # except Exception as e: 194 | # print(e) 195 | 196 | # @staticmethod 197 | # def freeProxy10(): 198 | # """ 199 | # 墙外网站 cn-proxy 200 | # :return: 201 | # """ 202 | # urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] 203 | # request = WebRequest() 204 | # for url in urls: 205 | # r = request.get(url, timeout=10) 206 | # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) 207 | # for proxy in proxies: 208 | # yield ':'.join(proxy) 209 | 210 | # @staticmethod 211 | # def freeProxy11(): 212 | # """ 213 | # https://proxy-list.org/english/index.php 214 | # :return: 215 | # """ 216 | # urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] 217 | # request = WebRequest() 218 | # import base64 219 | # for url in urls: 220 | # r = request.get(url, timeout=10) 221 | # proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) 222 | # for proxy in proxies: 223 | # yield base64.b64decode(proxy).decode() 224 | 225 | # @staticmethod 226 | # def freeProxy12(): 227 | # urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] 228 | # request = WebRequest() 229 | # for url in urls: 230 | # r = request.get(url, timeout=10) 231 | # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) 232 | # for proxy in proxies: 233 | # yield ':'.join(proxy) 234 | 235 | 236 | if __name__ == '__main__': 237 | p = ProxyFetcher() 238 | for _ in p.freeProxy06(): 239 | print(_) 240 | 241 | # http://nntime.com/proxy-list-01.htm 242 | -------------------------------------------------------------------------------- /handler/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py 5 | Description : 6 | Author : JHao 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | # from handler.ProxyManager import ProxyManager 16 | -------------------------------------------------------------------------------- /handler/configHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: configHandler 5 | Description : 6 | Author : JHao 7 | date: 2020/6/22 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/22: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import os 16 | import setting 17 | from util.singleton import Singleton 18 | from util.lazyProperty import LazyProperty 19 | from util.six import reload_six, withMetaclass 20 | 21 | 22 | class ConfigHandler(withMetaclass(Singleton)): 23 | 24 | def __init__(self): 25 | pass 26 | 27 | @LazyProperty 28 | def serverHost(self): 29 | return os.environ.get("HOST", setting.HOST) 30 | 31 | @LazyProperty 32 | def serverPort(self): 33 | return os.environ.get("PORT", setting.PORT) 34 | 35 | @LazyProperty 36 | def dbConn(self): 37 | return os.getenv("DB_CONN", setting.DB_CONN) 38 | 39 | @LazyProperty 40 | def tableName(self): 41 | return os.getenv("TABLE_NAME", setting.TABLE_NAME) 42 | 43 | @property 44 | def fetchers(self): 45 | reload_six(setting) 46 | return setting.PROXY_FETCHER 47 | 48 | @LazyProperty 49 | def httpUrl(self): 50 | return os.getenv("HTTP_URL", setting.HTTP_URL) 51 | 52 | @LazyProperty 53 | def httpsUrl(self): 54 | return os.getenv("HTTPS_URL", setting.HTTPS_URL) 55 | 56 | @LazyProperty 57 | def verifyTimeout(self): 58 | return int(os.getenv("VERIFY_TIMEOUT", setting.VERIFY_TIMEOUT)) 59 | 60 | # @LazyProperty 61 | # def proxyCheckCount(self): 62 | # return int(os.getenv("PROXY_CHECK_COUNT", setting.PROXY_CHECK_COUNT)) 63 | 64 | @LazyProperty 65 | def maxFailCount(self): 66 | return int(os.getenv("MAX_FAIL_COUNT", setting.MAX_FAIL_COUNT)) 67 | 68 | # @LazyProperty 69 | # def maxFailRate(self): 70 | # return int(os.getenv("MAX_FAIL_RATE", setting.MAX_FAIL_RATE)) 71 | 72 | @LazyProperty 73 | def poolSizeMin(self): 74 | return int(os.getenv("POOL_SIZE_MIN", setting.POOL_SIZE_MIN)) 75 | 76 | @LazyProperty 77 | def proxyRegion(self): 78 | return bool(os.getenv("PROXY_REGION", setting.PROXY_REGION)) 79 | 80 | @LazyProperty 81 | def timezone(self): 82 | return os.getenv("TIMEZONE", setting.TIMEZONE) 83 | 84 | -------------------------------------------------------------------------------- /handler/logHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: LogHandler.py 5 | Description : 日志操作模块 6 | Author : JHao 7 | date: 2017/3/6 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/03/06: log handler 11 | 2017/09/21: 屏幕输出/文件输出 可选(默认屏幕和文件均输出) 12 | 2020/07/13: Windows下TimedRotatingFileHandler线程不安全, 不再使用 13 | ------------------------------------------------- 14 | """ 15 | __author__ = 'JHao' 16 | 17 | import os 18 | import logging 19 | import platform 20 | 21 | from logging.handlers import TimedRotatingFileHandler 22 | 23 | # 日志级别 24 | CRITICAL = 50 25 | FATAL = CRITICAL 26 | ERROR = 40 27 | WARNING = 30 28 | WARN = WARNING 29 | INFO = 20 30 | DEBUG = 10 31 | NOTSET = 0 32 | 33 | CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) 34 | ROOT_PATH = os.path.join(CURRENT_PATH, os.pardir) 35 | LOG_PATH = os.path.join(ROOT_PATH, 'log') 36 | 37 | if not os.path.exists(LOG_PATH): 38 | try: 39 | os.mkdir(LOG_PATH) 40 | except FileExistsError: 41 | pass 42 | 43 | 44 | class LogHandler(logging.Logger): 45 | """ 46 | LogHandler 47 | """ 48 | 49 | def __init__(self, name, level=DEBUG, stream=True, file=True): 50 | self.name = name 51 | self.level = level 52 | logging.Logger.__init__(self, self.name, level=level) 53 | if stream: 54 | self.__setStreamHandler__() 55 | if file: 56 | if platform.system() != "Windows": 57 | self.__setFileHandler__() 58 | 59 | def __setFileHandler__(self, level=None): 60 | """ 61 | set file handler 62 | :param level: 63 | :return: 64 | """ 65 | file_name = os.path.join(LOG_PATH, '{name}.log'.format(name=self.name)) 66 | # 设置日志回滚, 保存在log目录, 一天保存一个文件, 保留15天 67 | file_handler = TimedRotatingFileHandler(filename=file_name, when='D', interval=1, backupCount=15) 68 | file_handler.suffix = '%Y%m%d.log' 69 | if not level: 70 | file_handler.setLevel(self.level) 71 | else: 72 | file_handler.setLevel(level) 73 | formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') 74 | 75 | file_handler.setFormatter(formatter) 76 | self.file_handler = file_handler 77 | self.addHandler(file_handler) 78 | 79 | def __setStreamHandler__(self, level=None): 80 | """ 81 | set stream handler 82 | :param level: 83 | :return: 84 | """ 85 | stream_handler = logging.StreamHandler() 86 | formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') 87 | stream_handler.setFormatter(formatter) 88 | if not level: 89 | stream_handler.setLevel(self.level) 90 | else: 91 | stream_handler.setLevel(level) 92 | self.addHandler(stream_handler) 93 | 94 | 95 | if __name__ == '__main__': 96 | log = LogHandler('test') 97 | log.info('this is a test msg') 98 | -------------------------------------------------------------------------------- /handler/proxyHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: ProxyHandler.py 5 | Description : 6 | Author : JHao 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/03: 11 | 2020/05/26: 区分http和https 12 | ------------------------------------------------- 13 | """ 14 | __author__ = 'JHao' 15 | 16 | from helper.proxy import Proxy 17 | from db.dbClient import DbClient 18 | from handler.configHandler import ConfigHandler 19 | 20 | 21 | class ProxyHandler(object): 22 | """ Proxy CRUD operator""" 23 | 24 | def __init__(self): 25 | self.conf = ConfigHandler() 26 | self.db = DbClient(self.conf.dbConn) 27 | self.db.changeTable(self.conf.tableName) 28 | 29 | def get(self, https=False): 30 | """ 31 | return a proxy 32 | Args: 33 | https: True/False 34 | Returns: 35 | """ 36 | proxy = self.db.get(https) 37 | return Proxy.createFromJson(proxy) if proxy else None 38 | 39 | def pop(self, https): 40 | """ 41 | return and delete a useful proxy 42 | :return: 43 | """ 44 | proxy = self.db.pop(https) 45 | if proxy: 46 | return Proxy.createFromJson(proxy) 47 | return None 48 | 49 | def put(self, proxy): 50 | """ 51 | put proxy into use proxy 52 | :return: 53 | """ 54 | self.db.put(proxy) 55 | 56 | def delete(self, proxy): 57 | """ 58 | delete useful proxy 59 | :param proxy: 60 | :return: 61 | """ 62 | return self.db.delete(proxy.proxy) 63 | 64 | def getAll(self, https=False): 65 | """ 66 | get all proxy from pool as Proxy list 67 | :return: 68 | """ 69 | proxies = self.db.getAll(https) 70 | return [Proxy.createFromJson(_) for _ in proxies] 71 | 72 | def exists(self, proxy): 73 | """ 74 | check proxy exists 75 | :param proxy: 76 | :return: 77 | """ 78 | return self.db.exists(proxy.proxy) 79 | 80 | def getCount(self): 81 | """ 82 | return raw_proxy and use_proxy count 83 | :return: 84 | """ 85 | total_use_proxy = self.db.getCount() 86 | return {'count': total_use_proxy} 87 | -------------------------------------------------------------------------------- /helper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jhao104/proxy_pool/e0fa4de1f59aaf3e287e6d166cee7b020bbe1578/helper/__init__.py -------------------------------------------------------------------------------- /helper/check.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: check 5 | Description : 执行代理校验 6 | Author : JHao 7 | date: 2019/8/6 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/08/06: 执行代理校验 11 | 2021/05/25: 分别校验http和https 12 | 2022/08/16: 获取代理Region信息 13 | ------------------------------------------------- 14 | """ 15 | __author__ = 'JHao' 16 | 17 | from util.six import Empty 18 | from threading import Thread 19 | from datetime import datetime 20 | from util.webRequest import WebRequest 21 | from handler.logHandler import LogHandler 22 | from helper.validator import ProxyValidator 23 | from handler.proxyHandler import ProxyHandler 24 | from handler.configHandler import ConfigHandler 25 | 26 | 27 | class DoValidator(object): 28 | """ 执行校验 """ 29 | 30 | conf = ConfigHandler() 31 | 32 | @classmethod 33 | def validator(cls, proxy, work_type): 34 | """ 35 | 校验入口 36 | Args: 37 | proxy: Proxy Object 38 | work_type: raw/use 39 | Returns: 40 | Proxy Object 41 | """ 42 | http_r = cls.httpValidator(proxy) 43 | https_r = False if not http_r else cls.httpsValidator(proxy) 44 | 45 | proxy.check_count += 1 46 | proxy.last_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 47 | proxy.last_status = True if http_r else False 48 | if http_r: 49 | if proxy.fail_count > 0: 50 | proxy.fail_count -= 1 51 | proxy.https = True if https_r else False 52 | if work_type == "raw": 53 | proxy.region = cls.regionGetter(proxy) if cls.conf.proxyRegion else "" 54 | else: 55 | proxy.fail_count += 1 56 | return proxy 57 | 58 | @classmethod 59 | def httpValidator(cls, proxy): 60 | for func in ProxyValidator.http_validator: 61 | if not func(proxy.proxy): 62 | return False 63 | return True 64 | 65 | @classmethod 66 | def httpsValidator(cls, proxy): 67 | for func in ProxyValidator.https_validator: 68 | if not func(proxy.proxy): 69 | return False 70 | return True 71 | 72 | @classmethod 73 | def preValidator(cls, proxy): 74 | for func in ProxyValidator.pre_validator: 75 | if not func(proxy): 76 | return False 77 | return True 78 | 79 | @classmethod 80 | def regionGetter(cls, proxy): 81 | try: 82 | url = 'https://searchplugin.csdn.net/api/v1/ip/get?ip=%s' % proxy.proxy.split(':')[0] 83 | r = WebRequest().get(url=url, retry_time=1, timeout=2).json 84 | return r['data']['address'] 85 | except: 86 | return 'error' 87 | 88 | 89 | class _ThreadChecker(Thread): 90 | """ 多线程检测 """ 91 | 92 | def __init__(self, work_type, target_queue, thread_name): 93 | Thread.__init__(self, name=thread_name) 94 | self.work_type = work_type 95 | self.log = LogHandler("checker") 96 | self.proxy_handler = ProxyHandler() 97 | self.target_queue = target_queue 98 | self.conf = ConfigHandler() 99 | 100 | def run(self): 101 | self.log.info("{}ProxyCheck - {}: start".format(self.work_type.title(), self.name)) 102 | while True: 103 | try: 104 | proxy = self.target_queue.get(block=False) 105 | except Empty: 106 | self.log.info("{}ProxyCheck - {}: complete".format(self.work_type.title(), self.name)) 107 | break 108 | proxy = DoValidator.validator(proxy, self.work_type) 109 | if self.work_type == "raw": 110 | self.__ifRaw(proxy) 111 | else: 112 | self.__ifUse(proxy) 113 | self.target_queue.task_done() 114 | 115 | def __ifRaw(self, proxy): 116 | if proxy.last_status: 117 | if self.proxy_handler.exists(proxy): 118 | self.log.info('RawProxyCheck - {}: {} exist'.format(self.name, proxy.proxy.ljust(23))) 119 | else: 120 | self.log.info('RawProxyCheck - {}: {} pass'.format(self.name, proxy.proxy.ljust(23))) 121 | self.proxy_handler.put(proxy) 122 | else: 123 | self.log.info('RawProxyCheck - {}: {} fail'.format(self.name, proxy.proxy.ljust(23))) 124 | 125 | def __ifUse(self, proxy): 126 | if proxy.last_status: 127 | self.log.info('UseProxyCheck - {}: {} pass'.format(self.name, proxy.proxy.ljust(23))) 128 | self.proxy_handler.put(proxy) 129 | else: 130 | if proxy.fail_count > self.conf.maxFailCount: 131 | self.log.info('UseProxyCheck - {}: {} fail, count {} delete'.format(self.name, 132 | proxy.proxy.ljust(23), 133 | proxy.fail_count)) 134 | self.proxy_handler.delete(proxy) 135 | else: 136 | self.log.info('UseProxyCheck - {}: {} fail, count {} keep'.format(self.name, 137 | proxy.proxy.ljust(23), 138 | proxy.fail_count)) 139 | self.proxy_handler.put(proxy) 140 | 141 | 142 | def Checker(tp, queue): 143 | """ 144 | run Proxy ThreadChecker 145 | :param tp: raw/use 146 | :param queue: Proxy Queue 147 | :return: 148 | """ 149 | thread_list = list() 150 | for index in range(20): 151 | thread_list.append(_ThreadChecker(tp, queue, "thread_%s" % str(index).zfill(2))) 152 | 153 | for thread in thread_list: 154 | thread.setDaemon(True) 155 | thread.start() 156 | 157 | for thread in thread_list: 158 | thread.join() 159 | -------------------------------------------------------------------------------- /helper/fetch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: fetchScheduler 5 | Description : 6 | Author : JHao 7 | date: 2019/8/6 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2021/11/18: 多线程采集 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from threading import Thread 16 | from helper.proxy import Proxy 17 | from helper.check import DoValidator 18 | from handler.logHandler import LogHandler 19 | from handler.proxyHandler import ProxyHandler 20 | from fetcher.proxyFetcher import ProxyFetcher 21 | from handler.configHandler import ConfigHandler 22 | 23 | 24 | class _ThreadFetcher(Thread): 25 | 26 | def __init__(self, fetch_source, proxy_dict): 27 | Thread.__init__(self) 28 | self.fetch_source = fetch_source 29 | self.proxy_dict = proxy_dict 30 | self.fetcher = getattr(ProxyFetcher, fetch_source, None) 31 | self.log = LogHandler("fetcher") 32 | self.conf = ConfigHandler() 33 | self.proxy_handler = ProxyHandler() 34 | 35 | def run(self): 36 | self.log.info("ProxyFetch - {func}: start".format(func=self.fetch_source)) 37 | try: 38 | for proxy in self.fetcher(): 39 | self.log.info('ProxyFetch - %s: %s ok' % (self.fetch_source, proxy.ljust(23))) 40 | proxy = proxy.strip() 41 | if proxy in self.proxy_dict: 42 | self.proxy_dict[proxy].add_source(self.fetch_source) 43 | else: 44 | self.proxy_dict[proxy] = Proxy( 45 | proxy, source=self.fetch_source) 46 | except Exception as e: 47 | self.log.error("ProxyFetch - {func}: error".format(func=self.fetch_source)) 48 | self.log.error(str(e)) 49 | 50 | 51 | class Fetcher(object): 52 | name = "fetcher" 53 | 54 | def __init__(self): 55 | self.log = LogHandler(self.name) 56 | self.conf = ConfigHandler() 57 | 58 | def run(self): 59 | """ 60 | fetch proxy with proxyFetcher 61 | :return: 62 | """ 63 | proxy_dict = dict() 64 | thread_list = list() 65 | self.log.info("ProxyFetch : start") 66 | 67 | for fetch_source in self.conf.fetchers: 68 | self.log.info("ProxyFetch - {func}: start".format(func=fetch_source)) 69 | fetcher = getattr(ProxyFetcher, fetch_source, None) 70 | if not fetcher: 71 | self.log.error("ProxyFetch - {func}: class method not exists!".format(func=fetch_source)) 72 | continue 73 | if not callable(fetcher): 74 | self.log.error("ProxyFetch - {func}: must be class method".format(func=fetch_source)) 75 | continue 76 | thread_list.append(_ThreadFetcher(fetch_source, proxy_dict)) 77 | 78 | for thread in thread_list: 79 | thread.setDaemon(True) 80 | thread.start() 81 | 82 | for thread in thread_list: 83 | thread.join() 84 | 85 | self.log.info("ProxyFetch - all complete!") 86 | for _ in proxy_dict.values(): 87 | if DoValidator.preValidator(_.proxy): 88 | yield _ 89 | -------------------------------------------------------------------------------- /helper/launcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: launcher 5 | Description : 启动器 6 | Author : JHao 7 | date: 2021/3/26 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2021/3/26: 启动器 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import sys 16 | from db.dbClient import DbClient 17 | from handler.logHandler import LogHandler 18 | from handler.configHandler import ConfigHandler 19 | 20 | log = LogHandler('launcher') 21 | 22 | 23 | def startServer(): 24 | __beforeStart() 25 | from api.proxyApi import runFlask 26 | runFlask() 27 | 28 | 29 | def startScheduler(): 30 | __beforeStart() 31 | from helper.scheduler import runScheduler 32 | runScheduler() 33 | 34 | 35 | def __beforeStart(): 36 | __showVersion() 37 | __showConfigure() 38 | if __checkDBConfig(): 39 | log.info('exit!') 40 | sys.exit() 41 | 42 | 43 | def __showVersion(): 44 | from setting import VERSION 45 | log.info("ProxyPool Version: %s" % VERSION) 46 | 47 | 48 | def __showConfigure(): 49 | conf = ConfigHandler() 50 | log.info("ProxyPool configure HOST: %s" % conf.serverHost) 51 | log.info("ProxyPool configure PORT: %s" % conf.serverPort) 52 | log.info("ProxyPool configure PROXY_FETCHER: %s" % conf.fetchers) 53 | 54 | 55 | def __checkDBConfig(): 56 | conf = ConfigHandler() 57 | db = DbClient(conf.dbConn) 58 | log.info("============ DATABASE CONFIGURE ================") 59 | log.info("DB_TYPE: %s" % db.db_type) 60 | log.info("DB_HOST: %s" % db.db_host) 61 | log.info("DB_PORT: %s" % db.db_port) 62 | log.info("DB_NAME: %s" % db.db_name) 63 | log.info("DB_USER: %s" % db.db_user) 64 | log.info("=================================================") 65 | return db.test() 66 | -------------------------------------------------------------------------------- /helper/proxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: Proxy 5 | Description : 代理对象类型封装 6 | Author : JHao 7 | date: 2019/7/11 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/7/11: 代理对象类型封装 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import json 16 | 17 | 18 | class Proxy(object): 19 | 20 | def __init__(self, proxy, fail_count=0, region="", anonymous="", 21 | source="", check_count=0, last_status="", last_time="", https=False): 22 | self._proxy = proxy 23 | self._fail_count = fail_count 24 | self._region = region 25 | self._anonymous = anonymous 26 | self._source = source.split('/') 27 | self._check_count = check_count 28 | self._last_status = last_status 29 | self._last_time = last_time 30 | self._https = https 31 | 32 | @classmethod 33 | def createFromJson(cls, proxy_json): 34 | _dict = json.loads(proxy_json) 35 | return cls(proxy=_dict.get("proxy", ""), 36 | fail_count=_dict.get("fail_count", 0), 37 | region=_dict.get("region", ""), 38 | anonymous=_dict.get("anonymous", ""), 39 | source=_dict.get("source", ""), 40 | check_count=_dict.get("check_count", 0), 41 | last_status=_dict.get("last_status", ""), 42 | last_time=_dict.get("last_time", ""), 43 | https=_dict.get("https", False) 44 | ) 45 | 46 | @property 47 | def proxy(self): 48 | """ 代理 ip:port """ 49 | return self._proxy 50 | 51 | @property 52 | def fail_count(self): 53 | """ 检测失败次数 """ 54 | return self._fail_count 55 | 56 | @property 57 | def region(self): 58 | """ 地理位置(国家/城市) """ 59 | return self._region 60 | 61 | @property 62 | def anonymous(self): 63 | """ 匿名 """ 64 | return self._anonymous 65 | 66 | @property 67 | def source(self): 68 | """ 代理来源 """ 69 | return '/'.join(self._source) 70 | 71 | @property 72 | def check_count(self): 73 | """ 代理检测次数 """ 74 | return self._check_count 75 | 76 | @property 77 | def last_status(self): 78 | """ 最后一次检测结果 True -> 可用; False -> 不可用""" 79 | return self._last_status 80 | 81 | @property 82 | def last_time(self): 83 | """ 最后一次检测时间 """ 84 | return self._last_time 85 | 86 | @property 87 | def https(self): 88 | """ 是否支持https """ 89 | return self._https 90 | 91 | @property 92 | def to_dict(self): 93 | """ 属性字典 """ 94 | return {"proxy": self.proxy, 95 | "https": self.https, 96 | "fail_count": self.fail_count, 97 | "region": self.region, 98 | "anonymous": self.anonymous, 99 | "source": self.source, 100 | "check_count": self.check_count, 101 | "last_status": self.last_status, 102 | "last_time": self.last_time} 103 | 104 | @property 105 | def to_json(self): 106 | """ 属性json格式 """ 107 | return json.dumps(self.to_dict, ensure_ascii=False) 108 | 109 | @fail_count.setter 110 | def fail_count(self, value): 111 | self._fail_count = value 112 | 113 | @check_count.setter 114 | def check_count(self, value): 115 | self._check_count = value 116 | 117 | @last_status.setter 118 | def last_status(self, value): 119 | self._last_status = value 120 | 121 | @last_time.setter 122 | def last_time(self, value): 123 | self._last_time = value 124 | 125 | @https.setter 126 | def https(self, value): 127 | self._https = value 128 | 129 | @region.setter 130 | def region(self, value): 131 | self._region = value 132 | 133 | def add_source(self, source_str): 134 | if source_str: 135 | self._source.append(source_str) 136 | self._source = list(set(self._source)) 137 | -------------------------------------------------------------------------------- /helper/scheduler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: proxyScheduler 5 | Description : 6 | Author : JHao 7 | date: 2019/8/5 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/08/05: proxyScheduler 11 | 2021/02/23: runProxyCheck时,剩余代理少于POOL_SIZE_MIN时执行抓取 12 | ------------------------------------------------- 13 | """ 14 | __author__ = 'JHao' 15 | 16 | from apscheduler.schedulers.blocking import BlockingScheduler 17 | from apscheduler.executors.pool import ProcessPoolExecutor 18 | 19 | from util.six import Queue 20 | from helper.fetch import Fetcher 21 | from helper.check import Checker 22 | from handler.logHandler import LogHandler 23 | from handler.proxyHandler import ProxyHandler 24 | from handler.configHandler import ConfigHandler 25 | 26 | 27 | def __runProxyFetch(): 28 | proxy_queue = Queue() 29 | proxy_fetcher = Fetcher() 30 | 31 | for proxy in proxy_fetcher.run(): 32 | proxy_queue.put(proxy) 33 | 34 | Checker("raw", proxy_queue) 35 | 36 | 37 | def __runProxyCheck(): 38 | proxy_handler = ProxyHandler() 39 | proxy_queue = Queue() 40 | if proxy_handler.db.getCount().get("total", 0) < proxy_handler.conf.poolSizeMin: 41 | __runProxyFetch() 42 | for proxy in proxy_handler.getAll(): 43 | proxy_queue.put(proxy) 44 | Checker("use", proxy_queue) 45 | 46 | 47 | def runScheduler(): 48 | __runProxyFetch() 49 | 50 | timezone = ConfigHandler().timezone 51 | scheduler_log = LogHandler("scheduler") 52 | scheduler = BlockingScheduler(logger=scheduler_log, timezone=timezone) 53 | 54 | scheduler.add_job(__runProxyFetch, 'interval', minutes=4, id="proxy_fetch", name="proxy采集") 55 | scheduler.add_job(__runProxyCheck, 'interval', minutes=2, id="proxy_check", name="proxy检查") 56 | executors = { 57 | 'default': {'type': 'threadpool', 'max_workers': 20}, 58 | 'processpool': ProcessPoolExecutor(max_workers=5) 59 | } 60 | job_defaults = { 61 | 'coalesce': False, 62 | 'max_instances': 10 63 | } 64 | 65 | scheduler.configure(executors=executors, job_defaults=job_defaults, timezone=timezone) 66 | 67 | scheduler.start() 68 | 69 | 70 | if __name__ == '__main__': 71 | runScheduler() 72 | -------------------------------------------------------------------------------- /helper/validator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: _validators 5 | Description : 定义proxy验证方法 6 | Author : JHao 7 | date: 2021/5/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2023/03/10: 支持带用户认证的代理格式 username:password@ip:port 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import re 16 | from requests import head 17 | from util.six import withMetaclass 18 | from util.singleton import Singleton 19 | from handler.configHandler import ConfigHandler 20 | 21 | conf = ConfigHandler() 22 | 23 | HEADER = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 24 | 'Accept': '*/*', 25 | 'Connection': 'keep-alive', 26 | 'Accept-Language': 'zh-CN,zh;q=0.8'} 27 | 28 | IP_REGEX = re.compile(r"(.*:.*@)?\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}") 29 | 30 | 31 | class ProxyValidator(withMetaclass(Singleton)): 32 | pre_validator = [] 33 | http_validator = [] 34 | https_validator = [] 35 | 36 | @classmethod 37 | def addPreValidator(cls, func): 38 | cls.pre_validator.append(func) 39 | return func 40 | 41 | @classmethod 42 | def addHttpValidator(cls, func): 43 | cls.http_validator.append(func) 44 | return func 45 | 46 | @classmethod 47 | def addHttpsValidator(cls, func): 48 | cls.https_validator.append(func) 49 | return func 50 | 51 | 52 | @ProxyValidator.addPreValidator 53 | def formatValidator(proxy): 54 | """检查代理格式""" 55 | return True if IP_REGEX.fullmatch(proxy) else False 56 | 57 | 58 | @ProxyValidator.addHttpValidator 59 | def httpTimeOutValidator(proxy): 60 | """ http检测超时 """ 61 | 62 | proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)} 63 | 64 | try: 65 | r = head(conf.httpUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout) 66 | return True if r.status_code == 200 else False 67 | except Exception as e: 68 | return False 69 | 70 | 71 | @ProxyValidator.addHttpsValidator 72 | def httpsTimeOutValidator(proxy): 73 | """https检测超时""" 74 | 75 | proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)} 76 | try: 77 | r = head(conf.httpsUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout, verify=False) 78 | return True if r.status_code == 200 else False 79 | except Exception as e: 80 | return False 81 | 82 | 83 | @ProxyValidator.addHttpValidator 84 | def customValidatorExample(proxy): 85 | """自定义validator函数,校验代理是否可用, 返回True/False""" 86 | return True 87 | -------------------------------------------------------------------------------- /proxyPool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: proxy_pool 5 | Description : proxy pool 启动入口 6 | Author : JHao 7 | date: 2020/6/19 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/19: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import click 16 | from helper.launcher import startServer, startScheduler 17 | from setting import BANNER, VERSION 18 | 19 | CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) 20 | 21 | 22 | @click.group(context_settings=CONTEXT_SETTINGS) 23 | @click.version_option(version=VERSION) 24 | def cli(): 25 | """ProxyPool cli工具""" 26 | 27 | 28 | @cli.command(name="schedule") 29 | def schedule(): 30 | """ 启动调度程序 """ 31 | click.echo(BANNER) 32 | startScheduler() 33 | 34 | 35 | @cli.command(name="server") 36 | def server(): 37 | """ 启动api服务 """ 38 | click.echo(BANNER) 39 | startServer() 40 | 41 | 42 | if __name__ == '__main__': 43 | cli() 44 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.20.0 2 | gunicorn==19.9.0 3 | lxml==4.9.2 4 | redis==3.5.3 5 | APScheduler==3.10.0;python_version>="3.10" 6 | APScheduler==3.2.0;python_version<"3.10" 7 | click==8.0.1;python_version>"3.6" 8 | click==7.0;python_version<="3.6" 9 | Flask==2.1.1;python_version>"3.6" 10 | Flask==1.0;python_version<="3.6" 11 | werkzeug==2.1.0;python_version>"3.6" 12 | werkzeug==0.15.5;python_version<="3.6" 13 | -------------------------------------------------------------------------------- /setting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: setting.py 5 | Description : 配置文件 6 | Author : JHao 7 | date: 2019/2/15 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/2/15: 11 | ------------------------------------------------- 12 | """ 13 | 14 | BANNER = r""" 15 | **************************************************************** 16 | *** ______ ********************* ______ *********** _ ******** 17 | *** | ___ \_ ******************** | ___ \ ********* | | ******** 18 | *** | |_/ / \__ __ __ _ __ _ | |_/ /___ * ___ | | ******** 19 | *** | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | ******** 20 | *** | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ **** 21 | *** \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____/ **** 22 | **** __ / / ***** 23 | ************************* /___ / ******************************* 24 | ************************* ******************************** 25 | **************************************************************** 26 | """ 27 | 28 | VERSION = "2.4.0" 29 | 30 | # ############### server config ############### 31 | HOST = "0.0.0.0" 32 | 33 | PORT = 5010 34 | 35 | # ############### database config ################### 36 | # db connection uri 37 | # example: 38 | # Redis: redis://:password@ip:port/db 39 | # Ssdb: ssdb://:password@ip:port 40 | DB_CONN = 'redis://:pwd@127.0.0.1:6379/0' 41 | 42 | # proxy table name 43 | TABLE_NAME = 'use_proxy' 44 | 45 | 46 | # ###### config the proxy fetch function ###### 47 | PROXY_FETCHER = [ 48 | "freeProxy01", 49 | "freeProxy02", 50 | "freeProxy03", 51 | "freeProxy04", 52 | "freeProxy05", 53 | "freeProxy06", 54 | "freeProxy07", 55 | "freeProxy08", 56 | "freeProxy09", 57 | "freeProxy10", 58 | "freeProxy11" 59 | ] 60 | 61 | # ############# proxy validator ################# 62 | # 代理验证目标网站 63 | HTTP_URL = "http://httpbin.org" 64 | 65 | HTTPS_URL = "https://www.qq.com" 66 | 67 | # 代理验证时超时时间 68 | VERIFY_TIMEOUT = 10 69 | 70 | # 近PROXY_CHECK_COUNT次校验中允许的最大失败次数,超过则剔除代理 71 | MAX_FAIL_COUNT = 0 72 | 73 | # 近PROXY_CHECK_COUNT次校验中允许的最大失败率,超过则剔除代理 74 | # MAX_FAIL_RATE = 0.1 75 | 76 | # proxyCheck时代理数量少于POOL_SIZE_MIN触发抓取 77 | POOL_SIZE_MIN = 20 78 | 79 | # ############# proxy attributes ################# 80 | # 是否启用代理地域属性 81 | PROXY_REGION = True 82 | 83 | # ############# scheduler config ################# 84 | 85 | # Set the timezone for the scheduler forcely (optional) 86 | # If it is running on a VM, and 87 | # "ValueError: Timezone offset does not match system offset" 88 | # was raised during scheduling. 89 | # Please uncomment the following line and set a timezone for the scheduler. 90 | # Otherwise it will detect the timezone from the system automatically. 91 | 92 | TIMEZONE = "Asia/Shanghai" 93 | -------------------------------------------------------------------------------- /start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python proxyPool.py server & 3 | python proxyPool.py schedule -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: test.py 5 | Description : 6 | Author : JHao 7 | date: 2017/3/7 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/3/7: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from test import testProxyValidator 16 | from test import testConfigHandler 17 | from test import testLogHandler 18 | from test import testDbClient 19 | 20 | if __name__ == '__main__': 21 | print("ConfigHandler:") 22 | testConfigHandler.testConfig() 23 | 24 | print("LogHandler:") 25 | testLogHandler.testLogHandler() 26 | 27 | print("DbClient:") 28 | testDbClient.testDbClient() 29 | 30 | print("ProxyValidator:") 31 | testProxyValidator.testProxyValidator() 32 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__ 5 | Description : 6 | Author : JHao 7 | date: 2019/2/15 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/2/15: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | -------------------------------------------------------------------------------- /test/testConfigHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testGetConfig 5 | Description : testGetConfig 6 | Author : J_hao 7 | date: 2017/7/31 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/7/31: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'J_hao' 14 | 15 | from handler.configHandler import ConfigHandler 16 | from time import sleep 17 | 18 | 19 | def testConfig(): 20 | """ 21 | :return: 22 | """ 23 | conf = ConfigHandler() 24 | print(conf.dbConn) 25 | print(conf.serverPort) 26 | print(conf.serverHost) 27 | print(conf.tableName) 28 | assert isinstance(conf.fetchers, list) 29 | print(conf.fetchers) 30 | 31 | for _ in range(2): 32 | print(conf.fetchers) 33 | sleep(5) 34 | 35 | 36 | if __name__ == '__main__': 37 | testConfig() 38 | 39 | -------------------------------------------------------------------------------- /test/testDbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testDbClient 5 | Description : 6 | Author : JHao 7 | date: 2020/6/23 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/23: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from db.dbClient import DbClient 16 | 17 | 18 | def testDbClient(): 19 | # ############### ssdb ############### 20 | ssdb_uri = "ssdb://:password@127.0.0.1:8888" 21 | s = DbClient.parseDbConn(ssdb_uri) 22 | assert s.db_type == "SSDB" 23 | assert s.db_pwd == "password" 24 | assert s.db_host == "127.0.0.1" 25 | assert s.db_port == 8888 26 | 27 | # ############### redis ############### 28 | redis_uri = "redis://:password@127.0.0.1:6379/1" 29 | r = DbClient.parseDbConn(redis_uri) 30 | assert r.db_type == "REDIS" 31 | assert r.db_pwd == "password" 32 | assert r.db_host == "127.0.0.1" 33 | assert r.db_port == 6379 34 | assert r.db_name == "1" 35 | print("DbClient ok!") 36 | 37 | 38 | if __name__ == '__main__': 39 | testDbClient() 40 | -------------------------------------------------------------------------------- /test/testLogHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testLogHandler 5 | Description : 6 | Author : J_hao 7 | date: 2017/8/2 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/8/2: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'J_hao' 14 | 15 | from handler.logHandler import LogHandler 16 | 17 | 18 | def testLogHandler(): 19 | log = LogHandler('test') 20 | log.info('this is info') 21 | log.error('this is error') 22 | 23 | 24 | if __name__ == '__main__': 25 | testLogHandler() 26 | -------------------------------------------------------------------------------- /test/testProxyClass.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testProxyClass 5 | Description : 6 | Author : JHao 7 | date: 2019/8/8 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/8/8: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import json 16 | from helper.proxy import Proxy 17 | 18 | 19 | def testProxyClass(): 20 | proxy = Proxy("127.0.0.1:8080") 21 | 22 | print(proxy.to_json) 23 | 24 | proxy.source = "test" 25 | 26 | proxy_str = json.dumps(proxy.to_dict, ensure_ascii=False) 27 | 28 | print(proxy_str) 29 | 30 | print(Proxy.createFromJson(proxy_str).to_dict) 31 | 32 | 33 | if __name__ == '__main__': 34 | testProxyClass() 35 | -------------------------------------------------------------------------------- /test/testProxyFetcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testProxyFetcher 5 | Description : 6 | Author : JHao 7 | date: 2020/6/23 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/23: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from fetcher.proxyFetcher import ProxyFetcher 16 | from handler.configHandler import ConfigHandler 17 | 18 | 19 | def testProxyFetcher(): 20 | conf = ConfigHandler() 21 | proxy_getter_functions = conf.fetchers 22 | proxy_counter = {_: 0 for _ in proxy_getter_functions} 23 | for proxyGetter in proxy_getter_functions: 24 | for proxy in getattr(ProxyFetcher, proxyGetter.strip())(): 25 | if proxy: 26 | print('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) 27 | proxy_counter[proxyGetter] = proxy_counter.get(proxyGetter) + 1 28 | for key, value in proxy_counter.items(): 29 | print(key, value) 30 | 31 | 32 | if __name__ == '__main__': 33 | testProxyFetcher() 34 | -------------------------------------------------------------------------------- /test/testProxyValidator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testProxyValidator 5 | Description : 6 | Author : JHao 7 | date: 2021/5/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2021/5/25: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from helper.validator import ProxyValidator 16 | 17 | 18 | def testProxyValidator(): 19 | for _ in ProxyValidator.pre_validator: 20 | print(_) 21 | for _ in ProxyValidator.http_validator: 22 | print(_) 23 | for _ in ProxyValidator.https_validator: 24 | print(_) 25 | 26 | 27 | if __name__ == '__main__': 28 | testProxyValidator() 29 | -------------------------------------------------------------------------------- /test/testRedisClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testRedisClient 5 | Description : 6 | Author : JHao 7 | date: 2020/6/23 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/23: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | 16 | def testRedisClient(): 17 | from db.dbClient import DbClient 18 | from helper.proxy import Proxy 19 | 20 | uri = "redis://:pwd@127.0.0.1:6379" 21 | db = DbClient(uri) 22 | db.changeTable("use_proxy") 23 | proxy = Proxy.createFromJson('{"proxy": "118.190.79.36:8090", "https": false, "fail_count": 0, "region": "", "anonymous": "", "source": "freeProxy14", "check_count": 4, "last_status": true, "last_time": "2021-05-26 10:58:04"}') 24 | 25 | print("put: ", db.put(proxy)) 26 | 27 | print("get: ", db.get(https=None)) 28 | 29 | print("exists: ", db.exists("27.38.96.101:9797")) 30 | 31 | print("exists: ", db.exists("27.38.96.101:8888")) 32 | 33 | print("pop: ", db.pop(https=None)) 34 | 35 | print("getAll: ", db.getAll(https=None)) 36 | 37 | print("getCount", db.getCount()) 38 | 39 | 40 | if __name__ == '__main__': 41 | testRedisClient() 42 | -------------------------------------------------------------------------------- /test/testSsdbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testSsdbClient 5 | Description : 6 | Author : JHao 7 | date: 2020/7/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/7/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | 16 | def testSsdbClient(): 17 | from db.dbClient import DbClient 18 | from helper.proxy import Proxy 19 | 20 | uri = "ssdb://@127.0.0.1:8888" 21 | db = DbClient(uri) 22 | db.changeTable("use_proxy") 23 | proxy = Proxy.createFromJson('{"proxy": "118.190.79.36:8090", "https": false, "fail_count": 0, "region": "", "anonymous": "", "source": "freeProxy14", "check_count": 4, "last_status": true, "last_time": "2021-05-26 10:58:04"}') 24 | 25 | print("put: ", db.put(proxy)) 26 | 27 | print("get: ", db.get(https=None)) 28 | 29 | print("exists: ", db.exists("27.38.96.101:9797")) 30 | 31 | print("exists: ", db.exists("27.38.96.101:8888")) 32 | 33 | print("getAll: ", db.getAll(https=None)) 34 | 35 | # print("pop: ", db.pop(https=None)) 36 | 37 | print("clear: ", db.clear()) 38 | 39 | print("getCount", db.getCount()) 40 | 41 | 42 | if __name__ == '__main__': 43 | testSsdbClient() 44 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__ 5 | Description : 6 | Author : JHao 7 | date: 2020/7/6 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/7/6: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | -------------------------------------------------------------------------------- /util/lazyProperty.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: lazyProperty 5 | Description : 6 | Author : JHao 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | 16 | class LazyProperty(object): 17 | """ 18 | LazyProperty 19 | explain: http://www.spiderpy.cn/blog/5/ 20 | """ 21 | 22 | def __init__(self, func): 23 | self.func = func 24 | 25 | def __get__(self, instance, owner): 26 | if instance is None: 27 | return self 28 | else: 29 | value = self.func(instance) 30 | setattr(instance, self.func.__name__, value) 31 | return value 32 | -------------------------------------------------------------------------------- /util/singleton.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: singleton 5 | Description : 6 | Author : JHao 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | 16 | class Singleton(type): 17 | """ 18 | Singleton Metaclass 19 | """ 20 | 21 | _inst = {} 22 | 23 | def __call__(cls, *args, **kwargs): 24 | if cls not in cls._inst: 25 | cls._inst[cls] = super(Singleton, cls).__call__(*args) 26 | return cls._inst[cls] 27 | -------------------------------------------------------------------------------- /util/six.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: six 5 | Description : 6 | Author : JHao 7 | date: 2020/6/22 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/22: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import sys 16 | 17 | PY2 = sys.version_info[0] == 2 18 | PY3 = sys.version_info[0] == 3 19 | 20 | if PY3: 21 | def iteritems(d, **kw): 22 | return iter(d.items(**kw)) 23 | else: 24 | def iteritems(d, **kw): 25 | return d.iteritems(**kw) 26 | 27 | if PY3: 28 | from urllib.parse import urlparse 29 | else: 30 | from urlparse import urlparse 31 | 32 | if PY3: 33 | from imp import reload as reload_six 34 | else: 35 | reload_six = reload 36 | 37 | if PY3: 38 | from queue import Empty, Queue 39 | else: 40 | from Queue import Empty, Queue 41 | 42 | 43 | def withMetaclass(meta, *bases): 44 | """Create a base class with a metaclass.""" 45 | 46 | # This requires a bit of explanation: the basic idea is to make a dummy 47 | # metaclass for one level of class instantiation that replaces itself with 48 | # the actual metaclass. 49 | class MetaClass(meta): 50 | 51 | def __new__(cls, name, this_bases, d): 52 | return meta(name, bases, d) 53 | 54 | return type.__new__(MetaClass, 'temporary_class', (), {}) 55 | -------------------------------------------------------------------------------- /util/webRequest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: WebRequest 5 | Description : Network Requests Class 6 | Author : J_hao 7 | date: 2017/7/31 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/7/31: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'J_hao' 14 | 15 | from requests.models import Response 16 | from lxml import etree 17 | import requests 18 | import random 19 | import time 20 | 21 | from handler.logHandler import LogHandler 22 | 23 | requests.packages.urllib3.disable_warnings() 24 | 25 | 26 | class WebRequest(object): 27 | name = "web_request" 28 | 29 | def __init__(self, *args, **kwargs): 30 | self.log = LogHandler(self.name, file=False) 31 | self.response = Response() 32 | 33 | @property 34 | def user_agent(self): 35 | """ 36 | return an User-Agent at random 37 | :return: 38 | """ 39 | ua_list = [ 40 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101', 41 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122', 42 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71', 43 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95', 44 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71', 45 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 46 | 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 47 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 48 | ] 49 | return random.choice(ua_list) 50 | 51 | @property 52 | def header(self): 53 | """ 54 | basic header 55 | :return: 56 | """ 57 | return {'User-Agent': self.user_agent, 58 | 'Accept': '*/*', 59 | 'Connection': 'keep-alive', 60 | 'Accept-Language': 'zh-CN,zh;q=0.8'} 61 | 62 | def get(self, url, header=None, retry_time=3, retry_interval=5, timeout=5, *args, **kwargs): 63 | """ 64 | get method 65 | :param url: target url 66 | :param header: headers 67 | :param retry_time: retry time 68 | :param retry_interval: retry interval 69 | :param timeout: network timeout 70 | :return: 71 | """ 72 | headers = self.header 73 | if header and isinstance(header, dict): 74 | headers.update(header) 75 | while True: 76 | try: 77 | self.response = requests.get(url, headers=headers, timeout=timeout, *args, **kwargs) 78 | return self 79 | except Exception as e: 80 | self.log.error("requests: %s error: %s" % (url, str(e))) 81 | retry_time -= 1 82 | if retry_time <= 0: 83 | resp = Response() 84 | resp.status_code = 200 85 | return self 86 | self.log.info("retry %s second after" % retry_interval) 87 | time.sleep(retry_interval) 88 | 89 | @property 90 | def tree(self): 91 | return etree.HTML(self.response.content) 92 | 93 | @property 94 | def text(self): 95 | return self.response.text 96 | 97 | @property 98 | def json(self): 99 | try: 100 | return self.response.json() 101 | except Exception as e: 102 | self.log.error(str(e)) 103 | return {} 104 | 105 | --------------------------------------------------------------------------------