├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── api ├── README.md ├── __init__.py └── api.py ├── config.py ├── db ├── Fetcher.py ├── Proxy.py ├── README.md ├── __init__.py ├── conn.py └── init.py ├── docs ├── screenshot1.png ├── screenshot2.png ├── term.png └── workflow.png ├── fetchers ├── BaseFetcher.py ├── GoubanjiaFetcher.py ├── IHuanFetcher.py ├── IP3366Fetcher.py ├── IP66Fetcher.py ├── IP89Fetcher.py ├── JiangxianliFetcher.py ├── KaiXinFetcher.py ├── KuaidailiFetcher.py ├── ProxyListFetcher.py ├── ProxyScrapeFetcher.py ├── ProxyscanFetcher.py ├── README.md ├── UUFetcher.py ├── XiLaFetcher.py ├── XiaoShuFetcher.py └── __init__.py ├── frontend ├── README.md ├── deployment │ ├── .nojekyll │ ├── 200.html │ ├── _nuxt │ │ ├── 473a16e.js │ │ ├── 4e6036a.js │ │ ├── 810b53a.js │ │ ├── 89e3175.js │ │ ├── LICENSES │ │ ├── c6103f9.js │ │ ├── fda1702.js │ │ └── static │ │ │ └── 1630852693 │ │ │ ├── fetchers │ │ │ └── payload.js │ │ │ ├── manifest.js │ │ │ └── payload.js │ ├── fetchers │ │ └── index.html │ └── index.html └── src │ ├── .editorconfig │ ├── .eslintrc.js │ ├── .gitignore │ ├── README.md │ ├── build.sh │ ├── jsconfig.json │ ├── layouts │ └── default.vue │ ├── nuxt.config.js │ ├── package-lock.json │ ├── package.json │ ├── pages │ ├── fetchers.vue │ └── index.vue │ └── plugins │ ├── antd-ui.js │ └── axios.js ├── main.py ├── proc ├── README.md ├── __init__.py ├── run_fetcher.py └── run_validator.py ├── requirements.txt └── test ├── README.md ├── testDB.py └── testFetcher.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | data.db 132 | 133 | # Pycharm settings 134 | .idea 135 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | jobs: 4 | include: 5 | - name: "Python 3.6 on Linux" 6 | python: "3.6" 7 | os: "linux" 8 | env: 9 | - PYTHON=python3 10 | - PIP=pip3 11 | - name: "Python 3.7 on Linux" 12 | python: "3.7" 13 | os: "linux" 14 | env: 15 | - PYTHON=python3 16 | - PIP=pip3 17 | - name: "Python 3.8 on Linux" 18 | python: "3.8" 19 | os: "linux" 20 | env: 21 | - PYTHON=python3 22 | - PIP=pip3 23 | - name: "Python 3.7 on macOS" 24 | os: osx 25 | osx_image: xcode11.2 26 | language: shell 27 | env: 28 | - PYTHON=python3 29 | - PIP=pip3 30 | - name: "Python 3.6 on Windows" 31 | os: windows 32 | language: shell 33 | before_install: 34 | - choco install python --version 3.6.8 35 | env: 36 | - PATH=/c/Python36:/c/Python36/Scripts:$PATH 37 | - PYTHON=python 38 | - PIP=pip 39 | - name: "Python 3.7 on Windows" 40 | os: windows 41 | language: shell 42 | before_install: 43 | - choco install python --version 3.7.4 44 | env: 45 | - PATH=/c/Python37:/c/Python37/Scripts:$PATH 46 | - PYTHON=python 47 | - PIP=pip 48 | 49 | install: 50 | - $PYTHON --version 51 | - $PIP install -r requirements.txt 52 | 53 | script: 54 | - $PYTHON --version 55 | - $PYTHON main.py citest 56 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.0 2 | 3 | WORKDIR /proxy 4 | 5 | ADD requirements.txt /proxy 6 | RUN pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple/ 7 | RUN pip3 install --upgrade pip 8 | RUN pip3 install -r requirements.txt 9 | 10 | CMD ["python", "main.py"] 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Yu Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 简易好用的免费代理池 2 | 3 | [![](https://img.shields.io/badge/python-3.6+-brightgreen)](https://github.com/OxOOo/ProxyPoolWithUI) 4 | 5 | 兼容系统: 6 | ![Windows](https://img.shields.io/badge/Windows-o-brightgreen) 7 | ![Linux](https://img.shields.io/badge/Linux-o-brightgreen) 8 | ![MacOS](https://img.shields.io/badge/MacOS-o-brightgreen) 9 | 10 | * 定时自动爬取网络上的免费代理 11 | * 定时对代理进行验证,集成API随时返回可用代理 12 | * 不需要第三方数据库支持,一键启动,简单易用 13 | * 集成WEB管理界面,方便查看代理状态并对代理池进行配置 14 | * 拥有详细的注释,可以非常方便地学习或修改 15 | 16 | 推荐: 17 | * [HTTP代理原理](https://zhuanlan.zhihu.com/p/349028243) 18 | 19 | 项目Demo:[http://chenyu0x00.com:8888/](http://chenyu0x00.com:8888/) 20 | 21 | **2021年3月8日测试,项目运行半小时后,支持访问HTTPS的代理有40+,支持访问HTTP的代理有100+。** 22 | 23 | 如果你知道有好用的代理源,或者是发现本项目存在一些问题,欢迎通过Issues和我们讨论。 24 | 25 | ## WEB管理界面截图 26 | 27 | ![screenshot1](docs/screenshot1.png) 28 | ![screenshot2](docs/screenshot2.png) 29 | 30 | ## 已经集成的免费代理源 31 | 32 | | 名称 | 地址 |备注 | 33 | |--------------|-------------------------------|-------------| 34 | | 悠悠网络代理 | https://uu-proxy.com/ | | 35 | | 快代理 | https://www.kuaidaili.com/ | | 36 | | 全网代理 | http://www.goubanjia.com/ | | 37 | | 66代理 | http://www.66ip.cn/ | | 38 | | 云代理 | http://www.ip3366.net/ | | 39 | | 免费代理库 | https://ip.jiangxianli.com/ | | 40 | | 小幻HTTP代理 | https://ip.ihuan.me/ | | 41 | | 89免费代理 | https://www.89ip.cn/ | | 42 | | ProxyScan | https://www.proxyscan.io/ | | 43 | | 开心代理 | http://www.kxdaili.com/ | | 44 | | 西拉代理 | http://www.xiladaili.com/ | | 45 | | 小舒代理 | http://www.xsdaili.cn/ | | 46 | | ProxyList | https://www.proxy-list.download/| | 47 | | ProxyScrape | https://proxyscrape.com/ |国内无法直接访问 | 48 | 49 | ## 运行本项目 50 | 51 | 本项目目前只适配了Python3,请确保你的电脑上安装了3.6或更高版本的Python软件。 52 | 53 | 1. 下载代码 54 | 55 | ```bash 56 | git clone https://github.com/OxOOo/ProxyPoolWithUI.git 57 | ``` 58 | 59 | 2. 安装Python依赖(在`ProxyPoolWithUI`目录下执行) 60 | 61 | ```bash 62 | pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt 63 | ``` 64 | 65 | 3. 启动(在`ProxyPoolWithUI`目录下执行) 66 | 67 | ```bash 68 | python3 main.py 69 | ``` 70 | 71 | 如果你在运行了上述命令之后,在命令行中看到了类似如下截图,那么说明项目成功启动了: 72 | 73 | ![term](docs/term.png) 74 | 75 | 4. 使用浏览器打开`http://localhost:5000`,可以看到WEB管理界面。 76 | 77 | ## Docker构建项目 78 | 79 | 1. 下载项目文件 80 | 81 | ```bash 82 | git clone https://github.com/OxOOo/ProxyPoolWithUI.git 83 | cd ProxyPoolWithUI 84 | ``` 85 | 86 | 2. 构建docker镜像 87 | 88 | ```bash 89 | docker build --tag proxy_pool . 90 | ``` 91 | 92 | 3. 运行镜像 93 | 94 | ```bash 95 | docker run -p 5000:5000 -v /root/ProxyPoolWithUI:/proxy -d proxy_pool 96 | ``` 97 | `/root/ProxyPoolWithUI`为clone下来的项目目录路径,请自行更改 98 | 99 | 100 | ## 使用代理 101 | 102 | 1. API接口 103 | 104 | 项目启动之后,会自动爬取并检测代理是否可用,因此我们只需要关注如何使用代理即可。 105 | 106 | * `http://localhost:5000/fetch_random` : 随机获取一个可用代理,如果没有可用代理则返回空白 107 | 108 | 返回示例 : `http://127.0.0.1:8080` 109 | 110 | * `http://localhost:5000/fetch_all` : 获取所有可用代理,如果没有可用代理则返回空白 111 | 112 | 返回示例 : `http://127.0.0.1:8080,http://127.0.0.1:8081` 113 | 114 | 1. 使用代理 115 | 116 | 不同语言使用代理的方式各不相同,这里提供一个Python集成本项目并使用代理的示例代码: 117 | 118 | ```python 119 | # encoding : utf-8 120 | 121 | import requests 122 | 123 | def main(): 124 | proxy_uri = requests.get('http://localhost:5000/fetch_random').text 125 | if len(proxy_uri) == 0: 126 | print(u'暂时没有可用代理') 127 | return 128 | print(u'获取到的代理是:' + proxy_uri) 129 | 130 | proxies = { 'http': proxy_uri } 131 | html = requests.get('http://www.baidu.com', proxies=proxies).text 132 | if u'百度一下,你就知道' in html: 133 | print('代理可用') 134 | else: 135 | print('代理不可用') 136 | 137 | if __name__ == '__main__': 138 | main() 139 | ``` 140 | 141 | ## 配置 142 | 143 | 如果是需要禁用或者启用某些代理,可直接在WEB管理界面进行操作。 144 | 145 | 本项目的大部分配置均可在`config.py`中找到,默认配置已经可以适应绝大部分情况,一般来说不需要进行修改。 146 | 147 | ## 添加新的代理源 148 | 149 | 本项目的爬取器均在`fetchers`目录下,你也可以根据自己的需求对其中的爬取器进行修改或者扩展。 150 | 151 | 编写本项目的爬取器并不复杂,详细的操作步骤可见[此处](fetchers/),可以参考`fetchers`目录下已有的爬取器。 152 | 153 | ## 项目工作流程图 154 | 155 | 本项目主要包含三部分: 156 | 157 | 1. 爬取进程:主要包括`fetchers`目录和`proc/run_fetcher.py`文件 158 | 2. 验证进程:主要在`proc/run_validator.py`文件中 159 | 3. WEB与API:在`api`目录下 160 | 161 | 本项目的大致逻辑图如下: 162 | 163 | 注:为了便于理解与画图,下图的逻辑是经过简化之后的逻辑,详细过程可查看代码以及相应的注释。 164 | 165 | ![workflow](docs/workflow.png) 166 | 167 | ## 验证算法相关 168 | 169 | 1. 如何验证代理可用 170 | 171 | 目前验证代理可用的算法较为简单,核心思想是使用`requests`库访问一个指定网页,查看是否访问成功。 172 | 173 | 相关配置参数(包括`超时时间`,`尝试次数`等)可在`config.py`中找到,具体代码逻辑在`proc/run_validator.py`中。 174 | 175 | 2. 什么时候该验证哪个代理 176 | 177 | 这个问题比较复杂,很难有一个完美的解决方案,因此目前的算法较为简单,勉强可用,可在[db](db)目录下找到对于目前算法的说明。 178 | 179 | 如果你有更好的算法,欢迎通过Issues和我们讨论,也可以根据[db](db)目录下的[README](db/README.md)文件对代码进行修改。 180 | -------------------------------------------------------------------------------- /api/README.md: -------------------------------------------------------------------------------- 1 | # API目录 2 | 3 | 使用Flask搭建了一个简单的API服务器,主要包含两部分: 4 | 5 | 1. 获取代理的API,使用方法详见[项目主页](https://github.com/OxOOo/ProxyPoolWithUI)。 6 | 2. 托管网页端的静态文件,并提供若干API给网页端使用。 7 | -------------------------------------------------------------------------------- /api/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 -------------------------------------------------------------------------------- /api/api.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import os 4 | import logging 5 | from flask import Flask 6 | from flask import jsonify, request, redirect, send_from_directory 7 | 8 | log = logging.getLogger('werkzeug') 9 | log.disabled = True 10 | 11 | try: 12 | from db import conn 13 | except: 14 | import sys 15 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 16 | from db import conn 17 | 18 | STATIC_FOLDER = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'frontend', 'deployment') 19 | 20 | app = Flask( 21 | __name__, 22 | static_url_path='/web', 23 | static_folder=STATIC_FOLDER 24 | ) 25 | 26 | ############# 以下API可用于获取代理 ################ 27 | 28 | # 可用于测试API状态 29 | @app.route('/ping', methods=['GET']) 30 | def ping(): 31 | return 'API OK' 32 | 33 | # 随机获取一个可用代理,如果没有可用代理则返回空白 34 | @app.route('/fetch_random', methods=['GET']) 35 | def fetch_random(): 36 | proxies = conn.getValidatedRandom(1) 37 | if len(proxies) > 0: 38 | p = proxies[0] 39 | return f'{p.protocol}://{p.ip}:{p.port}' 40 | else: 41 | return '' 42 | 43 | ############# 新增加接口int ################ 44 | 45 | #api 获取协议为http的一条结果 46 | @app.route('/fetch_http', methods=['GET']) 47 | def fetch_http(): 48 | proxies =conn.get_by_protocol('http', 1) 49 | if len(proxies) > 0: 50 | p = proxies[0] 51 | return f'{p.protocol}://{p.ip}:{p.port}' 52 | else: 53 | return '' 54 | 55 | #api 获取协议为http的全部结果 56 | @app.route('/fetch_http_all', methods=['GET']) 57 | def fetch_http_all(): 58 | proxies = conn.get_by_protocol('http', -1) 59 | if len(proxies) == 1: 60 | p = proxies[0] 61 | return f'{p.protocol}://{p.ip}:{p.port}' 62 | elif len(proxies) > 1: 63 | proxy_list = [] 64 | for p in proxies: 65 | proxy_list.append(f'{p.protocol}://{p.ip}:{p.port}') 66 | return ','.join(proxy_list) 67 | else: 68 | return '' 69 | 70 | #api 获取协议为https的一条结果 71 | @app.route('/fetch_https', methods=['GET']) 72 | def fetch_https(): 73 | proxies =conn.get_by_protocol('https', 1) 74 | if len(proxies) > 0: 75 | p = proxies[0] 76 | return f'{p.protocol}://{p.ip}:{p.port}' 77 | else: 78 | return '' 79 | 80 | #api 获取协议为https的全部结果 81 | @app.route('/fetch_https_all', methods=['GET']) 82 | def fetch_https_all(): 83 | proxies = conn.get_by_protocol('https', -1) 84 | if len(proxies) == 1: 85 | p = proxies[0] 86 | return f'{p.protocol}://{p.ip}:{p.port}' 87 | elif len(proxies) > 1: 88 | proxy_list = [] 89 | for p in proxies: 90 | proxy_list.append(f'{p.protocol}://{p.ip}:{p.port}') 91 | return ','.join(proxy_list) 92 | else: 93 | return '' 94 | 95 | #api 获取协议为http的一条结果 96 | @app.route('/fetch_socks4', methods=['GET']) 97 | def fetch_socks4(): 98 | proxies =conn.get_by_protocol('socks4', 1) 99 | if len(proxies) > 0: 100 | p = proxies[0] 101 | return f'{p.protocol}://{p.ip}:{p.port}' 102 | else: 103 | return '' 104 | 105 | #api 获取协议为http的全部结果 106 | @app.route('/fetch_socks4_all', methods=['GET']) 107 | def fetch_socks4_all(): 108 | proxies = conn.get_by_protocol('socks4', -1) 109 | if len(proxies) == 1: 110 | p = proxies[0] 111 | return f'{p.protocol}://{p.ip}:{p.port}' 112 | elif len(proxies) > 1: 113 | proxy_list = [] 114 | for p in proxies: 115 | proxy_list.append(f'{p.protocol}://{p.ip}:{p.port}') 116 | return ','.join(proxy_list) 117 | else: 118 | return '' 119 | 120 | #api 获取协议为https的一条结果 121 | @app.route('/fetch_socks5', methods=['GET']) 122 | def fetch_socks5(): 123 | proxies =conn.get_by_protocol('socks5', 1) 124 | if len(proxies) > 0: 125 | p = proxies[0] 126 | return f'{p.protocol}://{p.ip}:{p.port}' 127 | else: 128 | return '' 129 | 130 | #api 获取协议为https的全部结果 131 | @app.route('/fetch_socks5_all', methods=['GET']) 132 | def fetch_socks5_all(): 133 | proxies = conn.get_by_protocol('socks5', -1) 134 | if len(proxies) == 1: 135 | p = proxies[0] 136 | return f'{p.protocol}://{p.ip}:{p.port}' 137 | elif len(proxies) > 1: 138 | proxy_list = [] 139 | for p in proxies: 140 | proxy_list.append(f'{p.protocol}://{p.ip}:{p.port}') 141 | return ','.join(proxy_list) 142 | else: 143 | return '' 144 | 145 | ############# 新增加接口end ################ 146 | 147 | # 获取所有可用代理,如果没有可用代理则返回空白 148 | @app.route('/fetch_all', methods=['GET']) 149 | def fetch_all(): 150 | proxies = conn.getValidatedRandom(-1) 151 | proxies = [f'{p.protocol}://{p.ip}:{p.port}' for p in proxies] 152 | return ','.join(proxies) 153 | 154 | ############# 以下API主要给网页使用 ################ 155 | 156 | @app.route('/') 157 | def index(): 158 | return redirect('/web') 159 | 160 | # 网页:首页 161 | @app.route('/web', methods=['GET']) 162 | @app.route('/web/', methods=['GET']) 163 | def page_index(): 164 | return send_from_directory(STATIC_FOLDER, 'index.html') 165 | 166 | # 网页:爬取器状态 167 | @app.route('/web/fetchers', methods=['GET']) 168 | @app.route('/web/fetchers/', methods=['GET']) 169 | def page_fetchers(): 170 | return send_from_directory(STATIC_FOLDER, 'fetchers/index.html') 171 | 172 | # 获取代理状态 173 | @app.route('/proxies_status', methods=['GET']) 174 | def proxies_status(): 175 | proxies = conn.getValidatedRandom(-1) 176 | proxies = sorted(proxies, key=lambda p: f'{p.protocol}://{p.ip}:{p.port}', reverse=True) 177 | proxies = [p.to_dict() for p in proxies] 178 | 179 | status = conn.getProxiesStatus() 180 | 181 | return jsonify(dict( 182 | success=True, 183 | proxies=proxies, 184 | **status 185 | )) 186 | 187 | # 获取爬取器状态 188 | @app.route('/fetchers_status', methods=['GET']) 189 | def fetchers_status(): 190 | proxies = conn.getValidatedRandom(-1) # 获取所有可用代理 191 | fetchers = conn.getAllFetchers() 192 | fetchers = [f.to_dict() for f in fetchers] 193 | 194 | for f in fetchers: 195 | f['validated_cnt'] = len([_ for _ in proxies if _.fetcher_name == f['name']]) 196 | f['in_db_cnt'] = conn.getProxyCount(f['name']) 197 | 198 | return jsonify(dict( 199 | success=True, 200 | fetchers=fetchers 201 | )) 202 | 203 | # 清空爬取器状态 204 | @app.route('/clear_fetchers_status', methods=['GET']) 205 | def clear_fetchers_status(): 206 | conn.pushClearFetchersStatus() 207 | return jsonify(dict(success=True)) 208 | 209 | # 设置是否启用特定爬取器,?name=str,enable=0/1 210 | @app.route('/fetcher_enable', methods=['GET']) 211 | def fetcher_enable(): 212 | name = request.args.get('name') 213 | enable = request.args.get('enable') 214 | if enable == '1': 215 | conn.pushFetcherEnable(name, True) 216 | else: 217 | conn.pushFetcherEnable(name, False) 218 | return jsonify(dict(success=True)) 219 | 220 | ############# 其他 ################ 221 | 222 | # 跨域支持,主要是在开发网页端的时候需要使用 223 | def after_request(resp): 224 | ALLOWED_ORIGIN = ['0.0.0.0', '127.0.0.1', 'localhost'] 225 | origin = request.headers.get('origin', None) 226 | if origin is not None: 227 | for item in ALLOWED_ORIGIN: 228 | if item in origin: 229 | resp.headers['Access-Control-Allow-Origin'] = origin 230 | resp.headers['Access-Control-Allow-Credentials'] = 'true' 231 | return resp 232 | app.after_request(after_request) 233 | 234 | def main(proc_lock): 235 | if proc_lock is not None: 236 | conn.set_proc_lock(proc_lock) 237 | # 因为默认sqlite3中,同一个数据库连接不能在多线程环境下使用,所以这里需要禁用flask的多线程 238 | app.run(host='0.0.0.0', port=5000, threaded=False) 239 | 240 | if __name__ == '__main__': 241 | main(None) 242 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | """ 4 | 配置文件,一般来说不需要修改 5 | 如果需要启用或者禁用某些网站的爬取器,可在网页上进行配置 6 | """ 7 | 8 | import os 9 | 10 | # 数据库文件路径 11 | DATABASE_PATH = os.path.join(os.path.dirname(__file__), 'data.db') 12 | 13 | # 每次运行所有爬取器之后,睡眠多少时间,单位秒 14 | PROC_FETCHER_SLEEP = 5 * 60 15 | 16 | # 验证器每次睡眠的时间,单位秒 17 | PROC_VALIDATOR_SLEEP = 5 18 | 19 | # 验证器的配置参数 20 | VALIDATE_THREAD_NUM = 200 # 验证线程数量 21 | # 验证器的逻辑是: 22 | # 使用代理访问 VALIDATE_URL 网站,超时时间设置为 VALIDATE_TIMEOUT 23 | # 如果没有超时: 24 | # 1、若选择的验证方式为GET: 返回的网页中包含 VALIDATE_KEYWORD 文字,那么就认为本次验证成功 25 | # 2、若选择的验证方式为HEAD: 返回的响应头中,对于的 VALIDATE_HEADER 响应字段内容包含 VALIDATE_KEYWORD 内容,那么就认为本次验证成功 26 | # 上述过程最多进行 VALIDATE_MAX_FAILS 次,只要有一次成功,就认为代理可用 27 | VALIDATE_URL = 'https://qq.com' 28 | VALIDATE_METHOD = 'HEAD' # 验证方式,可选:GET、HEAD 29 | VALIDATE_HEADER = 'location' # 仅用于HEAD验证方式,百度响应头Server字段KEYWORD可填:bfe 30 | VALIDATE_KEYWORD = 'www.qq.com' 31 | VALIDATE_TIMEOUT = 5 # 超时时间,单位s 32 | VALIDATE_MAX_FAILS = 3 33 | -------------------------------------------------------------------------------- /db/Fetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import datetime 4 | 5 | class Fetcher(object): 6 | """ 7 | 爬取器的状态储存在数据库中,包括是否启用爬取器,爬取到的代理数量等 8 | """ 9 | 10 | ddls = [""" 11 | CREATE TABLE IF NOT EXISTS fetchers 12 | ( 13 | name VARCHAR(255) NOT NULL, 14 | enable BOOLEAN NOT NULL, 15 | sum_proxies_cnt INTEGER NOT NULL, 16 | last_proxies_cnt INTEGER NOT NULL, 17 | last_fetch_date TIMESTAMP, 18 | PRIMARY KEY (name) 19 | ) 20 | """] 21 | 22 | def __init__(self): 23 | self.name = None 24 | self.enable = True 25 | self.sum_proxies_cnt = 0 26 | self.last_proxies_cnt = 0 27 | self.last_fetch_date = None 28 | 29 | def params(self): 30 | """ 31 | 返回一个元组,包含自身的全部属性 32 | """ 33 | return ( 34 | self.name, self.enable, 35 | self.sum_proxies_cnt, self.last_proxies_cnt, self.last_fetch_date 36 | ) 37 | 38 | def to_dict(self): 39 | """ 40 | 返回一个dict,包含自身的全部属性 41 | """ 42 | return { 43 | 'name': self.name, 44 | 'enable': self.enable, 45 | 'sum_proxies_cnt': self.sum_proxies_cnt, 46 | 'last_proxies_cnt': self.last_proxies_cnt, 47 | 'last_fetch_date': str(self.last_fetch_date) if self.last_fetch_date is not None else None 48 | } 49 | 50 | @staticmethod 51 | def decode(row): 52 | """ 53 | 将sqlite返回的一行解析为Fetcher 54 | row : sqlite返回的一行 55 | """ 56 | assert len(row) == 5 57 | f = Fetcher() 58 | f.name = row[0] 59 | f.enable = bool(row[1]) 60 | f.sum_proxies_cnt = row[2] 61 | f.last_proxies_cnt = row[3] 62 | f.last_fetch_date = row[4] 63 | return f 64 | -------------------------------------------------------------------------------- /db/Proxy.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import datetime 4 | import random 5 | class Proxy(object): 6 | """ 7 | 代理,用于表示数据库中的一个记录 8 | """ 9 | 10 | ddls = [""" 11 | CREATE TABLE IF NOT EXISTS proxies 12 | ( 13 | fetcher_name VARCHAR(255) NOT NULL, 14 | protocol VARCHAR(32) NOT NULL, 15 | ip VARCHAR(255) NOT NULL, 16 | port INTEGER NOT NULL, 17 | validated BOOLEAN NOT NULL, 18 | latency INTEGER, 19 | validate_date TIMESTAMP, 20 | to_validate_date TIMESTAMP NOT NULL, 21 | validate_failed_cnt INTEGER NOT NULL, 22 | PRIMARY KEY (protocol, ip, port) 23 | ) 24 | """, 25 | """ 26 | CREATE INDEX IF NOT EXISTS proxies_fetcher_name_index 27 | ON proxies(fetcher_name) 28 | """, 29 | """ 30 | CREATE INDEX IF NOT EXISTS proxies_to_validate_date_index 31 | ON proxies(to_validate_date ASC) 32 | """] 33 | 34 | def __init__(self): 35 | self.fetcher_name = None 36 | self.protocol = None 37 | self.ip = None 38 | self.port = None 39 | self.validated = False 40 | self.latency = None 41 | self.validate_date = None 42 | self.to_validate_date = datetime.datetime.now() 43 | self.validate_failed_cnt = 0 44 | 45 | def params(self): 46 | """ 47 | 返回一个元组,包含自身的全部属性 48 | """ 49 | return ( 50 | self.fetcher_name, 51 | self.protocol, self.ip, self.port, 52 | self.validated, self.latency, 53 | self.validate_date, self.to_validate_date, self.validate_failed_cnt 54 | ) 55 | 56 | def to_dict(self): 57 | """ 58 | 返回一个dict,包含自身的全部属性 59 | """ 60 | return { 61 | 'fetcher_name': self.fetcher_name, 62 | 'protocol': self.protocol, 63 | 'ip': self.ip, 64 | 'port': self.port, 65 | 'validated': self.validated, 66 | 'latency': self.latency, 67 | 'validate_date': str(self.validate_date) if self.validate_date is not None else None, 68 | 'to_validate_date': str(self.to_validate_date) if self.to_validate_date is not None else None, 69 | 'validate_failed_cnt': self.validate_failed_cnt 70 | } 71 | 72 | @staticmethod 73 | def decode(row): 74 | """ 75 | 将sqlite返回的一行解析为Proxy 76 | row : sqlite返回的一行 77 | """ 78 | assert len(row) == 9 79 | p = Proxy() 80 | p.fetcher_name = row[0] 81 | p.protocol = row[1] 82 | p.ip = row[2] 83 | p.port = row[3] 84 | p.validated = bool(row[4]) 85 | p.latency = row[5] 86 | p.validate_date = row[6] 87 | p.to_validate_date = row[7] 88 | p.validate_failed_cnt = row[8] 89 | return p 90 | 91 | def validate(self, success, latency): 92 | """ 93 | 传入一次验证结果,根据验证结果调整自身属性,并返回是否删除这个代理 94 | success : True/False,表示本次验证是否成功 95 | 返回 : True/False,True表示这个代理太差了,应该从数据库中删除 96 | """ 97 | self.latency = latency 98 | if success: # 验证成功 99 | self.validated = True 100 | self.validate_date = datetime.datetime.now() 101 | self.validate_failed_cnt = 0 102 | #self.to_validate_date = datetime.datetime.now() + datetime.timedelta(minutes=30) # 30分钟之后继续验证 103 | self.to_validate_date = datetime.datetime.now() + datetime.timedelta(minutes=random.randint(10, 60)) # 10·60分钟之后继续验证 104 | return False 105 | else: 106 | self.validated = False 107 | self.validate_date = datetime.datetime.now() 108 | self.validate_failed_cnt = self.validate_failed_cnt + 1 109 | 110 | # 验证失败的次数越多,距离下次验证的时间越长 111 | delay_minutes = self.validate_failed_cnt * 10 112 | self.to_validate_date = datetime.datetime.now() + datetime.timedelta(minutes=delay_minutes) 113 | 114 | if self.validate_failed_cnt >= 6: 115 | return True 116 | else: 117 | return False 118 | -------------------------------------------------------------------------------- /db/README.md: -------------------------------------------------------------------------------- 1 | # 数据库封装 2 | 3 | 这个目录下封装了操作数据库的一些接口。 4 | 为了通用性,本项目使用SQLite作为底层的数据库,使用`sqlite3`提供的接口对数据库进行操作。 5 | 6 | ## 数据表 7 | 8 | 主要包含两个表,分别用于储存代理和爬取器: 9 | 10 | 1. 代理 11 | 12 | | 字段名称 | 数据类型 | 说明 | 13 | |---------------------|----------|--------------------------------------------------------------------------| 14 | | fetcher_name | 字符串 | 这个代理来自哪个爬取器 | 15 | | protocol | 字符串 | 代理协议名称,一般为HTTP | 16 | | ip | 字符串 | 代理的IP地址 | 17 | | port | 整数 | 代理的端口号 | 18 | | validated | 布尔值 | 这个代理是否通过了验证,通过了验证表示当前代理可用 | 19 | | latency | 整数 | 延迟(单位毫秒),表示上次验证所用的时间,越小则代理质量越好 | 20 | | validate_date | 时间戳 | 上一次进行验证的时间 | 21 | | to_validate_date | 时间戳 | 下一次进行验证的时间,如何调整下一次验证的时间可见后文或者代码`Proxy.py` | 22 | | validate_failed_cnt | 整数 | 已经连续验证失败了多少次,会影响下一次验证的时间 | 23 | 24 | 2. 爬取器 25 | 26 | | 字段名称 | 数据类型 | 说明 | 27 | |------------------|----------|----------------------------------------------------------------------------------| 28 | | name | 字符串 | 爬取器的名称 | 29 | | enable | 布尔值 | 是否启用这个爬取器,被禁用的爬取器不会在之后被运行,但是其之前爬取的代理依然存在 | 30 | | sum_proxies_cnt | 整数 | 至今为止总共爬取到了多少个代理 | 31 | | last_proxies_cnt | 整数 | 上次爬取到了多少个代理 | 32 | | last_fetch_date | 时间戳 | 上次爬取的时间 | 33 | 34 | ## 下次验证时间调整算法 35 | 36 | 由于不同代理网站公开的免费代理质量差距较大,因此对于多次验证都失败的代理,我们需要降低对他们进行验证的频率,甚至将他们从数据库中删除。 37 | 而对于现在可用的代理,则需要频繁对其进行验证,以保证其可用性。 38 | 39 | 目前的算法较为简单,可见`Proxy.py`文件中的`validate`函数,核心思想如下: 40 | 41 | 1. 优先验证之前验证通过并且到了验证时间的代理(`conn.py`中的`getToValidate`函数) 42 | 2. 对于爬取器新爬取到的代理,我们需要尽快对其进行验证(设置`to_validate_date`为当前时间) 43 | 3. 如果某个代理验证成功,那么设置它下一次进行验证的时间为5分钟之后 44 | 4. 如果某个代理验证失败,那么设置它下一次进行验证的时间为 5 * 连续失败次数 分钟之后,如果连续3次失败,那么将其从数据库中删除 45 | 46 | 你可以修改为自己的算法,主要代码涉及`Proxy.py`文件以及`conn.py`文件的`pushNewFetch`和`getToValidate`函数。 47 | -------------------------------------------------------------------------------- /db/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .init import init 4 | 5 | init() 6 | -------------------------------------------------------------------------------- /db/conn.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | """ 4 | 封装的数据库接口 5 | """ 6 | 7 | from config import DATABASE_PATH 8 | from .Proxy import Proxy 9 | from .Fetcher import Fetcher 10 | import sqlite3 11 | import datetime 12 | import threading 13 | 14 | conn = sqlite3.connect(DATABASE_PATH, detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES) 15 | # 线程锁 16 | conn_lock = threading.Lock() 17 | # 进程锁 18 | proc_lock = None 19 | 20 | def set_proc_lock(proc_lock_sub): 21 | """ 22 | 设置进程锁 23 | proc_lock_sub : main中的进程锁 24 | """ 25 | global proc_lock 26 | proc_lock = proc_lock_sub 27 | 28 | def pushNewFetch(fetcher_name, protocol, ip, port): 29 | """ 30 | 爬取器新抓到了一个代理,调用本函数将代理放入数据库 31 | fetcher_name : 爬取器名称 32 | protocol : 代理协议 33 | ip : 代理IP地址 34 | port : 代理端口 35 | """ 36 | p = Proxy() 37 | p.fetcher_name = fetcher_name 38 | p.protocol = protocol 39 | p.ip = ip 40 | p.port = port 41 | conn_lock.acquire() 42 | proc_lock.acquire() 43 | 44 | c = conn.cursor() 45 | c.execute('BEGIN EXCLUSIVE TRANSACTION;') 46 | # 更新proxies表 47 | c.execute('SELECT * FROM proxies WHERE protocol=? AND ip=? AND port=?', (p.protocol, p.ip, p.port)) 48 | row = c.fetchone() 49 | if row is not None: # 已经存在(protocol, ip, port) 50 | old_p = Proxy.decode(row) 51 | c.execute(""" 52 | UPDATE proxies SET fetcher_name=?,to_validate_date=? WHERE protocol=? AND ip=? AND port=? 53 | """, (p.fetcher_name, min(datetime.datetime.now(), old_p.to_validate_date), p.protocol, p.ip, p.port)) 54 | else: 55 | c.execute('INSERT INTO proxies VALUES (?,?,?,?,?,?,?,?,?)', p.params()) 56 | c.close() 57 | conn.commit() 58 | conn_lock.release() 59 | proc_lock.release() 60 | 61 | def getToValidate(max_count=1): 62 | """ 63 | 从数据库中获取待验证的代理,根据to_validate_date字段 64 | 优先选取已经通过了验证的代理,其次是没有通过验证的代理 65 | max_count : 返回数量限制 66 | 返回 : list[Proxy] 67 | """ 68 | conn_lock.acquire() 69 | proc_lock.acquire() 70 | c = conn.cursor() 71 | c.execute('BEGIN EXCLUSIVE TRANSACTION;') 72 | c.execute('SELECT * FROM proxies WHERE to_validate_date<=? AND validated=? ORDER BY to_validate_date LIMIT ?', ( 73 | datetime.datetime.now(), 74 | True, 75 | max_count 76 | )) 77 | proxies = [Proxy.decode(row) for row in c] 78 | c.execute('SELECT * FROM proxies WHERE to_validate_date<=? AND validated=? ORDER BY to_validate_date LIMIT ?', ( 79 | datetime.datetime.now(), 80 | False, 81 | max_count - len(proxies) 82 | )) 83 | proxies = proxies + [Proxy.decode(row) for row in c] 84 | c.close() 85 | conn.commit() 86 | conn_lock.release() 87 | proc_lock.release() 88 | return proxies 89 | 90 | def pushValidateResult(proxy, success, latency): 91 | """ 92 | 将验证器的一个结果添加进数据库中 93 | proxy : 代理 94 | success : True/False,验证是否成功 95 | latency : 本次验证所用的时间(单位毫秒) 96 | """ 97 | p = proxy 98 | should_remove = p.validate(success, latency) 99 | conn_lock.acquire() 100 | proc_lock.acquire() 101 | if should_remove: 102 | conn.execute('DELETE FROM proxies WHERE protocol=? AND ip=? AND port=?', (p.protocol, p.ip, p.port)) 103 | else: 104 | conn.execute(""" 105 | UPDATE proxies 106 | SET fetcher_name=?,validated=?,latency=?,validate_date=?,to_validate_date=?,validate_failed_cnt=? 107 | WHERE protocol=? AND ip=? AND port=? 108 | """, ( 109 | p.fetcher_name, p.validated, p.latency, p.validate_date, p.to_validate_date, p.validate_failed_cnt, 110 | p.protocol, p.ip, p.port 111 | )) 112 | conn.commit() 113 | conn_lock.release() 114 | proc_lock.release() 115 | 116 | def getValidatedRandom(max_count): 117 | """ 118 | 从通过了验证的代理中,随机选择max_count个代理返回 119 | max_count<=0表示不做数量限制 120 | 返回 : list[Proxy] 121 | """ 122 | conn_lock.acquire() 123 | proc_lock.acquire() 124 | if max_count > 0: 125 | r = conn.execute('SELECT * FROM proxies WHERE validated=? ORDER BY RANDOM() LIMIT ?', (True, max_count)) 126 | else: 127 | r = conn.execute('SELECT * FROM proxies WHERE validated=? ORDER BY RANDOM()', (True,)) 128 | proxies = [Proxy.decode(row) for row in r] 129 | r.close() 130 | conn_lock.release() 131 | proc_lock.release() 132 | return proxies 133 | 134 | #新增方法 135 | def get_by_protocol(protocol, max_count): 136 | """ 137 | 查询 protocol 字段为指定值的代理服务器记录 138 | max_count 表示返回记录的最大数量,如果为 0 或负数则返回所有记录 139 | 返回 : list[Proxy] 140 | """ 141 | conn_lock.acquire() 142 | proc_lock.acquire() 143 | if max_count > 0: 144 | r = conn.execute('SELECT * FROM proxies WHERE protocol=? AND validated=? ORDER BY RANDOM() LIMIT ?', (protocol, True, max_count)) 145 | else: 146 | r = conn.execute('SELECT * FROM proxies WHERE protocol=? AND validated=? ORDER BY RANDOM()', (protocol, True)) 147 | proxies = [Proxy.decode(row) for row in r] 148 | r.close() 149 | conn_lock.release() 150 | proc_lock.release() 151 | return proxies 152 | 153 | def pushFetcherResult(name, proxies_cnt): 154 | """ 155 | 更新爬取器的状态,每次在完成一个网站的爬取之后,调用本函数 156 | name : 爬取器的名称 157 | proxies_cnt : 本次爬取到的代理数量 158 | """ 159 | conn_lock.acquire() 160 | proc_lock.acquire() 161 | c = conn.cursor() 162 | c.execute('BEGIN EXCLUSIVE TRANSACTION;') 163 | c.execute('SELECT * FROM fetchers WHERE name=?', (name,)) 164 | row = c.fetchone() 165 | if row is None: 166 | raise ValueError(f'ERRROR: can not find fetcher {name}') 167 | else: 168 | f = Fetcher.decode(row) 169 | f.last_proxies_cnt = proxies_cnt 170 | f.sum_proxies_cnt = f.sum_proxies_cnt + proxies_cnt 171 | f.last_fetch_date = datetime.datetime.now() 172 | c.execute('UPDATE fetchers SET sum_proxies_cnt=?,last_proxies_cnt=?,last_fetch_date=? WHERE name=?', ( 173 | f.sum_proxies_cnt, f.last_proxies_cnt, f.last_fetch_date, f.name 174 | )) 175 | c.close() 176 | conn.commit() 177 | conn_lock.release() 178 | proc_lock.release() 179 | 180 | def pushFetcherEnable(name, enable): 181 | """ 182 | 设置是否起用对应爬取器,被禁用的爬取器将不会被运行 183 | name : 爬取器的名称 184 | enable : True/False, 是否启用 185 | """ 186 | conn_lock.acquire() 187 | proc_lock.acquire() 188 | c = conn.cursor() 189 | c.execute('BEGIN EXCLUSIVE TRANSACTION;') 190 | c.execute('SELECT * FROM fetchers WHERE name=?', (name,)) 191 | row = c.fetchone() 192 | if row is None: 193 | raise ValueError(f'ERRROR: can not find fetcher {name}') 194 | else: 195 | f = Fetcher.decode(row) 196 | f.enable = enable 197 | c.execute('UPDATE fetchers SET enable=? WHERE name=?', ( 198 | f.enable, f.name 199 | )) 200 | c.close() 201 | conn.commit() 202 | conn_lock.release() 203 | proc_lock.release() 204 | 205 | def getAllFetchers(): 206 | """ 207 | 获取所有的爬取器以及状态 208 | 返回 : list[Fetcher] 209 | """ 210 | conn_lock.acquire() 211 | proc_lock.acquire() 212 | r = conn.execute('SELECT * FROM fetchers') 213 | fetchers = [Fetcher.decode(row) for row in r] 214 | r.close() 215 | conn_lock.release() 216 | proc_lock.release() 217 | return fetchers 218 | 219 | def getFetcher(name): 220 | """ 221 | 获取指定爬取器以及状态 222 | 返回 : Fetcher 223 | """ 224 | conn_lock.acquire() 225 | proc_lock.acquire() 226 | r = conn.execute('SELECT * FROM fetchers WHERE name=?', (name,)) 227 | row = r.fetchone() 228 | r.close() 229 | conn_lock.release() 230 | proc_lock.release() 231 | if row is None: 232 | return None 233 | else: 234 | return Fetcher.decode(row) 235 | 236 | def getProxyCount(fetcher_name): 237 | """ 238 | 查询在数据库中有多少个由指定爬取器爬取到的代理 239 | fetcher_name : 爬取器名称 240 | 返回 : int 241 | """ 242 | conn_lock.acquire() 243 | proc_lock.acquire() 244 | r = conn.execute('SELECT count(*) FROM proxies WHERE fetcher_name=?', (fetcher_name,)) 245 | cnt = r.fetchone()[0] 246 | r.close() 247 | conn_lock.release() 248 | proc_lock.release() 249 | return cnt 250 | 251 | def getProxiesStatus(): 252 | """ 253 | 获取代理状态,包括`全部代理数量`,`当前可用代理数量`,`等待验证代理数量` 254 | 返回 : dict 255 | """ 256 | conn_lock.acquire() 257 | proc_lock.acquire() 258 | r = conn.execute('SELECT count(*) FROM proxies') 259 | sum_proxies_cnt = r.fetchone()[0] 260 | r.close() 261 | 262 | r = conn.execute('SELECT count(*) FROM proxies WHERE validated=?', (True,)) 263 | validated_proxies_cnt = r.fetchone()[0] 264 | r.close() 265 | 266 | r = conn.execute('SELECT count(*) FROM proxies WHERE to_validate_date<=?', (datetime.datetime.now(),)) 267 | pending_proxies_cnt = r.fetchone()[0] 268 | r.close() 269 | conn_lock.release() 270 | proc_lock.release() 271 | return dict( 272 | sum_proxies_cnt=sum_proxies_cnt, 273 | validated_proxies_cnt=validated_proxies_cnt, 274 | pending_proxies_cnt=pending_proxies_cnt 275 | ) 276 | 277 | def pushClearFetchersStatus(): 278 | """ 279 | 清空爬取器的统计信息,包括sum_proxies_cnt,last_proxies_cnt,last_fetch_date 280 | """ 281 | conn_lock.acquire() 282 | proc_lock.acquire() 283 | c = conn.cursor() 284 | c.execute('BEGIN EXCLUSIVE TRANSACTION;') 285 | c.execute('UPDATE fetchers SET sum_proxies_cnt=?, last_proxies_cnt=?, last_fetch_date=?', (0, 0, None)) 286 | c.close() 287 | conn.commit() 288 | conn_lock.release() 289 | proc_lock.release() 290 | -------------------------------------------------------------------------------- /db/init.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from config import DATABASE_PATH 4 | from .Proxy import Proxy 5 | from .Fetcher import Fetcher 6 | from fetchers import fetchers 7 | import sqlite3 8 | 9 | def init(): 10 | """ 11 | 初始化数据库 12 | """ 13 | 14 | conn = sqlite3.connect(DATABASE_PATH) 15 | 16 | create_tables = Proxy.ddls + Fetcher.ddls 17 | for sql in create_tables: 18 | conn.execute(sql) 19 | conn.commit() 20 | 21 | # 注册所有的爬取器 22 | c = conn.cursor() 23 | c.execute('BEGIN EXCLUSIVE TRANSACTION;') 24 | for item in fetchers: 25 | c.execute('SELECT * FROM fetchers WHERE name=?', (item.name,)) 26 | if c.fetchone() is None: 27 | f = Fetcher() 28 | f.name = item.name 29 | c.execute('INSERT INTO fetchers VALUES(?,?,?,?,?)', f.params()) 30 | c.close() 31 | conn.commit() 32 | 33 | conn.close() 34 | -------------------------------------------------------------------------------- /docs/screenshot1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/docs/screenshot1.png -------------------------------------------------------------------------------- /docs/screenshot2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/docs/screenshot2.png -------------------------------------------------------------------------------- /docs/term.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/docs/term.png -------------------------------------------------------------------------------- /docs/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/docs/workflow.png -------------------------------------------------------------------------------- /fetchers/BaseFetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | class BaseFetcher(object): 4 | """ 5 | 所有爬取器的基类 6 | """ 7 | 8 | def fetch(self): 9 | """ 10 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 11 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 12 | """ 13 | raise NotImplementedError() 14 | -------------------------------------------------------------------------------- /fetchers/GoubanjiaFetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | from pyquery import PyQuery as pq 6 | import re 7 | 8 | class GoubanjiaFetcher(BaseFetcher): 9 | """ 10 | http://www.goubanjia.com/ 11 | """ 12 | 13 | def fetch(self): 14 | """ 15 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 16 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 17 | """ 18 | 19 | proxies = [] 20 | 21 | headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'} 22 | html = requests.get('http://www.goubanjia.com/', headers=headers, timeout=10).text 23 | doc = pq(html) 24 | for item in doc('table tbody tr').items(): 25 | ipport = item.find('td.ip').html() 26 | # 以下对ipport进行整理 27 | hide_reg = re.compile(r']*style="display:[^<>]*none;"[^<>]*>[^<>]*

') 28 | ipport = re.sub(hide_reg, '', ipport) 29 | tag_reg = re.compile(r'<[^<>]*>') 30 | ipport = re.sub(tag_reg, '', ipport) 31 | 32 | ip = ipport.split(':')[0] 33 | port = self.pde(item.find('td.ip').find('span.port').attr('class').split(' ')[1]) 34 | proxies.append(('http', ip, int(port))) 35 | 36 | return list(set(proxies)) 37 | 38 | def pde(self, class_key): # 解密函数,端口是加密过的 39 | """ 40 | key是class内容 41 | """ 42 | class_key = str(class_key) 43 | f = [] 44 | for i in range(len(class_key)): 45 | f.append(str('ABCDEFGHIZ'.index(class_key[i]))) 46 | return str(int(''.join(f)) >> 0x3) 47 | -------------------------------------------------------------------------------- /fetchers/IHuanFetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | from pyquery import PyQuery as pq 6 | import re 7 | 8 | class IHuanFetcher(BaseFetcher): 9 | """ 10 | https://ip.ihuan.me/ 11 | 爬这个网站要温柔点,站长表示可能会永久关站 12 | """ 13 | 14 | def fetch(self): 15 | """ 16 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 17 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 18 | """ 19 | 20 | proxies = [] 21 | ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$') 22 | port_regex = re.compile(r'^\d+$') 23 | 24 | pending_urls = ['https://ip.ihuan.me/'] 25 | while len(pending_urls) > 0: 26 | url = pending_urls[0] 27 | pending_urls = pending_urls[1:] 28 | 29 | headers = { 30 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 31 | 'Accept-Encoding': 'gzip, deflate', 32 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 33 | 'Cache-Control': 'no-cache', 34 | 'Connection': 'keep-alive', 35 | 'Pragma': 'no-cache', 36 | 'Upgrade-Insecure-Requests': '1', 37 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36' 38 | } 39 | try: 40 | html = requests.get(url, headers=headers, timeout=10).text 41 | except Exception as e: 42 | print('ERROR in ip.ihuan.me:' + str(e)) 43 | continue 44 | doc = pq(html) 45 | for line in doc('tbody tr').items(): 46 | tds = list(line('td').items()) 47 | if len(tds) == 10: 48 | ip = tds[0].text().strip() 49 | port = tds[1].text().strip() 50 | if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None: 51 | proxies.append(('http', ip, int(port))) 52 | 53 | if url.endswith('/'): # 当前是第一页,解析后面几页的链接 54 | for item in list(doc('.pagination a').items())[1:-1]: 55 | href = item.attr('href') 56 | if href is not None and href.startswith('?page='): 57 | pending_urls.append('https://ip.ihuan.me/' + href) 58 | 59 | return list(set(proxies)) 60 | -------------------------------------------------------------------------------- /fetchers/IP3366Fetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | from pyquery import PyQuery as pq 6 | import re 7 | 8 | class IP3366Fetcher(BaseFetcher): 9 | """ 10 | http://www.ip3366.net/free/?stype=1 11 | """ 12 | 13 | def fetch(self): 14 | """ 15 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 16 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 17 | """ 18 | 19 | urls = [] 20 | for stype in ['1', '2']: 21 | for page in range(1, 6): 22 | url = f'http://www.ip3366.net/free/?stype={stype}&page={page}' 23 | urls.append(url) 24 | 25 | proxies = [] 26 | ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$') 27 | port_regex = re.compile(r'^\d+$') 28 | 29 | for url in urls: 30 | headers = { 31 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 32 | 'Accept-Encoding': 'gzip, deflate', 33 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 34 | 'Cache-Control': 'no-cache', 35 | 'Connection': 'keep-alive', 36 | 'Pragma': 'no-cache', 37 | 'Upgrade-Insecure-Requests': '1', 38 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36' 39 | } 40 | html = requests.get(url, headers=headers, timeout=10).text 41 | doc = pq(html) 42 | for line in doc('tr').items(): 43 | tds = list(line('td').items()) 44 | if len(tds) == 7: 45 | ip = tds[0].text().strip() 46 | port = tds[1].text().strip() 47 | if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None: 48 | proxies.append(('http', ip, int(port))) 49 | 50 | return list(set(proxies)) 51 | -------------------------------------------------------------------------------- /fetchers/IP66Fetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | from pyquery import PyQuery as pq 6 | import re 7 | 8 | class IP66Fetcher(BaseFetcher): 9 | """ 10 | http://www.66ip.cn/ 11 | """ 12 | 13 | def fetch(self): 14 | """ 15 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 16 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 17 | """ 18 | 19 | urls = [] 20 | for areaindex in range(10): 21 | for page in range(1, 6): 22 | if areaindex == 0: 23 | url = f'http://www.66ip.cn/{page}.html' 24 | else: 25 | url = f'http://www.66ip.cn/areaindex_{areaindex}/{page}.html' 26 | urls.append(url) 27 | 28 | proxies = [] 29 | ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$') 30 | port_regex = re.compile(r'^\d+$') 31 | 32 | for url in urls: 33 | headers = { 34 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 35 | 'Accept-Encoding': 'gzip, deflate', 36 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 37 | 'Cache-Control': 'no-cache', 38 | 'Connection': 'keep-alive', 39 | 'Pragma': 'no-cache', 40 | 'Upgrade-Insecure-Requests': '1', 41 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36' 42 | } 43 | html = requests.get(url, headers=headers, timeout=10).text 44 | doc = pq(html) 45 | for line in doc('table tr').items(): 46 | tds = list(line('td').items()) 47 | if len(tds) == 5: 48 | ip = tds[0].text().strip() 49 | port = tds[1].text().strip() 50 | if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None: 51 | proxies.append(('http', ip, int(port))) 52 | 53 | return list(set(proxies)) 54 | -------------------------------------------------------------------------------- /fetchers/IP89Fetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | from pyquery import PyQuery as pq 6 | import re 7 | 8 | class IP89Fetcher(BaseFetcher): 9 | """ 10 | https://www.89ip.cn/ 11 | """ 12 | 13 | def fetch(self): 14 | """ 15 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 16 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 17 | """ 18 | 19 | urls = [] 20 | for page in range(1, 6): 21 | url = f'https://www.89ip.cn/index_{page}.html' 22 | urls.append(url) 23 | 24 | proxies = [] 25 | ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$') 26 | port_regex = re.compile(r'^\d+$') 27 | 28 | for url in urls: 29 | headers = { 30 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 31 | 'Accept-Encoding': 'gzip, deflate', 32 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 33 | 'Cache-Control': 'no-cache', 34 | 'Connection': 'keep-alive', 35 | 'Pragma': 'no-cache', 36 | 'Upgrade-Insecure-Requests': '1', 37 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36' 38 | } 39 | html = requests.get(url, headers=headers, timeout=10).text 40 | doc = pq(html) 41 | for line in doc('tr').items(): 42 | tds = list(line('td').items()) 43 | if len(tds) == 5: 44 | ip = tds[0].text().strip() 45 | port = tds[1].text().strip() 46 | if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None: 47 | proxies.append(('http', ip, int(port))) 48 | 49 | return list(set(proxies)) 50 | -------------------------------------------------------------------------------- /fetchers/JiangxianliFetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | from pyquery import PyQuery as pq 6 | import re 7 | 8 | class JiangxianliFetcher(BaseFetcher): 9 | """ 10 | https://ip.jiangxianli.com/?page=1 11 | """ 12 | 13 | def fetch(self): 14 | """ 15 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 16 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 17 | """ 18 | 19 | urls = [] 20 | for page in range(1, 5): 21 | url = f'https://ip.jiangxianli.com/?page={page}' 22 | urls.append(url) 23 | 24 | proxies = [] 25 | ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$') 26 | port_regex = re.compile(r'^\d+$') 27 | 28 | for url in urls: 29 | headers = { 30 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 31 | 'Accept-Encoding': 'gzip, deflate', 32 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 33 | 'Cache-Control': 'no-cache', 34 | 'Connection': 'keep-alive', 35 | 'Pragma': 'no-cache', 36 | 'Upgrade-Insecure-Requests': '1', 37 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36' 38 | } 39 | html = requests.get(url, headers=headers, timeout=10).text 40 | doc = pq(html) 41 | for line in doc('tr').items(): 42 | tds = list(line('td').items()) 43 | if len(tds) >= 2: 44 | ip = tds[0].text().strip() 45 | port = tds[1].text().strip() 46 | if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None: 47 | proxies.append(('http', ip, int(port))) 48 | 49 | return list(set(proxies)) 50 | -------------------------------------------------------------------------------- /fetchers/KaiXinFetcher.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | 4 | import requests 5 | from pyquery import PyQuery as pq 6 | 7 | from .BaseFetcher import BaseFetcher 8 | 9 | class KaiXinFetcher(BaseFetcher): 10 | """ 11 | http://www.kxdaili.com/dailiip.html 12 | 代码由 [Zealot666](https://github.com/Zealot666) 提供 13 | """ 14 | 15 | def fetch(self): 16 | """ 17 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocol是协议名称,目前主要为http 18 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 19 | """ 20 | 21 | urls = [] 22 | urls = urls + [f'http://www.kxdaili.com/dailiip/1/{page}.html' for page in range(1, 11)] 23 | urls = urls + [f'http://www.kxdaili.com/dailiip/2/{page}.html' for page in range(1, 11)] 24 | 25 | proxies = [] 26 | ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$') 27 | port_regex = re.compile(r'^\d+$') 28 | 29 | for url in urls: 30 | html = requests.get(url, timeout=10).text 31 | doc = pq(html) 32 | for line in doc('tr').items(): 33 | tds = list(line('td').items()) 34 | if len(tds) >= 2: 35 | ip = tds[0].text().strip() 36 | port = tds[1].text().strip() 37 | if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None: 38 | proxies.append(('http', ip, int(port))) 39 | 40 | return list(set(proxies)) 41 | -------------------------------------------------------------------------------- /fetchers/KuaidailiFetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | from pyquery import PyQuery as pq 6 | 7 | class KuaidailiFetcher(BaseFetcher): 8 | """ 9 | https://www.kuaidaili.com/free 10 | """ 11 | 12 | def fetch(self): 13 | """ 14 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 15 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 16 | """ 17 | 18 | urls = [] 19 | urls = urls + [f'https://www.kuaidaili.com/free/inha/{page}/' for page in range(1, 11)] 20 | urls = urls + [f'https://www.kuaidaili.com/free/intr/{page}/' for page in range(1, 11)] 21 | 22 | proxies = [] 23 | 24 | for url in urls: 25 | html = requests.get(url, timeout=10).text 26 | doc = pq(html) 27 | for item in doc('table tbody tr').items(): 28 | ip = item.find('td[data-title="IP"]').text() 29 | port = int(item.find('td[data-title="PORT"]').text()) 30 | proxies.append(('http', ip, port)) 31 | 32 | return list(set(proxies)) 33 | -------------------------------------------------------------------------------- /fetchers/ProxyListFetcher.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import requests 4 | 5 | from .BaseFetcher import BaseFetcher 6 | 7 | 8 | class ProxyListFetcher(BaseFetcher): 9 | """ 10 | https://www.proxy-list.download/api/v1/get?type={{ protocol }}&_t={{ timestamp }} 11 | """ 12 | 13 | def fetch(self): 14 | proxies = [] 15 | type_list = ['socks4', 'socks5', 'http', 'https'] 16 | for protocol in type_list: 17 | url = "https://www.proxy-list.download/api/v1/get?type=" + protocol + "&_t=" + str(time.time()) 18 | proxies_list = requests.get(url).text.split("\n") 19 | for data in proxies_list: 20 | flag_idx = data.find(":") 21 | ip = data[:flag_idx] 22 | port = data[flag_idx + 1:-1] 23 | proxies.append((protocol, ip, port)) 24 | 25 | return list(set(proxies)) 26 | -------------------------------------------------------------------------------- /fetchers/ProxyScrapeFetcher.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import requests 4 | 5 | from .BaseFetcher import BaseFetcher 6 | 7 | 8 | class ProxyScrapeFetcher(BaseFetcher): 9 | """ 10 | https://api.proxyscrape.com/?request=displayproxies&proxytype={{ protocol }}&_t={{ timestamp }} 11 | """ 12 | 13 | def fetch(self): 14 | proxies = [] 15 | type_list = ['socks4', 'socks5', 'http', 'https'] 16 | for protocol in type_list: 17 | url = "https://api.proxyscrape.com/?request=displayproxies&proxytype=" + protocol + "&_t=" + str( 18 | time.time()) 19 | resp = requests.get(url).text 20 | for data in resp.split("\n"): 21 | flag_idx = data.find(":") 22 | ip = data[:flag_idx] 23 | port = data[flag_idx + 1:-1] 24 | proxies.append((protocol, ip, port)) 25 | 26 | return list(set(proxies)) 27 | -------------------------------------------------------------------------------- /fetchers/ProxyscanFetcher.py: -------------------------------------------------------------------------------- 1 | from .BaseFetcher import BaseFetcher 2 | import requests 3 | import time 4 | 5 | class ProxyscanFetcher(BaseFetcher): 6 | """ 7 | https://www.proxyscan.io/api/proxy?last_check=9800&uptime=50&limit=20&_t={{ timestamp }} 8 | """ 9 | 10 | def fetch(self): 11 | """ 12 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocol是协议名称,目前主要为http 13 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 14 | """ 15 | proxies = [] 16 | # 此API为随机获取接口,获取策略为:重复取十次后去重 17 | for _ in range(10): 18 | url = "https://www.proxyscan.io/api/proxy?last_check=9800&uptime=50&limit=20&_t=" + str(time.time()) 19 | resp = requests.get(url).json() 20 | for data in resp: 21 | protocol = str.lower(data['Type'][0]) 22 | proxies.append((protocol, data['Ip'], data['Port'])) 23 | 24 | return list(set(proxies)) -------------------------------------------------------------------------------- /fetchers/README.md: -------------------------------------------------------------------------------- 1 | # 爬取器 2 | 3 | 所有的爬取器都在这个目录中,并且在`__init__.py`中进行了注册。 4 | 5 | ## 添加新的爬取器 6 | 7 | 本项目默认包含了数量不少的免费公开代理源,并且会持续更新,如果你发现有不错的免费代理源,欢迎通过Issues反馈给我们。 8 | 9 | 1. 编写爬取器代码 10 | 11 | 爬取器需要继承基类`BaseFetcher`,然后实现`fetch`函数。 12 | 13 | `fetch`函数没有输入参数,每次运行都返回一个列表,列表中包含本次爬取到的代理。返回的格式为(代理协议类型,代理IP,端口)。 14 | 15 | 示例: 16 | 17 | ```python 18 | class CustomFetcher(BaseFetcher): 19 | def fetch(self): 20 | return [('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 21 | ``` 22 | 23 | 2. 注册爬取器 24 | 25 | 编写好爬取器之后,还需要在`__init__.py`文件中进行注册,添加如下代码: 26 | 27 | **注意:爬取器的名称(name)一定不能重复。** 28 | 29 | ```python 30 | from .CustomFetcher import CustomFetcher 31 | 32 | fetchers = [ 33 | ... 34 | Fetcher(name='www.custom.com', fetcher=CustomFetcher), 35 | ... 36 | ] 37 | ``` 38 | 39 | 3. 重启 40 | 41 | 完成上述步骤之后,重启进程即可。代码会自动将新爬取器添加到数据库中,爬取进程也会自动运行新爬取器。 42 | -------------------------------------------------------------------------------- /fetchers/UUFetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | import json 6 | 7 | class UUFetcher(BaseFetcher): 8 | """ 9 | https://uu-proxy.com/ 10 | """ 11 | 12 | def fetch(self): 13 | """ 14 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 15 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 16 | """ 17 | 18 | headers = { 19 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 20 | 'Accept-Encoding': 'gzip, deflate', 21 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 22 | 'Cache-Control': 'no-cache', 23 | 'Connection': 'keep-alive', 24 | 'Pragma': 'no-cache', 25 | 'Upgrade-Insecure-Requests': '1', 26 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36' 27 | } 28 | data = requests.get('https://uu-proxy.com/api/free', headers=headers, timeout=10).text 29 | free = json.loads(data)['free'] 30 | proxies = [(item['scheme'], item['ip'], item['port']) for item in free['proxies']] 31 | 32 | return list(set(proxies)) 33 | -------------------------------------------------------------------------------- /fetchers/XiLaFetcher.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import random 4 | 5 | import requests 6 | from pyquery import PyQuery as pq 7 | 8 | from .BaseFetcher import BaseFetcher 9 | 10 | class XiLaFetcher(BaseFetcher): 11 | """ 12 | http://www.xiladaili.com/gaoni/ 13 | 代码由 [Zealot666](https://github.com/Zealot666) 提供 14 | """ 15 | def __init__(self): 16 | super().__init__() 17 | self.index = 0 18 | 19 | def fetch(self): 20 | """ 21 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocol是协议名称,目前主要为http 22 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 23 | """ 24 | self.index += 1 25 | new_index = self.index % 30 26 | 27 | urls = [] 28 | urls = urls + [f'http://www.xiladaili.com/gaoni/{page}/' for page in range(new_index, new_index + 11)] 29 | urls = urls + [f'http://www.xiladaili.com/http/{page}/' for page in range(new_index, new_index + 11)] 30 | 31 | proxies = [] 32 | ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$') 33 | port_regex = re.compile(r'^\d+$') 34 | 35 | for url in urls: 36 | time.sleep(1) 37 | html = requests.get(url, timeout=10).text 38 | doc = pq(html) 39 | for line in doc('tr').items(): 40 | tds = list(line('td').items()) 41 | if len(tds) >= 2: 42 | ip = tds[0].text().strip().split(":")[0] 43 | port = tds[0].text().strip().split(":")[1] 44 | if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None: 45 | proxies.append(('http', ip, int(port))) 46 | 47 | proxies = list(set(proxies)) 48 | 49 | # 这个代理源数据太多了,验证器跑不过来 50 | # 所以只取一部分,一般来说也够用了 51 | if len(proxies) > 200: 52 | proxies = random.sample(proxies, 200) 53 | 54 | return proxies 55 | -------------------------------------------------------------------------------- /fetchers/XiaoShuFetcher.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import random 4 | 5 | import requests 6 | from pyquery import PyQuery as pq 7 | 8 | from .BaseFetcher import BaseFetcher 9 | 10 | class XiaoShuFetcher(BaseFetcher): 11 | """ 12 | http://www.xsdaili.cn/ 13 | 代码由 [Zealot666](https://github.com/Zealot666) 提供 14 | """ 15 | def __init__(self): 16 | super().__init__() 17 | self.index = 0 18 | 19 | def fetch(self): 20 | """ 21 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocol是协议名称,目前主要为http 22 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 23 | """ 24 | self.index += 1 25 | new_index = self.index % 10 26 | 27 | urls = set() 28 | proxies = [] 29 | headers = { 30 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36" 31 | } 32 | for page in range(new_index, new_index + 1): 33 | response = requests.get("http://www.xsdaili.cn/dayProxy/" + str(page) + ".html", headers=headers, timeout=10) 34 | for item in pq(response.text)('a').items(): 35 | try: 36 | if "/dayProxy/ip" in item.attr("href"): 37 | urls.add("http://www.xsdaili.cn" + item.attr("href")) 38 | except Exception: 39 | continue 40 | for url in urls: 41 | response = requests.get(url, headers=headers, timeout=8) 42 | doc = pq(response.text) 43 | for item in doc(".cont").items(): 44 | for line in item.text().split("\n"): 45 | ip = line.split('@')[0].split(':')[0] 46 | port = line.split('@')[0].split(':')[1] 47 | proxies.append(("http", ip, port)) 48 | 49 | proxies = list(set(proxies)) 50 | 51 | # 这个代理源数据太多了,验证器跑不过来 52 | # 所以只取一部分,一般来说也够用了 53 | if len(proxies) > 200: 54 | proxies = random.sample(proxies, 200) 55 | 56 | return proxies 57 | -------------------------------------------------------------------------------- /fetchers/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from collections import namedtuple 4 | 5 | Fetcher = namedtuple('Fetcher', ['name', 'fetcher']) 6 | 7 | from .UUFetcher import UUFetcher 8 | from .KuaidailiFetcher import KuaidailiFetcher 9 | from .GoubanjiaFetcher import GoubanjiaFetcher 10 | from .IP66Fetcher import IP66Fetcher 11 | from .IP3366Fetcher import IP3366Fetcher 12 | from .JiangxianliFetcher import JiangxianliFetcher 13 | from .IHuanFetcher import IHuanFetcher 14 | from .IP89Fetcher import IP89Fetcher 15 | from .ProxyscanFetcher import ProxyscanFetcher 16 | from .KaiXinFetcher import KaiXinFetcher 17 | from .XiLaFetcher import XiLaFetcher 18 | from .XiaoShuFetcher import XiaoShuFetcher 19 | from .ProxyListFetcher import ProxyListFetcher 20 | from .ProxyScrapeFetcher import ProxyScrapeFetcher 21 | 22 | fetchers = [ 23 | Fetcher(name='uu-proxy.com', fetcher=UUFetcher), 24 | Fetcher(name='www.kuaidaili.com', fetcher=KuaidailiFetcher), 25 | Fetcher(name='www.goubanjia.com', fetcher=GoubanjiaFetcher), 26 | Fetcher(name='www.66ip.cn', fetcher=IP66Fetcher), 27 | Fetcher(name='www.ip3366.net', fetcher=IP3366Fetcher), 28 | Fetcher(name='ip.jiangxianli.com', fetcher=JiangxianliFetcher), 29 | Fetcher(name='ip.ihuan.me', fetcher=IHuanFetcher), 30 | Fetcher(name='www.proxyscan.io', fetcher=ProxyscanFetcher), 31 | Fetcher(name='www.89ip.cn', fetcher=IP89Fetcher), 32 | Fetcher(name='www.kxdaili.com', fetcher=KaiXinFetcher), 33 | Fetcher(name='www.xiladaili.com', fetcher=XiLaFetcher), 34 | Fetcher(name='www.xsdaili.cn', fetcher=XiaoShuFetcher), 35 | Fetcher(name='www.proxy-list.download', fetcher=ProxyListFetcher), 36 | Fetcher(name='proxyscrape.com', fetcher=ProxyScrapeFetcher) 37 | ] 38 | -------------------------------------------------------------------------------- /frontend/README.md: -------------------------------------------------------------------------------- 1 | # 前端目录 2 | 3 | 这个目录主要是网页界面的代码,代码已经被编译成了静态文件(`deployment`目录中),一般来说不需要修改。 4 | 5 | `api/api.py`会自动将`deployment`中的内容部署成一个静态网页服务器。 6 | -------------------------------------------------------------------------------- /frontend/deployment/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/frontend/deployment/.nojekyll -------------------------------------------------------------------------------- /frontend/deployment/200.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 代理池网页管理界面 5 | 6 | 7 |
Loading...
8 | 9 | 10 | -------------------------------------------------------------------------------- /frontend/deployment/_nuxt/4e6036a.js: -------------------------------------------------------------------------------- 1 | !function(e){function r(data){for(var r,n,l=data[0],f=data[1],d=data[2],i=0,h=[];i 28 | * Released under the MIT License. 29 | */ 30 | 31 | /*! 32 | * vue-no-ssr v1.1.1 33 | * (c) 2018-present egoist <0x142857@gmail.com> 34 | * Released under the MIT License. 35 | */ 36 | 37 | //! moment.js 38 | 39 | //! moment.js locale configuration 40 | -------------------------------------------------------------------------------- /frontend/deployment/_nuxt/c6103f9.js: -------------------------------------------------------------------------------- 1 | (window.webpackJsonp=window.webpackJsonp||[]).push([[2],{1198:function(t,e,n){"use strict";n.r(e);n(133),n(79),n(80);var r=n(20),c=n(3),o=n.n(c),l=[{title:"名称",dataIndex:"name"},{title:"当前可用代理数量",dataIndex:"validated_cnt"},{dataIndex:"in_db_cnt",slots:{title:"inDbCntTitle"}},{title:"总共爬取代理数量",dataIndex:"sum_proxies_cnt"},{title:"上次爬取代理数量",dataIndex:"last_proxies_cnt"},{title:"上次爬取时间",dataIndex:"last_fetch_date",customRender:function(t){return t?o()(t).format("YYYY-MM-DD HH:mm:ss"):""}},{dataIndex:"enable",slots:{title:"enableTitle"},scopedSlots:{customRender:"enable"}}],d={data:function(){return{fetchers:[],columns:l,autoupdate:!0,lastupdate:"",handle:null}},mounted:function(){var t=this;this.handle=setInterval((function(){t.autoupdate&&t.update()}),2e3),this.update()},destroyed:function(){this.handle&&clearInterval(this.handle),this.handle=null},methods:{update:function(){var t=this;return Object(r.a)(regeneratorRuntime.mark((function e(){var data;return regeneratorRuntime.wrap((function(e){for(;;)switch(e.prev=e.next){case 0:return e.next=2,t.$http.get("/fetchers_status");case 2:data=e.sent,t.fetchers=data.fetchers,t.lastupdate=o()().format("HH:mm:ss");case 5:case"end":return e.stop()}}),e)})))()},clearStatus:function(){var t=this;return Object(r.a)(regeneratorRuntime.mark((function e(){return regeneratorRuntime.wrap((function(e){for(;;)switch(e.prev=e.next){case 0:return e.next=2,t.$http.get("/clear_fetchers_status");case 2:return t.$message.success("清空成功"),e.next=5,t.update();case 5:case"end":return e.stop()}}),e)})))()},enableChange:function(t){var e=this;return Object(r.a)(regeneratorRuntime.mark((function n(){return regeneratorRuntime.wrap((function(n){for(;;)switch(n.prev=n.next){case 0:if(!t.enable){n.next=5;break}return n.next=3,e.$http.get("/fetcher_enable",{name:t.name,enable:"0"});case 3:n.next=7;break;case 5:return n.next=7,e.$http.get("/fetcher_enable",{name:t.name,enable:"1"});case 7:return e.$message.success("修改成功"),n.next=10,e.update();case 10:case"end":return n.stop()}}),n)})))()}}},f=n(100),component=Object(f.a)(d,(function(){var t=this,e=t.$createElement,n=t._self._c||e;return n("div",[n("a-row",{attrs:{gutter:16}},[n("a-col",{attrs:{span:4}},[n("a-card",{attrs:{"body-style":{padding:"20px 24px 4px"}}},[n("p",[t._v("\n 自动刷新:\n "),n("a-switch",{model:{value:t.autoupdate,callback:function(e){t.autoupdate=e},expression:"autoupdate"}})],1),t._v(" "),n("p",[t._v("刷新时间:"+t._s(t.lastupdate))])])],1),t._v(" "),n("a-col",{attrs:{span:4}},[n("a-card",{attrs:{"body-style":{padding:"20px 24px 43px"}}},[n("div",{staticStyle:{"text-align":"center"}},[n("a-button",{attrs:{type:"primary"},on:{click:t.clearStatus}},[t._v("\n 清空统计信息\n ")]),t._v(" "),n("a-tooltip",{attrs:{title:"清空`总共爬取代理数量`等,已经爬取到的代理不会删除"}},[n("a-icon",{attrs:{type:"question-circle"}})],1)],1)])],1)],1),t._v(" "),n("br"),t._v(" "),n("a-table",{attrs:{columns:t.columns,"data-source":t.fetchers,"row-key":"name",pagination:!1,bordered:!0},scopedSlots:t._u([{key:"enable",fn:function(e,r){return[n("a-switch",{attrs:{"default-checked":e},on:{change:function(e){return t.enableChange(r)}}})]}}])},[n("span",{attrs:{slot:"inDbCntTitle"},slot:"inDbCntTitle"},[t._v("\n 数据库中的代理数量\n "),n("a-tooltip",{scopedSlots:t._u([{key:"title",fn:function(){return[n("span",[t._v("\n 当前数据库中,有多少代理是这个爬取器爬到的。\n 和`总共爬取代理数量`不同的地方在于,这个去掉了重复的和已经删除的代理。\n ")])]},proxy:!0}])},[t._v(" "),n("a-icon",{attrs:{type:"question-circle"}})],1)],1),t._v(" "),n("span",{attrs:{slot:"enableTitle"},slot:"enableTitle"},[t._v("\n 是否启用\n "),n("a-tooltip",{scopedSlots:t._u([{key:"title",fn:function(){return[n("span",[t._v("\n 在禁用之后,将不会再运行该爬取器。\n ")])]},proxy:!0}])},[t._v(" "),n("a-icon",{attrs:{type:"question-circle"}})],1)],1)])],1)}),[],!1,null,null,null);e.default=component.exports}}]); -------------------------------------------------------------------------------- /frontend/deployment/_nuxt/fda1702.js: -------------------------------------------------------------------------------- 1 | (window.webpackJsonp=window.webpackJsonp||[]).push([[0],{290:function(t,e,n){"use strict";var r=n(9),o=n(634),j=n.n(o);r.default.use(j.a)},291:function(t,e,n){"use strict";n(39),n(79),n(80);var r=n(186),o=n(187),j=n(20),c=n(636),l=n.n(c),f=n(639),m=l.a.create({baseURL:"/",timeout:1e4,withCredentials:!0});function h(t,title,content){return new Promise((function(e){setTimeout((function(){f.a.destroyAll(),f.a[t]({title:title,content:content,onOk:function(){e()}})}),500)}))}function d(){return new Promise((function(){}))}function v(t){return k.apply(this,arguments)}function k(){return(k=Object(j.a)(regeneratorRuntime.mark((function t(e){var data,n;return regeneratorRuntime.wrap((function(t){for(;;)switch(t.prev=t.next){case 0:return data={},t.prev=1,t.next=4,e;case 4:n=t.sent,data=n.data,t.next=14;break;case 8:return t.prev=8,t.t0=t.catch(1),t.next=12,h("error","网络错误",t.t0.message);case 12:return t.next=14,d();case 14:if(data.success){t.next=19;break}return t.next=17,h("info","错误",data.message);case 17:return t.next=19,d();case 19:return t.abrupt("return",data);case 20:case"end":return t.stop()}}),t,null,[[1,8]])})))).apply(this,arguments)}var y=function(){function t(){Object(r.a)(this,t),this.baseURL="/"}var e,n;return Object(o.a)(t,[{key:"get",value:(n=Object(j.a)(regeneratorRuntime.mark((function t(e,n){return regeneratorRuntime.wrap((function(t){for(;;)switch(t.prev=t.next){case 0:return n=n||{},t.next=3,v(m.get(e,{params:n}));case 3:return t.abrupt("return",t.sent);case 4:case"end":return t.stop()}}),t)}))),function(t,e){return n.apply(this,arguments)})},{key:"post",value:(e=Object(j.a)(regeneratorRuntime.mark((function t(e,n,data){return regeneratorRuntime.wrap((function(t){for(;;)switch(t.prev=t.next){case 0:return n=n||{},data=data||{},t.next=4,v(m.post(e,data,{params:n}));case 4:return t.abrupt("return",t.sent);case 5:case"end":return t.stop()}}),t)}))),function(t,n,r){return e.apply(this,arguments)})}]),t}();e.a=function(t,e){t.req;e("http",new y({}))}},472:function(t,e,n){var content=n(700);"string"==typeof content&&(content=[[t.i,content,""]]),content.locals&&(t.exports=content.locals);(0,n(128).default)("41b8fd4d",content,!0,{sourceMap:!1})},473:function(t,e,n){var content=n(702);"string"==typeof content&&(content=[[t.i,content,""]]),content.locals&&(t.exports=content.locals);(0,n(128).default)("6bdae85e",content,!0,{sourceMap:!1})},638:function(t,e,n){"use strict";n(71);var r=n(633),o=n.n(r),j=n(3);n.n(j).a.locale("zh-cn");var c={data:function(){return{locale:o.a,url_path:[]}},watch:{$route:function(){this.updateNav()}},mounted:function(){this.updateNav()},methods:{updateNav:function(){var data=/^\/[^/]*/.exec(this.$route.path||"");this.url_path=data?[data[0]]:[]}}},l=(n(699),n(701),n(100)),component=Object(l.a)(c,(function(){var t=this,e=t.$createElement,n=t._self._c||e;return n("div",[n("a-config-provider",{attrs:{locale:t.locale}},[n("a-layout",{staticClass:"layout-main"},[n("a-layout-sider",{attrs:{collapsible:""}},[n("div",{staticClass:"logo"}),t._v(" "),n("a-menu",{attrs:{theme:"dark",mode:"inline"},model:{value:t.url_path,callback:function(e){t.url_path=e},expression:"url_path"}},[n("a-menu-item",{key:"/"},[n("NuxtLink",{attrs:{to:"/"}},[n("a-icon",{attrs:{type:"home"}}),t._v(" "),n("span",[t._v("可用代理")])],1)],1),t._v(" "),n("a-menu-item",{key:"/fetchers"},[n("NuxtLink",{attrs:{to:"/fetchers"}},[n("a-icon",{attrs:{type:"retweet"}}),t._v(" "),n("span",[t._v("爬取器状态")])],1)],1),t._v(" "),n("a-menu-item",{key:"github"},[n("a",{attrs:{href:"https://github.com/OxOOo/ProxyPoolWithUI",target:"_blank"}},[n("a-icon",{attrs:{type:"github"}}),t._v(" "),n("span",[t._v("Github主页")])],1)])],1)],1),t._v(" "),n("a-layout",[n("a-layout-header",{staticStyle:{background:"#fff",padding:"0"}}),t._v(" "),n("a-layout-content",{style:{margin:"24px 16px",padding:"24px",background:"#fff"}},[n("Nuxt")],1)],1)],1)],1)],1)}),[],!1,null,"9df1b954",null);e.a=component.exports},640:function(t,e,n){t.exports=n(641)},687:function(t,e,n){var map={"./af":328,"./af.js":328,"./ar":329,"./ar-dz":330,"./ar-dz.js":330,"./ar-kw":331,"./ar-kw.js":331,"./ar-ly":332,"./ar-ly.js":332,"./ar-ma":333,"./ar-ma.js":333,"./ar-sa":334,"./ar-sa.js":334,"./ar-tn":335,"./ar-tn.js":335,"./ar.js":329,"./az":336,"./az.js":336,"./be":337,"./be.js":337,"./bg":338,"./bg.js":338,"./bm":339,"./bm.js":339,"./bn":340,"./bn-bd":341,"./bn-bd.js":341,"./bn.js":340,"./bo":342,"./bo.js":342,"./br":343,"./br.js":343,"./bs":344,"./bs.js":344,"./ca":345,"./ca.js":345,"./cs":346,"./cs.js":346,"./cv":347,"./cv.js":347,"./cy":348,"./cy.js":348,"./da":349,"./da.js":349,"./de":350,"./de-at":351,"./de-at.js":351,"./de-ch":352,"./de-ch.js":352,"./de.js":350,"./dv":353,"./dv.js":353,"./el":354,"./el.js":354,"./en-au":355,"./en-au.js":355,"./en-ca":356,"./en-ca.js":356,"./en-gb":357,"./en-gb.js":357,"./en-ie":358,"./en-ie.js":358,"./en-il":359,"./en-il.js":359,"./en-in":360,"./en-in.js":360,"./en-nz":361,"./en-nz.js":361,"./en-sg":362,"./en-sg.js":362,"./eo":363,"./eo.js":363,"./es":364,"./es-do":365,"./es-do.js":365,"./es-mx":366,"./es-mx.js":366,"./es-us":367,"./es-us.js":367,"./es.js":364,"./et":368,"./et.js":368,"./eu":369,"./eu.js":369,"./fa":370,"./fa.js":370,"./fi":371,"./fi.js":371,"./fil":372,"./fil.js":372,"./fo":373,"./fo.js":373,"./fr":374,"./fr-ca":375,"./fr-ca.js":375,"./fr-ch":376,"./fr-ch.js":376,"./fr.js":374,"./fy":377,"./fy.js":377,"./ga":378,"./ga.js":378,"./gd":379,"./gd.js":379,"./gl":380,"./gl.js":380,"./gom-deva":381,"./gom-deva.js":381,"./gom-latn":382,"./gom-latn.js":382,"./gu":383,"./gu.js":383,"./he":384,"./he.js":384,"./hi":385,"./hi.js":385,"./hr":386,"./hr.js":386,"./hu":387,"./hu.js":387,"./hy-am":388,"./hy-am.js":388,"./id":389,"./id.js":389,"./is":390,"./is.js":390,"./it":391,"./it-ch":392,"./it-ch.js":392,"./it.js":391,"./ja":393,"./ja.js":393,"./jv":394,"./jv.js":394,"./ka":395,"./ka.js":395,"./kk":396,"./kk.js":396,"./km":397,"./km.js":397,"./kn":398,"./kn.js":398,"./ko":399,"./ko.js":399,"./ku":400,"./ku.js":400,"./ky":401,"./ky.js":401,"./lb":402,"./lb.js":402,"./lo":403,"./lo.js":403,"./lt":404,"./lt.js":404,"./lv":405,"./lv.js":405,"./me":406,"./me.js":406,"./mi":407,"./mi.js":407,"./mk":408,"./mk.js":408,"./ml":409,"./ml.js":409,"./mn":410,"./mn.js":410,"./mr":411,"./mr.js":411,"./ms":412,"./ms-my":413,"./ms-my.js":413,"./ms.js":412,"./mt":414,"./mt.js":414,"./my":415,"./my.js":415,"./nb":416,"./nb.js":416,"./ne":417,"./ne.js":417,"./nl":418,"./nl-be":419,"./nl-be.js":419,"./nl.js":418,"./nn":420,"./nn.js":420,"./oc-lnc":421,"./oc-lnc.js":421,"./pa-in":422,"./pa-in.js":422,"./pl":423,"./pl.js":423,"./pt":424,"./pt-br":425,"./pt-br.js":425,"./pt.js":424,"./ro":426,"./ro.js":426,"./ru":427,"./ru.js":427,"./sd":428,"./sd.js":428,"./se":429,"./se.js":429,"./si":430,"./si.js":430,"./sk":431,"./sk.js":431,"./sl":432,"./sl.js":432,"./sq":433,"./sq.js":433,"./sr":434,"./sr-cyrl":435,"./sr-cyrl.js":435,"./sr.js":434,"./ss":436,"./ss.js":436,"./sv":437,"./sv.js":437,"./sw":438,"./sw.js":438,"./ta":439,"./ta.js":439,"./te":440,"./te.js":440,"./tet":441,"./tet.js":441,"./tg":442,"./tg.js":442,"./th":443,"./th.js":443,"./tk":444,"./tk.js":444,"./tl-ph":445,"./tl-ph.js":445,"./tlh":446,"./tlh.js":446,"./tr":447,"./tr.js":447,"./tzl":448,"./tzl.js":448,"./tzm":449,"./tzm-latn":450,"./tzm-latn.js":450,"./tzm.js":449,"./ug-cn":451,"./ug-cn.js":451,"./uk":452,"./uk.js":452,"./ur":453,"./ur.js":453,"./uz":454,"./uz-latn":455,"./uz-latn.js":455,"./uz.js":454,"./vi":456,"./vi.js":456,"./x-pseudo":457,"./x-pseudo.js":457,"./yo":458,"./yo.js":458,"./zh-cn":459,"./zh-cn.js":459,"./zh-hk":460,"./zh-hk.js":460,"./zh-mo":461,"./zh-mo.js":461,"./zh-tw":462,"./zh-tw.js":462};function r(t){var e=o(t);return n(e)}function o(t){if(!n.o(map,t)){var e=new Error("Cannot find module '"+t+"'");throw e.code="MODULE_NOT_FOUND",e}return map[t]}r.keys=function(){return Object.keys(map)},r.resolve=o,t.exports=r,r.id=687},699:function(t,e,n){"use strict";n(472)},700:function(t,e,n){(e=n(113)(!1)).push([t.i,".layout-main[data-v-9df1b954]{min-height:100vh}.logo[data-v-9df1b954]{height:32px;background:hsla(0,0%,100%,.2);margin:16px}",""]),t.exports=e},701:function(t,e,n){"use strict";n(473)},702:function(t,e,n){(e=n(113)(!1)).push([t.i,'html{font-family:"Source Sans Pro",-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,sans-serif;font-size:16px;word-spacing:1px;-ms-text-size-adjust:100%;-webkit-text-size-adjust:100%;-moz-osx-font-smoothing:grayscale;-webkit-font-smoothing:antialiased;box-sizing:border-box}',""]),t.exports=e}},[[640,4,1,5]]]); -------------------------------------------------------------------------------- /frontend/deployment/_nuxt/static/1630852693/fetchers/payload.js: -------------------------------------------------------------------------------- 1 | __NUXT_JSONP__("/fetchers", {data:[{}],fetch:[],mutations:void 0}); -------------------------------------------------------------------------------- /frontend/deployment/_nuxt/static/1630852693/manifest.js: -------------------------------------------------------------------------------- 1 | __NUXT_JSONP__("manifest.js", {routes:["\u002F","\u002Ffetchers"]}) -------------------------------------------------------------------------------- /frontend/deployment/_nuxt/static/1630852693/payload.js: -------------------------------------------------------------------------------- 1 | __NUXT_JSONP__("/", {data:[{}],fetch:[],mutations:void 0}); -------------------------------------------------------------------------------- /frontend/src/.editorconfig: -------------------------------------------------------------------------------- 1 | # editorconfig.org 2 | root = true 3 | 4 | [*] 5 | indent_style = space 6 | indent_size = 4 7 | end_of_line = lf 8 | charset = utf-8 9 | trim_trailing_whitespace = true 10 | insert_final_newline = true 11 | 12 | [*.md] 13 | trim_trailing_whitespace = false 14 | -------------------------------------------------------------------------------- /frontend/src/.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | root: true, 3 | env: { 4 | browser: true, 5 | node: true 6 | }, 7 | parserOptions: { 8 | parser: 'babel-eslint' 9 | }, 10 | extends: [ 11 | '@nuxtjs', 12 | 'plugin:nuxt/recommended' 13 | ], 14 | plugins: [ 15 | ], 16 | // add your custom rules here 17 | rules: { 18 | semi: ['error', 'always'], 19 | indent: ['error', 4], 20 | 'vue/html-indent': ['error', 4], 21 | camelcase: 'off', 22 | 'no-return-await': 'off', 23 | 'vue/no-parsing-error': 'off', 24 | 'no-unused-vars': 'warn', 25 | 'vue/html-self-closing': 'off', 26 | 'prefer-const': 'warn', 27 | 'vue/singleline-html-element-content-newline': 'off', 28 | 'vue/no-unused-components': 'warn', 29 | 'import/no-named-as-default': 'off', 30 | 'vue/no-unused-vars': 'warn' 31 | } 32 | }; 33 | -------------------------------------------------------------------------------- /frontend/src/.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Node template 3 | # Logs 4 | /logs 5 | *.log 6 | npm-debug.log* 7 | yarn-debug.log* 8 | yarn-error.log* 9 | 10 | # Runtime data 11 | pids 12 | *.pid 13 | *.seed 14 | *.pid.lock 15 | 16 | # Directory for instrumented libs generated by jscoverage/JSCover 17 | lib-cov 18 | 19 | # Coverage directory used by tools like istanbul 20 | coverage 21 | 22 | # nyc test coverage 23 | .nyc_output 24 | 25 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 26 | .grunt 27 | 28 | # Bower dependency directory (https://bower.io/) 29 | bower_components 30 | 31 | # node-waf configuration 32 | .lock-wscript 33 | 34 | # Compiled binary addons (https://nodejs.org/api/addons.html) 35 | build/Release 36 | 37 | # Dependency directories 38 | node_modules/ 39 | jspm_packages/ 40 | 41 | # TypeScript v1 declaration files 42 | typings/ 43 | 44 | # Optional npm cache directory 45 | .npm 46 | 47 | # Optional eslint cache 48 | .eslintcache 49 | 50 | # Optional REPL history 51 | .node_repl_history 52 | 53 | # Output of 'npm pack' 54 | *.tgz 55 | 56 | # Yarn Integrity file 57 | .yarn-integrity 58 | 59 | # dotenv environment variables file 60 | .env 61 | 62 | # parcel-bundler cache (https://parceljs.org/) 63 | .cache 64 | 65 | # next.js build output 66 | .next 67 | 68 | # nuxt.js build output 69 | .nuxt 70 | 71 | # Nuxt generate 72 | dist 73 | 74 | # vuepress build output 75 | .vuepress/dist 76 | 77 | # Serverless directories 78 | .serverless 79 | 80 | # IDE / Editor 81 | .idea 82 | 83 | # Service worker 84 | sw.* 85 | 86 | # macOS 87 | .DS_Store 88 | 89 | # Vim swap files 90 | *.swp 91 | -------------------------------------------------------------------------------- /frontend/src/README.md: -------------------------------------------------------------------------------- 1 | # 前端目录 2 | 3 | 如果不修改网页,则不需要修改本目录下的文件,以下内容也可以忽略。 4 | 5 | ## 软件需求 6 | 7 | 1. node 14.15 : 不一定需要这么高版本,但是最好使用LTS版本 8 | 2. npm 9 | 10 | ## 安装依赖 11 | 12 | ```bash 13 | $ npm i 14 | ``` 15 | 16 | ## 命令说明 17 | 18 | 1. 运行开发服务器 19 | 20 | ```bash 21 | $ npm run dev 22 | ``` 23 | 24 | 2. 生成静态代码 25 | 26 | ```bash 27 | $ ./build.sh 28 | ``` 29 | 30 | 3. 进行代码格式化与检查 31 | 32 | ```bash 33 | $ npm run lint 34 | ``` 35 | -------------------------------------------------------------------------------- /frontend/src/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | cd `dirname $0` 6 | 7 | rm -rf dist # 删除已经存在的目录 8 | npm run generate # 生成静态文件 9 | 10 | rm -rf ../deployment 11 | mv dist ../deployment 12 | 13 | echo 'Done.' 14 | -------------------------------------------------------------------------------- /frontend/src/jsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "baseUrl": ".", 4 | "paths": { 5 | "~/*": ["./*"], 6 | "@/*": ["./*"], 7 | "~~/*": ["./*"], 8 | "@@/*": ["./*"] 9 | } 10 | }, 11 | "exclude": ["node_modules", ".nuxt", "dist"] 12 | } 13 | -------------------------------------------------------------------------------- /frontend/src/layouts/default.vue: -------------------------------------------------------------------------------- 1 | 41 | 42 | 75 | 76 | 86 | 87 | 107 | -------------------------------------------------------------------------------- /frontend/src/nuxt.config.js: -------------------------------------------------------------------------------- 1 | export default { 2 | // Target: https://go.nuxtjs.dev/config-target 3 | target: 'static', 4 | 5 | // Global page headers: https://go.nuxtjs.dev/config-head 6 | head: { 7 | title: '代理池网页管理界面', 8 | htmlAttrs: { 9 | lang: 'zh-CN' 10 | }, 11 | meta: [ 12 | { charset: 'utf-8' }, 13 | { name: 'viewport', content: 'width=device-width, initial-scale=1' }, 14 | { hid: 'description', name: 'description', content: '' } 15 | ], 16 | link: [ 17 | { rel: 'icon', type: 'image/x-icon', href: '/favicon.ico' } 18 | ] 19 | }, 20 | 21 | // Global CSS: https://go.nuxtjs.dev/config-css 22 | css: [ 23 | 'ant-design-vue/dist/antd.css' 24 | ], 25 | 26 | // Plugins to run before rendering page: https://go.nuxtjs.dev/config-plugins 27 | plugins: [ 28 | '@/plugins/antd-ui', 29 | '@/plugins/axios' 30 | ], 31 | 32 | // Auto import components: https://go.nuxtjs.dev/config-components 33 | components: true, 34 | 35 | // Modules for dev and build (recommended): https://go.nuxtjs.dev/config-modules 36 | buildModules: [ 37 | // https://go.nuxtjs.dev/eslint 38 | '@nuxtjs/eslint-module' 39 | ], 40 | 41 | // Modules: https://go.nuxtjs.dev/config-modules 42 | modules: [], 43 | 44 | // Build Configuration: https://go.nuxtjs.dev/config-build 45 | build: {}, 46 | 47 | router: { 48 | base: '/web/' 49 | }, 50 | 51 | env: { 52 | AXIOS_BASE_URL: // 浏览器访问后端的地址 53 | process.env.NODE_ENV === 'production' ? '/' : 'http://localhost:5000' 54 | } 55 | }; 56 | -------------------------------------------------------------------------------- /frontend/src/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "frontend", 3 | "version": "1.0.0", 4 | "private": true, 5 | "scripts": { 6 | "dev": "nuxt", 7 | "generate": "NODE_ENV=production nuxt generate", 8 | "lint:js": "eslint --ext \".js,.vue\" --ignore-path .gitignore --fix .", 9 | "lint": "npm run lint:js" 10 | }, 11 | "dependencies": { 12 | "ant-design-vue": "^1.7.2", 13 | "axios": "^0.21.1", 14 | "core-js": "^3.8.3", 15 | "moment": "^2.29.1", 16 | "nuxt": "^2.14.12" 17 | }, 18 | "devDependencies": { 19 | "@nuxtjs/eslint-config": "^5.0.0", 20 | "@nuxtjs/eslint-module": "^3.0.2", 21 | "babel-eslint": "^10.1.0", 22 | "eslint": "^7.18.0", 23 | "eslint-plugin-nuxt": "^2.0.0", 24 | "eslint-plugin-vue": "^7.5.0" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /frontend/src/pages/fetchers.vue: -------------------------------------------------------------------------------- 1 | 63 | 64 | 155 | -------------------------------------------------------------------------------- /frontend/src/pages/index.vue: -------------------------------------------------------------------------------- 1 | 96 | 97 | 186 | -------------------------------------------------------------------------------- /frontend/src/plugins/antd-ui.js: -------------------------------------------------------------------------------- 1 | import Vue from 'vue'; 2 | import Antd from 'ant-design-vue/lib'; 3 | 4 | Vue.use(Antd); 5 | -------------------------------------------------------------------------------- /frontend/src/plugins/axios.js: -------------------------------------------------------------------------------- 1 | import axios from 'axios'; 2 | import { Modal } from 'ant-design-vue'; 3 | 4 | const baseURL = process.env.AXIOS_BASE_URL; 5 | 6 | const instance = axios.create({ 7 | baseURL, 8 | timeout: 10000, 9 | withCredentials: true 10 | }); 11 | 12 | function showModel (type, title, content) { 13 | return new Promise((resolve) => { 14 | setTimeout(() => { 15 | Modal.destroyAll(); 16 | Modal[type]({ 17 | title, 18 | content, 19 | onOk: () => { 20 | resolve(); 21 | } 22 | }); 23 | }, 500); 24 | }); 25 | } 26 | 27 | function never () { 28 | return new Promise(() => {}); 29 | } 30 | 31 | async function handle (req) { 32 | let data = {}; 33 | try { 34 | const res = await req; 35 | data = res.data; 36 | } catch (e) { 37 | await showModel('error', '网络错误', e.message); 38 | // throw e; // IE上会弹出错误提示 39 | await never(); 40 | } 41 | if (!data.success) { 42 | await showModel('info', '错误', data.message); 43 | // throw new Error(data.message); 44 | await never(); 45 | } 46 | return data; 47 | } 48 | 49 | class Http { 50 | constructor () { 51 | this.baseURL = baseURL; 52 | } 53 | 54 | async get (url, params) { 55 | params = params || {}; 56 | return await handle(instance.get(url, { params })); 57 | } 58 | 59 | async post (url, params, data) { 60 | params = params || {}; 61 | data = data || {}; 62 | return await handle(instance.post(url, data, { params })); 63 | } 64 | } 65 | 66 | export default ({ req }, inject) => { 67 | inject('http', new Http({ })); 68 | }; 69 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import sys, os, signal 4 | sys.path.append(os.path.dirname(__file__) + os.sep + '../') 5 | from multiprocessing import Process 6 | import time 7 | from proc import run_fetcher, run_validator 8 | from api import api 9 | import multiprocessing 10 | 11 | # 进程锁 12 | proc_lock = multiprocessing.Lock() 13 | 14 | class Item: 15 | def __init__(self, target, name): 16 | self.target = target 17 | self.name = name 18 | self.process = None 19 | self.start_time = 0 20 | 21 | def main(): 22 | processes = [] 23 | processes.append(Item(target=run_fetcher.main, name='fetcher')) 24 | processes.append(Item(target=run_validator.main, name='validator')) 25 | processes.append(Item(target=api.main, name='api')) 26 | 27 | while True: 28 | for p in processes: 29 | if p.process is None: 30 | p.process = Process(target=p.target, name=p.name, daemon=False, args=(proc_lock, )) 31 | p.process.start() 32 | print(f'启动{p.name}进程,pid={p.process.pid}') 33 | p.start_time = time.time() 34 | 35 | for p in processes: 36 | if p.process is not None: 37 | if not p.process.is_alive(): 38 | print(f'进程{p.name}异常退出, exitcode={p.process.exitcode}') 39 | p.process.terminate() 40 | p.process = None 41 | # 解除进程锁 42 | try: 43 | proc_lock.release() 44 | except ValueError: 45 | pass 46 | elif p.start_time + 60 * 60 < time.time(): # 最长运行1小时就重启 47 | print(f'进程{p.name}运行太久,重启') 48 | p.process.terminate() 49 | p.process = None 50 | # 解除进程锁 51 | try: 52 | proc_lock.release() 53 | except ValueError: 54 | pass 55 | 56 | time.sleep(0.2) 57 | 58 | def citest(): 59 | """ 60 | 此函数仅用于检查程序是否可运行,一般情况下使用本项目可忽略 61 | """ 62 | processes = [] 63 | processes.append(Item(target=run_fetcher.main, name='fetcher')) 64 | processes.append(Item(target=run_validator.main, name='validator')) 65 | processes.append(Item(target=api.main, name='api')) 66 | 67 | for p in processes: 68 | assert p.process is None 69 | p.process = Process(target=p.target, name=p.name, daemon=False) 70 | p.process.start() 71 | print(f'running {p.name}, pid={p.process.pid}') 72 | p.start_time = time.time() 73 | 74 | time.sleep(10) 75 | 76 | for p in processes: 77 | assert p.process is not None 78 | assert p.process.is_alive() 79 | p.process.terminate() 80 | 81 | if __name__ == '__main__': 82 | try: 83 | if len(sys.argv) >= 2 and sys.argv[1] == 'citest': 84 | citest() 85 | else: 86 | main() 87 | sys.exit(0) 88 | except Exception as e: 89 | print('========FATAL ERROR=========') 90 | print(e) 91 | sys.exit(1) 92 | -------------------------------------------------------------------------------- /proc/README.md: -------------------------------------------------------------------------------- 1 | 2 | 包含爬取器和验证器的代码。 3 | 4 | 爬取器会定时运行注册的爬取器,并将爬取到的代理放入数据库中,详见代码`run_fetcher.py`。 5 | 6 | 验证器会不断从数据库中获取待验证的代理(代理的`下次待验证时间`小于当前时间),并进行验证,详见代码`run_validator.py`。 7 | -------------------------------------------------------------------------------- /proc/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | -------------------------------------------------------------------------------- /proc/run_fetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """ 3 | 定时运行爬取器 4 | """ 5 | 6 | import sys 7 | import threading 8 | from queue import Queue 9 | import logging 10 | import time 11 | from db import conn 12 | from fetchers import fetchers 13 | from config import PROC_FETCHER_SLEEP 14 | from func_timeout import func_set_timeout 15 | from func_timeout.exceptions import FunctionTimedOut 16 | 17 | logging.basicConfig(stream=sys.stdout, format="%(asctime)s-%(levelname)s:%(name)s:%(message)s", level='INFO') 18 | 19 | def main(proc_lock): 20 | """ 21 | 定时运行爬取器 22 | 主要逻辑: 23 | While True: 24 | for 爬取器 in 所有爬取器: 25 | 查询数据库,判断当前爬取器是否需要运行 26 | 如果需要运行,那么启动线程运行该爬取器 27 | 等待所有线程结束 28 | 将爬取到的代理放入数据库中 29 | 睡眠一段时间 30 | """ 31 | logger = logging.getLogger('fetcher') 32 | conn.set_proc_lock(proc_lock) 33 | 34 | while True: 35 | logger.info('开始运行一轮爬取器') 36 | status = conn.getProxiesStatus() 37 | if status['pending_proxies_cnt'] > 2000: 38 | logger.info(f"还有{status['pending_proxies_cnt']}个代理等待验证,数量过多,跳过本次爬取") 39 | time.sleep(PROC_FETCHER_SLEEP) 40 | continue 41 | 42 | @func_set_timeout(30) 43 | def fetch_worker(fetcher): 44 | f = fetcher() 45 | proxies = f.fetch() 46 | return proxies 47 | 48 | def run_thread(name, fetcher, que): 49 | """ 50 | name: 爬取器名称 51 | fetcher: 爬取器class 52 | que: 队列,用于返回数据 53 | """ 54 | try: 55 | proxies = fetch_worker(fetcher) 56 | que.put((name, proxies)) 57 | except Exception as e: 58 | logger.error(f'运行爬取器{name}出错:' + str(e)) 59 | que.put((name, [])) 60 | except FunctionTimedOut: 61 | pass 62 | 63 | threads = [] 64 | que = Queue() 65 | for item in fetchers: 66 | data = conn.getFetcher(item.name) 67 | if data is None: 68 | logger.error(f'没有在数据库中找到对应的信息:{item.name}') 69 | raise ValueError('不可恢复错误') 70 | if not data.enable: 71 | logger.info(f'跳过爬取器{item.name}') 72 | continue 73 | threads.append(threading.Thread(target=run_thread, args=(item.name, item.fetcher, que))) 74 | [t.start() for t in threads] 75 | [t.join() for t in threads] 76 | while not que.empty(): 77 | fetcher_name, proxies = que.get() 78 | for proxy in proxies: 79 | conn.pushNewFetch(fetcher_name, proxy[0], proxy[1], proxy[2]) 80 | conn.pushFetcherResult(fetcher_name, len(proxies)) 81 | 82 | logger.info(f'完成运行{len(threads)}个爬取器,睡眠{PROC_FETCHER_SLEEP}秒') 83 | time.sleep(PROC_FETCHER_SLEEP) 84 | -------------------------------------------------------------------------------- /proc/run_validator.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | """ 3 | 验证器逻辑 4 | """ 5 | 6 | import sys 7 | import socket 8 | import threading 9 | from queue import Queue 10 | import logging 11 | import time 12 | import requests 13 | from func_timeout import func_set_timeout 14 | from func_timeout.exceptions import FunctionTimedOut 15 | from db import conn 16 | from config import PROC_VALIDATOR_SLEEP, VALIDATE_THREAD_NUM 17 | from config import VALIDATE_METHOD, VALIDATE_KEYWORD, VALIDATE_HEADER, VALIDATE_URL, VALIDATE_TIMEOUT, VALIDATE_MAX_FAILS 18 | 19 | logging.basicConfig(stream=sys.stdout, format="%(asctime)s-%(levelname)s:%(name)s:%(message)s", level='INFO') 20 | 21 | def main(proc_lock): 22 | """ 23 | 验证器 24 | 主要逻辑: 25 | 创建VALIDATE_THREAD_NUM个验证线程,这些线程会不断运行 26 | While True: 27 | 检查验证线程是否返回了代理的验证结果 28 | 从数据库中获取若干当前待验证的代理 29 | 将代理发送给前面创建的线程 30 | """ 31 | logger = logging.getLogger('validator') 32 | conn.set_proc_lock(proc_lock) 33 | 34 | in_que = Queue() 35 | out_que = Queue() 36 | running_proxies = set() # 储存哪些代理正在运行,以字符串的形式储存 37 | 38 | threads = [] 39 | for _ in range(VALIDATE_THREAD_NUM): 40 | threads.append(threading.Thread(target=validate_thread, args=(in_que, out_que))) 41 | [_.start() for _ in threads] 42 | 43 | while True: 44 | out_cnt = 0 45 | while not out_que.empty(): 46 | proxy, success, latency = out_que.get() 47 | conn.pushValidateResult(proxy, success, latency) 48 | uri = f'{proxy.protocol}://{proxy.ip}:{proxy.port}' 49 | running_proxies.remove(uri) 50 | out_cnt = out_cnt + 1 51 | if out_cnt > 0: 52 | logger.info(f'完成了{out_cnt}个代理的验证') 53 | 54 | # 如果正在进行验证的代理足够多,那么就不着急添加新代理 55 | if len(running_proxies) >= VALIDATE_THREAD_NUM * 2: 56 | time.sleep(PROC_VALIDATOR_SLEEP) 57 | continue 58 | 59 | # 找一些新的待验证的代理放入队列中 60 | added_cnt = 0 61 | for proxy in conn.getToValidate(VALIDATE_THREAD_NUM * 4): 62 | uri = f'{proxy.protocol}://{proxy.ip}:{proxy.port}' 63 | # 这里找出的代理有可能是正在进行验证的代理,要避免重复加入 64 | if uri not in running_proxies: 65 | running_proxies.add(uri) 66 | in_que.put(proxy) 67 | added_cnt += 1 68 | 69 | if added_cnt == 0: 70 | time.sleep(PROC_VALIDATOR_SLEEP) 71 | 72 | @func_set_timeout(VALIDATE_TIMEOUT * 2) 73 | def validate_once(proxy): 74 | """ 75 | 进行一次验证,如果验证成功则返回True,否则返回False或者是异常 76 | """ 77 | proxies = { 78 | 'http': f'{proxy.protocol}://{proxy.ip}:{proxy.port}', 79 | 'https': f'{proxy.protocol}://{proxy.ip}:{proxy.port}' 80 | } 81 | if VALIDATE_METHOD == "GET": 82 | r = requests.get(VALIDATE_URL, timeout=VALIDATE_TIMEOUT, proxies=proxies) 83 | r.encoding = "utf-8" 84 | html = r.text 85 | if VALIDATE_KEYWORD in html: 86 | return True 87 | return False 88 | else: 89 | r = requests.get(VALIDATE_URL, timeout=VALIDATE_TIMEOUT, proxies=proxies, allow_redirects=False) 90 | resp_headers = r.headers 91 | if VALIDATE_HEADER in resp_headers.keys() and VALIDATE_KEYWORD in resp_headers[VALIDATE_HEADER]: 92 | return True 93 | return False 94 | 95 | def validate_thread(in_que, out_que): 96 | """ 97 | 验证函数,这个函数会在一个线程中被调用 98 | in_que: 输入队列,用于接收验证任务 99 | out_que: 输出队列,用于返回验证结果 100 | in_que和out_que都是线程安全队列,并且如果队列为空,调用in_que.get()会阻塞线程 101 | """ 102 | 103 | while True: 104 | proxy = in_que.get() 105 | 106 | success = False 107 | latency = None 108 | for _ in range(VALIDATE_MAX_FAILS): 109 | try: 110 | start_time = time.time() 111 | if validate_once(proxy): 112 | end_time = time.time() 113 | latency = int((end_time-start_time)*1000) 114 | success = True 115 | break 116 | except Exception as e: 117 | pass 118 | except FunctionTimedOut: 119 | pass 120 | 121 | out_que.put((proxy, success, latency)) 122 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2020.12.5 2 | chardet==4.0.0 3 | click==7.1.2 4 | cssselect==1.1.0 5 | Flask==1.1.2 6 | idna==2.10 7 | itsdangerous==1.1.0 8 | Jinja2==2.11.2 9 | lxml==4.6.2 10 | MarkupSafe==1.1.1 11 | pyquery==1.4.3 12 | requests==2.25.1 13 | urllib3==1.26.3 14 | Werkzeug==1.0.1 15 | PySocks==1.7.1 16 | func-timeout==4.3.5 17 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | 2 | 一些测试脚本。 3 | -------------------------------------------------------------------------------- /test/testDB.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import sys,os 4 | sys.path.append(os.path.dirname(__file__) + os.sep + '../') 5 | from db import conn 6 | 7 | def run(): 8 | assert len(conn.getToValidate(10)) == 0 9 | conn.pushNewFetch('test', 'http', '127.0.0.1', 8080) 10 | assert len(conn.getToValidate(10)) == 1 11 | 12 | conn.pushNewFetch('test', 'http', '127.0.0.2', 8080) 13 | conn.pushNewFetch('test', 'http', '127.0.0.3', 8080) 14 | conn.pushNewFetch('test', 'http', '127.0.0.4', 8080) 15 | assert len(conn.getToValidate(2)) == 2 16 | proxies = conn.getToValidate(10) 17 | assert len(proxies) == 4 18 | assert proxies[0].ip == '127.0.0.1' 19 | assert proxies[1].ip == '127.0.0.2' 20 | assert proxies[2].ip == '127.0.0.3' 21 | assert proxies[3].ip == '127.0.0.4' 22 | 23 | p = conn.getToValidate(1)[0] # 设置一个代理通过验证 24 | conn.pushValidateResult(p, True) 25 | assert len(conn.getToValidate(10)) == 3 26 | p = conn.getToValidate(1)[0] # 设置一个代理没有通过验证 27 | conn.pushValidateResult(p, False) 28 | assert len(conn.getToValidate(10)) == 2 29 | assert len(conn.getValidatedRandom(1)) == 1 30 | assert len(conn.getValidatedRandom(-1)) == 1 31 | p = conn.getValidatedRandom(1)[0] 32 | assert p.ip == '127.0.0.1' 33 | p = conn.getToValidate(1)[0] # 设置一个代理通过验证 34 | conn.pushValidateResult(p, True) 35 | assert len(conn.getValidatedRandom(1)) == 1 36 | assert len(conn.getValidatedRandom(-1)) == 2 37 | 38 | proxies_status = conn.getProxiesStatus() 39 | assert proxies_status['sum_proxies_cnt'] == 4 40 | assert proxies_status['validated_proxies_cnt'] == 2 41 | assert proxies_status['pending_proxies_cnt'] == 1 42 | 43 | fetchers = conn.getAllFetchers() 44 | for item in fetchers: 45 | # 所有爬取器都应该是默认参数 46 | assert item.enable == True 47 | assert item.sum_proxies_cnt == 0 48 | assert item.last_proxies_cnt == 0 49 | assert item.last_fetch_date is None 50 | conn.pushFetcherResult('www.kuaidaili.com', 10) 51 | conn.pushFetcherResult('www.kuaidaili.com', 20) 52 | conn.pushFetcherEnable('www.kuaidaili.com', False) 53 | f = conn.getFetcher('www.kuaidaili.com') 54 | assert f is not None 55 | # www.kuaidaili.com的参数应该被修改了 56 | assert f.enable == False 57 | assert f.sum_proxies_cnt == 30 58 | assert f.last_proxies_cnt == 20 59 | assert f.last_fetch_date is not None 60 | 61 | conn.pushClearFetchersStatus() 62 | f = conn.getFetcher('www.kuaidaili.com') 63 | assert f is not None 64 | # www.kuaidaili.com的参数应该被修改了 65 | assert f.sum_proxies_cnt == 0 66 | assert f.last_proxies_cnt == 0 67 | assert f.last_fetch_date is None 68 | 69 | if __name__ == '__main__': 70 | print(u'请确保运行本脚本之前删除或备份`data.db`文件') 71 | run() 72 | print(u'测试通过') 73 | -------------------------------------------------------------------------------- /test/testFetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import sys,os 4 | sys.path.append(os.path.dirname(__file__) + os.sep + '../') 5 | from fetchers import fetchers 6 | 7 | def run(): 8 | proxies_cnt = dict() 9 | for item in fetchers: 10 | if item.name != 'www.xsdaili.cn': continue # 这行表示只测试特定的爬取器 11 | 12 | print('='*10, 'RUNNING ' + item.name, '='*10) 13 | fetcher = item.fetcher() # 实例化爬取器 14 | try: 15 | proxies = fetcher.fetch() 16 | except Exception as e: 17 | print(e) 18 | proxies = [] 19 | print(proxies) 20 | proxies_cnt[item.name] = len(proxies) 21 | 22 | print('='*10, 'PROXIES CNT', '='*10) 23 | print(proxies_cnt) 24 | 25 | if __name__ == '__main__': 26 | run() 27 | --------------------------------------------------------------------------------