├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── api ├── README.md ├── __init__.py └── api.py ├── config.py ├── db ├── Fetcher.py ├── Proxy.py ├── README.md ├── __init__.py ├── conn.py └── init.py ├── docs ├── screenshot1.png ├── screenshot2.png ├── term.png └── workflow.png ├── fetchers ├── BaseFetcher.py ├── GoubanjiaFetcher.py ├── IHuanFetcher.py ├── IP3366Fetcher.py ├── IP66Fetcher.py ├── IP89Fetcher.py ├── JiangxianliFetcher.py ├── KaiXinFetcher.py ├── KuaidailiFetcher.py ├── ProxyListFetcher.py ├── ProxyScrapeFetcher.py ├── ProxyscanFetcher.py ├── README.md ├── UUFetcher.py ├── XiLaFetcher.py ├── XiaoShuFetcher.py └── __init__.py ├── frontend ├── README.md ├── deployment │ ├── .nojekyll │ ├── 200.html │ ├── _nuxt │ │ ├── 473a16e.js │ │ ├── 4e6036a.js │ │ ├── 810b53a.js │ │ ├── 89e3175.js │ │ ├── LICENSES │ │ ├── c6103f9.js │ │ ├── fda1702.js │ │ └── static │ │ │ └── 1630852693 │ │ │ ├── fetchers │ │ │ └── payload.js │ │ │ ├── manifest.js │ │ │ └── payload.js │ ├── fetchers │ │ └── index.html │ └── index.html └── src │ ├── .editorconfig │ ├── .eslintrc.js │ ├── .gitignore │ ├── README.md │ ├── build.sh │ ├── jsconfig.json │ ├── layouts │ └── default.vue │ ├── nuxt.config.js │ ├── package-lock.json │ ├── package.json │ ├── pages │ ├── fetchers.vue │ └── index.vue │ └── plugins │ ├── antd-ui.js │ └── axios.js ├── main.py ├── proc ├── README.md ├── __init__.py ├── run_fetcher.py └── run_validator.py ├── requirements.txt └── test ├── README.md ├── testDB.py └── testFetcher.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | data.db 132 | 133 | # Pycharm settings 134 | .idea 135 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | jobs: 4 | include: 5 | - name: "Python 3.6 on Linux" 6 | python: "3.6" 7 | os: "linux" 8 | env: 9 | - PYTHON=python3 10 | - PIP=pip3 11 | - name: "Python 3.7 on Linux" 12 | python: "3.7" 13 | os: "linux" 14 | env: 15 | - PYTHON=python3 16 | - PIP=pip3 17 | - name: "Python 3.8 on Linux" 18 | python: "3.8" 19 | os: "linux" 20 | env: 21 | - PYTHON=python3 22 | - PIP=pip3 23 | - name: "Python 3.7 on macOS" 24 | os: osx 25 | osx_image: xcode11.2 26 | language: shell 27 | env: 28 | - PYTHON=python3 29 | - PIP=pip3 30 | - name: "Python 3.6 on Windows" 31 | os: windows 32 | language: shell 33 | before_install: 34 | - choco install python --version 3.6.8 35 | env: 36 | - PATH=/c/Python36:/c/Python36/Scripts:$PATH 37 | - PYTHON=python 38 | - PIP=pip 39 | - name: "Python 3.7 on Windows" 40 | os: windows 41 | language: shell 42 | before_install: 43 | - choco install python --version 3.7.4 44 | env: 45 | - PATH=/c/Python37:/c/Python37/Scripts:$PATH 46 | - PYTHON=python 47 | - PIP=pip 48 | 49 | install: 50 | - $PYTHON --version 51 | - $PIP install -r requirements.txt 52 | 53 | script: 54 | - $PYTHON --version 55 | - $PYTHON main.py citest 56 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.0 2 | 3 | WORKDIR /proxy 4 | 5 | ADD requirements.txt /proxy 6 | RUN pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple/ 7 | RUN pip3 install --upgrade pip 8 | RUN pip3 install -r requirements.txt 9 | 10 | CMD ["python", "main.py"] 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Yu Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 简易好用的免费代理池 2 | 3 | [](https://github.com/OxOOo/ProxyPoolWithUI) 4 | 5 | 兼容系统: 6 |  7 |  8 |  9 | 10 | * 定时自动爬取网络上的免费代理 11 | * 定时对代理进行验证,集成API随时返回可用代理 12 | * 不需要第三方数据库支持,一键启动,简单易用 13 | * 集成WEB管理界面,方便查看代理状态并对代理池进行配置 14 | * 拥有详细的注释,可以非常方便地学习或修改 15 | 16 | 推荐: 17 | * [HTTP代理原理](https://zhuanlan.zhihu.com/p/349028243) 18 | 19 | 项目Demo:[http://chenyu0x00.com:8888/](http://chenyu0x00.com:8888/) 20 | 21 | **2021年3月8日测试,项目运行半小时后,支持访问HTTPS的代理有40+,支持访问HTTP的代理有100+。** 22 | 23 | 如果你知道有好用的代理源,或者是发现本项目存在一些问题,欢迎通过Issues和我们讨论。 24 | 25 | ## WEB管理界面截图 26 | 27 |  28 |  29 | 30 | ## 已经集成的免费代理源 31 | 32 | | 名称 | 地址 |备注 | 33 | |--------------|-------------------------------|-------------| 34 | | 悠悠网络代理 | https://uu-proxy.com/ | | 35 | | 快代理 | https://www.kuaidaili.com/ | | 36 | | 全网代理 | http://www.goubanjia.com/ | | 37 | | 66代理 | http://www.66ip.cn/ | | 38 | | 云代理 | http://www.ip3366.net/ | | 39 | | 免费代理库 | https://ip.jiangxianli.com/ | | 40 | | 小幻HTTP代理 | https://ip.ihuan.me/ | | 41 | | 89免费代理 | https://www.89ip.cn/ | | 42 | | ProxyScan | https://www.proxyscan.io/ | | 43 | | 开心代理 | http://www.kxdaili.com/ | | 44 | | 西拉代理 | http://www.xiladaili.com/ | | 45 | | 小舒代理 | http://www.xsdaili.cn/ | | 46 | | ProxyList | https://www.proxy-list.download/| | 47 | | ProxyScrape | https://proxyscrape.com/ |国内无法直接访问 | 48 | 49 | ## 运行本项目 50 | 51 | 本项目目前只适配了Python3,请确保你的电脑上安装了3.6或更高版本的Python软件。 52 | 53 | 1. 下载代码 54 | 55 | ```bash 56 | git clone https://github.com/OxOOo/ProxyPoolWithUI.git 57 | ``` 58 | 59 | 2. 安装Python依赖(在`ProxyPoolWithUI`目录下执行) 60 | 61 | ```bash 62 | pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt 63 | ``` 64 | 65 | 3. 启动(在`ProxyPoolWithUI`目录下执行) 66 | 67 | ```bash 68 | python3 main.py 69 | ``` 70 | 71 | 如果你在运行了上述命令之后,在命令行中看到了类似如下截图,那么说明项目成功启动了: 72 | 73 |  74 | 75 | 4. 使用浏览器打开`http://localhost:5000`,可以看到WEB管理界面。 76 | 77 | ## Docker构建项目 78 | 79 | 1. 下载项目文件 80 | 81 | ```bash 82 | git clone https://github.com/OxOOo/ProxyPoolWithUI.git 83 | cd ProxyPoolWithUI 84 | ``` 85 | 86 | 2. 构建docker镜像 87 | 88 | ```bash 89 | docker build --tag proxy_pool . 90 | ``` 91 | 92 | 3. 运行镜像 93 | 94 | ```bash 95 | docker run -p 5000:5000 -v /root/ProxyPoolWithUI:/proxy -d proxy_pool 96 | ``` 97 | `/root/ProxyPoolWithUI`为clone下来的项目目录路径,请自行更改 98 | 99 | 100 | ## 使用代理 101 | 102 | 1. API接口 103 | 104 | 项目启动之后,会自动爬取并检测代理是否可用,因此我们只需要关注如何使用代理即可。 105 | 106 | * `http://localhost:5000/fetch_random` : 随机获取一个可用代理,如果没有可用代理则返回空白 107 | 108 | 返回示例 : `http://127.0.0.1:8080` 109 | 110 | * `http://localhost:5000/fetch_all` : 获取所有可用代理,如果没有可用代理则返回空白 111 | 112 | 返回示例 : `http://127.0.0.1:8080,http://127.0.0.1:8081` 113 | 114 | 1. 使用代理 115 | 116 | 不同语言使用代理的方式各不相同,这里提供一个Python集成本项目并使用代理的示例代码: 117 | 118 | ```python 119 | # encoding : utf-8 120 | 121 | import requests 122 | 123 | def main(): 124 | proxy_uri = requests.get('http://localhost:5000/fetch_random').text 125 | if len(proxy_uri) == 0: 126 | print(u'暂时没有可用代理') 127 | return 128 | print(u'获取到的代理是:' + proxy_uri) 129 | 130 | proxies = { 'http': proxy_uri } 131 | html = requests.get('http://www.baidu.com', proxies=proxies).text 132 | if u'百度一下,你就知道' in html: 133 | print('代理可用') 134 | else: 135 | print('代理不可用') 136 | 137 | if __name__ == '__main__': 138 | main() 139 | ``` 140 | 141 | ## 配置 142 | 143 | 如果是需要禁用或者启用某些代理,可直接在WEB管理界面进行操作。 144 | 145 | 本项目的大部分配置均可在`config.py`中找到,默认配置已经可以适应绝大部分情况,一般来说不需要进行修改。 146 | 147 | ## 添加新的代理源 148 | 149 | 本项目的爬取器均在`fetchers`目录下,你也可以根据自己的需求对其中的爬取器进行修改或者扩展。 150 | 151 | 编写本项目的爬取器并不复杂,详细的操作步骤可见[此处](fetchers/),可以参考`fetchers`目录下已有的爬取器。 152 | 153 | ## 项目工作流程图 154 | 155 | 本项目主要包含三部分: 156 | 157 | 1. 爬取进程:主要包括`fetchers`目录和`proc/run_fetcher.py`文件 158 | 2. 验证进程:主要在`proc/run_validator.py`文件中 159 | 3. WEB与API:在`api`目录下 160 | 161 | 本项目的大致逻辑图如下: 162 | 163 | 注:为了便于理解与画图,下图的逻辑是经过简化之后的逻辑,详细过程可查看代码以及相应的注释。 164 | 165 |  166 | 167 | ## 验证算法相关 168 | 169 | 1. 如何验证代理可用 170 | 171 | 目前验证代理可用的算法较为简单,核心思想是使用`requests`库访问一个指定网页,查看是否访问成功。 172 | 173 | 相关配置参数(包括`超时时间`,`尝试次数`等)可在`config.py`中找到,具体代码逻辑在`proc/run_validator.py`中。 174 | 175 | 2. 什么时候该验证哪个代理 176 | 177 | 这个问题比较复杂,很难有一个完美的解决方案,因此目前的算法较为简单,勉强可用,可在[db](db)目录下找到对于目前算法的说明。 178 | 179 | 如果你有更好的算法,欢迎通过Issues和我们讨论,也可以根据[db](db)目录下的[README](db/README.md)文件对代码进行修改。 180 | -------------------------------------------------------------------------------- /api/README.md: -------------------------------------------------------------------------------- 1 | # API目录 2 | 3 | 使用Flask搭建了一个简单的API服务器,主要包含两部分: 4 | 5 | 1. 获取代理的API,使用方法详见[项目主页](https://github.com/OxOOo/ProxyPoolWithUI)。 6 | 2. 托管网页端的静态文件,并提供若干API给网页端使用。 7 | -------------------------------------------------------------------------------- /api/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 -------------------------------------------------------------------------------- /api/api.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import os 4 | import logging 5 | from flask import Flask 6 | from flask import jsonify, request, redirect, send_from_directory 7 | 8 | log = logging.getLogger('werkzeug') 9 | log.disabled = True 10 | 11 | try: 12 | from db import conn 13 | except: 14 | import sys 15 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 16 | from db import conn 17 | 18 | STATIC_FOLDER = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'frontend', 'deployment') 19 | 20 | app = Flask( 21 | __name__, 22 | static_url_path='/web', 23 | static_folder=STATIC_FOLDER 24 | ) 25 | 26 | ############# 以下API可用于获取代理 ################ 27 | 28 | # 可用于测试API状态 29 | @app.route('/ping', methods=['GET']) 30 | def ping(): 31 | return 'API OK' 32 | 33 | # 随机获取一个可用代理,如果没有可用代理则返回空白 34 | @app.route('/fetch_random', methods=['GET']) 35 | def fetch_random(): 36 | proxies = conn.getValidatedRandom(1) 37 | if len(proxies) > 0: 38 | p = proxies[0] 39 | return f'{p.protocol}://{p.ip}:{p.port}' 40 | else: 41 | return '' 42 | 43 | ############# 新增加接口int ################ 44 | 45 | #api 获取协议为http的一条结果 46 | @app.route('/fetch_http', methods=['GET']) 47 | def fetch_http(): 48 | proxies =conn.get_by_protocol('http', 1) 49 | if len(proxies) > 0: 50 | p = proxies[0] 51 | return f'{p.protocol}://{p.ip}:{p.port}' 52 | else: 53 | return '' 54 | 55 | #api 获取协议为http的全部结果 56 | @app.route('/fetch_http_all', methods=['GET']) 57 | def fetch_http_all(): 58 | proxies = conn.get_by_protocol('http', -1) 59 | if len(proxies) == 1: 60 | p = proxies[0] 61 | return f'{p.protocol}://{p.ip}:{p.port}' 62 | elif len(proxies) > 1: 63 | proxy_list = [] 64 | for p in proxies: 65 | proxy_list.append(f'{p.protocol}://{p.ip}:{p.port}') 66 | return ','.join(proxy_list) 67 | else: 68 | return '' 69 | 70 | #api 获取协议为https的一条结果 71 | @app.route('/fetch_https', methods=['GET']) 72 | def fetch_https(): 73 | proxies =conn.get_by_protocol('https', 1) 74 | if len(proxies) > 0: 75 | p = proxies[0] 76 | return f'{p.protocol}://{p.ip}:{p.port}' 77 | else: 78 | return '' 79 | 80 | #api 获取协议为https的全部结果 81 | @app.route('/fetch_https_all', methods=['GET']) 82 | def fetch_https_all(): 83 | proxies = conn.get_by_protocol('https', -1) 84 | if len(proxies) == 1: 85 | p = proxies[0] 86 | return f'{p.protocol}://{p.ip}:{p.port}' 87 | elif len(proxies) > 1: 88 | proxy_list = [] 89 | for p in proxies: 90 | proxy_list.append(f'{p.protocol}://{p.ip}:{p.port}') 91 | return ','.join(proxy_list) 92 | else: 93 | return '' 94 | 95 | #api 获取协议为http的一条结果 96 | @app.route('/fetch_socks4', methods=['GET']) 97 | def fetch_socks4(): 98 | proxies =conn.get_by_protocol('socks4', 1) 99 | if len(proxies) > 0: 100 | p = proxies[0] 101 | return f'{p.protocol}://{p.ip}:{p.port}' 102 | else: 103 | return '' 104 | 105 | #api 获取协议为http的全部结果 106 | @app.route('/fetch_socks4_all', methods=['GET']) 107 | def fetch_socks4_all(): 108 | proxies = conn.get_by_protocol('socks4', -1) 109 | if len(proxies) == 1: 110 | p = proxies[0] 111 | return f'{p.protocol}://{p.ip}:{p.port}' 112 | elif len(proxies) > 1: 113 | proxy_list = [] 114 | for p in proxies: 115 | proxy_list.append(f'{p.protocol}://{p.ip}:{p.port}') 116 | return ','.join(proxy_list) 117 | else: 118 | return '' 119 | 120 | #api 获取协议为https的一条结果 121 | @app.route('/fetch_socks5', methods=['GET']) 122 | def fetch_socks5(): 123 | proxies =conn.get_by_protocol('socks5', 1) 124 | if len(proxies) > 0: 125 | p = proxies[0] 126 | return f'{p.protocol}://{p.ip}:{p.port}' 127 | else: 128 | return '' 129 | 130 | #api 获取协议为https的全部结果 131 | @app.route('/fetch_socks5_all', methods=['GET']) 132 | def fetch_socks5_all(): 133 | proxies = conn.get_by_protocol('socks5', -1) 134 | if len(proxies) == 1: 135 | p = proxies[0] 136 | return f'{p.protocol}://{p.ip}:{p.port}' 137 | elif len(proxies) > 1: 138 | proxy_list = [] 139 | for p in proxies: 140 | proxy_list.append(f'{p.protocol}://{p.ip}:{p.port}') 141 | return ','.join(proxy_list) 142 | else: 143 | return '' 144 | 145 | ############# 新增加接口end ################ 146 | 147 | # 获取所有可用代理,如果没有可用代理则返回空白 148 | @app.route('/fetch_all', methods=['GET']) 149 | def fetch_all(): 150 | proxies = conn.getValidatedRandom(-1) 151 | proxies = [f'{p.protocol}://{p.ip}:{p.port}' for p in proxies] 152 | return ','.join(proxies) 153 | 154 | ############# 以下API主要给网页使用 ################ 155 | 156 | @app.route('/') 157 | def index(): 158 | return redirect('/web') 159 | 160 | # 网页:首页 161 | @app.route('/web', methods=['GET']) 162 | @app.route('/web/', methods=['GET']) 163 | def page_index(): 164 | return send_from_directory(STATIC_FOLDER, 'index.html') 165 | 166 | # 网页:爬取器状态 167 | @app.route('/web/fetchers', methods=['GET']) 168 | @app.route('/web/fetchers/', methods=['GET']) 169 | def page_fetchers(): 170 | return send_from_directory(STATIC_FOLDER, 'fetchers/index.html') 171 | 172 | # 获取代理状态 173 | @app.route('/proxies_status', methods=['GET']) 174 | def proxies_status(): 175 | proxies = conn.getValidatedRandom(-1) 176 | proxies = sorted(proxies, key=lambda p: f'{p.protocol}://{p.ip}:{p.port}', reverse=True) 177 | proxies = [p.to_dict() for p in proxies] 178 | 179 | status = conn.getProxiesStatus() 180 | 181 | return jsonify(dict( 182 | success=True, 183 | proxies=proxies, 184 | **status 185 | )) 186 | 187 | # 获取爬取器状态 188 | @app.route('/fetchers_status', methods=['GET']) 189 | def fetchers_status(): 190 | proxies = conn.getValidatedRandom(-1) # 获取所有可用代理 191 | fetchers = conn.getAllFetchers() 192 | fetchers = [f.to_dict() for f in fetchers] 193 | 194 | for f in fetchers: 195 | f['validated_cnt'] = len([_ for _ in proxies if _.fetcher_name == f['name']]) 196 | f['in_db_cnt'] = conn.getProxyCount(f['name']) 197 | 198 | return jsonify(dict( 199 | success=True, 200 | fetchers=fetchers 201 | )) 202 | 203 | # 清空爬取器状态 204 | @app.route('/clear_fetchers_status', methods=['GET']) 205 | def clear_fetchers_status(): 206 | conn.pushClearFetchersStatus() 207 | return jsonify(dict(success=True)) 208 | 209 | # 设置是否启用特定爬取器,?name=str,enable=0/1 210 | @app.route('/fetcher_enable', methods=['GET']) 211 | def fetcher_enable(): 212 | name = request.args.get('name') 213 | enable = request.args.get('enable') 214 | if enable == '1': 215 | conn.pushFetcherEnable(name, True) 216 | else: 217 | conn.pushFetcherEnable(name, False) 218 | return jsonify(dict(success=True)) 219 | 220 | ############# 其他 ################ 221 | 222 | # 跨域支持,主要是在开发网页端的时候需要使用 223 | def after_request(resp): 224 | ALLOWED_ORIGIN = ['0.0.0.0', '127.0.0.1', 'localhost'] 225 | origin = request.headers.get('origin', None) 226 | if origin is not None: 227 | for item in ALLOWED_ORIGIN: 228 | if item in origin: 229 | resp.headers['Access-Control-Allow-Origin'] = origin 230 | resp.headers['Access-Control-Allow-Credentials'] = 'true' 231 | return resp 232 | app.after_request(after_request) 233 | 234 | def main(proc_lock): 235 | if proc_lock is not None: 236 | conn.set_proc_lock(proc_lock) 237 | # 因为默认sqlite3中,同一个数据库连接不能在多线程环境下使用,所以这里需要禁用flask的多线程 238 | app.run(host='0.0.0.0', port=5000, threaded=False) 239 | 240 | if __name__ == '__main__': 241 | main(None) 242 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | """ 4 | 配置文件,一般来说不需要修改 5 | 如果需要启用或者禁用某些网站的爬取器,可在网页上进行配置 6 | """ 7 | 8 | import os 9 | 10 | # 数据库文件路径 11 | DATABASE_PATH = os.path.join(os.path.dirname(__file__), 'data.db') 12 | 13 | # 每次运行所有爬取器之后,睡眠多少时间,单位秒 14 | PROC_FETCHER_SLEEP = 5 * 60 15 | 16 | # 验证器每次睡眠的时间,单位秒 17 | PROC_VALIDATOR_SLEEP = 5 18 | 19 | # 验证器的配置参数 20 | VALIDATE_THREAD_NUM = 200 # 验证线程数量 21 | # 验证器的逻辑是: 22 | # 使用代理访问 VALIDATE_URL 网站,超时时间设置为 VALIDATE_TIMEOUT 23 | # 如果没有超时: 24 | # 1、若选择的验证方式为GET: 返回的网页中包含 VALIDATE_KEYWORD 文字,那么就认为本次验证成功 25 | # 2、若选择的验证方式为HEAD: 返回的响应头中,对于的 VALIDATE_HEADER 响应字段内容包含 VALIDATE_KEYWORD 内容,那么就认为本次验证成功 26 | # 上述过程最多进行 VALIDATE_MAX_FAILS 次,只要有一次成功,就认为代理可用 27 | VALIDATE_URL = 'https://qq.com' 28 | VALIDATE_METHOD = 'HEAD' # 验证方式,可选:GET、HEAD 29 | VALIDATE_HEADER = 'location' # 仅用于HEAD验证方式,百度响应头Server字段KEYWORD可填:bfe 30 | VALIDATE_KEYWORD = 'www.qq.com' 31 | VALIDATE_TIMEOUT = 5 # 超时时间,单位s 32 | VALIDATE_MAX_FAILS = 3 33 | -------------------------------------------------------------------------------- /db/Fetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import datetime 4 | 5 | class Fetcher(object): 6 | """ 7 | 爬取器的状态储存在数据库中,包括是否启用爬取器,爬取到的代理数量等 8 | """ 9 | 10 | ddls = [""" 11 | CREATE TABLE IF NOT EXISTS fetchers 12 | ( 13 | name VARCHAR(255) NOT NULL, 14 | enable BOOLEAN NOT NULL, 15 | sum_proxies_cnt INTEGER NOT NULL, 16 | last_proxies_cnt INTEGER NOT NULL, 17 | last_fetch_date TIMESTAMP, 18 | PRIMARY KEY (name) 19 | ) 20 | """] 21 | 22 | def __init__(self): 23 | self.name = None 24 | self.enable = True 25 | self.sum_proxies_cnt = 0 26 | self.last_proxies_cnt = 0 27 | self.last_fetch_date = None 28 | 29 | def params(self): 30 | """ 31 | 返回一个元组,包含自身的全部属性 32 | """ 33 | return ( 34 | self.name, self.enable, 35 | self.sum_proxies_cnt, self.last_proxies_cnt, self.last_fetch_date 36 | ) 37 | 38 | def to_dict(self): 39 | """ 40 | 返回一个dict,包含自身的全部属性 41 | """ 42 | return { 43 | 'name': self.name, 44 | 'enable': self.enable, 45 | 'sum_proxies_cnt': self.sum_proxies_cnt, 46 | 'last_proxies_cnt': self.last_proxies_cnt, 47 | 'last_fetch_date': str(self.last_fetch_date) if self.last_fetch_date is not None else None 48 | } 49 | 50 | @staticmethod 51 | def decode(row): 52 | """ 53 | 将sqlite返回的一行解析为Fetcher 54 | row : sqlite返回的一行 55 | """ 56 | assert len(row) == 5 57 | f = Fetcher() 58 | f.name = row[0] 59 | f.enable = bool(row[1]) 60 | f.sum_proxies_cnt = row[2] 61 | f.last_proxies_cnt = row[3] 62 | f.last_fetch_date = row[4] 63 | return f 64 | -------------------------------------------------------------------------------- /db/Proxy.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | import datetime 4 | import random 5 | class Proxy(object): 6 | """ 7 | 代理,用于表示数据库中的一个记录 8 | """ 9 | 10 | ddls = [""" 11 | CREATE TABLE IF NOT EXISTS proxies 12 | ( 13 | fetcher_name VARCHAR(255) NOT NULL, 14 | protocol VARCHAR(32) NOT NULL, 15 | ip VARCHAR(255) NOT NULL, 16 | port INTEGER NOT NULL, 17 | validated BOOLEAN NOT NULL, 18 | latency INTEGER, 19 | validate_date TIMESTAMP, 20 | to_validate_date TIMESTAMP NOT NULL, 21 | validate_failed_cnt INTEGER NOT NULL, 22 | PRIMARY KEY (protocol, ip, port) 23 | ) 24 | """, 25 | """ 26 | CREATE INDEX IF NOT EXISTS proxies_fetcher_name_index 27 | ON proxies(fetcher_name) 28 | """, 29 | """ 30 | CREATE INDEX IF NOT EXISTS proxies_to_validate_date_index 31 | ON proxies(to_validate_date ASC) 32 | """] 33 | 34 | def __init__(self): 35 | self.fetcher_name = None 36 | self.protocol = None 37 | self.ip = None 38 | self.port = None 39 | self.validated = False 40 | self.latency = None 41 | self.validate_date = None 42 | self.to_validate_date = datetime.datetime.now() 43 | self.validate_failed_cnt = 0 44 | 45 | def params(self): 46 | """ 47 | 返回一个元组,包含自身的全部属性 48 | """ 49 | return ( 50 | self.fetcher_name, 51 | self.protocol, self.ip, self.port, 52 | self.validated, self.latency, 53 | self.validate_date, self.to_validate_date, self.validate_failed_cnt 54 | ) 55 | 56 | def to_dict(self): 57 | """ 58 | 返回一个dict,包含自身的全部属性 59 | """ 60 | return { 61 | 'fetcher_name': self.fetcher_name, 62 | 'protocol': self.protocol, 63 | 'ip': self.ip, 64 | 'port': self.port, 65 | 'validated': self.validated, 66 | 'latency': self.latency, 67 | 'validate_date': str(self.validate_date) if self.validate_date is not None else None, 68 | 'to_validate_date': str(self.to_validate_date) if self.to_validate_date is not None else None, 69 | 'validate_failed_cnt': self.validate_failed_cnt 70 | } 71 | 72 | @staticmethod 73 | def decode(row): 74 | """ 75 | 将sqlite返回的一行解析为Proxy 76 | row : sqlite返回的一行 77 | """ 78 | assert len(row) == 9 79 | p = Proxy() 80 | p.fetcher_name = row[0] 81 | p.protocol = row[1] 82 | p.ip = row[2] 83 | p.port = row[3] 84 | p.validated = bool(row[4]) 85 | p.latency = row[5] 86 | p.validate_date = row[6] 87 | p.to_validate_date = row[7] 88 | p.validate_failed_cnt = row[8] 89 | return p 90 | 91 | def validate(self, success, latency): 92 | """ 93 | 传入一次验证结果,根据验证结果调整自身属性,并返回是否删除这个代理 94 | success : True/False,表示本次验证是否成功 95 | 返回 : True/False,True表示这个代理太差了,应该从数据库中删除 96 | """ 97 | self.latency = latency 98 | if success: # 验证成功 99 | self.validated = True 100 | self.validate_date = datetime.datetime.now() 101 | self.validate_failed_cnt = 0 102 | #self.to_validate_date = datetime.datetime.now() + datetime.timedelta(minutes=30) # 30分钟之后继续验证 103 | self.to_validate_date = datetime.datetime.now() + datetime.timedelta(minutes=random.randint(10, 60)) # 10·60分钟之后继续验证 104 | return False 105 | else: 106 | self.validated = False 107 | self.validate_date = datetime.datetime.now() 108 | self.validate_failed_cnt = self.validate_failed_cnt + 1 109 | 110 | # 验证失败的次数越多,距离下次验证的时间越长 111 | delay_minutes = self.validate_failed_cnt * 10 112 | self.to_validate_date = datetime.datetime.now() + datetime.timedelta(minutes=delay_minutes) 113 | 114 | if self.validate_failed_cnt >= 6: 115 | return True 116 | else: 117 | return False 118 | -------------------------------------------------------------------------------- /db/README.md: -------------------------------------------------------------------------------- 1 | # 数据库封装 2 | 3 | 这个目录下封装了操作数据库的一些接口。 4 | 为了通用性,本项目使用SQLite作为底层的数据库,使用`sqlite3`提供的接口对数据库进行操作。 5 | 6 | ## 数据表 7 | 8 | 主要包含两个表,分别用于储存代理和爬取器: 9 | 10 | 1. 代理 11 | 12 | | 字段名称 | 数据类型 | 说明 | 13 | |---------------------|----------|--------------------------------------------------------------------------| 14 | | fetcher_name | 字符串 | 这个代理来自哪个爬取器 | 15 | | protocol | 字符串 | 代理协议名称,一般为HTTP | 16 | | ip | 字符串 | 代理的IP地址 | 17 | | port | 整数 | 代理的端口号 | 18 | | validated | 布尔值 | 这个代理是否通过了验证,通过了验证表示当前代理可用 | 19 | | latency | 整数 | 延迟(单位毫秒),表示上次验证所用的时间,越小则代理质量越好 | 20 | | validate_date | 时间戳 | 上一次进行验证的时间 | 21 | | to_validate_date | 时间戳 | 下一次进行验证的时间,如何调整下一次验证的时间可见后文或者代码`Proxy.py` | 22 | | validate_failed_cnt | 整数 | 已经连续验证失败了多少次,会影响下一次验证的时间 | 23 | 24 | 2. 爬取器 25 | 26 | | 字段名称 | 数据类型 | 说明 | 27 | |------------------|----------|----------------------------------------------------------------------------------| 28 | | name | 字符串 | 爬取器的名称 | 29 | | enable | 布尔值 | 是否启用这个爬取器,被禁用的爬取器不会在之后被运行,但是其之前爬取的代理依然存在 | 30 | | sum_proxies_cnt | 整数 | 至今为止总共爬取到了多少个代理 | 31 | | last_proxies_cnt | 整数 | 上次爬取到了多少个代理 | 32 | | last_fetch_date | 时间戳 | 上次爬取的时间 | 33 | 34 | ## 下次验证时间调整算法 35 | 36 | 由于不同代理网站公开的免费代理质量差距较大,因此对于多次验证都失败的代理,我们需要降低对他们进行验证的频率,甚至将他们从数据库中删除。 37 | 而对于现在可用的代理,则需要频繁对其进行验证,以保证其可用性。 38 | 39 | 目前的算法较为简单,可见`Proxy.py`文件中的`validate`函数,核心思想如下: 40 | 41 | 1. 优先验证之前验证通过并且到了验证时间的代理(`conn.py`中的`getToValidate`函数) 42 | 2. 对于爬取器新爬取到的代理,我们需要尽快对其进行验证(设置`to_validate_date`为当前时间) 43 | 3. 如果某个代理验证成功,那么设置它下一次进行验证的时间为5分钟之后 44 | 4. 如果某个代理验证失败,那么设置它下一次进行验证的时间为 5 * 连续失败次数 分钟之后,如果连续3次失败,那么将其从数据库中删除 45 | 46 | 你可以修改为自己的算法,主要代码涉及`Proxy.py`文件以及`conn.py`文件的`pushNewFetch`和`getToValidate`函数。 47 | -------------------------------------------------------------------------------- /db/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .init import init 4 | 5 | init() 6 | -------------------------------------------------------------------------------- /db/conn.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | """ 4 | 封装的数据库接口 5 | """ 6 | 7 | from config import DATABASE_PATH 8 | from .Proxy import Proxy 9 | from .Fetcher import Fetcher 10 | import sqlite3 11 | import datetime 12 | import threading 13 | 14 | conn = sqlite3.connect(DATABASE_PATH, detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES) 15 | # 线程锁 16 | conn_lock = threading.Lock() 17 | # 进程锁 18 | proc_lock = None 19 | 20 | def set_proc_lock(proc_lock_sub): 21 | """ 22 | 设置进程锁 23 | proc_lock_sub : main中的进程锁 24 | """ 25 | global proc_lock 26 | proc_lock = proc_lock_sub 27 | 28 | def pushNewFetch(fetcher_name, protocol, ip, port): 29 | """ 30 | 爬取器新抓到了一个代理,调用本函数将代理放入数据库 31 | fetcher_name : 爬取器名称 32 | protocol : 代理协议 33 | ip : 代理IP地址 34 | port : 代理端口 35 | """ 36 | p = Proxy() 37 | p.fetcher_name = fetcher_name 38 | p.protocol = protocol 39 | p.ip = ip 40 | p.port = port 41 | conn_lock.acquire() 42 | proc_lock.acquire() 43 | 44 | c = conn.cursor() 45 | c.execute('BEGIN EXCLUSIVE TRANSACTION;') 46 | # 更新proxies表 47 | c.execute('SELECT * FROM proxies WHERE protocol=? AND ip=? AND port=?', (p.protocol, p.ip, p.port)) 48 | row = c.fetchone() 49 | if row is not None: # 已经存在(protocol, ip, port) 50 | old_p = Proxy.decode(row) 51 | c.execute(""" 52 | UPDATE proxies SET fetcher_name=?,to_validate_date=? WHERE protocol=? AND ip=? AND port=? 53 | """, (p.fetcher_name, min(datetime.datetime.now(), old_p.to_validate_date), p.protocol, p.ip, p.port)) 54 | else: 55 | c.execute('INSERT INTO proxies VALUES (?,?,?,?,?,?,?,?,?)', p.params()) 56 | c.close() 57 | conn.commit() 58 | conn_lock.release() 59 | proc_lock.release() 60 | 61 | def getToValidate(max_count=1): 62 | """ 63 | 从数据库中获取待验证的代理,根据to_validate_date字段 64 | 优先选取已经通过了验证的代理,其次是没有通过验证的代理 65 | max_count : 返回数量限制 66 | 返回 : list[Proxy] 67 | """ 68 | conn_lock.acquire() 69 | proc_lock.acquire() 70 | c = conn.cursor() 71 | c.execute('BEGIN EXCLUSIVE TRANSACTION;') 72 | c.execute('SELECT * FROM proxies WHERE to_validate_date<=? AND validated=? ORDER BY to_validate_date LIMIT ?', ( 73 | datetime.datetime.now(), 74 | True, 75 | max_count 76 | )) 77 | proxies = [Proxy.decode(row) for row in c] 78 | c.execute('SELECT * FROM proxies WHERE to_validate_date<=? AND validated=? ORDER BY to_validate_date LIMIT ?', ( 79 | datetime.datetime.now(), 80 | False, 81 | max_count - len(proxies) 82 | )) 83 | proxies = proxies + [Proxy.decode(row) for row in c] 84 | c.close() 85 | conn.commit() 86 | conn_lock.release() 87 | proc_lock.release() 88 | return proxies 89 | 90 | def pushValidateResult(proxy, success, latency): 91 | """ 92 | 将验证器的一个结果添加进数据库中 93 | proxy : 代理 94 | success : True/False,验证是否成功 95 | latency : 本次验证所用的时间(单位毫秒) 96 | """ 97 | p = proxy 98 | should_remove = p.validate(success, latency) 99 | conn_lock.acquire() 100 | proc_lock.acquire() 101 | if should_remove: 102 | conn.execute('DELETE FROM proxies WHERE protocol=? AND ip=? AND port=?', (p.protocol, p.ip, p.port)) 103 | else: 104 | conn.execute(""" 105 | UPDATE proxies 106 | SET fetcher_name=?,validated=?,latency=?,validate_date=?,to_validate_date=?,validate_failed_cnt=? 107 | WHERE protocol=? AND ip=? AND port=? 108 | """, ( 109 | p.fetcher_name, p.validated, p.latency, p.validate_date, p.to_validate_date, p.validate_failed_cnt, 110 | p.protocol, p.ip, p.port 111 | )) 112 | conn.commit() 113 | conn_lock.release() 114 | proc_lock.release() 115 | 116 | def getValidatedRandom(max_count): 117 | """ 118 | 从通过了验证的代理中,随机选择max_count个代理返回 119 | max_count<=0表示不做数量限制 120 | 返回 : list[Proxy] 121 | """ 122 | conn_lock.acquire() 123 | proc_lock.acquire() 124 | if max_count > 0: 125 | r = conn.execute('SELECT * FROM proxies WHERE validated=? ORDER BY RANDOM() LIMIT ?', (True, max_count)) 126 | else: 127 | r = conn.execute('SELECT * FROM proxies WHERE validated=? ORDER BY RANDOM()', (True,)) 128 | proxies = [Proxy.decode(row) for row in r] 129 | r.close() 130 | conn_lock.release() 131 | proc_lock.release() 132 | return proxies 133 | 134 | #新增方法 135 | def get_by_protocol(protocol, max_count): 136 | """ 137 | 查询 protocol 字段为指定值的代理服务器记录 138 | max_count 表示返回记录的最大数量,如果为 0 或负数则返回所有记录 139 | 返回 : list[Proxy] 140 | """ 141 | conn_lock.acquire() 142 | proc_lock.acquire() 143 | if max_count > 0: 144 | r = conn.execute('SELECT * FROM proxies WHERE protocol=? AND validated=? ORDER BY RANDOM() LIMIT ?', (protocol, True, max_count)) 145 | else: 146 | r = conn.execute('SELECT * FROM proxies WHERE protocol=? AND validated=? ORDER BY RANDOM()', (protocol, True)) 147 | proxies = [Proxy.decode(row) for row in r] 148 | r.close() 149 | conn_lock.release() 150 | proc_lock.release() 151 | return proxies 152 | 153 | def pushFetcherResult(name, proxies_cnt): 154 | """ 155 | 更新爬取器的状态,每次在完成一个网站的爬取之后,调用本函数 156 | name : 爬取器的名称 157 | proxies_cnt : 本次爬取到的代理数量 158 | """ 159 | conn_lock.acquire() 160 | proc_lock.acquire() 161 | c = conn.cursor() 162 | c.execute('BEGIN EXCLUSIVE TRANSACTION;') 163 | c.execute('SELECT * FROM fetchers WHERE name=?', (name,)) 164 | row = c.fetchone() 165 | if row is None: 166 | raise ValueError(f'ERRROR: can not find fetcher {name}') 167 | else: 168 | f = Fetcher.decode(row) 169 | f.last_proxies_cnt = proxies_cnt 170 | f.sum_proxies_cnt = f.sum_proxies_cnt + proxies_cnt 171 | f.last_fetch_date = datetime.datetime.now() 172 | c.execute('UPDATE fetchers SET sum_proxies_cnt=?,last_proxies_cnt=?,last_fetch_date=? WHERE name=?', ( 173 | f.sum_proxies_cnt, f.last_proxies_cnt, f.last_fetch_date, f.name 174 | )) 175 | c.close() 176 | conn.commit() 177 | conn_lock.release() 178 | proc_lock.release() 179 | 180 | def pushFetcherEnable(name, enable): 181 | """ 182 | 设置是否起用对应爬取器,被禁用的爬取器将不会被运行 183 | name : 爬取器的名称 184 | enable : True/False, 是否启用 185 | """ 186 | conn_lock.acquire() 187 | proc_lock.acquire() 188 | c = conn.cursor() 189 | c.execute('BEGIN EXCLUSIVE TRANSACTION;') 190 | c.execute('SELECT * FROM fetchers WHERE name=?', (name,)) 191 | row = c.fetchone() 192 | if row is None: 193 | raise ValueError(f'ERRROR: can not find fetcher {name}') 194 | else: 195 | f = Fetcher.decode(row) 196 | f.enable = enable 197 | c.execute('UPDATE fetchers SET enable=? WHERE name=?', ( 198 | f.enable, f.name 199 | )) 200 | c.close() 201 | conn.commit() 202 | conn_lock.release() 203 | proc_lock.release() 204 | 205 | def getAllFetchers(): 206 | """ 207 | 获取所有的爬取器以及状态 208 | 返回 : list[Fetcher] 209 | """ 210 | conn_lock.acquire() 211 | proc_lock.acquire() 212 | r = conn.execute('SELECT * FROM fetchers') 213 | fetchers = [Fetcher.decode(row) for row in r] 214 | r.close() 215 | conn_lock.release() 216 | proc_lock.release() 217 | return fetchers 218 | 219 | def getFetcher(name): 220 | """ 221 | 获取指定爬取器以及状态 222 | 返回 : Fetcher 223 | """ 224 | conn_lock.acquire() 225 | proc_lock.acquire() 226 | r = conn.execute('SELECT * FROM fetchers WHERE name=?', (name,)) 227 | row = r.fetchone() 228 | r.close() 229 | conn_lock.release() 230 | proc_lock.release() 231 | if row is None: 232 | return None 233 | else: 234 | return Fetcher.decode(row) 235 | 236 | def getProxyCount(fetcher_name): 237 | """ 238 | 查询在数据库中有多少个由指定爬取器爬取到的代理 239 | fetcher_name : 爬取器名称 240 | 返回 : int 241 | """ 242 | conn_lock.acquire() 243 | proc_lock.acquire() 244 | r = conn.execute('SELECT count(*) FROM proxies WHERE fetcher_name=?', (fetcher_name,)) 245 | cnt = r.fetchone()[0] 246 | r.close() 247 | conn_lock.release() 248 | proc_lock.release() 249 | return cnt 250 | 251 | def getProxiesStatus(): 252 | """ 253 | 获取代理状态,包括`全部代理数量`,`当前可用代理数量`,`等待验证代理数量` 254 | 返回 : dict 255 | """ 256 | conn_lock.acquire() 257 | proc_lock.acquire() 258 | r = conn.execute('SELECT count(*) FROM proxies') 259 | sum_proxies_cnt = r.fetchone()[0] 260 | r.close() 261 | 262 | r = conn.execute('SELECT count(*) FROM proxies WHERE validated=?', (True,)) 263 | validated_proxies_cnt = r.fetchone()[0] 264 | r.close() 265 | 266 | r = conn.execute('SELECT count(*) FROM proxies WHERE to_validate_date<=?', (datetime.datetime.now(),)) 267 | pending_proxies_cnt = r.fetchone()[0] 268 | r.close() 269 | conn_lock.release() 270 | proc_lock.release() 271 | return dict( 272 | sum_proxies_cnt=sum_proxies_cnt, 273 | validated_proxies_cnt=validated_proxies_cnt, 274 | pending_proxies_cnt=pending_proxies_cnt 275 | ) 276 | 277 | def pushClearFetchersStatus(): 278 | """ 279 | 清空爬取器的统计信息,包括sum_proxies_cnt,last_proxies_cnt,last_fetch_date 280 | """ 281 | conn_lock.acquire() 282 | proc_lock.acquire() 283 | c = conn.cursor() 284 | c.execute('BEGIN EXCLUSIVE TRANSACTION;') 285 | c.execute('UPDATE fetchers SET sum_proxies_cnt=?, last_proxies_cnt=?, last_fetch_date=?', (0, 0, None)) 286 | c.close() 287 | conn.commit() 288 | conn_lock.release() 289 | proc_lock.release() 290 | -------------------------------------------------------------------------------- /db/init.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from config import DATABASE_PATH 4 | from .Proxy import Proxy 5 | from .Fetcher import Fetcher 6 | from fetchers import fetchers 7 | import sqlite3 8 | 9 | def init(): 10 | """ 11 | 初始化数据库 12 | """ 13 | 14 | conn = sqlite3.connect(DATABASE_PATH) 15 | 16 | create_tables = Proxy.ddls + Fetcher.ddls 17 | for sql in create_tables: 18 | conn.execute(sql) 19 | conn.commit() 20 | 21 | # 注册所有的爬取器 22 | c = conn.cursor() 23 | c.execute('BEGIN EXCLUSIVE TRANSACTION;') 24 | for item in fetchers: 25 | c.execute('SELECT * FROM fetchers WHERE name=?', (item.name,)) 26 | if c.fetchone() is None: 27 | f = Fetcher() 28 | f.name = item.name 29 | c.execute('INSERT INTO fetchers VALUES(?,?,?,?,?)', f.params()) 30 | c.close() 31 | conn.commit() 32 | 33 | conn.close() 34 | -------------------------------------------------------------------------------- /docs/screenshot1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/docs/screenshot1.png -------------------------------------------------------------------------------- /docs/screenshot2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/docs/screenshot2.png -------------------------------------------------------------------------------- /docs/term.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/docs/term.png -------------------------------------------------------------------------------- /docs/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/docs/workflow.png -------------------------------------------------------------------------------- /fetchers/BaseFetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | class BaseFetcher(object): 4 | """ 5 | 所有爬取器的基类 6 | """ 7 | 8 | def fetch(self): 9 | """ 10 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 11 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 12 | """ 13 | raise NotImplementedError() 14 | -------------------------------------------------------------------------------- /fetchers/GoubanjiaFetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | from pyquery import PyQuery as pq 6 | import re 7 | 8 | class GoubanjiaFetcher(BaseFetcher): 9 | """ 10 | http://www.goubanjia.com/ 11 | """ 12 | 13 | def fetch(self): 14 | """ 15 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 16 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 17 | """ 18 | 19 | proxies = [] 20 | 21 | headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'} 22 | html = requests.get('http://www.goubanjia.com/', headers=headers, timeout=10).text 23 | doc = pq(html) 24 | for item in doc('table tbody tr').items(): 25 | ipport = item.find('td.ip').html() 26 | # 以下对ipport进行整理 27 | hide_reg = re.compile(r'
]*style="display:[^<>]*none;"[^<>]*>[^<>]*
') 28 | ipport = re.sub(hide_reg, '', ipport) 29 | tag_reg = re.compile(r'<[^<>]*>') 30 | ipport = re.sub(tag_reg, '', ipport) 31 | 32 | ip = ipport.split(':')[0] 33 | port = self.pde(item.find('td.ip').find('span.port').attr('class').split(' ')[1]) 34 | proxies.append(('http', ip, int(port))) 35 | 36 | return list(set(proxies)) 37 | 38 | def pde(self, class_key): # 解密函数,端口是加密过的 39 | """ 40 | key是class内容 41 | """ 42 | class_key = str(class_key) 43 | f = [] 44 | for i in range(len(class_key)): 45 | f.append(str('ABCDEFGHIZ'.index(class_key[i]))) 46 | return str(int(''.join(f)) >> 0x3) 47 | -------------------------------------------------------------------------------- /fetchers/IHuanFetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | from pyquery import PyQuery as pq 6 | import re 7 | 8 | class IHuanFetcher(BaseFetcher): 9 | """ 10 | https://ip.ihuan.me/ 11 | 爬这个网站要温柔点,站长表示可能会永久关站 12 | """ 13 | 14 | def fetch(self): 15 | """ 16 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 17 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 18 | """ 19 | 20 | proxies = [] 21 | ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$') 22 | port_regex = re.compile(r'^\d+$') 23 | 24 | pending_urls = ['https://ip.ihuan.me/'] 25 | while len(pending_urls) > 0: 26 | url = pending_urls[0] 27 | pending_urls = pending_urls[1:] 28 | 29 | headers = { 30 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 31 | 'Accept-Encoding': 'gzip, deflate', 32 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 33 | 'Cache-Control': 'no-cache', 34 | 'Connection': 'keep-alive', 35 | 'Pragma': 'no-cache', 36 | 'Upgrade-Insecure-Requests': '1', 37 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36' 38 | } 39 | try: 40 | html = requests.get(url, headers=headers, timeout=10).text 41 | except Exception as e: 42 | print('ERROR in ip.ihuan.me:' + str(e)) 43 | continue 44 | doc = pq(html) 45 | for line in doc('tbody tr').items(): 46 | tds = list(line('td').items()) 47 | if len(tds) == 10: 48 | ip = tds[0].text().strip() 49 | port = tds[1].text().strip() 50 | if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None: 51 | proxies.append(('http', ip, int(port))) 52 | 53 | if url.endswith('/'): # 当前是第一页,解析后面几页的链接 54 | for item in list(doc('.pagination a').items())[1:-1]: 55 | href = item.attr('href') 56 | if href is not None and href.startswith('?page='): 57 | pending_urls.append('https://ip.ihuan.me/' + href) 58 | 59 | return list(set(proxies)) 60 | -------------------------------------------------------------------------------- /fetchers/IP3366Fetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | from pyquery import PyQuery as pq 6 | import re 7 | 8 | class IP3366Fetcher(BaseFetcher): 9 | """ 10 | http://www.ip3366.net/free/?stype=1 11 | """ 12 | 13 | def fetch(self): 14 | """ 15 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 16 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 17 | """ 18 | 19 | urls = [] 20 | for stype in ['1', '2']: 21 | for page in range(1, 6): 22 | url = f'http://www.ip3366.net/free/?stype={stype}&page={page}' 23 | urls.append(url) 24 | 25 | proxies = [] 26 | ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$') 27 | port_regex = re.compile(r'^\d+$') 28 | 29 | for url in urls: 30 | headers = { 31 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 32 | 'Accept-Encoding': 'gzip, deflate', 33 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 34 | 'Cache-Control': 'no-cache', 35 | 'Connection': 'keep-alive', 36 | 'Pragma': 'no-cache', 37 | 'Upgrade-Insecure-Requests': '1', 38 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36' 39 | } 40 | html = requests.get(url, headers=headers, timeout=10).text 41 | doc = pq(html) 42 | for line in doc('tr').items(): 43 | tds = list(line('td').items()) 44 | if len(tds) == 7: 45 | ip = tds[0].text().strip() 46 | port = tds[1].text().strip() 47 | if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None: 48 | proxies.append(('http', ip, int(port))) 49 | 50 | return list(set(proxies)) 51 | -------------------------------------------------------------------------------- /fetchers/IP66Fetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | from pyquery import PyQuery as pq 6 | import re 7 | 8 | class IP66Fetcher(BaseFetcher): 9 | """ 10 | http://www.66ip.cn/ 11 | """ 12 | 13 | def fetch(self): 14 | """ 15 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 16 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 17 | """ 18 | 19 | urls = [] 20 | for areaindex in range(10): 21 | for page in range(1, 6): 22 | if areaindex == 0: 23 | url = f'http://www.66ip.cn/{page}.html' 24 | else: 25 | url = f'http://www.66ip.cn/areaindex_{areaindex}/{page}.html' 26 | urls.append(url) 27 | 28 | proxies = [] 29 | ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$') 30 | port_regex = re.compile(r'^\d+$') 31 | 32 | for url in urls: 33 | headers = { 34 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 35 | 'Accept-Encoding': 'gzip, deflate', 36 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 37 | 'Cache-Control': 'no-cache', 38 | 'Connection': 'keep-alive', 39 | 'Pragma': 'no-cache', 40 | 'Upgrade-Insecure-Requests': '1', 41 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36' 42 | } 43 | html = requests.get(url, headers=headers, timeout=10).text 44 | doc = pq(html) 45 | for line in doc('table tr').items(): 46 | tds = list(line('td').items()) 47 | if len(tds) == 5: 48 | ip = tds[0].text().strip() 49 | port = tds[1].text().strip() 50 | if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None: 51 | proxies.append(('http', ip, int(port))) 52 | 53 | return list(set(proxies)) 54 | -------------------------------------------------------------------------------- /fetchers/IP89Fetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | from pyquery import PyQuery as pq 6 | import re 7 | 8 | class IP89Fetcher(BaseFetcher): 9 | """ 10 | https://www.89ip.cn/ 11 | """ 12 | 13 | def fetch(self): 14 | """ 15 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 16 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 17 | """ 18 | 19 | urls = [] 20 | for page in range(1, 6): 21 | url = f'https://www.89ip.cn/index_{page}.html' 22 | urls.append(url) 23 | 24 | proxies = [] 25 | ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$') 26 | port_regex = re.compile(r'^\d+$') 27 | 28 | for url in urls: 29 | headers = { 30 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 31 | 'Accept-Encoding': 'gzip, deflate', 32 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 33 | 'Cache-Control': 'no-cache', 34 | 'Connection': 'keep-alive', 35 | 'Pragma': 'no-cache', 36 | 'Upgrade-Insecure-Requests': '1', 37 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36' 38 | } 39 | html = requests.get(url, headers=headers, timeout=10).text 40 | doc = pq(html) 41 | for line in doc('tr').items(): 42 | tds = list(line('td').items()) 43 | if len(tds) == 5: 44 | ip = tds[0].text().strip() 45 | port = tds[1].text().strip() 46 | if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None: 47 | proxies.append(('http', ip, int(port))) 48 | 49 | return list(set(proxies)) 50 | -------------------------------------------------------------------------------- /fetchers/JiangxianliFetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | from pyquery import PyQuery as pq 6 | import re 7 | 8 | class JiangxianliFetcher(BaseFetcher): 9 | """ 10 | https://ip.jiangxianli.com/?page=1 11 | """ 12 | 13 | def fetch(self): 14 | """ 15 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 16 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 17 | """ 18 | 19 | urls = [] 20 | for page in range(1, 5): 21 | url = f'https://ip.jiangxianli.com/?page={page}' 22 | urls.append(url) 23 | 24 | proxies = [] 25 | ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$') 26 | port_regex = re.compile(r'^\d+$') 27 | 28 | for url in urls: 29 | headers = { 30 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 31 | 'Accept-Encoding': 'gzip, deflate', 32 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 33 | 'Cache-Control': 'no-cache', 34 | 'Connection': 'keep-alive', 35 | 'Pragma': 'no-cache', 36 | 'Upgrade-Insecure-Requests': '1', 37 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36' 38 | } 39 | html = requests.get(url, headers=headers, timeout=10).text 40 | doc = pq(html) 41 | for line in doc('tr').items(): 42 | tds = list(line('td').items()) 43 | if len(tds) >= 2: 44 | ip = tds[0].text().strip() 45 | port = tds[1].text().strip() 46 | if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None: 47 | proxies.append(('http', ip, int(port))) 48 | 49 | return list(set(proxies)) 50 | -------------------------------------------------------------------------------- /fetchers/KaiXinFetcher.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | 4 | import requests 5 | from pyquery import PyQuery as pq 6 | 7 | from .BaseFetcher import BaseFetcher 8 | 9 | class KaiXinFetcher(BaseFetcher): 10 | """ 11 | http://www.kxdaili.com/dailiip.html 12 | 代码由 [Zealot666](https://github.com/Zealot666) 提供 13 | """ 14 | 15 | def fetch(self): 16 | """ 17 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocol是协议名称,目前主要为http 18 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 19 | """ 20 | 21 | urls = [] 22 | urls = urls + [f'http://www.kxdaili.com/dailiip/1/{page}.html' for page in range(1, 11)] 23 | urls = urls + [f'http://www.kxdaili.com/dailiip/2/{page}.html' for page in range(1, 11)] 24 | 25 | proxies = [] 26 | ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$') 27 | port_regex = re.compile(r'^\d+$') 28 | 29 | for url in urls: 30 | html = requests.get(url, timeout=10).text 31 | doc = pq(html) 32 | for line in doc('tr').items(): 33 | tds = list(line('td').items()) 34 | if len(tds) >= 2: 35 | ip = tds[0].text().strip() 36 | port = tds[1].text().strip() 37 | if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None: 38 | proxies.append(('http', ip, int(port))) 39 | 40 | return list(set(proxies)) 41 | -------------------------------------------------------------------------------- /fetchers/KuaidailiFetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | from pyquery import PyQuery as pq 6 | 7 | class KuaidailiFetcher(BaseFetcher): 8 | """ 9 | https://www.kuaidaili.com/free 10 | """ 11 | 12 | def fetch(self): 13 | """ 14 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 15 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 16 | """ 17 | 18 | urls = [] 19 | urls = urls + [f'https://www.kuaidaili.com/free/inha/{page}/' for page in range(1, 11)] 20 | urls = urls + [f'https://www.kuaidaili.com/free/intr/{page}/' for page in range(1, 11)] 21 | 22 | proxies = [] 23 | 24 | for url in urls: 25 | html = requests.get(url, timeout=10).text 26 | doc = pq(html) 27 | for item in doc('table tbody tr').items(): 28 | ip = item.find('td[data-title="IP"]').text() 29 | port = int(item.find('td[data-title="PORT"]').text()) 30 | proxies.append(('http', ip, port)) 31 | 32 | return list(set(proxies)) 33 | -------------------------------------------------------------------------------- /fetchers/ProxyListFetcher.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import requests 4 | 5 | from .BaseFetcher import BaseFetcher 6 | 7 | 8 | class ProxyListFetcher(BaseFetcher): 9 | """ 10 | https://www.proxy-list.download/api/v1/get?type={{ protocol }}&_t={{ timestamp }} 11 | """ 12 | 13 | def fetch(self): 14 | proxies = [] 15 | type_list = ['socks4', 'socks5', 'http', 'https'] 16 | for protocol in type_list: 17 | url = "https://www.proxy-list.download/api/v1/get?type=" + protocol + "&_t=" + str(time.time()) 18 | proxies_list = requests.get(url).text.split("\n") 19 | for data in proxies_list: 20 | flag_idx = data.find(":") 21 | ip = data[:flag_idx] 22 | port = data[flag_idx + 1:-1] 23 | proxies.append((protocol, ip, port)) 24 | 25 | return list(set(proxies)) 26 | -------------------------------------------------------------------------------- /fetchers/ProxyScrapeFetcher.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import requests 4 | 5 | from .BaseFetcher import BaseFetcher 6 | 7 | 8 | class ProxyScrapeFetcher(BaseFetcher): 9 | """ 10 | https://api.proxyscrape.com/?request=displayproxies&proxytype={{ protocol }}&_t={{ timestamp }} 11 | """ 12 | 13 | def fetch(self): 14 | proxies = [] 15 | type_list = ['socks4', 'socks5', 'http', 'https'] 16 | for protocol in type_list: 17 | url = "https://api.proxyscrape.com/?request=displayproxies&proxytype=" + protocol + "&_t=" + str( 18 | time.time()) 19 | resp = requests.get(url).text 20 | for data in resp.split("\n"): 21 | flag_idx = data.find(":") 22 | ip = data[:flag_idx] 23 | port = data[flag_idx + 1:-1] 24 | proxies.append((protocol, ip, port)) 25 | 26 | return list(set(proxies)) 27 | -------------------------------------------------------------------------------- /fetchers/ProxyscanFetcher.py: -------------------------------------------------------------------------------- 1 | from .BaseFetcher import BaseFetcher 2 | import requests 3 | import time 4 | 5 | class ProxyscanFetcher(BaseFetcher): 6 | """ 7 | https://www.proxyscan.io/api/proxy?last_check=9800&uptime=50&limit=20&_t={{ timestamp }} 8 | """ 9 | 10 | def fetch(self): 11 | """ 12 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocol是协议名称,目前主要为http 13 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 14 | """ 15 | proxies = [] 16 | # 此API为随机获取接口,获取策略为:重复取十次后去重 17 | for _ in range(10): 18 | url = "https://www.proxyscan.io/api/proxy?last_check=9800&uptime=50&limit=20&_t=" + str(time.time()) 19 | resp = requests.get(url).json() 20 | for data in resp: 21 | protocol = str.lower(data['Type'][0]) 22 | proxies.append((protocol, data['Ip'], data['Port'])) 23 | 24 | return list(set(proxies)) -------------------------------------------------------------------------------- /fetchers/README.md: -------------------------------------------------------------------------------- 1 | # 爬取器 2 | 3 | 所有的爬取器都在这个目录中,并且在`__init__.py`中进行了注册。 4 | 5 | ## 添加新的爬取器 6 | 7 | 本项目默认包含了数量不少的免费公开代理源,并且会持续更新,如果你发现有不错的免费代理源,欢迎通过Issues反馈给我们。 8 | 9 | 1. 编写爬取器代码 10 | 11 | 爬取器需要继承基类`BaseFetcher`,然后实现`fetch`函数。 12 | 13 | `fetch`函数没有输入参数,每次运行都返回一个列表,列表中包含本次爬取到的代理。返回的格式为(代理协议类型,代理IP,端口)。 14 | 15 | 示例: 16 | 17 | ```python 18 | class CustomFetcher(BaseFetcher): 19 | def fetch(self): 20 | return [('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 21 | ``` 22 | 23 | 2. 注册爬取器 24 | 25 | 编写好爬取器之后,还需要在`__init__.py`文件中进行注册,添加如下代码: 26 | 27 | **注意:爬取器的名称(name)一定不能重复。** 28 | 29 | ```python 30 | from .CustomFetcher import CustomFetcher 31 | 32 | fetchers = [ 33 | ... 34 | Fetcher(name='www.custom.com', fetcher=CustomFetcher), 35 | ... 36 | ] 37 | ``` 38 | 39 | 3. 重启 40 | 41 | 完成上述步骤之后,重启进程即可。代码会自动将新爬取器添加到数据库中,爬取进程也会自动运行新爬取器。 42 | -------------------------------------------------------------------------------- /fetchers/UUFetcher.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from .BaseFetcher import BaseFetcher 4 | import requests 5 | import json 6 | 7 | class UUFetcher(BaseFetcher): 8 | """ 9 | https://uu-proxy.com/ 10 | """ 11 | 12 | def fetch(self): 13 | """ 14 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocal是协议名称,目前主要为http 15 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 16 | """ 17 | 18 | headers = { 19 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 20 | 'Accept-Encoding': 'gzip, deflate', 21 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 22 | 'Cache-Control': 'no-cache', 23 | 'Connection': 'keep-alive', 24 | 'Pragma': 'no-cache', 25 | 'Upgrade-Insecure-Requests': '1', 26 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36' 27 | } 28 | data = requests.get('https://uu-proxy.com/api/free', headers=headers, timeout=10).text 29 | free = json.loads(data)['free'] 30 | proxies = [(item['scheme'], item['ip'], item['port']) for item in free['proxies']] 31 | 32 | return list(set(proxies)) 33 | -------------------------------------------------------------------------------- /fetchers/XiLaFetcher.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import random 4 | 5 | import requests 6 | from pyquery import PyQuery as pq 7 | 8 | from .BaseFetcher import BaseFetcher 9 | 10 | class XiLaFetcher(BaseFetcher): 11 | """ 12 | http://www.xiladaili.com/gaoni/ 13 | 代码由 [Zealot666](https://github.com/Zealot666) 提供 14 | """ 15 | def __init__(self): 16 | super().__init__() 17 | self.index = 0 18 | 19 | def fetch(self): 20 | """ 21 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocol是协议名称,目前主要为http 22 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 23 | """ 24 | self.index += 1 25 | new_index = self.index % 30 26 | 27 | urls = [] 28 | urls = urls + [f'http://www.xiladaili.com/gaoni/{page}/' for page in range(new_index, new_index + 11)] 29 | urls = urls + [f'http://www.xiladaili.com/http/{page}/' for page in range(new_index, new_index + 11)] 30 | 31 | proxies = [] 32 | ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$') 33 | port_regex = re.compile(r'^\d+$') 34 | 35 | for url in urls: 36 | time.sleep(1) 37 | html = requests.get(url, timeout=10).text 38 | doc = pq(html) 39 | for line in doc('tr').items(): 40 | tds = list(line('td').items()) 41 | if len(tds) >= 2: 42 | ip = tds[0].text().strip().split(":")[0] 43 | port = tds[0].text().strip().split(":")[1] 44 | if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None: 45 | proxies.append(('http', ip, int(port))) 46 | 47 | proxies = list(set(proxies)) 48 | 49 | # 这个代理源数据太多了,验证器跑不过来 50 | # 所以只取一部分,一般来说也够用了 51 | if len(proxies) > 200: 52 | proxies = random.sample(proxies, 200) 53 | 54 | return proxies 55 | -------------------------------------------------------------------------------- /fetchers/XiaoShuFetcher.py: -------------------------------------------------------------------------------- 1 | import re 2 | import time 3 | import random 4 | 5 | import requests 6 | from pyquery import PyQuery as pq 7 | 8 | from .BaseFetcher import BaseFetcher 9 | 10 | class XiaoShuFetcher(BaseFetcher): 11 | """ 12 | http://www.xsdaili.cn/ 13 | 代码由 [Zealot666](https://github.com/Zealot666) 提供 14 | """ 15 | def __init__(self): 16 | super().__init__() 17 | self.index = 0 18 | 19 | def fetch(self): 20 | """ 21 | 执行一次爬取,返回一个数组,每个元素是(protocol, ip, port),portocol是协议名称,目前主要为http 22 | 返回示例:[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)] 23 | """ 24 | self.index += 1 25 | new_index = self.index % 10 26 | 27 | urls = set() 28 | proxies = [] 29 | headers = { 30 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36" 31 | } 32 | for page in range(new_index, new_index + 1): 33 | response = requests.get("http://www.xsdaili.cn/dayProxy/" + str(page) + ".html", headers=headers, timeout=10) 34 | for item in pq(response.text)('a').items(): 35 | try: 36 | if "/dayProxy/ip" in item.attr("href"): 37 | urls.add("http://www.xsdaili.cn" + item.attr("href")) 38 | except Exception: 39 | continue 40 | for url in urls: 41 | response = requests.get(url, headers=headers, timeout=8) 42 | doc = pq(response.text) 43 | for item in doc(".cont").items(): 44 | for line in item.text().split("\n"): 45 | ip = line.split('@')[0].split(':')[0] 46 | port = line.split('@')[0].split(':')[1] 47 | proxies.append(("http", ip, port)) 48 | 49 | proxies = list(set(proxies)) 50 | 51 | # 这个代理源数据太多了,验证器跑不过来 52 | # 所以只取一部分,一般来说也够用了 53 | if len(proxies) > 200: 54 | proxies = random.sample(proxies, 200) 55 | 56 | return proxies 57 | -------------------------------------------------------------------------------- /fetchers/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | from collections import namedtuple 4 | 5 | Fetcher = namedtuple('Fetcher', ['name', 'fetcher']) 6 | 7 | from .UUFetcher import UUFetcher 8 | from .KuaidailiFetcher import KuaidailiFetcher 9 | from .GoubanjiaFetcher import GoubanjiaFetcher 10 | from .IP66Fetcher import IP66Fetcher 11 | from .IP3366Fetcher import IP3366Fetcher 12 | from .JiangxianliFetcher import JiangxianliFetcher 13 | from .IHuanFetcher import IHuanFetcher 14 | from .IP89Fetcher import IP89Fetcher 15 | from .ProxyscanFetcher import ProxyscanFetcher 16 | from .KaiXinFetcher import KaiXinFetcher 17 | from .XiLaFetcher import XiLaFetcher 18 | from .XiaoShuFetcher import XiaoShuFetcher 19 | from .ProxyListFetcher import ProxyListFetcher 20 | from .ProxyScrapeFetcher import ProxyScrapeFetcher 21 | 22 | fetchers = [ 23 | Fetcher(name='uu-proxy.com', fetcher=UUFetcher), 24 | Fetcher(name='www.kuaidaili.com', fetcher=KuaidailiFetcher), 25 | Fetcher(name='www.goubanjia.com', fetcher=GoubanjiaFetcher), 26 | Fetcher(name='www.66ip.cn', fetcher=IP66Fetcher), 27 | Fetcher(name='www.ip3366.net', fetcher=IP3366Fetcher), 28 | Fetcher(name='ip.jiangxianli.com', fetcher=JiangxianliFetcher), 29 | Fetcher(name='ip.ihuan.me', fetcher=IHuanFetcher), 30 | Fetcher(name='www.proxyscan.io', fetcher=ProxyscanFetcher), 31 | Fetcher(name='www.89ip.cn', fetcher=IP89Fetcher), 32 | Fetcher(name='www.kxdaili.com', fetcher=KaiXinFetcher), 33 | Fetcher(name='www.xiladaili.com', fetcher=XiLaFetcher), 34 | Fetcher(name='www.xsdaili.cn', fetcher=XiaoShuFetcher), 35 | Fetcher(name='www.proxy-list.download', fetcher=ProxyListFetcher), 36 | Fetcher(name='proxyscrape.com', fetcher=ProxyScrapeFetcher) 37 | ] 38 | -------------------------------------------------------------------------------- /frontend/README.md: -------------------------------------------------------------------------------- 1 | # 前端目录 2 | 3 | 这个目录主要是网页界面的代码,代码已经被编译成了静态文件(`deployment`目录中),一般来说不需要修改。 4 | 5 | `api/api.py`会自动将`deployment`中的内容部署成一个静态网页服务器。 6 | -------------------------------------------------------------------------------- /frontend/deployment/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/frontend/deployment/.nojekyll -------------------------------------------------------------------------------- /frontend/deployment/200.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
7 | 自动刷新:
8 |
刷新时间:{{ lastupdate }}
11 |
61 | 自动刷新:
62 |
刷新时间:{{ lastupdate }}
65 |