├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── README.md
├── api
    ├── README.md
    ├── __init__.py
    └── api.py
├── config.py
├── db
    ├── Fetcher.py
    ├── Proxy.py
    ├── README.md
    ├── __init__.py
    ├── conn.py
    └── init.py
├── docs
    ├── screenshot1.png
    ├── screenshot2.png
    ├── term.png
    └── workflow.png
├── fetchers
    ├── BaseFetcher.py
    ├── GoubanjiaFetcher.py
    ├── IHuanFetcher.py
    ├── IP3366Fetcher.py
    ├── IP66Fetcher.py
    ├── IP89Fetcher.py
    ├── JiangxianliFetcher.py
    ├── KaiXinFetcher.py
    ├── KuaidailiFetcher.py
    ├── ProxyListFetcher.py
    ├── ProxyScrapeFetcher.py
    ├── ProxyscanFetcher.py
    ├── README.md
    ├── UUFetcher.py
    ├── XiLaFetcher.py
    ├── XiaoShuFetcher.py
    └── __init__.py
├── frontend
    ├── README.md
    ├── deployment
    │   ├── .nojekyll
    │   ├── 200.html
    │   ├── _nuxt
    │   │   ├── 473a16e.js
    │   │   ├── 4e6036a.js
    │   │   ├── 810b53a.js
    │   │   ├── 89e3175.js
    │   │   ├── LICENSES
    │   │   ├── c6103f9.js
    │   │   ├── fda1702.js
    │   │   └── static
    │   │   │   └── 1630852693
    │   │   │       ├── fetchers
    │   │   │           └── payload.js
    │   │   │       ├── manifest.js
    │   │   │       └── payload.js
    │   ├── fetchers
    │   │   └── index.html
    │   └── index.html
    └── src
    │   ├── .editorconfig
    │   ├── .eslintrc.js
    │   ├── .gitignore
    │   ├── README.md
    │   ├── build.sh
    │   ├── jsconfig.json
    │   ├── layouts
    │       └── default.vue
    │   ├── nuxt.config.js
    │   ├── package-lock.json
    │   ├── package.json
    │   ├── pages
    │       ├── fetchers.vue
    │       └── index.vue
    │   └── plugins
    │       ├── antd-ui.js
    │       └── axios.js
├── main.py
├── proc
    ├── README.md
    ├── __init__.py
    ├── run_fetcher.py
    └── run_validator.py
├── requirements.txt
└── test
    ├── README.md
    ├── testDB.py
    └── testFetcher.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | data.db
132 | 
133 | # Pycharm settings
134 | .idea
135 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | jobs:
 4 |   include:
 5 |     - name: "Python 3.6 on Linux"
 6 |       python: "3.6"
 7 |       os: "linux"
 8 |       env:
 9 |         - PYTHON=python3
10 |         - PIP=pip3
11 |     - name: "Python 3.7 on Linux"
12 |       python: "3.7"
13 |       os: "linux"
14 |       env:
15 |         - PYTHON=python3
16 |         - PIP=pip3
17 |     - name: "Python 3.8 on Linux"
18 |       python: "3.8"
19 |       os: "linux"
20 |       env:
21 |         - PYTHON=python3
22 |         - PIP=pip3
23 |     - name: "Python 3.7 on macOS"
24 |       os: osx
25 |       osx_image: xcode11.2
26 |       language: shell
27 |       env:
28 |         - PYTHON=python3
29 |         - PIP=pip3
30 |     - name: "Python 3.6 on Windows"
31 |       os: windows
32 |       language: shell
33 |       before_install:
34 |         - choco install python --version 3.6.8
35 |       env:
36 |         - PATH=/c/Python36:/c/Python36/Scripts:$PATH
37 |         - PYTHON=python
38 |         - PIP=pip
39 |     - name: "Python 3.7 on Windows"
40 |       os: windows
41 |       language: shell
42 |       before_install:
43 |         - choco install python --version 3.7.4
44 |       env:
45 |         - PATH=/c/Python37:/c/Python37/Scripts:$PATH
46 |         - PYTHON=python
47 |         - PIP=pip
48 | 
49 | install:
50 |   - $PYTHON --version
51 |   - $PIP install -r requirements.txt
52 | 
53 | script:
54 |   - $PYTHON --version
55 |   - $PYTHON main.py citest
56 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7.0
 2 | 
 3 | WORKDIR /proxy
 4 | 
 5 | ADD requirements.txt /proxy
 6 | RUN pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple/
 7 | RUN pip3 install --upgrade pip
 8 | RUN pip3 install -r requirements.txt
 9 | 
10 | CMD ["python", "main.py"]
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Yu Chen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 简易好用的免费代理池
  2 | 
  3 | [![](https://img.shields.io/badge/python-3.6+-brightgreen)](https://github.com/OxOOo/ProxyPoolWithUI)
  4 | 
  5 | 兼容系统：
  6 | ![Windows](https://img.shields.io/badge/Windows-o-brightgreen)
  7 | ![Linux](https://img.shields.io/badge/Linux-o-brightgreen)
  8 | ![MacOS](https://img.shields.io/badge/MacOS-o-brightgreen)
  9 | 
 10 | * 定时自动爬取网络上的免费代理
 11 | * 定时对代理进行验证，集成API随时返回可用代理
 12 | * 不需要第三方数据库支持，一键启动，简单易用
 13 | * 集成WEB管理界面，方便查看代理状态并对代理池进行配置
 14 | * 拥有详细的注释，可以非常方便地学习或修改
 15 | 
 16 | 推荐:
 17 | * [HTTP代理原理](https://zhuanlan.zhihu.com/p/349028243)
 18 | 
 19 | 项目Demo：[http://chenyu0x00.com:8888/](http://chenyu0x00.com:8888/)
 20 | 
 21 | **2021年3月8日测试，项目运行半小时后，支持访问HTTPS的代理有40+，支持访问HTTP的代理有100+。**
 22 | 
 23 | 如果你知道有好用的代理源，或者是发现本项目存在一些问题，欢迎通过Issues和我们讨论。
 24 | 
 25 | ## WEB管理界面截图
 26 | 
 27 | ![screenshot1](docs/screenshot1.png)
 28 | ![screenshot2](docs/screenshot2.png)
 29 | 
 30 | ## 已经集成的免费代理源
 31 | 
 32 | | 名称         | 地址                           |备注          |
 33 | |--------------|-------------------------------|-------------|
 34 | | 悠悠网络代理   | https://uu-proxy.com/         |              |
 35 | | 快代理       | https://www.kuaidaili.com/     |              |
 36 | | 全网代理     | http://www.goubanjia.com/      |              |
 37 | | 66代理       | http://www.66ip.cn/            |              |
 38 | | 云代理       | http://www.ip3366.net/         |              |
 39 | | 免费代理库   | https://ip.jiangxianli.com/     |              |
 40 | | 小幻HTTP代理 | https://ip.ihuan.me/            |              |
 41 | | 89免费代理   | https://www.89ip.cn/            |              |
 42 | | ProxyScan   | https://www.proxyscan.io/      |              |
 43 | | 开心代理     | http://www.kxdaili.com/         |              |
 44 | | 西拉代理     | http://www.xiladaili.com/       |              |
 45 | | 小舒代理     | http://www.xsdaili.cn/          |              |
 46 | | ProxyList   | https://www.proxy-list.download/|              |
 47 | | ProxyScrape | https://proxyscrape.com/        |国内无法直接访问 |
 48 | 
 49 | ## 运行本项目
 50 | 
 51 | 本项目目前只适配了Python3,请确保你的电脑上安装了3.6或更高版本的Python软件。
 52 | 
 53 | 1. 下载代码
 54 | 
 55 | ```bash
 56 | git clone https://github.com/OxOOo/ProxyPoolWithUI.git
 57 | ```
 58 | 
 59 | 2. 安装Python依赖(在`ProxyPoolWithUI`目录下执行)
 60 | 
 61 | ```bash
 62 | pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt
 63 | ```
 64 | 
 65 | 3. 启动(在`ProxyPoolWithUI`目录下执行)
 66 | 
 67 | ```bash
 68 | python3 main.py
 69 | ```
 70 | 
 71 | 如果你在运行了上述命令之后，在命令行中看到了类似如下截图，那么说明项目成功启动了：
 72 | 
 73 | ![term](docs/term.png)
 74 | 
 75 | 4. 使用浏览器打开`http://localhost:5000`，可以看到WEB管理界面。
 76 | 
 77 | ## Docker构建项目
 78 | 
 79 | 1. 下载项目文件
 80 | 
 81 | ```bash
 82 | git clone https://github.com/OxOOo/ProxyPoolWithUI.git
 83 | cd ProxyPoolWithUI
 84 | ```
 85 | 
 86 | 2. 构建docker镜像
 87 | 
 88 | ```bash
 89 | docker build --tag proxy_pool .
 90 | ```
 91 | 
 92 | 3. 运行镜像
 93 | 
 94 | ```bash
 95 | docker run -p 5000:5000 -v /root/ProxyPoolWithUI:/proxy -d proxy_pool
 96 | ```
 97 | `/root/ProxyPoolWithUI`为clone下来的项目目录路径，请自行更改
 98 | 
 99 | 
100 | ## 使用代理
101 | 
102 | 1. API接口
103 | 
104 | 项目启动之后，会自动爬取并检测代理是否可用，因此我们只需要关注如何使用代理即可。
105 | 
106 | * `http://localhost:5000/fetch_random` : 随机获取一个可用代理，如果没有可用代理则返回空白
107 |   
108 |   返回示例 : `http://127.0.0.1:8080`
109 | 
110 | * `http://localhost:5000/fetch_all` : 获取所有可用代理，如果没有可用代理则返回空白
111 |   
112 |   返回示例 : `http://127.0.0.1:8080,http://127.0.0.1:8081`
113 | 
114 | 1. 使用代理
115 | 
116 | 不同语言使用代理的方式各不相同，这里提供一个Python集成本项目并使用代理的示例代码：
117 | 
118 | ```python
119 | # encoding : utf-8
120 | 
121 | import requests
122 | 
123 | def main():
124 |     proxy_uri = requests.get('http://localhost:5000/fetch_random').text
125 |     if len(proxy_uri) == 0:
126 |         print(u'暂时没有可用代理')
127 |         return
128 |     print(u'获取到的代理是：' + proxy_uri)
129 |     
130 |     proxies = { 'http': proxy_uri }
131 |     html = requests.get('http://www.baidu.com', proxies=proxies).text
132 |     if u'百度一下，你就知道' in html:
133 |         print('代理可用')
134 |     else:
135 |         print('代理不可用')
136 | 
137 | if __name__ == '__main__':
138 |     main()
139 | ```
140 | 
141 | ## 配置
142 | 
143 | 如果是需要禁用或者启用某些代理，可直接在WEB管理界面进行操作。
144 | 
145 | 本项目的大部分配置均可在`config.py`中找到，默认配置已经可以适应绝大部分情况，一般来说不需要进行修改。
146 | 
147 | ## 添加新的代理源
148 | 
149 | 本项目的爬取器均在`fetchers`目录下，你也可以根据自己的需求对其中的爬取器进行修改或者扩展。
150 | 
151 | 编写本项目的爬取器并不复杂，详细的操作步骤可见[此处](fetchers/)，可以参考`fetchers`目录下已有的爬取器。
152 | 
153 | ## 项目工作流程图
154 | 
155 | 本项目主要包含三部分：
156 | 
157 | 1. 爬取进程：主要包括`fetchers`目录和`proc/run_fetcher.py`文件
158 | 2. 验证进程：主要在`proc/run_validator.py`文件中
159 | 3. WEB与API：在`api`目录下
160 | 
161 | 本项目的大致逻辑图如下：
162 | 
163 | 注：为了便于理解与画图，下图的逻辑是经过简化之后的逻辑，详细过程可查看代码以及相应的注释。
164 | 
165 | ![workflow](docs/workflow.png)
166 | 
167 | ## 验证算法相关
168 | 
169 | 1. 如何验证代理可用
170 | 
171 | 目前验证代理可用的算法较为简单，核心思想是使用`requests`库访问一个指定网页，查看是否访问成功。
172 | 
173 | 相关配置参数（包括`超时时间`，`尝试次数`等）可在`config.py`中找到，具体代码逻辑在`proc/run_validator.py`中。
174 | 
175 | 2. 什么时候该验证哪个代理
176 | 
177 | 这个问题比较复杂，很难有一个完美的解决方案，因此目前的算法较为简单，勉强可用，可在[db](db)目录下找到对于目前算法的说明。
178 | 
179 | 如果你有更好的算法，欢迎通过Issues和我们讨论，也可以根据[db](db)目录下的[README](db/README.md)文件对代码进行修改。
180 | 


--------------------------------------------------------------------------------
/api/README.md:
--------------------------------------------------------------------------------
1 | # API目录
2 | 
3 | 使用Flask搭建了一个简单的API服务器，主要包含两部分：
4 | 
5 | 1. 获取代理的API，使用方法详见[项目主页](https://github.com/OxOOo/ProxyPoolWithUI)。
6 | 2. 托管网页端的静态文件，并提供若干API给网页端使用。
7 | 


--------------------------------------------------------------------------------
/api/__init__.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8


--------------------------------------------------------------------------------
/api/api.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | 
  3 | import os
  4 | import logging
  5 | from flask import Flask
  6 | from flask import jsonify, request, redirect, send_from_directory
  7 | 
  8 | log = logging.getLogger('werkzeug')
  9 | log.disabled = True
 10 | 
 11 | try:
 12 |     from db import conn
 13 | except:
 14 |     import sys
 15 |     sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 16 |     from db import conn
 17 | 
 18 | STATIC_FOLDER = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'frontend', 'deployment')
 19 | 
 20 | app = Flask(
 21 |     __name__,
 22 |     static_url_path='/web',
 23 |     static_folder=STATIC_FOLDER
 24 | )
 25 | 
 26 | ############# 以下API可用于获取代理 ################
 27 | 
 28 | # 可用于测试API状态
 29 | @app.route('/ping', methods=['GET'])
 30 | def ping():
 31 |     return 'API OK'
 32 | 
 33 | # 随机获取一个可用代理，如果没有可用代理则返回空白
 34 | @app.route('/fetch_random', methods=['GET'])
 35 | def fetch_random():
 36 |     proxies = conn.getValidatedRandom(1)
 37 |     if len(proxies) > 0:
 38 |         p = proxies[0]
 39 |         return f'{p.protocol}://{p.ip}:{p.port}'
 40 |     else:
 41 |         return ''
 42 | 
 43 | ############# 新增加接口int ################        
 44 | 
 45 | #api 获取协议为http的一条结果
 46 | @app.route('/fetch_http', methods=['GET'])
 47 | def fetch_http():
 48 |     proxies =conn.get_by_protocol('http', 1)
 49 |     if len(proxies) > 0:
 50 |         p = proxies[0]
 51 |         return f'{p.protocol}://{p.ip}:{p.port}'
 52 |     else:
 53 |         return ''
 54 | 
 55 | #api 获取协议为http的全部结果
 56 | @app.route('/fetch_http_all', methods=['GET'])
 57 | def fetch_http_all():
 58 |     proxies = conn.get_by_protocol('http', -1)
 59 |     if len(proxies) == 1:
 60 |         p = proxies[0]
 61 |         return f'{p.protocol}://{p.ip}:{p.port}'
 62 |     elif len(proxies) > 1:
 63 |         proxy_list = []
 64 |         for p in proxies:
 65 |             proxy_list.append(f'{p.protocol}://{p.ip}:{p.port}')
 66 |         return ','.join(proxy_list)
 67 |     else:
 68 |         return ''
 69 |         
 70 | #api 获取协议为https的一条结果
 71 | @app.route('/fetch_https', methods=['GET'])
 72 | def fetch_https():
 73 |     proxies =conn.get_by_protocol('https', 1)
 74 |     if len(proxies) > 0:
 75 |         p = proxies[0]
 76 |         return f'{p.protocol}://{p.ip}:{p.port}'
 77 |     else:
 78 |         return ''
 79 | 
 80 | #api 获取协议为https的全部结果
 81 | @app.route('/fetch_https_all', methods=['GET'])
 82 | def fetch_https_all():
 83 |     proxies = conn.get_by_protocol('https', -1)
 84 |     if len(proxies) == 1:
 85 |         p = proxies[0]
 86 |         return f'{p.protocol}://{p.ip}:{p.port}'
 87 |     elif len(proxies) > 1:
 88 |         proxy_list = []
 89 |         for p in proxies:
 90 |             proxy_list.append(f'{p.protocol}://{p.ip}:{p.port}')
 91 |         return ','.join(proxy_list)
 92 |     else:
 93 |         return ''
 94 |                 
 95 | #api 获取协议为http的一条结果
 96 | @app.route('/fetch_socks4', methods=['GET'])
 97 | def fetch_socks4():
 98 |     proxies =conn.get_by_protocol('socks4', 1)
 99 |     if len(proxies) > 0:
100 |         p = proxies[0]
101 |         return f'{p.protocol}://{p.ip}:{p.port}'
102 |     else:
103 |         return ''
104 | 
105 | #api 获取协议为http的全部结果
106 | @app.route('/fetch_socks4_all', methods=['GET'])
107 | def fetch_socks4_all():
108 |     proxies = conn.get_by_protocol('socks4', -1)
109 |     if len(proxies) == 1:
110 |         p = proxies[0]
111 |         return f'{p.protocol}://{p.ip}:{p.port}'
112 |     elif len(proxies) > 1:
113 |         proxy_list = []
114 |         for p in proxies:
115 |             proxy_list.append(f'{p.protocol}://{p.ip}:{p.port}')
116 |         return ','.join(proxy_list)
117 |     else:
118 |         return ''
119 |         
120 | #api 获取协议为https的一条结果
121 | @app.route('/fetch_socks5', methods=['GET'])
122 | def fetch_socks5():
123 |     proxies =conn.get_by_protocol('socks5', 1)
124 |     if len(proxies) > 0:
125 |         p = proxies[0]
126 |         return f'{p.protocol}://{p.ip}:{p.port}'
127 |     else:
128 |         return ''
129 | 
130 | #api 获取协议为https的全部结果
131 | @app.route('/fetch_socks5_all', methods=['GET'])
132 | def fetch_socks5_all():
133 |     proxies = conn.get_by_protocol('socks5', -1)
134 |     if len(proxies) == 1:
135 |         p = proxies[0]
136 |         return f'{p.protocol}://{p.ip}:{p.port}'
137 |     elif len(proxies) > 1:
138 |         proxy_list = []
139 |         for p in proxies:
140 |             proxy_list.append(f'{p.protocol}://{p.ip}:{p.port}')
141 |         return ','.join(proxy_list)
142 |     else:
143 |         return ''
144 |                         
145 | ############# 新增加接口end ################    
146 | 
147 | # 获取所有可用代理，如果没有可用代理则返回空白
148 | @app.route('/fetch_all', methods=['GET'])
149 | def fetch_all():
150 |     proxies = conn.getValidatedRandom(-1)
151 |     proxies = [f'{p.protocol}://{p.ip}:{p.port}' for p in proxies]
152 |     return ','.join(proxies)
153 | 
154 | ############# 以下API主要给网页使用 ################
155 | 
156 | @app.route('/')
157 | def index():
158 |     return redirect('/web')
159 | 
160 | # 网页：首页
161 | @app.route('/web', methods=['GET'])
162 | @app.route('/web/', methods=['GET'])
163 | def page_index():
164 |     return send_from_directory(STATIC_FOLDER, 'index.html')
165 | 
166 | # 网页：爬取器状态
167 | @app.route('/web/fetchers', methods=['GET'])
168 | @app.route('/web/fetchers/', methods=['GET'])
169 | def page_fetchers():
170 |     return send_from_directory(STATIC_FOLDER, 'fetchers/index.html')
171 | 
172 | # 获取代理状态
173 | @app.route('/proxies_status', methods=['GET'])
174 | def proxies_status():
175 |     proxies = conn.getValidatedRandom(-1)
176 |     proxies = sorted(proxies, key=lambda p: f'{p.protocol}://{p.ip}:{p.port}', reverse=True)
177 |     proxies = [p.to_dict() for p in proxies]
178 | 
179 |     status = conn.getProxiesStatus()
180 | 
181 |     return jsonify(dict(
182 |         success=True,
183 |         proxies=proxies,
184 |         **status
185 |     ))
186 | 
187 | # 获取爬取器状态
188 | @app.route('/fetchers_status', methods=['GET'])
189 | def fetchers_status():
190 |     proxies = conn.getValidatedRandom(-1) # 获取所有可用代理
191 |     fetchers = conn.getAllFetchers()
192 |     fetchers = [f.to_dict() for f in fetchers]
193 | 
194 |     for f in fetchers:
195 |         f['validated_cnt'] = len([_ for _ in proxies if _.fetcher_name == f['name']])
196 |         f['in_db_cnt'] = conn.getProxyCount(f['name'])
197 |     
198 |     return jsonify(dict(
199 |         success=True,
200 |         fetchers=fetchers
201 |     ))
202 | 
203 | # 清空爬取器状态
204 | @app.route('/clear_fetchers_status', methods=['GET'])
205 | def clear_fetchers_status():
206 |     conn.pushClearFetchersStatus()
207 |     return jsonify(dict(success=True))
208 | 
209 | # 设置是否启用特定爬取器,?name=str,enable=0/1
210 | @app.route('/fetcher_enable', methods=['GET'])
211 | def fetcher_enable():
212 |     name = request.args.get('name')
213 |     enable = request.args.get('enable')
214 |     if enable == '1':
215 |         conn.pushFetcherEnable(name, True)
216 |     else:
217 |         conn.pushFetcherEnable(name, False)
218 |     return jsonify(dict(success=True))
219 | 
220 | ############# 其他 ################
221 | 
222 | # 跨域支持，主要是在开发网页端的时候需要使用
223 | def after_request(resp):
224 |     ALLOWED_ORIGIN = ['0.0.0.0', '127.0.0.1', 'localhost']
225 |     origin = request.headers.get('origin', None)
226 |     if origin is not None:
227 |         for item in ALLOWED_ORIGIN:
228 |             if item in origin:
229 |                 resp.headers['Access-Control-Allow-Origin'] = origin
230 |                 resp.headers['Access-Control-Allow-Credentials'] = 'true'
231 |     return resp
232 | app.after_request(after_request)
233 | 
234 | def main(proc_lock):
235 |     if proc_lock is not None:
236 |         conn.set_proc_lock(proc_lock)
237 |     # 因为默认sqlite3中，同一个数据库连接不能在多线程环境下使用，所以这里需要禁用flask的多线程
238 |     app.run(host='0.0.0.0', port=5000, threaded=False)
239 | 
240 | if __name__ == '__main__':
241 |     main(None)
242 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | """
 4 | 配置文件，一般来说不需要修改
 5 | 如果需要启用或者禁用某些网站的爬取器，可在网页上进行配置
 6 | """
 7 | 
 8 | import os
 9 | 
10 | # 数据库文件路径
11 | DATABASE_PATH = os.path.join(os.path.dirname(__file__), 'data.db')
12 | 
13 | # 每次运行所有爬取器之后，睡眠多少时间，单位秒
14 | PROC_FETCHER_SLEEP = 5 * 60
15 | 
16 | # 验证器每次睡眠的时间，单位秒
17 | PROC_VALIDATOR_SLEEP = 5
18 | 
19 | # 验证器的配置参数
20 | VALIDATE_THREAD_NUM = 200 # 验证线程数量
21 | # 验证器的逻辑是：
22 | # 使用代理访问 VALIDATE_URL 网站，超时时间设置为 VALIDATE_TIMEOUT
23 | # 如果没有超时：
24 | # 1、若选择的验证方式为GET：  返回的网页中包含 VALIDATE_KEYWORD 文字，那么就认为本次验证成功
25 | # 2、若选择的验证方式为HEAD： 返回的响应头中，对于的 VALIDATE_HEADER 响应字段内容包含 VALIDATE_KEYWORD 内容，那么就认为本次验证成功
26 | # 上述过程最多进行 VALIDATE_MAX_FAILS 次，只要有一次成功，就认为代理可用
27 | VALIDATE_URL = 'https://qq.com'
28 | VALIDATE_METHOD = 'HEAD' # 验证方式，可选：GET、HEAD
29 | VALIDATE_HEADER = 'location' # 仅用于HEAD验证方式，百度响应头Server字段KEYWORD可填：bfe
30 | VALIDATE_KEYWORD = 'www.qq.com'
31 | VALIDATE_TIMEOUT = 5 # 超时时间，单位s
32 | VALIDATE_MAX_FAILS = 3
33 | 


--------------------------------------------------------------------------------
/db/Fetcher.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | import datetime
 4 | 
 5 | class Fetcher(object):
 6 |     """
 7 |     爬取器的状态储存在数据库中，包括是否启用爬取器，爬取到的代理数量等
 8 |     """
 9 | 
10 |     ddls = ["""
11 |     CREATE TABLE IF NOT EXISTS fetchers
12 |     (
13 |         name VARCHAR(255) NOT NULL,
14 |         enable BOOLEAN NOT NULL,
15 |         sum_proxies_cnt INTEGER NOT NULL,
16 |         last_proxies_cnt INTEGER NOT NULL,
17 |         last_fetch_date TIMESTAMP,
18 |         PRIMARY KEY (name)
19 |     )
20 |     """]
21 | 
22 |     def __init__(self):
23 |         self.name = None
24 |         self.enable = True
25 |         self.sum_proxies_cnt = 0
26 |         self.last_proxies_cnt = 0
27 |         self.last_fetch_date = None
28 |     
29 |     def params(self):
30 |         """
31 |         返回一个元组，包含自身的全部属性
32 |         """
33 |         return (
34 |             self.name, self.enable,
35 |             self.sum_proxies_cnt, self.last_proxies_cnt, self.last_fetch_date
36 |         )
37 |     
38 |     def to_dict(self):
39 |         """
40 |         返回一个dict，包含自身的全部属性
41 |         """
42 |         return {
43 |             'name': self.name,
44 |             'enable': self.enable,
45 |             'sum_proxies_cnt': self.sum_proxies_cnt,
46 |             'last_proxies_cnt': self.last_proxies_cnt,
47 |             'last_fetch_date': str(self.last_fetch_date) if self.last_fetch_date is not None else None
48 |         }
49 |     
50 |     @staticmethod
51 |     def decode(row):
52 |         """
53 |         将sqlite返回的一行解析为Fetcher
54 |         row : sqlite返回的一行
55 |         """
56 |         assert len(row) == 5
57 |         f = Fetcher()
58 |         f.name = row[0]
59 |         f.enable = bool(row[1])
60 |         f.sum_proxies_cnt = row[2]
61 |         f.last_proxies_cnt = row[3]
62 |         f.last_fetch_date = row[4]
63 |         return f
64 | 


--------------------------------------------------------------------------------
/db/Proxy.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | 
  3 | import datetime
  4 | import random
  5 | class Proxy(object):
  6 |     """
  7 |     代理，用于表示数据库中的一个记录
  8 |     """
  9 | 
 10 |     ddls = ["""
 11 |     CREATE TABLE IF NOT EXISTS proxies
 12 |     (
 13 |         fetcher_name VARCHAR(255) NOT NULL,
 14 |         protocol VARCHAR(32) NOT NULL,
 15 |         ip VARCHAR(255) NOT NULL,
 16 |         port INTEGER NOT NULL,
 17 |         validated BOOLEAN NOT NULL,
 18 |         latency INTEGER,
 19 |         validate_date TIMESTAMP,
 20 |         to_validate_date TIMESTAMP NOT NULL,
 21 |         validate_failed_cnt INTEGER NOT NULL,
 22 |         PRIMARY KEY (protocol, ip, port)
 23 |     )
 24 |     """,
 25 |     """
 26 |     CREATE INDEX IF NOT EXISTS proxies_fetcher_name_index
 27 |     ON proxies(fetcher_name)
 28 |     """,
 29 |     """
 30 |     CREATE INDEX IF NOT EXISTS proxies_to_validate_date_index
 31 |     ON proxies(to_validate_date ASC)
 32 |     """]
 33 | 
 34 |     def __init__(self):
 35 |         self.fetcher_name = None
 36 |         self.protocol = None
 37 |         self.ip = None
 38 |         self.port = None
 39 |         self.validated = False
 40 |         self.latency = None
 41 |         self.validate_date = None
 42 |         self.to_validate_date = datetime.datetime.now()
 43 |         self.validate_failed_cnt = 0
 44 |     
 45 |     def params(self):
 46 |         """
 47 |         返回一个元组，包含自身的全部属性
 48 |         """
 49 |         return (
 50 |             self.fetcher_name,
 51 |             self.protocol, self.ip, self.port,
 52 |             self.validated, self.latency,
 53 |             self.validate_date, self.to_validate_date, self.validate_failed_cnt
 54 |         )
 55 |     
 56 |     def to_dict(self):
 57 |         """
 58 |         返回一个dict，包含自身的全部属性
 59 |         """
 60 |         return {
 61 |             'fetcher_name': self.fetcher_name,
 62 |             'protocol': self.protocol,
 63 |             'ip': self.ip,
 64 |             'port': self.port,
 65 |             'validated': self.validated,
 66 |             'latency': self.latency,
 67 |             'validate_date': str(self.validate_date) if self.validate_date is not None else None,
 68 |             'to_validate_date': str(self.to_validate_date) if self.to_validate_date is not None else None,
 69 |             'validate_failed_cnt': self.validate_failed_cnt
 70 |         }
 71 |     
 72 |     @staticmethod
 73 |     def decode(row):
 74 |         """
 75 |         将sqlite返回的一行解析为Proxy
 76 |         row : sqlite返回的一行
 77 |         """
 78 |         assert len(row) == 9
 79 |         p = Proxy()
 80 |         p.fetcher_name = row[0]
 81 |         p.protocol = row[1]
 82 |         p.ip = row[2]
 83 |         p.port = row[3]
 84 |         p.validated = bool(row[4])
 85 |         p.latency = row[5]
 86 |         p.validate_date = row[6]
 87 |         p.to_validate_date = row[7]
 88 |         p.validate_failed_cnt = row[8]
 89 |         return p
 90 |     
 91 |     def validate(self, success, latency):
 92 |         """
 93 |         传入一次验证结果，根据验证结果调整自身属性，并返回是否删除这个代理
 94 |         success : True/False，表示本次验证是否成功
 95 |         返回 : True/False，True表示这个代理太差了，应该从数据库中删除
 96 |         """
 97 |         self.latency = latency
 98 |         if success: # 验证成功
 99 |             self.validated = True
100 |             self.validate_date = datetime.datetime.now()
101 |             self.validate_failed_cnt = 0
102 |             #self.to_validate_date = datetime.datetime.now() + datetime.timedelta(minutes=30)  # 30分钟之后继续验证
103 |             self.to_validate_date = datetime.datetime.now() + datetime.timedelta(minutes=random.randint(10, 60))  # 10·60分钟之后继续验证
104 |             return False
105 |         else:
106 |             self.validated = False
107 |             self.validate_date = datetime.datetime.now()
108 |             self.validate_failed_cnt = self.validate_failed_cnt + 1
109 | 
110 |             # 验证失败的次数越多，距离下次验证的时间越长
111 |             delay_minutes = self.validate_failed_cnt * 10
112 |             self.to_validate_date = datetime.datetime.now() + datetime.timedelta(minutes=delay_minutes)
113 | 
114 |             if self.validate_failed_cnt >= 6:
115 |                 return True
116 |             else:
117 |                 return False
118 | 


--------------------------------------------------------------------------------
/db/README.md:
--------------------------------------------------------------------------------
 1 | # 数据库封装
 2 | 
 3 | 这个目录下封装了操作数据库的一些接口。
 4 | 为了通用性，本项目使用SQLite作为底层的数据库，使用`sqlite3`提供的接口对数据库进行操作。
 5 | 
 6 | ## 数据表
 7 | 
 8 | 主要包含两个表，分别用于储存代理和爬取器：
 9 | 
10 | 1. 代理
11 | 
12 | | 字段名称            | 数据类型 | 说明                                                                     |
13 | |---------------------|----------|--------------------------------------------------------------------------|
14 | | fetcher_name        | 字符串   | 这个代理来自哪个爬取器                                                   |
15 | | protocol            | 字符串   | 代理协议名称，一般为HTTP                                                 |
16 | | ip                  | 字符串   | 代理的IP地址                                                             |
17 | | port                | 整数     | 代理的端口号                                                             |
18 | | validated           | 布尔值   | 这个代理是否通过了验证，通过了验证表示当前代理可用                       |
19 | | latency             |  整数    | 延迟(单位毫秒)，表示上次验证所用的时间，越小则代理质量越好                       |
20 | | validate_date       | 时间戳   | 上一次进行验证的时间                                                     |
21 | | to_validate_date    | 时间戳   | 下一次进行验证的时间，如何调整下一次验证的时间可见后文或者代码`Proxy.py` |
22 | | validate_failed_cnt | 整数     | 已经连续验证失败了多少次，会影响下一次验证的时间                         |
23 | 
24 | 2. 爬取器
25 | 
26 | | 字段名称         | 数据类型 | 说明                                                                             |
27 | |------------------|----------|----------------------------------------------------------------------------------|
28 | | name             | 字符串   | 爬取器的名称                                                                     |
29 | | enable           | 布尔值   | 是否启用这个爬取器，被禁用的爬取器不会在之后被运行，但是其之前爬取的代理依然存在 |
30 | | sum_proxies_cnt  | 整数     | 至今为止总共爬取到了多少个代理                                                   |
31 | | last_proxies_cnt | 整数     | 上次爬取到了多少个代理                                                           |
32 | | last_fetch_date  | 时间戳   | 上次爬取的时间                                                                   |
33 | 
34 | ## 下次验证时间调整算法
35 | 
36 | 由于不同代理网站公开的免费代理质量差距较大，因此对于多次验证都失败的代理，我们需要降低对他们进行验证的频率，甚至将他们从数据库中删除。
37 | 而对于现在可用的代理，则需要频繁对其进行验证，以保证其可用性。
38 | 
39 | 目前的算法较为简单，可见`Proxy.py`文件中的`validate`函数，核心思想如下：
40 | 
41 | 1. 优先验证之前验证通过并且到了验证时间的代理（`conn.py`中的`getToValidate`函数）
42 | 2. 对于爬取器新爬取到的代理，我们需要尽快对其进行验证(设置`to_validate_date`为当前时间)
43 | 3. 如果某个代理验证成功，那么设置它下一次进行验证的时间为5分钟之后
44 | 4. 如果某个代理验证失败，那么设置它下一次进行验证的时间为 5 * 连续失败次数 分钟之后，如果连续3次失败，那么将其从数据库中删除
45 | 
46 | 你可以修改为自己的算法，主要代码涉及`Proxy.py`文件以及`conn.py`文件的`pushNewFetch`和`getToValidate`函数。
47 | 


--------------------------------------------------------------------------------
/db/__init__.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | 
3 | from .init import init
4 | 
5 | init()
6 | 


--------------------------------------------------------------------------------
/db/conn.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | 
  3 | """
  4 | 封装的数据库接口
  5 | """
  6 | 
  7 | from config import DATABASE_PATH
  8 | from .Proxy import Proxy
  9 | from .Fetcher import Fetcher
 10 | import sqlite3
 11 | import datetime
 12 | import threading
 13 | 
 14 | conn = sqlite3.connect(DATABASE_PATH, detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES)
 15 | # 线程锁
 16 | conn_lock = threading.Lock()
 17 | # 进程锁
 18 | proc_lock = None
 19 | 
 20 | def set_proc_lock(proc_lock_sub):
 21 |     """
 22 |     设置进程锁
 23 |     proc_lock_sub : main中的进程锁
 24 |     """
 25 |     global proc_lock
 26 |     proc_lock = proc_lock_sub
 27 | 
 28 | def pushNewFetch(fetcher_name, protocol, ip, port):
 29 |     """
 30 |     爬取器新抓到了一个代理，调用本函数将代理放入数据库
 31 |     fetcher_name : 爬取器名称
 32 |     protocol : 代理协议
 33 |     ip : 代理IP地址
 34 |     port : 代理端口
 35 |     """
 36 |     p = Proxy()
 37 |     p.fetcher_name = fetcher_name
 38 |     p.protocol = protocol
 39 |     p.ip = ip
 40 |     p.port = port
 41 |     conn_lock.acquire()
 42 |     proc_lock.acquire()
 43 | 
 44 |     c = conn.cursor()
 45 |     c.execute('BEGIN EXCLUSIVE TRANSACTION;')
 46 |     # 更新proxies表
 47 |     c.execute('SELECT * FROM proxies WHERE protocol=? AND ip=? AND port=?', (p.protocol, p.ip, p.port))
 48 |     row = c.fetchone()
 49 |     if row is not None: # 已经存在(protocol, ip, port)
 50 |         old_p = Proxy.decode(row)
 51 |         c.execute("""
 52 |             UPDATE proxies SET fetcher_name=?,to_validate_date=? WHERE protocol=? AND ip=? AND port=?
 53 |         """, (p.fetcher_name, min(datetime.datetime.now(), old_p.to_validate_date), p.protocol, p.ip, p.port))
 54 |     else:
 55 |         c.execute('INSERT INTO proxies VALUES (?,?,?,?,?,?,?,?,?)', p.params())
 56 |     c.close()
 57 |     conn.commit()
 58 |     conn_lock.release()
 59 |     proc_lock.release()
 60 | 
 61 | def getToValidate(max_count=1):
 62 |     """
 63 |     从数据库中获取待验证的代理，根据to_validate_date字段
 64 |     优先选取已经通过了验证的代理，其次是没有通过验证的代理
 65 |     max_count : 返回数量限制
 66 |     返回 : list[Proxy]
 67 |     """
 68 |     conn_lock.acquire()
 69 |     proc_lock.acquire()
 70 |     c = conn.cursor()
 71 |     c.execute('BEGIN EXCLUSIVE TRANSACTION;')
 72 |     c.execute('SELECT * FROM proxies WHERE to_validate_date<=? AND validated=? ORDER BY to_validate_date LIMIT ?', (
 73 |         datetime.datetime.now(),
 74 |         True,
 75 |         max_count
 76 |     ))
 77 |     proxies = [Proxy.decode(row) for row in c]
 78 |     c.execute('SELECT * FROM proxies WHERE to_validate_date<=? AND validated=? ORDER BY to_validate_date LIMIT ?', (
 79 |         datetime.datetime.now(),
 80 |         False,
 81 |         max_count - len(proxies)
 82 |     ))
 83 |     proxies = proxies + [Proxy.decode(row) for row in c]
 84 |     c.close()
 85 |     conn.commit()
 86 |     conn_lock.release()
 87 |     proc_lock.release()
 88 |     return proxies
 89 | 
 90 | def pushValidateResult(proxy, success, latency):
 91 |     """
 92 |     将验证器的一个结果添加进数据库中
 93 |     proxy : 代理
 94 |     success : True/False，验证是否成功
 95 |     latency : 本次验证所用的时间(单位毫秒)
 96 |     """
 97 |     p = proxy
 98 |     should_remove = p.validate(success, latency)
 99 |     conn_lock.acquire()
100 |     proc_lock.acquire()
101 |     if should_remove:
102 |         conn.execute('DELETE FROM proxies WHERE protocol=? AND ip=? AND port=?', (p.protocol, p.ip, p.port))
103 |     else:
104 |         conn.execute("""
105 |             UPDATE proxies
106 |             SET fetcher_name=?,validated=?,latency=?,validate_date=?,to_validate_date=?,validate_failed_cnt=?
107 |             WHERE protocol=? AND ip=? AND port=?
108 |         """, (
109 |             p.fetcher_name, p.validated, p.latency, p.validate_date, p.to_validate_date, p.validate_failed_cnt,
110 |             p.protocol, p.ip, p.port
111 |         ))
112 |     conn.commit()
113 |     conn_lock.release()
114 |     proc_lock.release()
115 | 
116 | def getValidatedRandom(max_count):
117 |     """
118 |     从通过了验证的代理中，随机选择max_count个代理返回
119 |     max_count<=0表示不做数量限制
120 |     返回 : list[Proxy]
121 |     """
122 |     conn_lock.acquire()
123 |     proc_lock.acquire()
124 |     if max_count > 0:
125 |         r = conn.execute('SELECT * FROM proxies WHERE validated=? ORDER BY RANDOM() LIMIT ?', (True, max_count))
126 |     else:
127 |         r = conn.execute('SELECT * FROM proxies WHERE validated=? ORDER BY RANDOM()', (True,))
128 |     proxies = [Proxy.decode(row) for row in r]
129 |     r.close()
130 |     conn_lock.release()
131 |     proc_lock.release()
132 |     return proxies
133 |     
134 |     #新增方法
135 | def get_by_protocol(protocol, max_count):
136 |     """
137 |     查询 protocol 字段为指定值的代理服务器记录
138 |     max_count 表示返回记录的最大数量，如果为 0 或负数则返回所有记录
139 |     返回 : list[Proxy]
140 |     """
141 |     conn_lock.acquire()
142 |     proc_lock.acquire()
143 |     if max_count > 0:
144 |         r = conn.execute('SELECT * FROM proxies WHERE protocol=? AND validated=? ORDER BY RANDOM() LIMIT ?', (protocol, True, max_count))
145 |     else:
146 |         r = conn.execute('SELECT * FROM proxies WHERE protocol=? AND validated=? ORDER BY RANDOM()', (protocol, True))
147 |     proxies = [Proxy.decode(row) for row in r]
148 |     r.close()
149 |     conn_lock.release()
150 |     proc_lock.release()
151 |     return proxies
152 | 
153 | def pushFetcherResult(name, proxies_cnt):
154 |     """
155 |     更新爬取器的状态，每次在完成一个网站的爬取之后，调用本函数
156 |     name : 爬取器的名称
157 |     proxies_cnt : 本次爬取到的代理数量
158 |     """
159 |     conn_lock.acquire()
160 |     proc_lock.acquire()
161 |     c = conn.cursor()
162 |     c.execute('BEGIN EXCLUSIVE TRANSACTION;')
163 |     c.execute('SELECT * FROM fetchers WHERE name=?', (name,))
164 |     row = c.fetchone()
165 |     if row is None:
166 |         raise ValueError(f'ERRROR: can not find fetcher {name}')
167 |     else:
168 |         f = Fetcher.decode(row)
169 |         f.last_proxies_cnt = proxies_cnt
170 |         f.sum_proxies_cnt = f.sum_proxies_cnt + proxies_cnt
171 |         f.last_fetch_date = datetime.datetime.now()
172 |         c.execute('UPDATE fetchers SET sum_proxies_cnt=?,last_proxies_cnt=?,last_fetch_date=? WHERE name=?', (
173 |             f.sum_proxies_cnt, f.last_proxies_cnt, f.last_fetch_date, f.name
174 |         ))
175 |     c.close()
176 |     conn.commit()
177 |     conn_lock.release()
178 |     proc_lock.release()
179 | 
180 | def pushFetcherEnable(name, enable):
181 |     """
182 |     设置是否起用对应爬取器，被禁用的爬取器将不会被运行
183 |     name : 爬取器的名称
184 |     enable : True/False, 是否启用
185 |     """
186 |     conn_lock.acquire()
187 |     proc_lock.acquire()
188 |     c = conn.cursor()
189 |     c.execute('BEGIN EXCLUSIVE TRANSACTION;')
190 |     c.execute('SELECT * FROM fetchers WHERE name=?', (name,))
191 |     row = c.fetchone()
192 |     if row is None:
193 |         raise ValueError(f'ERRROR: can not find fetcher {name}')
194 |     else:
195 |         f = Fetcher.decode(row)
196 |         f.enable = enable
197 |         c.execute('UPDATE fetchers SET enable=? WHERE name=?', (
198 |             f.enable, f.name
199 |         ))
200 |     c.close()
201 |     conn.commit()
202 |     conn_lock.release()
203 |     proc_lock.release()
204 | 
205 | def getAllFetchers():
206 |     """
207 |     获取所有的爬取器以及状态
208 |     返回 : list[Fetcher]
209 |     """
210 |     conn_lock.acquire()
211 |     proc_lock.acquire()
212 |     r = conn.execute('SELECT * FROM fetchers')
213 |     fetchers = [Fetcher.decode(row) for row in r]
214 |     r.close()
215 |     conn_lock.release()
216 |     proc_lock.release()
217 |     return fetchers
218 | 
219 | def getFetcher(name):
220 |     """
221 |     获取指定爬取器以及状态
222 |     返回 : Fetcher
223 |     """
224 |     conn_lock.acquire()
225 |     proc_lock.acquire()
226 |     r = conn.execute('SELECT * FROM fetchers WHERE name=?', (name,))
227 |     row = r.fetchone()
228 |     r.close()
229 |     conn_lock.release()
230 |     proc_lock.release()
231 |     if row is None:
232 |         return None
233 |     else:
234 |         return Fetcher.decode(row)
235 | 
236 | def getProxyCount(fetcher_name):
237 |     """
238 |     查询在数据库中有多少个由指定爬取器爬取到的代理
239 |     fetcher_name : 爬取器名称
240 |     返回 : int
241 |     """
242 |     conn_lock.acquire()
243 |     proc_lock.acquire()
244 |     r = conn.execute('SELECT count(*) FROM proxies WHERE fetcher_name=?', (fetcher_name,))
245 |     cnt = r.fetchone()[0]
246 |     r.close()
247 |     conn_lock.release()
248 |     proc_lock.release()
249 |     return cnt
250 | 
251 | def getProxiesStatus():
252 |     """
253 |     获取代理状态，包括`全部代理数量`，`当前可用代理数量`，`等待验证代理数量`
254 |     返回 : dict
255 |     """
256 |     conn_lock.acquire()
257 |     proc_lock.acquire()
258 |     r = conn.execute('SELECT count(*) FROM proxies')
259 |     sum_proxies_cnt = r.fetchone()[0]
260 |     r.close()
261 | 
262 |     r = conn.execute('SELECT count(*) FROM proxies WHERE validated=?', (True,))
263 |     validated_proxies_cnt = r.fetchone()[0]
264 |     r.close()
265 | 
266 |     r = conn.execute('SELECT count(*) FROM proxies WHERE to_validate_date<=?', (datetime.datetime.now(),))
267 |     pending_proxies_cnt = r.fetchone()[0]
268 |     r.close()
269 |     conn_lock.release()
270 |     proc_lock.release()
271 |     return dict(
272 |         sum_proxies_cnt=sum_proxies_cnt,
273 |         validated_proxies_cnt=validated_proxies_cnt,
274 |         pending_proxies_cnt=pending_proxies_cnt
275 |     )
276 | 
277 | def pushClearFetchersStatus():
278 |     """
279 |     清空爬取器的统计信息，包括sum_proxies_cnt,last_proxies_cnt,last_fetch_date
280 |     """
281 |     conn_lock.acquire()
282 |     proc_lock.acquire()
283 |     c = conn.cursor()
284 |     c.execute('BEGIN EXCLUSIVE TRANSACTION;')
285 |     c.execute('UPDATE fetchers SET sum_proxies_cnt=?, last_proxies_cnt=?, last_fetch_date=?', (0, 0, None))
286 |     c.close()
287 |     conn.commit()
288 |     conn_lock.release()
289 |     proc_lock.release()
290 | 


--------------------------------------------------------------------------------
/db/init.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from config import DATABASE_PATH
 4 | from .Proxy import Proxy
 5 | from .Fetcher import Fetcher
 6 | from fetchers import fetchers
 7 | import sqlite3
 8 | 
 9 | def init():
10 |     """
11 |     初始化数据库
12 |     """
13 | 
14 |     conn = sqlite3.connect(DATABASE_PATH)
15 | 
16 |     create_tables = Proxy.ddls + Fetcher.ddls
17 |     for sql in create_tables:
18 |         conn.execute(sql)
19 |         conn.commit()
20 |     
21 |     # 注册所有的爬取器
22 |     c = conn.cursor()
23 |     c.execute('BEGIN EXCLUSIVE TRANSACTION;')
24 |     for item in fetchers:
25 |         c.execute('SELECT * FROM fetchers WHERE name=?', (item.name,))
26 |         if c.fetchone() is None:
27 |             f = Fetcher()
28 |             f.name = item.name
29 |             c.execute('INSERT INTO fetchers VALUES(?,?,?,?,?)', f.params())
30 |     c.close()
31 |     conn.commit()
32 |     
33 |     conn.close()
34 | 


--------------------------------------------------------------------------------
/docs/screenshot1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/docs/screenshot1.png


--------------------------------------------------------------------------------
/docs/screenshot2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/docs/screenshot2.png


--------------------------------------------------------------------------------
/docs/term.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/docs/term.png


--------------------------------------------------------------------------------
/docs/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/docs/workflow.png


--------------------------------------------------------------------------------
/fetchers/BaseFetcher.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | class BaseFetcher(object):
 4 |     """
 5 |     所有爬取器的基类
 6 |     """
 7 | 
 8 |     def fetch(self):
 9 |         """
10 |         执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocal是协议名称，目前主要为http
11 |         返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
12 |         """
13 |         raise NotImplementedError()
14 | 


--------------------------------------------------------------------------------
/fetchers/GoubanjiaFetcher.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from .BaseFetcher import BaseFetcher
 4 | import requests
 5 | from pyquery import PyQuery as pq
 6 | import re
 7 | 
 8 | class GoubanjiaFetcher(BaseFetcher):
 9 |     """
10 |     http://www.goubanjia.com/
11 |     """
12 | 
13 |     def fetch(self):
14 |         """
15 |         执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocal是协议名称，目前主要为http
16 |         返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
17 |         """
18 | 
19 |         proxies = []
20 | 
21 |         headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'}
22 |         html = requests.get('http://www.goubanjia.com/', headers=headers, timeout=10).text
23 |         doc = pq(html)
24 |         for item in doc('table tbody tr').items():
25 |             ipport = item.find('td.ip').html()
26 |             # 以下对ipport进行整理
27 |             hide_reg = re.compile(r'<p[^<>]*style="display:[^<>]*none;"[^<>]*>[^<>]*</p>')
28 |             ipport = re.sub(hide_reg, '', ipport)
29 |             tag_reg = re.compile(r'<[^<>]*>')
30 |             ipport = re.sub(tag_reg, '', ipport)
31 | 
32 |             ip = ipport.split(':')[0]
33 |             port = self.pde(item.find('td.ip').find('span.port').attr('class').split(' ')[1])
34 |             proxies.append(('http', ip, int(port)))
35 |         
36 |         return list(set(proxies))
37 |     
38 |     def pde(self, class_key): # 解密函数，端口是加密过的
39 |         """
40 |         key是class内容
41 |         """
42 |         class_key = str(class_key)
43 |         f = []
44 |         for i in range(len(class_key)):
45 |             f.append(str('ABCDEFGHIZ'.index(class_key[i])))
46 |         return str(int(''.join(f)) >> 0x3)
47 | 


--------------------------------------------------------------------------------
/fetchers/IHuanFetcher.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from .BaseFetcher import BaseFetcher
 4 | import requests
 5 | from pyquery import PyQuery as pq
 6 | import re
 7 | 
 8 | class IHuanFetcher(BaseFetcher):
 9 |     """
10 |     https://ip.ihuan.me/
11 |     爬这个网站要温柔点，站长表示可能会永久关站
12 |     """
13 | 
14 |     def fetch(self):
15 |         """
16 |         执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocal是协议名称，目前主要为http
17 |         返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
18 |         """
19 | 
20 |         proxies = []
21 |         ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$')
22 |         port_regex = re.compile(r'^\d+$')
23 | 
24 |         pending_urls = ['https://ip.ihuan.me/']
25 |         while len(pending_urls) > 0:
26 |             url = pending_urls[0]
27 |             pending_urls = pending_urls[1:]
28 | 
29 |             headers = {
30 |                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
31 |                 'Accept-Encoding': 'gzip, deflate',
32 |                 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
33 |                 'Cache-Control': 'no-cache',
34 |                 'Connection': 'keep-alive',
35 |                 'Pragma': 'no-cache',
36 |                 'Upgrade-Insecure-Requests': '1',
37 |                 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36'
38 |             }
39 |             try:
40 |                 html = requests.get(url, headers=headers, timeout=10).text
41 |             except Exception as e:
42 |                 print('ERROR in ip.ihuan.me:' + str(e))
43 |                 continue
44 |             doc = pq(html)
45 |             for line in doc('tbody tr').items():
46 |                 tds = list(line('td').items())
47 |                 if len(tds) == 10:
48 |                     ip = tds[0].text().strip()
49 |                     port = tds[1].text().strip()
50 |                     if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None:
51 |                         proxies.append(('http', ip, int(port)))
52 |             
53 |             if url.endswith('/'): # 当前是第一页，解析后面几页的链接
54 |                 for item in list(doc('.pagination a').items())[1:-1]:
55 |                     href = item.attr('href')
56 |                     if href is not None and href.startswith('?page='):
57 |                         pending_urls.append('https://ip.ihuan.me/' + href)
58 |         
59 |         return list(set(proxies))
60 | 


--------------------------------------------------------------------------------
/fetchers/IP3366Fetcher.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from .BaseFetcher import BaseFetcher
 4 | import requests
 5 | from pyquery import PyQuery as pq
 6 | import re
 7 | 
 8 | class IP3366Fetcher(BaseFetcher):
 9 |     """
10 |     http://www.ip3366.net/free/?stype=1
11 |     """
12 | 
13 |     def fetch(self):
14 |         """
15 |         执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocal是协议名称，目前主要为http
16 |         返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
17 |         """
18 | 
19 |         urls = []
20 |         for stype in ['1', '2']:
21 |             for page in range(1, 6):
22 |                 url = f'http://www.ip3366.net/free/?stype={stype}&page={page}'
23 |                 urls.append(url)
24 | 
25 |         proxies = []
26 |         ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$')
27 |         port_regex = re.compile(r'^\d+$')
28 | 
29 |         for url in urls:
30 |             headers = {
31 |                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
32 |                 'Accept-Encoding': 'gzip, deflate',
33 |                 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
34 |                 'Cache-Control': 'no-cache',
35 |                 'Connection': 'keep-alive',
36 |                 'Pragma': 'no-cache',
37 |                 'Upgrade-Insecure-Requests': '1',
38 |                 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36'
39 |             }
40 |             html = requests.get(url, headers=headers, timeout=10).text
41 |             doc = pq(html)
42 |             for line in doc('tr').items():
43 |                 tds = list(line('td').items())
44 |                 if len(tds) == 7:
45 |                     ip = tds[0].text().strip()
46 |                     port = tds[1].text().strip()
47 |                     if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None:
48 |                         proxies.append(('http', ip, int(port)))
49 |         
50 |         return list(set(proxies))
51 | 


--------------------------------------------------------------------------------
/fetchers/IP66Fetcher.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from .BaseFetcher import BaseFetcher
 4 | import requests
 5 | from pyquery import PyQuery as pq
 6 | import re
 7 | 
 8 | class IP66Fetcher(BaseFetcher):
 9 |     """
10 |     http://www.66ip.cn/
11 |     """
12 | 
13 |     def fetch(self):
14 |         """
15 |         执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocal是协议名称，目前主要为http
16 |         返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
17 |         """
18 | 
19 |         urls = []
20 |         for areaindex in range(10):
21 |             for page in range(1, 6):
22 |                 if areaindex == 0:
23 |                     url = f'http://www.66ip.cn/{page}.html'
24 |                 else:
25 |                     url = f'http://www.66ip.cn/areaindex_{areaindex}/{page}.html'
26 |                 urls.append(url)
27 | 
28 |         proxies = []
29 |         ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$')
30 |         port_regex = re.compile(r'^\d+$')
31 | 
32 |         for url in urls:
33 |             headers = {
34 |                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
35 |                 'Accept-Encoding': 'gzip, deflate',
36 |                 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
37 |                 'Cache-Control': 'no-cache',
38 |                 'Connection': 'keep-alive',
39 |                 'Pragma': 'no-cache',
40 |                 'Upgrade-Insecure-Requests': '1',
41 |                 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36'
42 |             }
43 |             html = requests.get(url, headers=headers, timeout=10).text
44 |             doc = pq(html)
45 |             for line in doc('table tr').items():
46 |                 tds = list(line('td').items())
47 |                 if len(tds) == 5:
48 |                     ip = tds[0].text().strip()
49 |                     port = tds[1].text().strip()
50 |                     if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None:
51 |                         proxies.append(('http', ip, int(port)))
52 |         
53 |         return list(set(proxies))
54 | 


--------------------------------------------------------------------------------
/fetchers/IP89Fetcher.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from .BaseFetcher import BaseFetcher
 4 | import requests
 5 | from pyquery import PyQuery as pq
 6 | import re
 7 | 
 8 | class IP89Fetcher(BaseFetcher):
 9 |     """
10 |     https://www.89ip.cn/
11 |     """
12 | 
13 |     def fetch(self):
14 |         """
15 |         执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocal是协议名称，目前主要为http
16 |         返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
17 |         """
18 | 
19 |         urls = []
20 |         for page in range(1, 6):
21 |             url = f'https://www.89ip.cn/index_{page}.html'
22 |             urls.append(url)
23 | 
24 |         proxies = []
25 |         ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$')
26 |         port_regex = re.compile(r'^\d+$')
27 | 
28 |         for url in urls:
29 |             headers = {
30 |                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
31 |                 'Accept-Encoding': 'gzip, deflate',
32 |                 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
33 |                 'Cache-Control': 'no-cache',
34 |                 'Connection': 'keep-alive',
35 |                 'Pragma': 'no-cache',
36 |                 'Upgrade-Insecure-Requests': '1',
37 |                 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36'
38 |             }
39 |             html = requests.get(url, headers=headers, timeout=10).text
40 |             doc = pq(html)
41 |             for line in doc('tr').items():
42 |                 tds = list(line('td').items())
43 |                 if len(tds) == 5:
44 |                     ip = tds[0].text().strip()
45 |                     port = tds[1].text().strip()
46 |                     if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None:
47 |                         proxies.append(('http', ip, int(port)))
48 |         
49 |         return list(set(proxies))
50 | 


--------------------------------------------------------------------------------
/fetchers/JiangxianliFetcher.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from .BaseFetcher import BaseFetcher
 4 | import requests
 5 | from pyquery import PyQuery as pq
 6 | import re
 7 | 
 8 | class JiangxianliFetcher(BaseFetcher):
 9 |     """
10 |     https://ip.jiangxianli.com/?page=1
11 |     """
12 | 
13 |     def fetch(self):
14 |         """
15 |         执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocal是协议名称，目前主要为http
16 |         返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
17 |         """
18 | 
19 |         urls = []
20 |         for page in range(1, 5):
21 |             url = f'https://ip.jiangxianli.com/?page={page}'
22 |             urls.append(url)
23 | 
24 |         proxies = []
25 |         ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$')
26 |         port_regex = re.compile(r'^\d+$')
27 | 
28 |         for url in urls:
29 |             headers = {
30 |                 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
31 |                 'Accept-Encoding': 'gzip, deflate',
32 |                 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
33 |                 'Cache-Control': 'no-cache',
34 |                 'Connection': 'keep-alive',
35 |                 'Pragma': 'no-cache',
36 |                 'Upgrade-Insecure-Requests': '1',
37 |                 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36'
38 |             }
39 |             html = requests.get(url, headers=headers, timeout=10).text
40 |             doc = pq(html)
41 |             for line in doc('tr').items():
42 |                 tds = list(line('td').items())
43 |                 if len(tds) >= 2:
44 |                     ip = tds[0].text().strip()
45 |                     port = tds[1].text().strip()
46 |                     if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None:
47 |                         proxies.append(('http', ip, int(port)))
48 |         
49 |         return list(set(proxies))
50 | 


--------------------------------------------------------------------------------
/fetchers/KaiXinFetcher.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import time
 3 | 
 4 | import requests
 5 | from pyquery import PyQuery as pq
 6 | 
 7 | from .BaseFetcher import BaseFetcher
 8 | 
 9 | class KaiXinFetcher(BaseFetcher):
10 |     """
11 |     http://www.kxdaili.com/dailiip.html
12 |     代码由 [Zealot666](https://github.com/Zealot666) 提供
13 |     """
14 | 
15 |     def fetch(self):
16 |         """
17 |         执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocol是协议名称，目前主要为http
18 |         返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
19 |         """
20 | 
21 |         urls = []
22 |         urls = urls + [f'http://www.kxdaili.com/dailiip/1/{page}.html' for page in range(1, 11)]
23 |         urls = urls + [f'http://www.kxdaili.com/dailiip/2/{page}.html' for page in range(1, 11)]
24 | 
25 |         proxies = []
26 |         ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$')
27 |         port_regex = re.compile(r'^\d+$')
28 | 
29 |         for url in urls:
30 |             html = requests.get(url, timeout=10).text
31 |             doc = pq(html)
32 |             for line in doc('tr').items():
33 |                 tds = list(line('td').items())
34 |                 if len(tds) >= 2:
35 |                     ip = tds[0].text().strip()
36 |                     port = tds[1].text().strip()
37 |                     if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None:
38 |                         proxies.append(('http', ip, int(port)))
39 | 
40 |         return list(set(proxies))
41 | 


--------------------------------------------------------------------------------
/fetchers/KuaidailiFetcher.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from .BaseFetcher import BaseFetcher
 4 | import requests
 5 | from pyquery import PyQuery as pq
 6 | 
 7 | class KuaidailiFetcher(BaseFetcher):
 8 |     """
 9 |     https://www.kuaidaili.com/free
10 |     """
11 | 
12 |     def fetch(self):
13 |         """
14 |         执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocal是协议名称，目前主要为http
15 |         返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
16 |         """
17 |         
18 |         urls = []
19 |         urls = urls + [f'https://www.kuaidaili.com/free/inha/{page}/' for page in range(1, 11)]
20 |         urls = urls + [f'https://www.kuaidaili.com/free/intr/{page}/' for page in range(1, 11)]
21 | 
22 |         proxies = []
23 | 
24 |         for url in urls:
25 |             html = requests.get(url, timeout=10).text
26 |             doc = pq(html)
27 |             for item in doc('table tbody tr').items():
28 |                 ip = item.find('td[data-title="IP"]').text()
29 |                 port = int(item.find('td[data-title="PORT"]').text())
30 |                 proxies.append(('http', ip, port))
31 |         
32 |         return list(set(proxies))
33 | 


--------------------------------------------------------------------------------
/fetchers/ProxyListFetcher.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import requests
 4 | 
 5 | from .BaseFetcher import BaseFetcher
 6 | 
 7 | 
 8 | class ProxyListFetcher(BaseFetcher):
 9 |     """
10 |     https://www.proxy-list.download/api/v1/get?type={{ protocol }}&_t={{ timestamp }}
11 |     """
12 | 
13 |     def fetch(self):
14 |         proxies = []
15 |         type_list = ['socks4', 'socks5', 'http', 'https']
16 |         for protocol in type_list:
17 |             url = "https://www.proxy-list.download/api/v1/get?type=" + protocol + "&_t=" + str(time.time())
18 |             proxies_list = requests.get(url).text.split("\n")
19 |             for data in proxies_list:
20 |                 flag_idx = data.find(":")
21 |                 ip = data[:flag_idx]
22 |                 port = data[flag_idx + 1:-1]
23 |                 proxies.append((protocol, ip, port))
24 | 
25 |         return list(set(proxies))
26 | 


--------------------------------------------------------------------------------
/fetchers/ProxyScrapeFetcher.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import requests
 4 | 
 5 | from .BaseFetcher import BaseFetcher
 6 | 
 7 | 
 8 | class ProxyScrapeFetcher(BaseFetcher):
 9 |     """
10 |     https://api.proxyscrape.com/?request=displayproxies&proxytype={{ protocol }}&_t={{ timestamp }}
11 |     """
12 | 
13 |     def fetch(self):
14 |         proxies = []
15 |         type_list = ['socks4', 'socks5', 'http', 'https']
16 |         for protocol in type_list:
17 |             url = "https://api.proxyscrape.com/?request=displayproxies&proxytype=" + protocol + "&_t=" + str(
18 |                 time.time())
19 |             resp = requests.get(url).text
20 |             for data in resp.split("\n"):
21 |                 flag_idx = data.find(":")
22 |                 ip = data[:flag_idx]
23 |                 port = data[flag_idx + 1:-1]
24 |                 proxies.append((protocol, ip, port))
25 | 
26 |         return list(set(proxies))
27 | 


--------------------------------------------------------------------------------
/fetchers/ProxyscanFetcher.py:
--------------------------------------------------------------------------------
 1 | from .BaseFetcher import BaseFetcher
 2 | import requests
 3 | import time
 4 | 
 5 | class ProxyscanFetcher(BaseFetcher):
 6 |     """
 7 |     https://www.proxyscan.io/api/proxy?last_check=9800&uptime=50&limit=20&_t={{ timestamp }}
 8 |     """
 9 | 
10 |     def fetch(self):
11 |         """
12 |         执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocol是协议名称，目前主要为http
13 |         返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
14 |         """
15 |         proxies = []
16 |         # 此API为随机获取接口，获取策略为：重复取十次后去重
17 |         for _ in range(10):
18 |             url = "https://www.proxyscan.io/api/proxy?last_check=9800&uptime=50&limit=20&_t=" + str(time.time())
19 |             resp = requests.get(url).json()
20 |             for data in resp:
21 |                 protocol = str.lower(data['Type'][0])
22 |                 proxies.append((protocol, data['Ip'], data['Port']))
23 |         
24 |         return list(set(proxies))


--------------------------------------------------------------------------------
/fetchers/README.md:
--------------------------------------------------------------------------------
 1 | # 爬取器
 2 | 
 3 | 所有的爬取器都在这个目录中，并且在`__init__.py`中进行了注册。
 4 | 
 5 | ## 添加新的爬取器
 6 | 
 7 | 本项目默认包含了数量不少的免费公开代理源，并且会持续更新，如果你发现有不错的免费代理源，欢迎通过Issues反馈给我们。
 8 | 
 9 | 1. 编写爬取器代码
10 | 
11 | 爬取器需要继承基类`BaseFetcher`，然后实现`fetch`函数。
12 | 
13 | `fetch`函数没有输入参数，每次运行都返回一个列表，列表中包含本次爬取到的代理。返回的格式为(代理协议类型,代理IP,端口)。
14 | 
15 | 示例：
16 | 
17 | ```python
18 | class CustomFetcher(BaseFetcher):
19 |     def fetch(self):
20 |         return [('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
21 | ```
22 | 
23 | 2. 注册爬取器
24 | 
25 | 编写好爬取器之后，还需要在`__init__.py`文件中进行注册，添加如下代码：
26 | 
27 | **注意：爬取器的名称(name)一定不能重复。**
28 | 
29 | ```python
30 | from .CustomFetcher import CustomFetcher
31 | 
32 | fetchers = [
33 |     ...
34 |     Fetcher(name='www.custom.com', fetcher=CustomFetcher),
35 |     ...
36 | ]
37 | ```
38 | 
39 | 3. 重启
40 | 
41 | 完成上述步骤之后，重启进程即可。代码会自动将新爬取器添加到数据库中，爬取进程也会自动运行新爬取器。
42 | 


--------------------------------------------------------------------------------
/fetchers/UUFetcher.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from .BaseFetcher import BaseFetcher
 4 | import requests
 5 | import json
 6 | 
 7 | class UUFetcher(BaseFetcher):
 8 |     """
 9 |     https://uu-proxy.com/
10 |     """
11 | 
12 |     def fetch(self):
13 |         """
14 |         执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocal是协议名称，目前主要为http
15 |         返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
16 |         """
17 | 
18 |         headers = {
19 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
20 |             'Accept-Encoding': 'gzip, deflate',
21 |             'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
22 |             'Cache-Control': 'no-cache',
23 |             'Connection': 'keep-alive',
24 |             'Pragma': 'no-cache',
25 |             'Upgrade-Insecure-Requests': '1',
26 |             'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.130 Chrome/79.0.3945.130 Safari/537.36'
27 |         }
28 |         data = requests.get('https://uu-proxy.com/api/free', headers=headers, timeout=10).text
29 |         free = json.loads(data)['free']
30 |         proxies = [(item['scheme'], item['ip'], item['port']) for item in free['proxies']]
31 | 
32 |         return list(set(proxies))
33 | 


--------------------------------------------------------------------------------
/fetchers/XiLaFetcher.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import time
 3 | import random
 4 | 
 5 | import requests
 6 | from pyquery import PyQuery as pq
 7 | 
 8 | from .BaseFetcher import BaseFetcher
 9 | 
10 | class XiLaFetcher(BaseFetcher):
11 |     """
12 |     http://www.xiladaili.com/gaoni/
13 |     代码由 [Zealot666](https://github.com/Zealot666) 提供
14 |     """
15 |     def __init__(self):
16 |         super().__init__()
17 |         self.index = 0
18 | 
19 |     def fetch(self):
20 |         """
21 |         执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocol是协议名称，目前主要为http
22 |         返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
23 |         """
24 |         self.index += 1
25 |         new_index = self.index % 30
26 | 
27 |         urls = []
28 |         urls = urls + [f'http://www.xiladaili.com/gaoni/{page}/' for page in range(new_index, new_index + 11)]
29 |         urls = urls + [f'http://www.xiladaili.com/http/{page}/' for page in range(new_index, new_index + 11)]
30 | 
31 |         proxies = []
32 |         ip_regex = re.compile(r'^\d+\.\d+\.\d+\.\d+$')
33 |         port_regex = re.compile(r'^\d+$')
34 | 
35 |         for url in urls:
36 |             time.sleep(1)
37 |             html = requests.get(url, timeout=10).text
38 |             doc = pq(html)
39 |             for line in doc('tr').items():
40 |                 tds = list(line('td').items())
41 |                 if len(tds) >= 2:
42 |                     ip = tds[0].text().strip().split(":")[0]
43 |                     port = tds[0].text().strip().split(":")[1]
44 |                     if re.match(ip_regex, ip) is not None and re.match(port_regex, port) is not None:
45 |                         proxies.append(('http', ip, int(port)))
46 | 
47 |         proxies = list(set(proxies))
48 | 
49 |         # 这个代理源数据太多了，验证器跑不过来
50 |         # 所以只取一部分，一般来说也够用了
51 |         if len(proxies) > 200:
52 |             proxies = random.sample(proxies, 200)
53 |         
54 |         return proxies
55 | 


--------------------------------------------------------------------------------
/fetchers/XiaoShuFetcher.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import time
 3 | import random
 4 | 
 5 | import requests
 6 | from pyquery import PyQuery as pq
 7 | 
 8 | from .BaseFetcher import BaseFetcher
 9 | 
10 | class XiaoShuFetcher(BaseFetcher):
11 |     """
12 |     http://www.xsdaili.cn/
13 |     代码由 [Zealot666](https://github.com/Zealot666) 提供
14 |     """
15 |     def __init__(self):
16 |         super().__init__()
17 |         self.index = 0
18 | 
19 |     def fetch(self):
20 |         """
21 |         执行一次爬取，返回一个数组，每个元素是(protocol, ip, port)，portocol是协议名称，目前主要为http
22 |         返回示例：[('http', '127.0.0.1', 8080), ('http', '127.0.0.1', 1234)]
23 |         """
24 |         self.index += 1
25 |         new_index = self.index % 10
26 | 
27 |         urls = set()
28 |         proxies = []
29 |         headers = {
30 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
31 |         }
32 |         for page in range(new_index, new_index + 1):
33 |             response = requests.get("http://www.xsdaili.cn/dayProxy/" + str(page) + ".html", headers=headers, timeout=10)
34 |             for item in pq(response.text)('a').items():
35 |                 try:
36 |                     if "/dayProxy/ip" in item.attr("href"):
37 |                         urls.add("http://www.xsdaili.cn" + item.attr("href"))
38 |                 except Exception:
39 |                     continue
40 |             for url in urls:
41 |                 response = requests.get(url, headers=headers, timeout=8)
42 |                 doc = pq(response.text)
43 |                 for item in doc(".cont").items():
44 |                     for line in item.text().split("\n"):
45 |                         ip = line.split('@')[0].split(':')[0]
46 |                         port = line.split('@')[0].split(':')[1]
47 |                         proxies.append(("http", ip, port))
48 | 
49 |         proxies = list(set(proxies))
50 | 
51 |         # 这个代理源数据太多了，验证器跑不过来
52 |         # 所以只取一部分，一般来说也够用了
53 |         if len(proxies) > 200:
54 |             proxies = random.sample(proxies, 200)
55 | 
56 |         return proxies
57 | 


--------------------------------------------------------------------------------
/fetchers/__init__.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | from collections import namedtuple
 4 | 
 5 | Fetcher = namedtuple('Fetcher', ['name', 'fetcher'])
 6 | 
 7 | from .UUFetcher import UUFetcher
 8 | from .KuaidailiFetcher import KuaidailiFetcher
 9 | from .GoubanjiaFetcher import GoubanjiaFetcher
10 | from .IP66Fetcher import IP66Fetcher
11 | from .IP3366Fetcher import IP3366Fetcher
12 | from .JiangxianliFetcher import JiangxianliFetcher
13 | from .IHuanFetcher import IHuanFetcher
14 | from .IP89Fetcher import IP89Fetcher
15 | from .ProxyscanFetcher import ProxyscanFetcher
16 | from .KaiXinFetcher import KaiXinFetcher
17 | from .XiLaFetcher import XiLaFetcher
18 | from .XiaoShuFetcher import XiaoShuFetcher
19 | from .ProxyListFetcher import ProxyListFetcher
20 | from .ProxyScrapeFetcher import ProxyScrapeFetcher
21 | 
22 | fetchers = [
23 |     Fetcher(name='uu-proxy.com', fetcher=UUFetcher),
24 |     Fetcher(name='www.kuaidaili.com', fetcher=KuaidailiFetcher),
25 |     Fetcher(name='www.goubanjia.com', fetcher=GoubanjiaFetcher),
26 |     Fetcher(name='www.66ip.cn', fetcher=IP66Fetcher),
27 |     Fetcher(name='www.ip3366.net', fetcher=IP3366Fetcher),
28 |     Fetcher(name='ip.jiangxianli.com', fetcher=JiangxianliFetcher),
29 |     Fetcher(name='ip.ihuan.me', fetcher=IHuanFetcher),
30 |     Fetcher(name='www.proxyscan.io', fetcher=ProxyscanFetcher),
31 |     Fetcher(name='www.89ip.cn', fetcher=IP89Fetcher),
32 |     Fetcher(name='www.kxdaili.com', fetcher=KaiXinFetcher),
33 |     Fetcher(name='www.xiladaili.com', fetcher=XiLaFetcher),
34 |     Fetcher(name='www.xsdaili.cn', fetcher=XiaoShuFetcher),
35 |     Fetcher(name='www.proxy-list.download', fetcher=ProxyListFetcher),
36 |     Fetcher(name='proxyscrape.com', fetcher=ProxyScrapeFetcher)
37 | ]
38 | 


--------------------------------------------------------------------------------
/frontend/README.md:
--------------------------------------------------------------------------------
1 | # 前端目录
2 | 
3 | 这个目录主要是网页界面的代码，代码已经被编译成了静态文件(`deployment`目录中)，一般来说不需要修改。
4 | 
5 | `api/api.py`会自动将`deployment`中的内容部署成一个静态网页服务器。
6 | 


--------------------------------------------------------------------------------
/frontend/deployment/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OxOOo/ProxyPoolWithUI/0e9aa3cf1c38f50aeafd53da71e8cd464c2723eb/frontend/deployment/.nojekyll


--------------------------------------------------------------------------------
/frontend/deployment/200.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="zh-CN" data-n-head="%7B%22lang%22:%7B%221%22:%22zh-CN%22%7D%7D">
 3 |   <head>
 4 |     <title>代理池网页管理界面</title><meta data-n-head="1" charset="utf-8"><meta data-n-head="1" name="viewport" content="width=device-width,initial-scale=1"><meta data-n-head="1" data-hid="description" name="description" content=""><link data-n-head="1" rel="icon" type="image/x-icon" href="/favicon.ico"><base href="/web/"><link rel="preload" href="/web/_nuxt/4e6036a.js" as="script"><link rel="preload" href="/web/_nuxt/473a16e.js" as="script"><link rel="preload" href="/web/_nuxt/89e3175.js" as="script"><link rel="preload" href="/web/_nuxt/fda1702.js" as="script">
 5 |   </head>
 6 |   <body>
 7 |     <div id="__nuxt"><style>#nuxt-loading{background:#fff;visibility:hidden;opacity:0;position:absolute;left:0;right:0;top:0;bottom:0;display:flex;justify-content:center;align-items:center;flex-direction:column;animation:nuxtLoadingIn 10s ease;-webkit-animation:nuxtLoadingIn 10s ease;animation-fill-mode:forwards;overflow:hidden}@keyframes nuxtLoadingIn{0%{visibility:hidden;opacity:0}20%{visibility:visible;opacity:0}100%{visibility:visible;opacity:1}}@-webkit-keyframes nuxtLoadingIn{0%{visibility:hidden;opacity:0}20%{visibility:visible;opacity:0}100%{visibility:visible;opacity:1}}#nuxt-loading>div,#nuxt-loading>div:after{border-radius:50%;width:5rem;height:5rem}#nuxt-loading>div{font-size:10px;position:relative;text-indent:-9999em;border:.5rem solid #f5f5f5;border-left:.5rem solid #000;-webkit-transform:translateZ(0);-ms-transform:translateZ(0);transform:translateZ(0);-webkit-animation:nuxtLoading 1.1s infinite linear;animation:nuxtLoading 1.1s infinite linear}#nuxt-loading.error>div{border-left:.5rem solid #ff4500;animation-duration:5s}@-webkit-keyframes nuxtLoading{0%{-webkit-transform:rotate(0);transform:rotate(0)}100%{-webkit-transform:rotate(360deg);transform:rotate(360deg)}}@keyframes nuxtLoading{0%{-webkit-transform:rotate(0);transform:rotate(0)}100%{-webkit-transform:rotate(360deg);transform:rotate(360deg)}}</style><script>window.addEventListener("error",function(){var e=document.getElementById("nuxt-loading");e&&(e.className+=" error")})</script><div id="nuxt-loading" aria-live="polite" role="status"><div>Loading...</div></div></div><script>window.__NUXT__={config:{},staticAssetsBase:"/web/_nuxt/static/1630852693"}</script>
 8 |   <script src="/web/_nuxt/4e6036a.js"></script><script src="/web/_nuxt/473a16e.js"></script><script src="/web/_nuxt/89e3175.js"></script><script src="/web/_nuxt/fda1702.js"></script></body>
 9 | </html>
10 | 


--------------------------------------------------------------------------------
/frontend/deployment/_nuxt/4e6036a.js:
--------------------------------------------------------------------------------
1 | !function(e){function r(data){for(var r,n,l=data[0],f=data[1],d=data[2],i=0,h=[];i<l.length;i++)n=l[i],Object.prototype.hasOwnProperty.call(o,n)&&o[n]&&h.push(o[n][0]),o[n]=0;for(r in f)Object.prototype.hasOwnProperty.call(f,r)&&(e[r]=f[r]);for(v&&v(data);h.length;)h.shift()();return c.push.apply(c,d||[]),t()}function t(){for(var e,i=0;i<c.length;i++){for(var r=c[i],t=!0,n=1;n<r.length;n++){var f=r[n];0!==o[f]&&(t=!1)}t&&(c.splice(i--,1),e=l(l.s=r[0]))}return e}var n={},o={4:0},c=[];function l(r){if(n[r])return n[r].exports;var t=n[r]={i:r,l:!1,exports:{}};return e[r].call(t.exports,t,t.exports,l),t.l=!0,t.exports}l.e=function(e){var r=[],t=o[e];if(0!==t)if(t)r.push(t[2]);else{var n=new Promise((function(r,n){t=o[e]=[r,n]}));r.push(t[2]=n);var c,script=document.createElement("script");script.charset="utf-8",script.timeout=120,l.nc&&script.setAttribute("nonce",l.nc),script.src=function(e){return l.p+""+{2:"c6103f9",3:"810b53a"}[e]+".js"}(e);var f=new Error;c=function(r){script.onerror=script.onload=null,clearTimeout(d);var t=o[e];if(0!==t){if(t){var n=r&&("load"===r.type?"missing":r.type),c=r&&r.target&&r.target.src;f.message="Loading chunk "+e+" failed.\n("+n+": "+c+")",f.name="ChunkLoadError",f.type=n,f.request=c,t[1](f)}o[e]=void 0}};var d=setTimeout((function(){c({type:"timeout",target:script})}),12e4);script.onerror=script.onload=c,document.head.appendChild(script)}return Promise.all(r)},l.m=e,l.c=n,l.d=function(e,r,t){l.o(e,r)||Object.defineProperty(e,r,{enumerable:!0,get:t})},l.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},l.t=function(e,r){if(1&r&&(e=l(e)),8&r)return e;if(4&r&&"object"==typeof e&&e&&e.__esModule)return e;var t=Object.create(null);if(l.r(t),Object.defineProperty(t,"default",{enumerable:!0,value:e}),2&r&&"string"!=typeof e)for(var n in e)l.d(t,n,function(r){return e[r]}.bind(null,n));return t},l.n=function(e){var r=e&&e.__esModule?function(){return e.default}:function(){return e};return l.d(r,"a",r),r},l.o=function(object,e){return Object.prototype.hasOwnProperty.call(object,e)},l.p="/web/_nuxt/",l.oe=function(e){throw console.error(e),e};var f=window.webpackJsonp=window.webpackJsonp||[],d=f.push.bind(f);f.push=r,f=f.slice();for(var i=0;i<f.length;i++)r(f[i]);var v=d;t()}([]);


--------------------------------------------------------------------------------
/frontend/deployment/_nuxt/810b53a.js:
--------------------------------------------------------------------------------
1 | (window.webpackJsonp=window.webpackJsonp||[]).push([[3],{1199:function(t,e,n){"use strict";n.r(e);n(79),n(80);var r=n(20),o=n(3),l=n.n(o),c=[{title:"来自",dataIndex:"fetcher_name"},{title:"代理类型",dataIndex:"protocol"},{title:"IP",dataIndex:"ip"},{title:"端口",dataIndex:"port"},{title:"延迟",dataIndex:"latency",scopedSlots:{customRender:"latency"}},{title:"上次验证时间",dataIndex:"validate_date",customRender:function(t){return t?l()(t).format("YYYY-MM-DD HH:mm:ss"):""}},{dataIndex:"to_validate_date",slots:{title:"to_validate_date"},customRender:function(t){return t?l()(t).format("YYYY-MM-DD HH:mm:ss"):""}}],d={data:function(){return{columns:c,proxies:[],sum_proxies_cnt:0,validated_proxies_cnt:0,pending_proxies_cnt:0,autoupdate:!0,lastupdate:"",handle:null}},mounted:function(){var t=this;this.handle=setInterval((function(){t.autoupdate&&t.update()}),2e3),this.update()},destroyed:function(){this.handle&&clearInterval(this.handle),this.handle=null},methods:{update:function(){var t=this;return Object(r.a)(regeneratorRuntime.mark((function e(){var data;return regeneratorRuntime.wrap((function(e){for(;;)switch(e.prev=e.next){case 0:return e.next=2,t.$http.get("/proxies_status");case 2:data=e.sent,t.proxies=data.proxies,t.sum_proxies_cnt=data.sum_proxies_cnt,t.validated_proxies_cnt=data.validated_proxies_cnt,t.pending_proxies_cnt=data.pending_proxies_cnt,t.lastupdate=l()().format("HH:mm:ss");case 8:case"end":return e.stop()}}),e)})))()}}},_=n(100),component=Object(_.a)(d,(function(){var t=this,e=t.$createElement,n=t._self._c||e;return n("div",[n("a-row",{attrs:{gutter:16}},[n("a-col",{attrs:{span:4}},[n("a-card",{attrs:{"body-style":{padding:"20px 24px 14px"}}},[n("a-statistic",{staticStyle:{"margin-right":"50px"},attrs:{value:t.sum_proxies_cnt,"value-style":{color:"#3f8600"}},scopedSlots:t._u([{key:"title",fn:function(){return[n("span",[t._v("\n                            全部代理数量\n                            "),n("a-tooltip",{attrs:{title:"目前数据库中的代理总数，包含没有通过验证的代理"}},[n("a-icon",{attrs:{type:"question-circle"}})],1)],1)]},proxy:!0}])})],1)],1),t._v(" "),n("a-col",{attrs:{span:4}},[n("a-card",{attrs:{"body-style":{padding:"20px 24px 14px"}}},[n("a-statistic",{staticStyle:{"margin-right":"50px"},attrs:{title:"当前可用代理数量",value:t.validated_proxies_cnt,"value-style":{color:"#3f8600"}}})],1)],1),t._v(" "),n("a-col",{attrs:{span:4}},[n("a-card",{attrs:{"body-style":{padding:"20px 24px 14px"}}},[n("a-statistic",{staticStyle:{"margin-right":"50px"},attrs:{value:t.pending_proxies_cnt,"value-style":{color:"#3f8600"}},scopedSlots:t._u([{key:"title",fn:function(){return[n("span",[t._v("\n                            等待验证代理数量\n                            "),n("a-tooltip",{scopedSlots:t._u([{key:"title",fn:function(){return[n("span",[t._v("\n                                        表示这些代理的`下次验证时间`已经到了，但是还没有完成验证。\n                                        如果该数字突然增大，有可能是爬取器突然网数据库中添加了一批代理，是正常现象，慢慢等待即可。\n                                        如果该数字一直较大，则表示验证器忙不过来了。\n                                    ")])]},proxy:!0}])},[t._v(" "),n("a-icon",{attrs:{type:"question-circle"}})],1)],1)]},proxy:!0}])})],1)],1),t._v(" "),n("a-col",{attrs:{span:4}},[n("a-card",{attrs:{"body-style":{padding:"20px 24px 4px"}}},[n("p",[t._v("\n                    自动刷新:\n                    "),n("a-switch",{model:{value:t.autoupdate,callback:function(e){t.autoupdate=e},expression:"autoupdate"}})],1),t._v(" "),n("p",[t._v("刷新时间："+t._s(t.lastupdate))])])],1)],1),t._v(" "),n("br"),t._v(" "),n("a-table",{attrs:{columns:t.columns,"data-source":t.proxies,"row-key":function(t){return t.protocol+"://"+t.ip+":"+t.port},bordered:!0},scopedSlots:t._u([{key:"latency",fn:function(e){return n("span",{},[n("a-tag",{attrs:{color:e<2e3?"green":e<4e3?"orange":"red"}},[t._v("\n                "+t._s(e)+"\n            ")])],1)}}])},[n("span",{attrs:{slot:"to_validate_date"},slot:"to_validate_date"},[t._v("\n            下次验证时间\n            "),n("a-tooltip",{scopedSlots:t._u([{key:"title",fn:function(){return[n("span",[t._v("\n                        验证器会不断从数据库中取出满足`下次验证时间`在当前时间之前的代理进行验证。\n                    ")])]},proxy:!0}])},[t._v(" "),n("a-icon",{attrs:{type:"question-circle"}})],1)],1)])],1)}),[],!1,null,null,null);e.default=component.exports}}]);


--------------------------------------------------------------------------------
/frontend/deployment/_nuxt/LICENSES:
--------------------------------------------------------------------------------
 1 | /*
 2 | object-assign
 3 | (c) Sindre Sorhus
 4 | @license MIT
 5 | */
 6 | 
 7 | /*!
 8 |   * vue-router v3.5.1
 9 |   * (c) 2021 Evan You
10 |   * @license MIT
11 |   */
12 | 
13 | /*!
14 |   Copyright (c) 2017 Jed Watson.
15 |   Licensed under the MIT License (MIT), see
16 |   http://jedwatson.github.io/classnames
17 | */
18 | 
19 | /*!
20 |  * Vue.js v2.6.12
21 |  * (c) 2014-2020 Evan You
22 |  * Released under the MIT License.
23 |  */
24 | 
25 | /*!
26 |  * vue-client-only v2.0.0
27 |  * (c) 2019-present egoist <0x142857@gmail.com>
28 |  * Released under the MIT License.
29 |  */
30 | 
31 | /*!
32 |  * vue-no-ssr v1.1.1
33 |  * (c) 2018-present egoist <0x142857@gmail.com>
34 |  * Released under the MIT License.
35 |  */
36 | 
37 | //! moment.js
38 | 
39 | //! moment.js locale configuration
40 | 


--------------------------------------------------------------------------------
/frontend/deployment/_nuxt/c6103f9.js:
--------------------------------------------------------------------------------
1 | (window.webpackJsonp=window.webpackJsonp||[]).push([[2],{1198:function(t,e,n){"use strict";n.r(e);n(133),n(79),n(80);var r=n(20),c=n(3),o=n.n(c),l=[{title:"名称",dataIndex:"name"},{title:"当前可用代理数量",dataIndex:"validated_cnt"},{dataIndex:"in_db_cnt",slots:{title:"inDbCntTitle"}},{title:"总共爬取代理数量",dataIndex:"sum_proxies_cnt"},{title:"上次爬取代理数量",dataIndex:"last_proxies_cnt"},{title:"上次爬取时间",dataIndex:"last_fetch_date",customRender:function(t){return t?o()(t).format("YYYY-MM-DD HH:mm:ss"):""}},{dataIndex:"enable",slots:{title:"enableTitle"},scopedSlots:{customRender:"enable"}}],d={data:function(){return{fetchers:[],columns:l,autoupdate:!0,lastupdate:"",handle:null}},mounted:function(){var t=this;this.handle=setInterval((function(){t.autoupdate&&t.update()}),2e3),this.update()},destroyed:function(){this.handle&&clearInterval(this.handle),this.handle=null},methods:{update:function(){var t=this;return Object(r.a)(regeneratorRuntime.mark((function e(){var data;return regeneratorRuntime.wrap((function(e){for(;;)switch(e.prev=e.next){case 0:return e.next=2,t.$http.get("/fetchers_status");case 2:data=e.sent,t.fetchers=data.fetchers,t.lastupdate=o()().format("HH:mm:ss");case 5:case"end":return e.stop()}}),e)})))()},clearStatus:function(){var t=this;return Object(r.a)(regeneratorRuntime.mark((function e(){return regeneratorRuntime.wrap((function(e){for(;;)switch(e.prev=e.next){case 0:return e.next=2,t.$http.get("/clear_fetchers_status");case 2:return t.$message.success("清空成功"),e.next=5,t.update();case 5:case"end":return e.stop()}}),e)})))()},enableChange:function(t){var e=this;return Object(r.a)(regeneratorRuntime.mark((function n(){return regeneratorRuntime.wrap((function(n){for(;;)switch(n.prev=n.next){case 0:if(!t.enable){n.next=5;break}return n.next=3,e.$http.get("/fetcher_enable",{name:t.name,enable:"0"});case 3:n.next=7;break;case 5:return n.next=7,e.$http.get("/fetcher_enable",{name:t.name,enable:"1"});case 7:return e.$message.success("修改成功"),n.next=10,e.update();case 10:case"end":return n.stop()}}),n)})))()}}},f=n(100),component=Object(f.a)(d,(function(){var t=this,e=t.$createElement,n=t._self._c||e;return n("div",[n("a-row",{attrs:{gutter:16}},[n("a-col",{attrs:{span:4}},[n("a-card",{attrs:{"body-style":{padding:"20px 24px 4px"}}},[n("p",[t._v("\n                    自动刷新:\n                    "),n("a-switch",{model:{value:t.autoupdate,callback:function(e){t.autoupdate=e},expression:"autoupdate"}})],1),t._v(" "),n("p",[t._v("刷新时间："+t._s(t.lastupdate))])])],1),t._v(" "),n("a-col",{attrs:{span:4}},[n("a-card",{attrs:{"body-style":{padding:"20px 24px 43px"}}},[n("div",{staticStyle:{"text-align":"center"}},[n("a-button",{attrs:{type:"primary"},on:{click:t.clearStatus}},[t._v("\n                        清空统计信息\n                    ")]),t._v(" "),n("a-tooltip",{attrs:{title:"清空`总共爬取代理数量`等，已经爬取到的代理不会删除"}},[n("a-icon",{attrs:{type:"question-circle"}})],1)],1)])],1)],1),t._v(" "),n("br"),t._v(" "),n("a-table",{attrs:{columns:t.columns,"data-source":t.fetchers,"row-key":"name",pagination:!1,bordered:!0},scopedSlots:t._u([{key:"enable",fn:function(e,r){return[n("a-switch",{attrs:{"default-checked":e},on:{change:function(e){return t.enableChange(r)}}})]}}])},[n("span",{attrs:{slot:"inDbCntTitle"},slot:"inDbCntTitle"},[t._v("\n            数据库中的代理数量\n            "),n("a-tooltip",{scopedSlots:t._u([{key:"title",fn:function(){return[n("span",[t._v("\n                        当前数据库中，有多少代理是这个爬取器爬到的。\n                        和`总共爬取代理数量`不同的地方在于，这个去掉了重复的和已经删除的代理。\n                    ")])]},proxy:!0}])},[t._v(" "),n("a-icon",{attrs:{type:"question-circle"}})],1)],1),t._v(" "),n("span",{attrs:{slot:"enableTitle"},slot:"enableTitle"},[t._v("\n            是否启用\n            "),n("a-tooltip",{scopedSlots:t._u([{key:"title",fn:function(){return[n("span",[t._v("\n                        在禁用之后，将不会再运行该爬取器。\n                    ")])]},proxy:!0}])},[t._v(" "),n("a-icon",{attrs:{type:"question-circle"}})],1)],1)])],1)}),[],!1,null,null,null);e.default=component.exports}}]);


--------------------------------------------------------------------------------
/frontend/deployment/_nuxt/fda1702.js:
--------------------------------------------------------------------------------
1 | (window.webpackJsonp=window.webpackJsonp||[]).push([[0],{290:function(t,e,n){"use strict";var r=n(9),o=n(634),j=n.n(o);r.default.use(j.a)},291:function(t,e,n){"use strict";n(39),n(79),n(80);var r=n(186),o=n(187),j=n(20),c=n(636),l=n.n(c),f=n(639),m=l.a.create({baseURL:"/",timeout:1e4,withCredentials:!0});function h(t,title,content){return new Promise((function(e){setTimeout((function(){f.a.destroyAll(),f.a[t]({title:title,content:content,onOk:function(){e()}})}),500)}))}function d(){return new Promise((function(){}))}function v(t){return k.apply(this,arguments)}function k(){return(k=Object(j.a)(regeneratorRuntime.mark((function t(e){var data,n;return regeneratorRuntime.wrap((function(t){for(;;)switch(t.prev=t.next){case 0:return data={},t.prev=1,t.next=4,e;case 4:n=t.sent,data=n.data,t.next=14;break;case 8:return t.prev=8,t.t0=t.catch(1),t.next=12,h("error","网络错误",t.t0.message);case 12:return t.next=14,d();case 14:if(data.success){t.next=19;break}return t.next=17,h("info","错误",data.message);case 17:return t.next=19,d();case 19:return t.abrupt("return",data);case 20:case"end":return t.stop()}}),t,null,[[1,8]])})))).apply(this,arguments)}var y=function(){function t(){Object(r.a)(this,t),this.baseURL="/"}var e,n;return Object(o.a)(t,[{key:"get",value:(n=Object(j.a)(regeneratorRuntime.mark((function t(e,n){return regeneratorRuntime.wrap((function(t){for(;;)switch(t.prev=t.next){case 0:return n=n||{},t.next=3,v(m.get(e,{params:n}));case 3:return t.abrupt("return",t.sent);case 4:case"end":return t.stop()}}),t)}))),function(t,e){return n.apply(this,arguments)})},{key:"post",value:(e=Object(j.a)(regeneratorRuntime.mark((function t(e,n,data){return regeneratorRuntime.wrap((function(t){for(;;)switch(t.prev=t.next){case 0:return n=n||{},data=data||{},t.next=4,v(m.post(e,data,{params:n}));case 4:return t.abrupt("return",t.sent);case 5:case"end":return t.stop()}}),t)}))),function(t,n,r){return e.apply(this,arguments)})}]),t}();e.a=function(t,e){t.req;e("http",new y({}))}},472:function(t,e,n){var content=n(700);"string"==typeof content&&(content=[[t.i,content,""]]),content.locals&&(t.exports=content.locals);(0,n(128).default)("41b8fd4d",content,!0,{sourceMap:!1})},473:function(t,e,n){var content=n(702);"string"==typeof content&&(content=[[t.i,content,""]]),content.locals&&(t.exports=content.locals);(0,n(128).default)("6bdae85e",content,!0,{sourceMap:!1})},638:function(t,e,n){"use strict";n(71);var r=n(633),o=n.n(r),j=n(3);n.n(j).a.locale("zh-cn");var c={data:function(){return{locale:o.a,url_path:[]}},watch:{$route:function(){this.updateNav()}},mounted:function(){this.updateNav()},methods:{updateNav:function(){var data=/^\/[^/]*/.exec(this.$route.path||"");this.url_path=data?[data[0]]:[]}}},l=(n(699),n(701),n(100)),component=Object(l.a)(c,(function(){var t=this,e=t.$createElement,n=t._self._c||e;return n("div",[n("a-config-provider",{attrs:{locale:t.locale}},[n("a-layout",{staticClass:"layout-main"},[n("a-layout-sider",{attrs:{collapsible:""}},[n("div",{staticClass:"logo"}),t._v(" "),n("a-menu",{attrs:{theme:"dark",mode:"inline"},model:{value:t.url_path,callback:function(e){t.url_path=e},expression:"url_path"}},[n("a-menu-item",{key:"/"},[n("NuxtLink",{attrs:{to:"/"}},[n("a-icon",{attrs:{type:"home"}}),t._v(" "),n("span",[t._v("可用代理")])],1)],1),t._v(" "),n("a-menu-item",{key:"/fetchers"},[n("NuxtLink",{attrs:{to:"/fetchers"}},[n("a-icon",{attrs:{type:"retweet"}}),t._v(" "),n("span",[t._v("爬取器状态")])],1)],1),t._v(" "),n("a-menu-item",{key:"github"},[n("a",{attrs:{href:"https://github.com/OxOOo/ProxyPoolWithUI",target:"_blank"}},[n("a-icon",{attrs:{type:"github"}}),t._v(" "),n("span",[t._v("Github主页")])],1)])],1)],1),t._v(" "),n("a-layout",[n("a-layout-header",{staticStyle:{background:"#fff",padding:"0"}}),t._v(" "),n("a-layout-content",{style:{margin:"24px 16px",padding:"24px",background:"#fff"}},[n("Nuxt")],1)],1)],1)],1)],1)}),[],!1,null,"9df1b954",null);e.a=component.exports},640:function(t,e,n){t.exports=n(641)},687:function(t,e,n){var map={"./af":328,"./af.js":328,"./ar":329,"./ar-dz":330,"./ar-dz.js":330,"./ar-kw":331,"./ar-kw.js":331,"./ar-ly":332,"./ar-ly.js":332,"./ar-ma":333,"./ar-ma.js":333,"./ar-sa":334,"./ar-sa.js":334,"./ar-tn":335,"./ar-tn.js":335,"./ar.js":329,"./az":336,"./az.js":336,"./be":337,"./be.js":337,"./bg":338,"./bg.js":338,"./bm":339,"./bm.js":339,"./bn":340,"./bn-bd":341,"./bn-bd.js":341,"./bn.js":340,"./bo":342,"./bo.js":342,"./br":343,"./br.js":343,"./bs":344,"./bs.js":344,"./ca":345,"./ca.js":345,"./cs":346,"./cs.js":346,"./cv":347,"./cv.js":347,"./cy":348,"./cy.js":348,"./da":349,"./da.js":349,"./de":350,"./de-at":351,"./de-at.js":351,"./de-ch":352,"./de-ch.js":352,"./de.js":350,"./dv":353,"./dv.js":353,"./el":354,"./el.js":354,"./en-au":355,"./en-au.js":355,"./en-ca":356,"./en-ca.js":356,"./en-gb":357,"./en-gb.js":357,"./en-ie":358,"./en-ie.js":358,"./en-il":359,"./en-il.js":359,"./en-in":360,"./en-in.js":360,"./en-nz":361,"./en-nz.js":361,"./en-sg":362,"./en-sg.js":362,"./eo":363,"./eo.js":363,"./es":364,"./es-do":365,"./es-do.js":365,"./es-mx":366,"./es-mx.js":366,"./es-us":367,"./es-us.js":367,"./es.js":364,"./et":368,"./et.js":368,"./eu":369,"./eu.js":369,"./fa":370,"./fa.js":370,"./fi":371,"./fi.js":371,"./fil":372,"./fil.js":372,"./fo":373,"./fo.js":373,"./fr":374,"./fr-ca":375,"./fr-ca.js":375,"./fr-ch":376,"./fr-ch.js":376,"./fr.js":374,"./fy":377,"./fy.js":377,"./ga":378,"./ga.js":378,"./gd":379,"./gd.js":379,"./gl":380,"./gl.js":380,"./gom-deva":381,"./gom-deva.js":381,"./gom-latn":382,"./gom-latn.js":382,"./gu":383,"./gu.js":383,"./he":384,"./he.js":384,"./hi":385,"./hi.js":385,"./hr":386,"./hr.js":386,"./hu":387,"./hu.js":387,"./hy-am":388,"./hy-am.js":388,"./id":389,"./id.js":389,"./is":390,"./is.js":390,"./it":391,"./it-ch":392,"./it-ch.js":392,"./it.js":391,"./ja":393,"./ja.js":393,"./jv":394,"./jv.js":394,"./ka":395,"./ka.js":395,"./kk":396,"./kk.js":396,"./km":397,"./km.js":397,"./kn":398,"./kn.js":398,"./ko":399,"./ko.js":399,"./ku":400,"./ku.js":400,"./ky":401,"./ky.js":401,"./lb":402,"./lb.js":402,"./lo":403,"./lo.js":403,"./lt":404,"./lt.js":404,"./lv":405,"./lv.js":405,"./me":406,"./me.js":406,"./mi":407,"./mi.js":407,"./mk":408,"./mk.js":408,"./ml":409,"./ml.js":409,"./mn":410,"./mn.js":410,"./mr":411,"./mr.js":411,"./ms":412,"./ms-my":413,"./ms-my.js":413,"./ms.js":412,"./mt":414,"./mt.js":414,"./my":415,"./my.js":415,"./nb":416,"./nb.js":416,"./ne":417,"./ne.js":417,"./nl":418,"./nl-be":419,"./nl-be.js":419,"./nl.js":418,"./nn":420,"./nn.js":420,"./oc-lnc":421,"./oc-lnc.js":421,"./pa-in":422,"./pa-in.js":422,"./pl":423,"./pl.js":423,"./pt":424,"./pt-br":425,"./pt-br.js":425,"./pt.js":424,"./ro":426,"./ro.js":426,"./ru":427,"./ru.js":427,"./sd":428,"./sd.js":428,"./se":429,"./se.js":429,"./si":430,"./si.js":430,"./sk":431,"./sk.js":431,"./sl":432,"./sl.js":432,"./sq":433,"./sq.js":433,"./sr":434,"./sr-cyrl":435,"./sr-cyrl.js":435,"./sr.js":434,"./ss":436,"./ss.js":436,"./sv":437,"./sv.js":437,"./sw":438,"./sw.js":438,"./ta":439,"./ta.js":439,"./te":440,"./te.js":440,"./tet":441,"./tet.js":441,"./tg":442,"./tg.js":442,"./th":443,"./th.js":443,"./tk":444,"./tk.js":444,"./tl-ph":445,"./tl-ph.js":445,"./tlh":446,"./tlh.js":446,"./tr":447,"./tr.js":447,"./tzl":448,"./tzl.js":448,"./tzm":449,"./tzm-latn":450,"./tzm-latn.js":450,"./tzm.js":449,"./ug-cn":451,"./ug-cn.js":451,"./uk":452,"./uk.js":452,"./ur":453,"./ur.js":453,"./uz":454,"./uz-latn":455,"./uz-latn.js":455,"./uz.js":454,"./vi":456,"./vi.js":456,"./x-pseudo":457,"./x-pseudo.js":457,"./yo":458,"./yo.js":458,"./zh-cn":459,"./zh-cn.js":459,"./zh-hk":460,"./zh-hk.js":460,"./zh-mo":461,"./zh-mo.js":461,"./zh-tw":462,"./zh-tw.js":462};function r(t){var e=o(t);return n(e)}function o(t){if(!n.o(map,t)){var e=new Error("Cannot find module '"+t+"'");throw e.code="MODULE_NOT_FOUND",e}return map[t]}r.keys=function(){return Object.keys(map)},r.resolve=o,t.exports=r,r.id=687},699:function(t,e,n){"use strict";n(472)},700:function(t,e,n){(e=n(113)(!1)).push([t.i,".layout-main[data-v-9df1b954]{min-height:100vh}.logo[data-v-9df1b954]{height:32px;background:hsla(0,0%,100%,.2);margin:16px}",""]),t.exports=e},701:function(t,e,n){"use strict";n(473)},702:function(t,e,n){(e=n(113)(!1)).push([t.i,'html{font-family:"Source Sans Pro",-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,sans-serif;font-size:16px;word-spacing:1px;-ms-text-size-adjust:100%;-webkit-text-size-adjust:100%;-moz-osx-font-smoothing:grayscale;-webkit-font-smoothing:antialiased;box-sizing:border-box}',""]),t.exports=e}},[[640,4,1,5]]]);


--------------------------------------------------------------------------------
/frontend/deployment/_nuxt/static/1630852693/fetchers/payload.js:
--------------------------------------------------------------------------------
1 | __NUXT_JSONP__("/fetchers", {data:[{}],fetch:[],mutations:void 0});


--------------------------------------------------------------------------------
/frontend/deployment/_nuxt/static/1630852693/manifest.js:
--------------------------------------------------------------------------------
1 | __NUXT_JSONP__("manifest.js", {routes:["\u002F","\u002Ffetchers"]})


--------------------------------------------------------------------------------
/frontend/deployment/_nuxt/static/1630852693/payload.js:
--------------------------------------------------------------------------------
1 | __NUXT_JSONP__("/", {data:[{}],fetch:[],mutations:void 0});


--------------------------------------------------------------------------------
/frontend/src/.editorconfig:
--------------------------------------------------------------------------------
 1 | # editorconfig.org
 2 | root = true
 3 | 
 4 | [*]
 5 | indent_style = space
 6 | indent_size = 4
 7 | end_of_line = lf
 8 | charset = utf-8
 9 | trim_trailing_whitespace = true
10 | insert_final_newline = true
11 | 
12 | [*.md]
13 | trim_trailing_whitespace = false
14 | 


--------------------------------------------------------------------------------
/frontend/src/.eslintrc.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |     root: true,
 3 |     env: {
 4 |         browser: true,
 5 |         node: true
 6 |     },
 7 |     parserOptions: {
 8 |         parser: 'babel-eslint'
 9 |     },
10 |     extends: [
11 |         '@nuxtjs',
12 |         'plugin:nuxt/recommended'
13 |     ],
14 |     plugins: [
15 |     ],
16 |     // add your custom rules here
17 |     rules: {
18 |         semi: ['error', 'always'],
19 |         indent: ['error', 4],
20 |         'vue/html-indent': ['error', 4],
21 |         camelcase: 'off',
22 |         'no-return-await': 'off',
23 |         'vue/no-parsing-error': 'off',
24 |         'no-unused-vars': 'warn',
25 |         'vue/html-self-closing': 'off',
26 |         'prefer-const': 'warn',
27 |         'vue/singleline-html-element-content-newline': 'off',
28 |         'vue/no-unused-components': 'warn',
29 |         'import/no-named-as-default': 'off',
30 |         'vue/no-unused-vars': 'warn'
31 |     }
32 | };
33 | 


--------------------------------------------------------------------------------
/frontend/src/.gitignore:
--------------------------------------------------------------------------------
 1 | # Created by .ignore support plugin (hsz.mobi)
 2 | ### Node template
 3 | # Logs
 4 | /logs
 5 | *.log
 6 | npm-debug.log*
 7 | yarn-debug.log*
 8 | yarn-error.log*
 9 | 
10 | # Runtime data
11 | pids
12 | *.pid
13 | *.seed
14 | *.pid.lock
15 | 
16 | # Directory for instrumented libs generated by jscoverage/JSCover
17 | lib-cov
18 | 
19 | # Coverage directory used by tools like istanbul
20 | coverage
21 | 
22 | # nyc test coverage
23 | .nyc_output
24 | 
25 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
26 | .grunt
27 | 
28 | # Bower dependency directory (https://bower.io/)
29 | bower_components
30 | 
31 | # node-waf configuration
32 | .lock-wscript
33 | 
34 | # Compiled binary addons (https://nodejs.org/api/addons.html)
35 | build/Release
36 | 
37 | # Dependency directories
38 | node_modules/
39 | jspm_packages/
40 | 
41 | # TypeScript v1 declaration files
42 | typings/
43 | 
44 | # Optional npm cache directory
45 | .npm
46 | 
47 | # Optional eslint cache
48 | .eslintcache
49 | 
50 | # Optional REPL history
51 | .node_repl_history
52 | 
53 | # Output of 'npm pack'
54 | *.tgz
55 | 
56 | # Yarn Integrity file
57 | .yarn-integrity
58 | 
59 | # dotenv environment variables file
60 | .env
61 | 
62 | # parcel-bundler cache (https://parceljs.org/)
63 | .cache
64 | 
65 | # next.js build output
66 | .next
67 | 
68 | # nuxt.js build output
69 | .nuxt
70 | 
71 | # Nuxt generate
72 | dist
73 | 
74 | # vuepress build output
75 | .vuepress/dist
76 | 
77 | # Serverless directories
78 | .serverless
79 | 
80 | # IDE / Editor
81 | .idea
82 | 
83 | # Service worker
84 | sw.*
85 | 
86 | # macOS
87 | .DS_Store
88 | 
89 | # Vim swap files
90 | *.swp
91 | 


--------------------------------------------------------------------------------
/frontend/src/README.md:
--------------------------------------------------------------------------------
 1 | # 前端目录
 2 | 
 3 | 如果不修改网页，则不需要修改本目录下的文件，以下内容也可以忽略。
 4 | 
 5 | ## 软件需求
 6 | 
 7 | 1. node 14.15 : 不一定需要这么高版本，但是最好使用LTS版本
 8 | 2. npm
 9 | 
10 | ## 安装依赖
11 | 
12 | ```bash
13 | $ npm i
14 | ```
15 | 
16 | ## 命令说明
17 | 
18 | 1. 运行开发服务器
19 | 
20 | ```bash
21 | $ npm run dev
22 | ```
23 | 
24 | 2. 生成静态代码
25 | 
26 | ```bash
27 | $ ./build.sh
28 | ```
29 | 
30 | 3. 进行代码格式化与检查
31 | 
32 | ```bash
33 | $ npm run lint
34 | ```
35 | 


--------------------------------------------------------------------------------
/frontend/src/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | cd `dirname $0`
 6 | 
 7 | rm -rf dist # 删除已经存在的目录
 8 | npm run generate # 生成静态文件
 9 | 
10 | rm -rf ../deployment
11 | mv dist ../deployment
12 | 
13 | echo 'Done.'
14 | 


--------------------------------------------------------------------------------
/frontend/src/jsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "baseUrl": ".",
 4 |     "paths": {
 5 |       "~/*": ["./*"],
 6 |       "@/*": ["./*"],
 7 |       "~~/*": ["./*"],
 8 |       "@@/*": ["./*"]
 9 |     }
10 |   },
11 |   "exclude": ["node_modules", ".nuxt", "dist"]
12 | }
13 | 


--------------------------------------------------------------------------------
/frontend/src/layouts/default.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |     <div>
  3 |         <a-config-provider :locale="locale">
  4 |             <a-layout class="layout-main">
  5 |                 <a-layout-sider collapsible>
  6 |                     <div class="logo" />
  7 |                     <a-menu v-model="url_path" theme="dark" mode="inline">
  8 |                         <a-menu-item key="/">
  9 |                             <NuxtLink to="/">
 10 |                                 <a-icon type="home" />
 11 |                                 <span>可用代理</span>
 12 |                             </NuxtLink>
 13 |                         </a-menu-item>
 14 |                         <a-menu-item key="/fetchers">
 15 |                             <NuxtLink to="/fetchers">
 16 |                                 <a-icon type="retweet" />
 17 |                                 <span>爬取器状态</span>
 18 |                             </NuxtLink>
 19 |                         </a-menu-item>
 20 |                         <a-menu-item key="github">
 21 |                             <a href="https://github.com/OxOOo/ProxyPoolWithUI" target="_blank">
 22 |                                 <a-icon type="github" />
 23 |                                 <span>Github主页</span>
 24 |                             </a>
 25 |                         </a-menu-item>
 26 |                     </a-menu>
 27 |                 </a-layout-sider>
 28 |                 <a-layout>
 29 |                     <a-layout-header style="background: #fff; padding: 0">
 30 |                     </a-layout-header>
 31 |                     <a-layout-content
 32 |                         :style="{ margin: '24px 16px', padding: '24px', background: '#fff' }"
 33 |                     >
 34 |                         <Nuxt />
 35 |                     </a-layout-content>
 36 |                 </a-layout>
 37 |             </a-layout>
 38 |         </a-config-provider>
 39 |     </div>
 40 | </template>
 41 | 
 42 | <script>
 43 | import zh_CN from 'ant-design-vue/lib/locale-provider/zh_CN';
 44 | import moment from 'moment';
 45 | 
 46 | moment.locale('zh-cn');
 47 | 
 48 | export default {
 49 |     data () {
 50 |         return {
 51 |             locale: zh_CN,
 52 |             url_path: []
 53 |         };
 54 |     },
 55 |     watch: {
 56 |         $route () {
 57 |             this.updateNav();
 58 |         }
 59 |     },
 60 |     mounted () {
 61 |         this.updateNav();
 62 |     },
 63 |     methods: {
 64 |         updateNav () {
 65 |             const data = /^\/[^/]*/.exec(this.$route.path || '');
 66 |             if (data) {
 67 |                 this.url_path = [data[0]];
 68 |             } else {
 69 |                 this.url_path = [];
 70 |             }
 71 |         }
 72 |     }
 73 | };
 74 | </script>
 75 | 
 76 | <style scoped>
 77 | .layout-main {
 78 |     min-height: 100vh;
 79 | }
 80 | .logo {
 81 |     height: 32px;
 82 |     background: rgba(255, 255, 255, 0.2);
 83 |     margin: 16px;
 84 | }
 85 | </style>
 86 | 
 87 | <style>
 88 | html {
 89 |   font-family:
 90 |     'Source Sans Pro',
 91 |     -apple-system,
 92 |     BlinkMacSystemFont,
 93 |     'Segoe UI',
 94 |     Roboto,
 95 |     'Helvetica Neue',
 96 |     Arial,
 97 |     sans-serif;
 98 |   font-size: 16px;
 99 |   word-spacing: 1px;
100 |   -ms-text-size-adjust: 100%;
101 |   -webkit-text-size-adjust: 100%;
102 |   -moz-osx-font-smoothing: grayscale;
103 |   -webkit-font-smoothing: antialiased;
104 |   box-sizing: border-box;
105 | }
106 | </style>
107 | 


--------------------------------------------------------------------------------
/frontend/src/nuxt.config.js:
--------------------------------------------------------------------------------
 1 | export default {
 2 |     // Target: https://go.nuxtjs.dev/config-target
 3 |     target: 'static',
 4 | 
 5 |     // Global page headers: https://go.nuxtjs.dev/config-head
 6 |     head: {
 7 |         title: '代理池网页管理界面',
 8 |         htmlAttrs: {
 9 |             lang: 'zh-CN'
10 |         },
11 |         meta: [
12 |             { charset: 'utf-8' },
13 |             { name: 'viewport', content: 'width=device-width, initial-scale=1' },
14 |             { hid: 'description', name: 'description', content: '' }
15 |         ],
16 |         link: [
17 |             { rel: 'icon', type: 'image/x-icon', href: '/favicon.ico' }
18 |         ]
19 |     },
20 | 
21 |     // Global CSS: https://go.nuxtjs.dev/config-css
22 |     css: [
23 |         'ant-design-vue/dist/antd.css'
24 |     ],
25 | 
26 |     // Plugins to run before rendering page: https://go.nuxtjs.dev/config-plugins
27 |     plugins: [
28 |         '@/plugins/antd-ui',
29 |         '@/plugins/axios'
30 |     ],
31 | 
32 |     // Auto import components: https://go.nuxtjs.dev/config-components
33 |     components: true,
34 | 
35 |     // Modules for dev and build (recommended): https://go.nuxtjs.dev/config-modules
36 |     buildModules: [
37 |     // https://go.nuxtjs.dev/eslint
38 |         '@nuxtjs/eslint-module'
39 |     ],
40 | 
41 |     // Modules: https://go.nuxtjs.dev/config-modules
42 |     modules: [],
43 | 
44 |     // Build Configuration: https://go.nuxtjs.dev/config-build
45 |     build: {},
46 | 
47 |     router: {
48 |         base: '/web/'
49 |     },
50 | 
51 |     env: {
52 |         AXIOS_BASE_URL: // 浏览器访问后端的地址
53 |             process.env.NODE_ENV === 'production' ? '/' : 'http://localhost:5000'
54 |     }
55 | };
56 | 


--------------------------------------------------------------------------------
/frontend/src/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "frontend",
 3 |   "version": "1.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "dev": "nuxt",
 7 |     "generate": "NODE_ENV=production nuxt generate",
 8 |     "lint:js": "eslint --ext \".js,.vue\" --ignore-path .gitignore --fix .",
 9 |     "lint": "npm run lint:js"
10 |   },
11 |   "dependencies": {
12 |     "ant-design-vue": "^1.7.2",
13 |     "axios": "^0.21.1",
14 |     "core-js": "^3.8.3",
15 |     "moment": "^2.29.1",
16 |     "nuxt": "^2.14.12"
17 |   },
18 |   "devDependencies": {
19 |     "@nuxtjs/eslint-config": "^5.0.0",
20 |     "@nuxtjs/eslint-module": "^3.0.2",
21 |     "babel-eslint": "^10.1.0",
22 |     "eslint": "^7.18.0",
23 |     "eslint-plugin-nuxt": "^2.0.0",
24 |     "eslint-plugin-vue": "^7.5.0"
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/frontend/src/pages/fetchers.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |     <div>
  3 |         <a-row :gutter="16">
  4 |             <a-col :span="4">
  5 |                 <a-card :body-style="{padding: '20px 24px 4px'}">
  6 |                     <p>
  7 |                         自动刷新:
  8 |                         <a-switch v-model="autoupdate" />
  9 |                     </p>
 10 |                     <p>刷新时间：{{ lastupdate }}</p>
 11 |                 </a-card>
 12 |             </a-col>
 13 |             <a-col :span="4">
 14 |                 <a-card :body-style="{padding: '20px 24px 43px'}">
 15 |                     <div style="text-align: center">
 16 |                         <a-button type="primary" @click="clearStatus">
 17 |                             清空统计信息
 18 |                         </a-button>
 19 |                         <a-tooltip title="清空`总共爬取代理数量`等，已经爬取到的代理不会删除">
 20 |                             <a-icon type="question-circle" />
 21 |                         </a-tooltip>
 22 |                     </div>
 23 |                 </a-card>
 24 |             </a-col>
 25 |         </a-row>
 26 |         <br />
 27 |         <a-table
 28 |             :columns="columns"
 29 |             :data-source="fetchers"
 30 |             row-key="name"
 31 |             :pagination="false"
 32 |             :bordered="true"
 33 |         >
 34 |             <span slot="inDbCntTitle">
 35 |                 数据库中的代理数量
 36 |                 <a-tooltip>
 37 |                     <template #title>
 38 |                         <span>
 39 |                             当前数据库中，有多少代理是这个爬取器爬到的。
 40 |                             和`总共爬取代理数量`不同的地方在于，这个去掉了重复的和已经删除的代理。
 41 |                         </span>
 42 |                     </template>
 43 |                     <a-icon type="question-circle" />
 44 |                 </a-tooltip>
 45 |             </span>
 46 |             <span slot="enableTitle">
 47 |                 是否启用
 48 |                 <a-tooltip>
 49 |                     <template #title>
 50 |                         <span>
 51 |                             在禁用之后，将不会再运行该爬取器。
 52 |                         </span>
 53 |                     </template>
 54 |                     <a-icon type="question-circle" />
 55 |                 </a-tooltip>
 56 |             </span>
 57 |             <template slot="enable" slot-scope="enable, record">
 58 |                 <a-switch :default-checked="enable" @change="enableChange(record)" />
 59 |             </template>
 60 |         </a-table>
 61 |     </div>
 62 | </template>
 63 | 
 64 | <script>
 65 | import moment from 'moment';
 66 | 
 67 | const columns = [
 68 |     {
 69 |         title: '名称',
 70 |         dataIndex: 'name'
 71 |     },
 72 |     {
 73 |         title: '当前可用代理数量',
 74 |         dataIndex: 'validated_cnt'
 75 |     },
 76 |     {
 77 |         dataIndex: 'in_db_cnt',
 78 |         slots: { title: 'inDbCntTitle' }
 79 |     },
 80 |     {
 81 |         title: '总共爬取代理数量',
 82 |         dataIndex: 'sum_proxies_cnt'
 83 |     },
 84 |     {
 85 |         title: '上次爬取代理数量',
 86 |         dataIndex: 'last_proxies_cnt'
 87 |     },
 88 |     {
 89 |         title: '上次爬取时间',
 90 |         dataIndex: 'last_fetch_date',
 91 |         customRender: (date) => {
 92 |             return date ? moment(date).format('YYYY-MM-DD HH:mm:ss') : '';
 93 |         }
 94 |     },
 95 |     {
 96 |         dataIndex: 'enable',
 97 |         slots: { title: 'enableTitle' },
 98 |         scopedSlots: { customRender: 'enable' }
 99 |     }
100 | ];
101 | 
102 | // const data = [
103 | //     {
104 | //         name: '1',
105 | //         enable: true,
106 | //         sum_proxies_cnt: 308,
107 | //         last_proxies_cnt: 20,
108 | //         last_fetch_date: moment().toDate()
109 | //     }
110 | // ];
111 | 
112 | export default {
113 |     data () {
114 |         return {
115 |             fetchers: [],
116 |             columns,
117 |             autoupdate: true,
118 |             lastupdate: '',
119 |             handle: null
120 |         };
121 |     },
122 |     mounted () {
123 |         this.handle = setInterval(() => {
124 |             if (this.autoupdate) { this.update(); }
125 |         }, 2000);
126 |         this.update();
127 |     },
128 |     destroyed () {
129 |         if (this.handle) { clearInterval(this.handle); }
130 |         this.handle = null;
131 |     },
132 |     methods: {
133 |         async update () {
134 |             const data = await this.$http.get('/fetchers_status');
135 |             this.fetchers = data.fetchers;
136 |             this.lastupdate = moment().format('HH:mm:ss');
137 |         },
138 |         async clearStatus () {
139 |             await this.$http.get('/clear_fetchers_status');
140 |             this.$message.success('清空成功');
141 |             await this.update();
142 |         },
143 |         async enableChange (fetcher) {
144 |             if (fetcher.enable) {
145 |                 await this.$http.get('/fetcher_enable', { name: fetcher.name, enable: '0' });
146 |             } else {
147 |                 await this.$http.get('/fetcher_enable', { name: fetcher.name, enable: '1' });
148 |             }
149 |             this.$message.success('修改成功');
150 |             await this.update();
151 |         }
152 |     }
153 | };
154 | </script>
155 | 


--------------------------------------------------------------------------------
/frontend/src/pages/index.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |     <div>
  3 |         <a-row :gutter="16">
  4 |             <a-col :span="4">
  5 |                 <a-card :body-style="{padding: '20px 24px 14px'}">
  6 |                     <a-statistic
  7 |                         :value="sum_proxies_cnt"
  8 |                         :value-style="{ color: '#3f8600' }"
  9 |                         style="margin-right: 50px"
 10 |                     >
 11 |                         <template #title>
 12 |                             <span>
 13 |                                 全部代理数量
 14 |                                 <a-tooltip title="目前数据库中的代理总数，包含没有通过验证的代理">
 15 |                                     <a-icon type="question-circle" />
 16 |                                 </a-tooltip>
 17 |                             </span>
 18 |                         </template>
 19 |                     </a-statistic>
 20 |                 </a-card>
 21 |             </a-col>
 22 |             <a-col :span="4">
 23 |                 <a-card :body-style="{padding: '20px 24px 14px'}">
 24 |                     <a-statistic
 25 |                         title="当前可用代理数量"
 26 |                         :value="validated_proxies_cnt"
 27 |                         :value-style="{ color: '#3f8600' }"
 28 |                         style="margin-right: 50px"
 29 |                     >
 30 |                     </a-statistic>
 31 |                 </a-card>
 32 |             </a-col>
 33 |             <a-col :span="4">
 34 |                 <a-card :body-style="{padding: '20px 24px 14px'}">
 35 |                     <a-statistic
 36 |                         :value="pending_proxies_cnt"
 37 |                         :value-style="{ color: '#3f8600' }"
 38 |                         style="margin-right: 50px"
 39 |                     >
 40 |                         <template #title>
 41 |                             <span>
 42 |                                 等待验证代理数量
 43 |                                 <a-tooltip>
 44 |                                     <template #title>
 45 |                                         <span>
 46 |                                             表示这些代理的`下次验证时间`已经到了，但是还没有完成验证。
 47 |                                             如果该数字突然增大，有可能是爬取器突然网数据库中添加了一批代理，是正常现象，慢慢等待即可。
 48 |                                             如果该数字一直较大，则表示验证器忙不过来了。
 49 |                                         </span>
 50 |                                     </template>
 51 |                                     <a-icon type="question-circle" />
 52 |                                 </a-tooltip>
 53 |                             </span>
 54 |                         </template>
 55 |                     </a-statistic>
 56 |                 </a-card>
 57 |             </a-col>
 58 |             <a-col :span="4">
 59 |                 <a-card :body-style="{padding: '20px 24px 4px'}">
 60 |                     <p>
 61 |                         自动刷新:
 62 |                         <a-switch v-model="autoupdate" />
 63 |                     </p>
 64 |                     <p>刷新时间：{{ lastupdate }}</p>
 65 |                 </a-card>
 66 |             </a-col>
 67 |         </a-row>
 68 |         <br />
 69 |         <a-table
 70 |             :columns="columns"
 71 |             :data-source="proxies"
 72 |             :row-key="(r) => `${r.protocol}://${r.ip}:${r.port}`"
 73 |             :bordered="true"
 74 |         >
 75 |             <span slot="to_validate_date">
 76 |                 下次验证时间
 77 |                 <a-tooltip>
 78 |                     <template #title>
 79 |                         <span>
 80 |                             验证器会不断从数据库中取出满足`下次验证时间`在当前时间之前的代理进行验证。
 81 |                         </span>
 82 |                     </template>
 83 |                     <a-icon type="question-circle" />
 84 |                 </a-tooltip>
 85 |             </span>
 86 |             <span slot="latency" slot-scope="latency">
 87 |                 <a-tag
 88 |                     :color="latency < 2000 ? 'green' : (latency < 4000 ? 'orange' : 'red')"
 89 |                 >
 90 |                     {{ latency }}
 91 |                 </a-tag>
 92 |             </span>
 93 |         </a-table>
 94 |     </div>
 95 | </template>
 96 | 
 97 | <script>
 98 | import moment from 'moment';
 99 | 
100 | const columns = [
101 |     {
102 |         title: '来自',
103 |         dataIndex: 'fetcher_name'
104 |     },
105 |     {
106 |         title: '代理类型',
107 |         dataIndex: 'protocol'
108 |     },
109 |     {
110 |         title: 'IP',
111 |         dataIndex: 'ip'
112 |     },
113 |     {
114 |         title: '端口',
115 |         dataIndex: 'port'
116 |     },
117 |     {
118 |         title: '延迟',
119 |         dataIndex: 'latency',
120 |         scopedSlots: { customRender: 'latency' }
121 |     },
122 |     {
123 |         title: '上次验证时间',
124 |         dataIndex: 'validate_date',
125 |         customRender: (date) => {
126 |             return date ? moment(date).format('YYYY-MM-DD HH:mm:ss') : '';
127 |         }
128 |     },
129 |     {
130 |         dataIndex: 'to_validate_date',
131 |         slots: { title: 'to_validate_date' },
132 |         customRender: (date) => {
133 |             return date ? moment(date).format('YYYY-MM-DD HH:mm:ss') : '';
134 |         }
135 |     }
136 | ];
137 | 
138 | // const data = [
139 | //     {
140 | //         fetcher_name: '1',
141 | //         protocol: 'http',
142 | //         ip: '127.0.0.1',
143 | //         port: 308,
144 | //         validated: true,
145 | //         validate_date: moment().toDate(),
146 | //         to_validate_date: moment().toDate(),
147 | //         validate_failed_cnt: 0
148 | //     }
149 | // ];
150 | 
151 | export default {
152 |     data () {
153 |         return {
154 |             columns,
155 |             proxies: [],
156 |             sum_proxies_cnt: 0,
157 |             validated_proxies_cnt: 0,
158 |             pending_proxies_cnt: 0,
159 |             autoupdate: true,
160 |             lastupdate: '',
161 |             handle: null
162 |         };
163 |     },
164 |     mounted () {
165 |         this.handle = setInterval(() => {
166 |             if (this.autoupdate) { this.update(); }
167 |         }, 2000);
168 |         this.update();
169 |     },
170 |     destroyed () {
171 |         if (this.handle) { clearInterval(this.handle); }
172 |         this.handle = null;
173 |     },
174 |     methods: {
175 |         async update () {
176 |             const data = await this.$http.get('/proxies_status');
177 |             this.proxies = data.proxies;
178 |             this.sum_proxies_cnt = data.sum_proxies_cnt;
179 |             this.validated_proxies_cnt = data.validated_proxies_cnt;
180 |             this.pending_proxies_cnt = data.pending_proxies_cnt;
181 |             this.lastupdate = moment().format('HH:mm:ss');
182 |         }
183 |     }
184 | };
185 | </script>
186 | 


--------------------------------------------------------------------------------
/frontend/src/plugins/antd-ui.js:
--------------------------------------------------------------------------------
1 | import Vue from 'vue';
2 | import Antd from 'ant-design-vue/lib';
3 | 
4 | Vue.use(Antd);
5 | 


--------------------------------------------------------------------------------
/frontend/src/plugins/axios.js:
--------------------------------------------------------------------------------
 1 | import axios from 'axios';
 2 | import { Modal } from 'ant-design-vue';
 3 | 
 4 | const baseURL = process.env.AXIOS_BASE_URL;
 5 | 
 6 | const instance = axios.create({
 7 |     baseURL,
 8 |     timeout: 10000,
 9 |     withCredentials: true
10 | });
11 | 
12 | function showModel (type, title, content) {
13 |     return new Promise((resolve) => {
14 |         setTimeout(() => {
15 |             Modal.destroyAll();
16 |             Modal[type]({
17 |                 title,
18 |                 content,
19 |                 onOk: () => {
20 |                     resolve();
21 |                 }
22 |             });
23 |         }, 500);
24 |     });
25 | }
26 | 
27 | function never () {
28 |     return new Promise(() => {});
29 | }
30 | 
31 | async function handle (req) {
32 |     let data = {};
33 |     try {
34 |         const res = await req;
35 |         data = res.data;
36 |     } catch (e) {
37 |         await showModel('error', '网络错误', e.message);
38 |         // throw e; // IE上会弹出错误提示
39 |         await never();
40 |     }
41 |     if (!data.success) {
42 |         await showModel('info', '错误', data.message);
43 |         // throw new Error(data.message);
44 |         await never();
45 |     }
46 |     return data;
47 | }
48 | 
49 | class Http {
50 |     constructor () {
51 |         this.baseURL = baseURL;
52 |     }
53 | 
54 |     async get (url, params) {
55 |         params = params || {};
56 |         return await handle(instance.get(url, { params }));
57 |     }
58 | 
59 |     async post (url, params, data) {
60 |         params = params || {};
61 |         data = data || {};
62 |         return await handle(instance.post(url, data, { params }));
63 |     }
64 | }
65 | 
66 | export default ({ req }, inject) => {
67 |     inject('http', new Http({ }));
68 | };
69 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | import sys, os, signal
 4 | sys.path.append(os.path.dirname(__file__) + os.sep + '../')
 5 | from multiprocessing import Process
 6 | import time
 7 | from proc import run_fetcher, run_validator
 8 | from api import api
 9 | import multiprocessing
10 | 
11 | # 进程锁
12 | proc_lock = multiprocessing.Lock()
13 | 
14 | class Item:
15 |     def __init__(self, target, name):
16 |         self.target = target
17 |         self.name = name
18 |         self.process = None
19 |         self.start_time = 0
20 | 
21 | def main():
22 |     processes = []
23 |     processes.append(Item(target=run_fetcher.main, name='fetcher'))
24 |     processes.append(Item(target=run_validator.main, name='validator'))
25 |     processes.append(Item(target=api.main, name='api'))
26 | 
27 |     while True:
28 |         for p in processes:
29 |             if p.process is None:
30 |                 p.process = Process(target=p.target, name=p.name, daemon=False, args=(proc_lock, ))
31 |                 p.process.start()
32 |                 print(f'启动{p.name}进程，pid={p.process.pid}')
33 |                 p.start_time = time.time()
34 | 
35 |         for p in processes:
36 |             if p.process is not None:
37 |                 if not p.process.is_alive():
38 |                     print(f'进程{p.name}异常退出, exitcode={p.process.exitcode}')
39 |                     p.process.terminate()
40 |                     p.process = None
41 |                     # 解除进程锁
42 |                     try:
43 |                         proc_lock.release()
44 |                     except ValueError:
45 |                         pass
46 |                 elif p.start_time + 60 * 60 < time.time(): # 最长运行1小时就重启
47 |                     print(f'进程{p.name}运行太久，重启')
48 |                     p.process.terminate()
49 |                     p.process = None
50 |                     # 解除进程锁
51 |                     try:
52 |                         proc_lock.release()
53 |                     except ValueError:
54 |                         pass
55 | 
56 |         time.sleep(0.2)
57 | 
58 | def citest():
59 |     """
60 |     此函数仅用于检查程序是否可运行，一般情况下使用本项目可忽略
61 |     """
62 |     processes = []
63 |     processes.append(Item(target=run_fetcher.main, name='fetcher'))
64 |     processes.append(Item(target=run_validator.main, name='validator'))
65 |     processes.append(Item(target=api.main, name='api'))
66 | 
67 |     for p in processes:
68 |         assert p.process is None
69 |         p.process = Process(target=p.target, name=p.name, daemon=False)
70 |         p.process.start()
71 |         print(f'running {p.name}, pid={p.process.pid}')
72 |         p.start_time = time.time()
73 | 
74 |     time.sleep(10)
75 | 
76 |     for p in processes:
77 |         assert p.process is not None
78 |         assert p.process.is_alive()
79 |         p.process.terminate()
80 | 
81 | if __name__ == '__main__':
82 |     try:
83 |         if len(sys.argv) >= 2 and sys.argv[1] == 'citest':
84 |             citest()
85 |         else:
86 |             main()
87 |         sys.exit(0)
88 |     except Exception as e:
89 |         print('========FATAL ERROR=========')
90 |         print(e)
91 |         sys.exit(1)
92 | 


--------------------------------------------------------------------------------
/proc/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 包含爬取器和验证器的代码。
3 | 
4 | 爬取器会定时运行注册的爬取器，并将爬取到的代理放入数据库中，详见代码`run_fetcher.py`。
5 | 
6 | 验证器会不断从数据库中获取待验证的代理（代理的`下次待验证时间`小于当前时间），并进行验证，详见代码`run_validator.py`。
7 | 


--------------------------------------------------------------------------------
/proc/__init__.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | 


--------------------------------------------------------------------------------
/proc/run_fetcher.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | """
 3 | 定时运行爬取器
 4 | """
 5 | 
 6 | import sys
 7 | import threading
 8 | from queue import Queue
 9 | import logging
10 | import time
11 | from db import conn
12 | from fetchers import fetchers
13 | from config import PROC_FETCHER_SLEEP
14 | from func_timeout import func_set_timeout
15 | from func_timeout.exceptions import FunctionTimedOut
16 | 
17 | logging.basicConfig(stream=sys.stdout, format="%(asctime)s-%(levelname)s:%(name)s:%(message)s", level='INFO')
18 | 
19 | def main(proc_lock):
20 |     """
21 |     定时运行爬取器
22 |     主要逻辑：
23 |     While True:
24 |         for 爬取器 in 所有爬取器:
25 |             查询数据库，判断当前爬取器是否需要运行
26 |             如果需要运行，那么启动线程运行该爬取器
27 |         等待所有线程结束
28 |         将爬取到的代理放入数据库中
29 |         睡眠一段时间
30 |     """
31 |     logger = logging.getLogger('fetcher')
32 |     conn.set_proc_lock(proc_lock)
33 | 
34 |     while True:
35 |         logger.info('开始运行一轮爬取器')
36 |         status = conn.getProxiesStatus()
37 |         if status['pending_proxies_cnt'] > 2000:
38 |             logger.info(f"还有{status['pending_proxies_cnt']}个代理等待验证，数量过多，跳过本次爬取")
39 |             time.sleep(PROC_FETCHER_SLEEP)
40 |             continue
41 | 
42 |         @func_set_timeout(30)
43 |         def fetch_worker(fetcher):
44 |             f = fetcher()
45 |             proxies = f.fetch()
46 |             return proxies
47 | 
48 |         def run_thread(name, fetcher, que):
49 |             """
50 |             name: 爬取器名称
51 |             fetcher: 爬取器class
52 |             que: 队列，用于返回数据
53 |             """
54 |             try:
55 |                 proxies = fetch_worker(fetcher)
56 |                 que.put((name, proxies))
57 |             except Exception as e:
58 |                 logger.error(f'运行爬取器{name}出错：' + str(e))
59 |                 que.put((name, []))
60 |             except FunctionTimedOut:
61 |                 pass
62 | 
63 |         threads = []
64 |         que = Queue()
65 |         for item in fetchers:
66 |             data = conn.getFetcher(item.name)
67 |             if data is None:
68 |                 logger.error(f'没有在数据库中找到对应的信息：{item.name}')
69 |                 raise ValueError('不可恢复错误')
70 |             if not data.enable:
71 |                 logger.info(f'跳过爬取器{item.name}')
72 |                 continue
73 |             threads.append(threading.Thread(target=run_thread, args=(item.name, item.fetcher, que)))
74 |         [t.start() for t in threads]
75 |         [t.join() for t in threads]
76 |         while not que.empty():
77 |             fetcher_name, proxies = que.get()
78 |             for proxy in proxies:
79 |                 conn.pushNewFetch(fetcher_name, proxy[0], proxy[1], proxy[2])
80 |             conn.pushFetcherResult(fetcher_name, len(proxies))
81 |         
82 |         logger.info(f'完成运行{len(threads)}个爬取器，睡眠{PROC_FETCHER_SLEEP}秒')
83 |         time.sleep(PROC_FETCHER_SLEEP)
84 | 


--------------------------------------------------------------------------------
/proc/run_validator.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | """
  3 | 验证器逻辑
  4 | """
  5 | 
  6 | import sys
  7 | import socket
  8 | import threading
  9 | from queue import Queue
 10 | import logging
 11 | import time
 12 | import requests
 13 | from func_timeout import func_set_timeout
 14 | from func_timeout.exceptions import FunctionTimedOut
 15 | from db import conn
 16 | from config import PROC_VALIDATOR_SLEEP, VALIDATE_THREAD_NUM
 17 | from config import VALIDATE_METHOD, VALIDATE_KEYWORD, VALIDATE_HEADER, VALIDATE_URL, VALIDATE_TIMEOUT, VALIDATE_MAX_FAILS
 18 | 
 19 | logging.basicConfig(stream=sys.stdout, format="%(asctime)s-%(levelname)s:%(name)s:%(message)s", level='INFO')
 20 | 
 21 | def main(proc_lock):
 22 |     """
 23 |     验证器
 24 |     主要逻辑：
 25 |     创建VALIDATE_THREAD_NUM个验证线程，这些线程会不断运行
 26 |     While True:
 27 |         检查验证线程是否返回了代理的验证结果
 28 |         从数据库中获取若干当前待验证的代理
 29 |         将代理发送给前面创建的线程
 30 |     """
 31 |     logger = logging.getLogger('validator')
 32 |     conn.set_proc_lock(proc_lock)
 33 | 
 34 |     in_que = Queue()
 35 |     out_que = Queue()
 36 |     running_proxies = set() # 储存哪些代理正在运行，以字符串的形式储存
 37 | 
 38 |     threads = []
 39 |     for _ in range(VALIDATE_THREAD_NUM):
 40 |         threads.append(threading.Thread(target=validate_thread, args=(in_que, out_que)))
 41 |     [_.start() for _ in threads]
 42 | 
 43 |     while True:
 44 |         out_cnt = 0
 45 |         while not out_que.empty():
 46 |             proxy, success, latency = out_que.get()
 47 |             conn.pushValidateResult(proxy, success, latency)
 48 |             uri = f'{proxy.protocol}://{proxy.ip}:{proxy.port}'
 49 |             running_proxies.remove(uri)
 50 |             out_cnt = out_cnt + 1
 51 |         if out_cnt > 0:
 52 |             logger.info(f'完成了{out_cnt}个代理的验证')
 53 | 
 54 |         # 如果正在进行验证的代理足够多，那么就不着急添加新代理        
 55 |         if len(running_proxies) >= VALIDATE_THREAD_NUM * 2:
 56 |             time.sleep(PROC_VALIDATOR_SLEEP)
 57 |             continue
 58 | 
 59 |         # 找一些新的待验证的代理放入队列中
 60 |         added_cnt = 0
 61 |         for proxy in conn.getToValidate(VALIDATE_THREAD_NUM * 4):
 62 |             uri = f'{proxy.protocol}://{proxy.ip}:{proxy.port}'
 63 |             # 这里找出的代理有可能是正在进行验证的代理，要避免重复加入
 64 |             if uri not in running_proxies:
 65 |                 running_proxies.add(uri)
 66 |                 in_que.put(proxy)
 67 |                 added_cnt += 1
 68 |         
 69 |         if added_cnt == 0:
 70 |             time.sleep(PROC_VALIDATOR_SLEEP)
 71 | 
 72 | @func_set_timeout(VALIDATE_TIMEOUT * 2)
 73 | def validate_once(proxy):
 74 |     """
 75 |     进行一次验证，如果验证成功则返回True，否则返回False或者是异常
 76 |     """
 77 |     proxies = {
 78 |         'http': f'{proxy.protocol}://{proxy.ip}:{proxy.port}',
 79 |         'https': f'{proxy.protocol}://{proxy.ip}:{proxy.port}'
 80 |     }
 81 |     if VALIDATE_METHOD == "GET":
 82 |         r = requests.get(VALIDATE_URL, timeout=VALIDATE_TIMEOUT, proxies=proxies)
 83 |         r.encoding = "utf-8"
 84 |         html = r.text
 85 |         if VALIDATE_KEYWORD in html:
 86 |             return True
 87 |         return False
 88 |     else:
 89 |         r = requests.get(VALIDATE_URL, timeout=VALIDATE_TIMEOUT, proxies=proxies, allow_redirects=False)
 90 |         resp_headers = r.headers
 91 |         if VALIDATE_HEADER in resp_headers.keys() and VALIDATE_KEYWORD in resp_headers[VALIDATE_HEADER]:
 92 |             return True
 93 |         return False
 94 | 
 95 | def validate_thread(in_que, out_que):
 96 |     """
 97 |     验证函数，这个函数会在一个线程中被调用
 98 |     in_que: 输入队列，用于接收验证任务
 99 |     out_que: 输出队列，用于返回验证结果
100 |     in_que和out_que都是线程安全队列，并且如果队列为空，调用in_que.get()会阻塞线程
101 |     """
102 | 
103 |     while True:
104 |         proxy = in_que.get()
105 | 
106 |         success = False
107 |         latency = None
108 |         for _ in range(VALIDATE_MAX_FAILS):
109 |             try:
110 |                 start_time = time.time()
111 |                 if validate_once(proxy):
112 |                     end_time = time.time()
113 |                     latency = int((end_time-start_time)*1000)
114 |                     success = True
115 |                     break
116 |             except Exception as e:
117 |                 pass
118 |             except FunctionTimedOut:
119 |                 pass
120 | 
121 |         out_que.put((proxy, success, latency))
122 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2020.12.5
 2 | chardet==4.0.0
 3 | click==7.1.2
 4 | cssselect==1.1.0
 5 | Flask==1.1.2
 6 | idna==2.10
 7 | itsdangerous==1.1.0
 8 | Jinja2==2.11.2
 9 | lxml==4.6.2
10 | MarkupSafe==1.1.1
11 | pyquery==1.4.3
12 | requests==2.25.1
13 | urllib3==1.26.3
14 | Werkzeug==1.0.1
15 | PySocks==1.7.1
16 | func-timeout==4.3.5
17 | 


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 一些测试脚本。
3 | 


--------------------------------------------------------------------------------
/test/testDB.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | import sys,os
 4 | sys.path.append(os.path.dirname(__file__) + os.sep + '../')
 5 | from db import conn
 6 | 
 7 | def run():
 8 |     assert len(conn.getToValidate(10)) == 0
 9 |     conn.pushNewFetch('test', 'http', '127.0.0.1', 8080)
10 |     assert len(conn.getToValidate(10)) == 1
11 | 
12 |     conn.pushNewFetch('test', 'http', '127.0.0.2', 8080)
13 |     conn.pushNewFetch('test', 'http', '127.0.0.3', 8080)
14 |     conn.pushNewFetch('test', 'http', '127.0.0.4', 8080)
15 |     assert len(conn.getToValidate(2)) == 2
16 |     proxies = conn.getToValidate(10)
17 |     assert len(proxies) == 4
18 |     assert proxies[0].ip == '127.0.0.1'
19 |     assert proxies[1].ip == '127.0.0.2'
20 |     assert proxies[2].ip == '127.0.0.3'
21 |     assert proxies[3].ip == '127.0.0.4'
22 | 
23 |     p = conn.getToValidate(1)[0] # 设置一个代理通过验证
24 |     conn.pushValidateResult(p, True)
25 |     assert len(conn.getToValidate(10)) == 3
26 |     p = conn.getToValidate(1)[0] # 设置一个代理没有通过验证
27 |     conn.pushValidateResult(p, False)
28 |     assert len(conn.getToValidate(10)) == 2
29 |     assert len(conn.getValidatedRandom(1)) == 1
30 |     assert len(conn.getValidatedRandom(-1)) == 1
31 |     p = conn.getValidatedRandom(1)[0]
32 |     assert p.ip == '127.0.0.1'
33 |     p = conn.getToValidate(1)[0] # 设置一个代理通过验证
34 |     conn.pushValidateResult(p, True)
35 |     assert len(conn.getValidatedRandom(1)) == 1
36 |     assert len(conn.getValidatedRandom(-1)) == 2
37 | 
38 |     proxies_status = conn.getProxiesStatus()
39 |     assert proxies_status['sum_proxies_cnt'] == 4
40 |     assert proxies_status['validated_proxies_cnt'] == 2
41 |     assert proxies_status['pending_proxies_cnt'] == 1
42 | 
43 |     fetchers = conn.getAllFetchers()
44 |     for item in fetchers:
45 |         # 所有爬取器都应该是默认参数
46 |         assert item.enable == True
47 |         assert item.sum_proxies_cnt == 0
48 |         assert item.last_proxies_cnt == 0
49 |         assert item.last_fetch_date is None
50 |     conn.pushFetcherResult('www.kuaidaili.com', 10)
51 |     conn.pushFetcherResult('www.kuaidaili.com', 20)
52 |     conn.pushFetcherEnable('www.kuaidaili.com', False)
53 |     f = conn.getFetcher('www.kuaidaili.com')
54 |     assert f is not None
55 |     # www.kuaidaili.com的参数应该被修改了
56 |     assert f.enable == False
57 |     assert f.sum_proxies_cnt == 30
58 |     assert f.last_proxies_cnt == 20
59 |     assert f.last_fetch_date is not None
60 | 
61 |     conn.pushClearFetchersStatus()
62 |     f = conn.getFetcher('www.kuaidaili.com')
63 |     assert f is not None
64 |     # www.kuaidaili.com的参数应该被修改了
65 |     assert f.sum_proxies_cnt == 0
66 |     assert f.last_proxies_cnt == 0
67 |     assert f.last_fetch_date is None
68 | 
69 | if __name__ == '__main__':
70 |     print(u'请确保运行本脚本之前删除或备份`data.db`文件')
71 |     run()
72 |     print(u'测试通过')
73 | 


--------------------------------------------------------------------------------
/test/testFetcher.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | import sys,os
 4 | sys.path.append(os.path.dirname(__file__) + os.sep + '../')
 5 | from fetchers import fetchers
 6 | 
 7 | def run():
 8 |     proxies_cnt = dict()
 9 |     for item in fetchers:
10 |         if item.name != 'www.xsdaili.cn': continue # 这行表示只测试特定的爬取器
11 | 
12 |         print('='*10, 'RUNNING ' + item.name, '='*10)
13 |         fetcher = item.fetcher() # 实例化爬取器
14 |         try:
15 |             proxies = fetcher.fetch()
16 |         except Exception as e:
17 |             print(e)
18 |             proxies = []
19 |         print(proxies)
20 |         proxies_cnt[item.name] = len(proxies)
21 |     
22 |     print('='*10, 'PROXIES CNT', '='*10)
23 |     print(proxies_cnt)
24 | 
25 | if __name__ == '__main__':
26 |     run()
27 | 


--------------------------------------------------------------------------------