├── .gitignore
├── LICENSE
├── README.md
├── async_proxy_pool
├── __init__.py
├── config.py
├── crawler.py
├── database.py
├── logger.py
├── scheduler.py
├── utils.py
├── validator.py
├── webapi_flask.py
└── webapi_sanic.py
├── client.py
├── requirements.txt
├── server_flask.py
├── server_sanic.py
└── test
└── test_proxy.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | .idea
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *.cover
48 | .hypothesis/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | .static_storage/
57 | .media/
58 | local_settings.py
59 |
60 | # Flask stuff:
61 | instance/
62 | .webassets-cache
63 |
64 | # Scrapy stuff:
65 | .scrapy
66 |
67 | # Sphinx documentation
68 | docs/_build/
69 |
70 | # PyBuilder
71 | target/
72 |
73 | # Jupyter Notebook
74 | .ipynb_checkpoints
75 |
76 | # pyenv
77 | .python-version
78 |
79 | # celery beat schedule file
80 | celerybeat-schedule
81 |
82 | # SageMath parsed files
83 | *.sage.py
84 |
85 | # Environments
86 | .env
87 | .venv
88 | env/
89 | venv/
90 | ENV/
91 | env.bak/
92 | venv.bak/
93 |
94 | # Spyder project settings
95 | .spyderproject
96 | .spyproject
97 |
98 | # Rope project settings
99 | .ropeproject
100 |
101 | # mkdocs documentation
102 | /site
103 |
104 | # mypy
105 | .mypy_cache/
106 |
107 | # for vscode
108 | .vscode/
109 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018~now chenjiandongx
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
Async Proxy Pool
2 |
3 | 异步爬虫代理池,以 Python asyncio 为基础,旨在充分利用 Python 的异步性能。
4 |
5 |
6 | ### 运行环境
7 |
8 | 项目使用了 [sanic](https://github.com/channelcat/sanic),(也提供了 Flask)一个异步网络框架。所以建议运行 Python 环境为 Python3.5+,并且 sanic 不支持 Windows 系统,Windows 用户(比如我 😄)可以考虑使用 Ubuntu on Windows。
9 |
10 |
11 | ### 如何使用
12 |
13 | #### 安装 Redis
14 | 项目数据库使用了 [Redis](https://redis.io/),Redis 是一个开源(BSD 许可)的,内存中的数据结构存储系统,它可以用作数据库、缓存和消息中间件。所以请确保运行环境已经正确安装了 Redis。安装方法请参照官网指南。
15 |
16 | #### 下载项目源码
17 | ```bash
18 | $ git clone https://github.com/chenjiandongx/async-proxy-pool.git
19 | ```
20 |
21 | #### 安装依赖
22 | 使用 requirements.txt
23 | ```bash
24 | $ pip install -r requirements.txt
25 | ```
26 |
27 | #### 配置文件
28 | 配置文件 [config.py](https://github.com/chenjiandongx/async-proxy-pool/blob/master/async_proxy_pool/config.py),保存了项目所使用到的所有配置项。如下所示,用户可以根据需求自行更改。不然按默认即可。
29 | ```python
30 | #!/usr/bin/env python
31 | # coding=utf-8
32 |
33 | # 请求超时时间(秒)
34 | REQUEST_TIMEOUT = 15
35 | # 请求延迟时间(秒)
36 | REQUEST_DELAY = 0
37 |
38 | # redis 地址
39 | REDIS_HOST = "localhost"
40 | # redis 端口
41 | REDIS_PORT = 6379
42 | # redis 密码
43 | REDIS_PASSWORD = None
44 | # redis set key
45 | REDIS_KEY = "proxies:ranking"
46 | # redis 连接池最大连接量
47 | REDIS_MAX_CONNECTION = 20
48 |
49 | # REDIS SCORE 最大分数
50 | MAX_SCORE = 10
51 | # REDIS SCORE 最小分数
52 | MIN_SCORE = 0
53 | # REDIS SCORE 初始分数
54 | INIT_SCORE = 9
55 |
56 | # server web host
57 | SERVER_HOST = "localhost"
58 | # server web port
59 | SERVER_PORT = 3289
60 | # 是否开启日志记录
61 | SERVER_ACCESS_LOG = True
62 |
63 | # 批量测试数量
64 | VALIDATOR_BATCH_COUNT = 256
65 | # 校验器测试网站,可以定向改为自己想爬取的网站,如新浪,知乎等
66 | VALIDATOR_BASE_URL = "https://httpbin.org/"
67 | # 校验器循环周期(分钟)
68 | VALIDATOR_RUN_CYCLE = 15
69 |
70 |
71 | # 爬取器循环周期(分钟)
72 | CRAWLER_RUN_CYCLE = 30
73 | # 请求 headers
74 | HEADERS = {
75 | "X-Requested-With": "XMLHttpRequest",
76 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
77 | "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
78 | }
79 | ```
80 |
81 | ### 运行项目
82 |
83 | **运行客户端,启动收集器和校验器**
84 | ```bash
85 | # 可设置校验网站环境变量 set/export VALIDATOR_BASE_URL="https://example.com"
86 | $ python client.py
87 | 2018-05-16 23:41:39,234 - Crawler working...
88 | 2018-05-16 23:41:40,509 - Crawler √ http://202.83.123.33:3128
89 | 2018-05-16 23:41:40,509 - Crawler √ http://123.53.118.122:61234
90 | 2018-05-16 23:41:40,510 - Crawler √ http://212.237.63.84:8888
91 | 2018-05-16 23:41:40,510 - Crawler √ http://36.73.102.245:8080
92 | 2018-05-16 23:41:40,511 - Crawler √ http://78.137.90.253:8080
93 | 2018-05-16 23:41:40,512 - Crawler √ http://5.45.70.39:1490
94 | 2018-05-16 23:41:40,512 - Crawler √ http://117.102.97.162:8080
95 | 2018-05-16 23:41:40,513 - Crawler √ http://109.185.149.65:8080
96 | 2018-05-16 23:41:40,513 - Crawler √ http://189.39.143.172:20183
97 | 2018-05-16 23:41:40,514 - Crawler √ http://186.225.112.62:20183
98 | 2018-05-16 23:41:40,514 - Crawler √ http://189.126.66.154:20183
99 | ...
100 | 2018-05-16 23:41:55,866 - Validator working...
101 | 2018-05-16 23:41:56,951 - Validator × https://114.113.126.82:80
102 | 2018-05-16 23:41:56,953 - Validator × https://114.199.125.242:80
103 | 2018-05-16 23:41:56,955 - Validator × https://114.228.75.17:6666
104 | 2018-05-16 23:41:56,957 - Validator × https://115.227.3.86:9000
105 | 2018-05-16 23:41:56,960 - Validator × https://115.229.88.191:9000
106 | 2018-05-16 23:41:56,964 - Validator × https://115.229.89.100:9000
107 | 2018-05-16 23:41:56,966 - Validator × https://103.18.180.194:8080
108 | 2018-05-16 23:41:56,967 - Validator × https://115.229.90.207:9000
109 | 2018-05-16 23:41:56,968 - Validator × https://103.216.144.17:8080
110 | 2018-05-16 23:41:56,969 - Validator × https://117.65.43.29:31588
111 | 2018-05-16 23:41:56,971 - Validator × https://103.248.232.135:8080
112 | 2018-05-16 23:41:56,972 - Validator × https://117.94.69.166:61234
113 | 2018-05-16 23:41:56,975 - Validator × https://103.26.56.109:8080
114 | ...
115 | ```
116 |
117 | **运行服务器,启动 web 服务**
118 |
119 | #### Sanic
120 | ```bash
121 | $ python server_sanic.py
122 | [2018-05-16 23:36:22 +0800] [108] [INFO] Goin' Fast @ http://localhost:3289
123 | [2018-05-16 23:36:22 +0800] [108] [INFO] Starting worker [108]
124 | ```
125 |
126 | #### Flask
127 | ```bash
128 | $ python server_flask.py
129 | * Serving Flask app "async_proxy_pool.webapi_flask" (lazy loading)
130 | * Environment: production
131 | WARNING: Do not use the development server in a production environment.
132 | Use a production WSGI server instead.
133 | * Debug mode: on
134 | * Restarting with stat
135 | * Debugger is active!
136 | * Debugger PIN: 322-954-449
137 | * Running on http://localhost:3289/ (Press CTRL+C to quit)
138 | ```
139 |
140 | ### 总体架构
141 |
142 | 项目主要几大模块分别是爬取模块,存储模块,校验模块,调度模块,接口模块。
143 |
144 | [爬取模块](https://github.com/chenjiandongx/async-proxy-pool/blob/master/async_proxy_pool/crawler.py):负责爬取代理网站,并将所得到的代理存入到数据库,每个代理的初始化权值为 INIT_SCORE。
145 |
146 | [存储模块](https://github.com/chenjiandongx/async-proxy-pool/blob/master/async_proxy_pool/database.py):封装了 Redis 操作的一些接口,提供 Redis 连接池。
147 |
148 | [校验模块](https://github.com/chenjiandongx/async-proxy-pool/blob/master/async_proxy_pool/validator.py):验证代理 IP 是否可用,如果代理可用则权值 +1,最大值为 MAX_SCORE。不可用则权值 -1,直至权值为 0 时将代理从数据库中删除。
149 |
150 | [调度模块](https://github.com/chenjiandongx/async-proxy-pool/blob/master/async_proxy_pool/scheduler.py):负责调度爬取器和校验器的运行。
151 |
152 | [接口模块](https://github.com/chenjiandongx/async-proxy-pool/blob/master/async_proxy_pool/webapi.py):使用 sanic 提供 **WEB API** 。
153 |
154 |
155 | `/`
156 |
157 | 欢迎页面
158 | ```bash
159 | $ http http://localhost:3289/
160 | HTTP/1.1 200 OK
161 | Connection: keep-alive
162 | Content-Length: 42
163 | Content-Type: application/json
164 | Keep-Alive: 5
165 |
166 | {
167 | "Welcome": "This is a proxy pool system."
168 | }
169 | ```
170 |
171 |
172 | **`/pop`**
173 |
174 | 随机返回一个代理,分三次尝试。
175 | 1. 尝试返回权值为 MAX_SCORE,也就是最新可用的代理。
176 | 2. 尝试返回随机权值在 (MAX_SCORE -3) - MAX_SCORE 之间的代理。
177 | 3. 尝试返回权值在 0 - MAX_SCORE 之间的代理
178 | ```bash
179 | $ http http://localhost:3289/pop
180 | HTTP/1.1 200 OK
181 | Connection: keep-alive
182 | Content-Length: 38
183 | Content-Type: application/json
184 | Keep-Alive: 5
185 |
186 | {
187 | "http": "http://46.48.105.235:8080"
188 | }
189 | ```
190 |
191 |
192 | **`/get/`**
193 |
194 | 返回指定数量的代理,权值从大到小排序。
195 | ```bash
196 | $ http http://localhost:3289/get/10
197 | HTTP/1.1 200 OK
198 | Connection: keep-alive
199 | Content-Length: 393
200 | Content-Type: application/json
201 | Keep-Alive: 5
202 |
203 | [
204 | {
205 | "http": "http://94.177.214.215:3128"
206 | },
207 | {
208 | "http": "http://94.139.242.70:53281"
209 | },
210 | {
211 | "http": "http://94.130.92.40:3128"
212 | },
213 | {
214 | "http": "http://82.78.28.139:8080"
215 | },
216 | {
217 | "http": "http://82.222.153.227:9090"
218 | },
219 | {
220 | "http": "http://80.211.228.238:8888"
221 | },
222 | {
223 | "http": "http://80.211.180.224:3128"
224 | },
225 | {
226 | "http": "http://79.101.98.2:53281"
227 | },
228 | {
229 | "http": "http://66.96.233.182:8080"
230 | },
231 | {
232 | "http": "http://61.228.45.165:8080"
233 | }
234 | ]
235 | ```
236 |
237 |
238 | **`/count`**
239 |
240 | 返回代理池中所有代理总数
241 | ```bash
242 | $ http http://localhost:3289/count
243 | HTTP/1.1 200 OK
244 | Connection: keep-alive
245 | Content-Length: 15
246 | Content-Type: application/json
247 | Keep-Alive: 5
248 |
249 | {
250 | "count": "698"
251 | }
252 | ```
253 |
254 |
255 | **`/count/`**
256 |
257 | 返回指定权值代理总数
258 | ```bash
259 | $ http http://localhost:3289/count/10
260 | HTTP/1.1 200 OK
261 | Connection: keep-alive
262 | Content-Length: 15
263 | Content-Type: application/json
264 | Keep-Alive: 5
265 |
266 | {
267 | "count": "143"
268 | }
269 |
270 | ```
271 |
272 |
273 | **`/clear/`**
274 |
275 | 删除权值小于等于 score 的代理
276 | ```bash
277 | $ http http://localhost:3289/clear/0
278 | HTTP/1.1 200 OK
279 | Connection: keep-alive
280 | Content-Length: 22
281 | Content-Type: application/json
282 | Keep-Alive: 5
283 |
284 | {
285 | "Clear": "Successful"
286 | }
287 | ```
288 |
289 |
290 | ### 扩展代理爬取网站
291 |
292 | 在 crawler.py 文件里新增你自己的爬取方法。
293 | ```python
294 | class Crawler:
295 |
296 | @staticmethod
297 | def run():
298 | ...
299 |
300 | # 新增你自己的爬取方法
301 | @staticmethod
302 | @collect_funcs # 加入装饰器用于最后运行函数
303 | def crawl_xxx():
304 | # 爬取逻辑
305 | ```
306 |
307 | ### 选择其他 web 框架
308 |
309 | 本项目使用了 Sanic,但是开发者完全可以根据自己的需求选择其他 web 框架,web 模块是完全独立的,替换框架不会影响到项目的正常运行。需要如下步骤。
310 |
311 | 1. 在 [webapi.py](https://github.com/chenjiandongx/async-proxy-pool/blob/master/async_proxy_pool/webapi.py) 里更换框架。
312 | 2. 在 [server.py](https://github.com/chenjiandongx/async-proxy-pool/blob/master/server.py) 里修改 app 启动细节。
313 |
314 |
315 | ### Sanic 性能测试
316 |
317 | 使用 [wrk](https://github.com/wg/wrk) 进行服务器压力测试。基准测试 30 秒, 使用 12 个线程, 并发 400 个 http 连接。
318 |
319 | 测试 http://127.0.0.1:3289/pop
320 | ```bash
321 | $ wrk -t12 -c400 -d30s http://127.0.0.1:3289/pop
322 | Running 30s test @ http://127.0.0.1:3289/pop
323 | 12 threads and 400 connections
324 | Thread Stats Avg Stdev Max +/- Stdev
325 | Latency 350.37ms 118.99ms 660.41ms 60.94%
326 | Req/Sec 98.18 35.94 277.00 79.43%
327 | 33694 requests in 30.10s, 4.77MB read
328 | Socket errors: connect 0, read 340, write 0, timeout 0
329 | Requests/sec: 1119.44
330 | Transfer/sec: 162.23KB
331 | ```
332 |
333 | 测试 http://127.0.0.1:3289/get/10
334 | ```bash
335 | Running 30s test @ http://127.0.0.1:3289/get/10
336 | 12 threads and 400 connections
337 | Thread Stats Avg Stdev Max +/- Stdev
338 | Latency 254.90ms 95.43ms 615.14ms 63.51%
339 | Req/Sec 144.84 61.52 320.00 66.58%
340 | 46538 requests in 30.10s, 22.37MB read
341 | Socket errors: connect 0, read 28, write 0, timeout 0
342 | Requests/sec: 1546.20
343 | Transfer/sec: 761.02KB
344 | ```
345 |
346 | 性能还算不错,再测试一下没有 Redis 操作的 http://127.0.0.1:3289/
347 | ```bash
348 | $ wrk -t12 -c400 -d30s http://127.0.0.1:3289/
349 | Running 30s test @ http://127.0.0.1:3289/
350 | 12 threads and 400 connections
351 | Thread Stats Avg Stdev Max +/- Stdev
352 | Latency 127.86ms 41.71ms 260.69ms 55.22%
353 | Req/Sec 258.56 92.25 520.00 68.90%
354 | 92766 requests in 30.10s, 13.45MB read
355 | Requests/sec: 3081.87
356 | Transfer/sec: 457.47KB
357 | ```
358 | ⭐️ **Requests/sec: 3081.87**
359 |
360 | 关闭 sanic 日志记录,测试 http://127.0.0.1:3289/
361 | ```bash
362 | $ wrk -t12 -c400 -d30s http://127.0.0.1:3289/
363 | Running 30s test @ http://127.0.0.1:3289/
364 | 12 threads and 400 connections
365 | Thread Stats Avg Stdev Max +/- Stdev
366 | Latency 34.63ms 12.66ms 96.28ms 58.07%
367 | Req/Sec 0.96k 137.29 2.21k 73.29%
368 | 342764 requests in 30.10s, 49.69MB read
369 | Requests/sec: 11387.89
370 | Transfer/sec: 1.65MB
371 | ```
372 | ⭐️ **Requests/sec: 11387.89**
373 |
374 |
375 | ### 实际代理性能测试
376 |
377 | [test_proxy.py](https://github.com/chenjiandongx/async-proxy-pool/blob/master/test/test_proxy.py) 用于测试实际代理性能
378 |
379 | #### 运行代码
380 |
381 | ```bash
382 | $ cd test
383 | $ python test_proxy.py
384 |
385 | # 可设置的环境变量
386 | TEST_COUNT = os.environ.get("TEST_COUNT") or 1000
387 | TEST_WEBSITE = os.environ.get("TEST_WEBSITE") or "https://httpbin.org/"
388 | TEST_PROXIES = os.environ.get("TEST_PROXIES") or "http://localhost:3289/get/20"
389 | ```
390 |
391 | #### 实测效果
392 |
393 | **https://httpbin.org/**
394 | ```
395 | 测试代理: http://localhost:3289/get/20
396 | 测试网站: https://httpbin.org/
397 | 测试次数: 1000
398 | 成功次数: 1000
399 | 失败次数: 0
400 | 成功率: 1.0
401 | ```
402 |
403 | **https://taobao.com**
404 | ```
405 | 测试代理: http://localhost:3289/get/20
406 | 测试网站: https://taobao.com/
407 | 测试次数: 1000
408 | 成功次数: 984
409 | 失败次数: 16
410 | 成功率: 0.984
411 | ```
412 |
413 | **https://baidu.com**
414 | ```
415 | 测试代理: http://localhost:3289/get/20
416 | 测试网站: https://baidu.com
417 | 测试次数: 1000
418 | 成功次数: 975
419 | 失败次数: 25
420 | 成功率: 0.975
421 | ```
422 |
423 | **https://zhihu.com**
424 | ```
425 | 测试代理: http://localhost:3289/get/20
426 | 测试网站: https://zhihu.com
427 | 测试次数: 1000
428 | 成功次数: 1000
429 | 失败次数: 0
430 | 成功率: 1.0
431 | ```
432 |
433 | 可以看到其实性能是非常棒的,成功率极高。 😉
434 |
435 |
436 | ### 实际应用示例
437 |
438 | ```python
439 | import random
440 |
441 | import requests
442 |
443 | # 确保已经启动 sanic 服务
444 | # 获取多个然后随机选一个
445 |
446 | try:
447 | proxies = requests.get("http://localhost:3289/get/20").json()
448 | req = requests.get("https://example.com", proxies=random.choice(proxies))
449 | except:
450 | raise
451 |
452 | # 或者单独弹出一个
453 |
454 | try:
455 | proxy = requests.get("http://localhost:3289/pop").json()
456 | req = requests.get("https://example.com", proxies=proxy)
457 | except:
458 | raise
459 | ```
460 |
461 |
462 | ### aiohttp 的坑
463 |
464 | 整个项目都是基于 aiohttp 这个异步网络库的,在这个项目的文档中,关于代理的介绍是这样的。
465 |
466 | 
467 |
468 | **划重点:aiohttp supports HTTP/HTTPS proxies**
469 |
470 | 但是,它根本就不支持 https 代理好吧,在它的代码中是这样写的。
471 |
472 | 
473 |
474 | **划重点:Only http proxies are supported**
475 |
476 | 我的心情可以说是十分复杂的。😲 不过只有 http 代理效果也不错没什么太大影响,参见上面的测试数据。
477 |
478 |
479 | ### 参考借鉴项目
480 |
481 | ✨🍰✨
482 |
483 | * [ProxyPool](https://github.com/WiseDoge/ProxyPool)
484 | * [proxy_pool](https://github.com/jhao104/proxy_pool)
485 |
486 | ### License
487 |
488 | MIT [©chenjiandongx](https://github.com/chenjiandongx)
489 |
--------------------------------------------------------------------------------
/async_proxy_pool/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenjiandongx/async-proxy-pool/b6869e39ab949700b90b84df58489c41f8d6e3e2/async_proxy_pool/__init__.py
--------------------------------------------------------------------------------
/async_proxy_pool/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | # 请求超时时间(秒)
5 | REQUEST_TIMEOUT = 15
6 | # 请求延迟时间(秒)
7 | REQUEST_DELAY = 0
8 |
9 | # redis 地址
10 | REDIS_HOST = "localhost"
11 | # redis 端口
12 | REDIS_PORT = 6379
13 | # redis 密码
14 | REDIS_PASSWORD = None
15 | # redis set key
16 | REDIS_KEY = "proxies:ranking"
17 | # redis 连接池最大连接量
18 | REDIS_MAX_CONNECTION = 20
19 |
20 | # REDIS SCORE 最大分数
21 | MAX_SCORE = 10
22 | # REDIS SCORE 最小分数
23 | MIN_SCORE = 0
24 | # REDIS SCORE 初始分数
25 | INIT_SCORE = 9
26 |
27 | # server web host
28 | SERVER_HOST = "localhost"
29 | # server web port
30 | SERVER_PORT = 3289
31 | # 是否开启日志记录
32 | SERVER_ACCESS_LOG = True
33 |
34 | # 批量测试数量
35 | VALIDATOR_BATCH_COUNT = 256
36 | # 校验器测试网站,可以定向改为自己想爬取的网站,如新浪,知乎等
37 | VALIDATOR_BASE_URL = "http://baidu.com"
38 | # 校验器循环周期(分钟)
39 | VALIDATOR_RUN_CYCLE = 15
40 |
41 |
42 | # 爬取器循环周期(分钟)
43 | CRAWLER_RUN_CYCLE = 30
44 | # 请求 headers
45 | HEADERS = {
46 | "X-Requested-With": "XMLHttpRequest",
47 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
48 | "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
49 | }
50 |
--------------------------------------------------------------------------------
/async_proxy_pool/crawler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | import re
5 |
6 | import pyquery
7 |
8 | from .utils import requests
9 | from .database import RedisClient
10 | from .logger import logger
11 |
12 |
13 | redis_conn = RedisClient()
14 | all_funcs = []
15 |
16 |
17 | def collect_funcs(func):
18 | """
19 | 装饰器,用于收集爬虫函数
20 | """
21 | all_funcs.append(func)
22 | return func
23 |
24 |
25 | class Crawler:
26 | @staticmethod
27 | def run():
28 | """
29 | 启动收集器
30 | """
31 | logger.info("Crawler working...")
32 | for func in all_funcs:
33 | for proxy in func():
34 | redis_conn.add_proxy(proxy)
35 | logger.info("Crawler √ {}".format(proxy))
36 | logger.info("Crawler resting...")
37 |
38 | @staticmethod
39 | @collect_funcs
40 | def crawl_66ip():
41 | """
42 | 66ip 代理:http://www.66ip.cn
43 | """
44 | url = (
45 | "http://www.66ip.cn/nmtq.php?getnum=100&isp=0"
46 | "&anonymoustype=0&area=0&proxytype={}&api=66ip"
47 | )
48 | pattern = "\d+\.\d+.\d+\.\d+:\d+"
49 |
50 | items = [(0, "http://{}"), (1, "https://{}")]
51 | for item in items:
52 | proxy_type, host = item
53 | html = requests(url.format(proxy_type))
54 | if html:
55 | for proxy in re.findall(pattern, html):
56 | yield host.format(proxy)
57 |
58 | @staticmethod
59 | @collect_funcs
60 | def crawl_xici():
61 | """
62 | 西刺代理:http://www.xicidaili.com
63 | """
64 | url = "http://www.xicidaili.com/{}"
65 |
66 | items = []
67 | for page in range(1, 21):
68 | items.append(("wt/{}".format(page), "http://{}:{}"))
69 | items.append(("wn/{}".format(page), "https://{}:{}"))
70 |
71 | for item in items:
72 | proxy_type, host = item
73 | html = requests(url.format(proxy_type))
74 | if html:
75 | doc = pyquery.PyQuery(html)
76 | for proxy in doc("table tr").items():
77 | ip = proxy("td:nth-child(2)").text()
78 | port = proxy("td:nth-child(3)").text()
79 | if ip and port:
80 | yield host.format(ip, port)
81 |
82 | @staticmethod
83 | @collect_funcs
84 | def crawl_kuaidaili():
85 | """
86 | 快代理:https://www.kuaidaili.com
87 | """
88 | url = "https://www.kuaidaili.com/free/{}"
89 |
90 | items = ["inha/1/"]
91 | for proxy_type in items:
92 | html = requests(url.format(proxy_type))
93 | if html:
94 | doc = pyquery.PyQuery(html)
95 | for proxy in doc(".table-bordered tr").items():
96 | ip = proxy("[data-title=IP]").text()
97 | port = proxy("[data-title=PORT]").text()
98 | if ip and port:
99 | yield "http://{}:{}".format(ip, port)
100 |
101 | @staticmethod
102 | @collect_funcs
103 | def crawl_ip3366():
104 | """
105 | 云代理:http://www.ip3366.net
106 | """
107 | url = "http://www.ip3366.net/?stype=1&page={}"
108 |
109 | items = [p for p in range(1, 8)]
110 | for page in items:
111 | html = requests(url.format(page))
112 | if html:
113 | doc = pyquery.PyQuery(html)
114 | for proxy in doc(".table-bordered tr").items():
115 | ip = proxy("td:nth-child(1)").text()
116 | port = proxy("td:nth-child(2)").text()
117 | schema = proxy("td:nth-child(4)").text()
118 | if ip and port and schema:
119 | yield "{}://{}:{}".format(schema.lower(), ip, port)
120 |
121 | @staticmethod
122 | @collect_funcs
123 | def crawl_data5u():
124 | """
125 | 无忧代理:http://www.data5u.com/
126 | """
127 | url = "http://www.data5u.com/"
128 |
129 | html = requests(url)
130 | if html:
131 | doc = pyquery.PyQuery(html)
132 | for index, item in enumerate(doc("li ul").items()):
133 | if index > 0:
134 | ip = item("span:nth-child(1)").text()
135 | port = item("span:nth-child(2)").text()
136 | schema = item("span:nth-child(4)").text()
137 | if ip and port and schema:
138 | yield "{}://{}:{}".format(schema, ip, port)
139 |
140 | @staticmethod
141 | @collect_funcs
142 | def crawl_iphai():
143 | """
144 | ip 海代理:http://www.iphai.com
145 | """
146 | url = "http://www.iphai.com/free/{}"
147 |
148 | items = ["ng", "np", "wg", "wp"]
149 | for proxy_type in items:
150 | html = requests(url.format(proxy_type))
151 | if html:
152 | doc = pyquery.PyQuery(html)
153 | for item in doc(".table-bordered tr").items():
154 | ip = item("td:nth-child(1)").text()
155 | port = item("td:nth-child(2)").text()
156 | schema = item("td:nth-child(4)").text().split(",")[0]
157 | if ip and port and schema:
158 | yield "{}://{}:{}".format(schema.lower(), ip, port)
159 |
160 | @staticmethod
161 | @collect_funcs
162 | def crawl_swei360():
163 | """
164 | 360 代理:http://www.swei360.com
165 | """
166 | url = "http://www.swei360.com/free/?stype={}"
167 |
168 | items = [p for p in range(1, 5)]
169 | for proxy_type in items:
170 | html = requests(url.format(proxy_type))
171 | if html:
172 | doc = pyquery.PyQuery(html)
173 | for item in doc(".table-bordered tr").items():
174 | ip = item("td:nth-child(1)").text()
175 | port = item("td:nth-child(2)").text()
176 | schema = item("td:nth-child(4)").text()
177 | if ip and port and schema:
178 | yield "{}://{}:{}".format(schema.lower(), ip, port)
179 |
180 |
181 | crawler = Crawler()
182 |
--------------------------------------------------------------------------------
/async_proxy_pool/database.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | import random
5 |
6 | import redis
7 |
8 | from .config import (
9 | REDIS_KEY,
10 | REDIS_PORT,
11 | REDIS_PASSWORD,
12 | REDIS_HOST,
13 | REDIS_MAX_CONNECTION,
14 | MAX_SCORE,
15 | MIN_SCORE,
16 | INIT_SCORE,
17 | )
18 |
19 |
20 | class RedisClient:
21 | """
22 | 代理池依赖了 Redis 数据库,使用了其`有序集合`的数据结构
23 | (可按分数排序,key 值不能重复)
24 | """
25 |
26 | def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD):
27 | conn_pool = redis.ConnectionPool(
28 | host=host,
29 | port=port,
30 | password=password,
31 | max_connections=REDIS_MAX_CONNECTION,
32 | )
33 | self.redis = redis.Redis(connection_pool=conn_pool)
34 |
35 | def add_proxy(self, proxy, score=INIT_SCORE):
36 | """
37 | 新增一个代理,初始化分数 INIT_SCORE < MAX_SCORE,确保在
38 | 运行完收集器后还没运行校验器就获取代理,导致获取到分数虽为 MAX_SCORE,
39 | 但实际上确是未经验证,不可用的代理
40 |
41 | :param proxy: 新增代理
42 | :param score: 初始化分数
43 | """
44 | if not self.redis.zscore(REDIS_KEY, proxy):
45 | self.redis.zadd(REDIS_KEY, proxy, score)
46 |
47 | def reduce_proxy_score(self, proxy):
48 | """
49 | 验证未通过,分数减一
50 |
51 | :param proxy: 验证代理
52 | """
53 | score = self.redis.zscore(REDIS_KEY, proxy)
54 | if score and score > MIN_SCORE:
55 | self.redis.zincrby(REDIS_KEY, proxy, -1)
56 | else:
57 | self.redis.zrem(REDIS_KEY, proxy)
58 |
59 | def increase_proxy_score(self, proxy):
60 | """
61 | 验证通过,分数加一
62 |
63 | :param proxy: 验证代理
64 | """
65 | score = self.redis.zscore(REDIS_KEY, proxy)
66 | if score and score < MAX_SCORE:
67 | self.redis.zincrby(REDIS_KEY, proxy, 1)
68 |
69 | def pop_proxy(self):
70 | """
71 | 返回一个代理
72 | """
73 | # 第一次尝试取分数最高,也就是最新可用的代理
74 | first_chance = self.redis.zrangebyscore(REDIS_KEY, MAX_SCORE, MAX_SCORE)
75 | if first_chance:
76 | return random.choice(first_chance)
77 |
78 | else:
79 | # 第二次尝试取 7-10 分数的任意一个代理
80 | second_chance = self.redis.zrangebyscore(
81 | REDIS_KEY, MAX_SCORE - 3, MAX_SCORE
82 | )
83 | if second_chance:
84 | return random.choice(second_chance)
85 | # 最后一次就随便取咯
86 | else:
87 | last_chance = self.redis.zrangebyscore(REDIS_KEY, MIN_SCORE, MAX_SCORE)
88 | if last_chance:
89 | return random.choice(last_chance)
90 |
91 | def get_proxies(self, count=1):
92 | """
93 | 返回指定数量代理,分数由高到低排序
94 |
95 | :param count: 代理数量
96 | """
97 | proxies = self.redis.zrevrange(REDIS_KEY, 0, count - 1)
98 | for proxy in proxies:
99 | yield proxy.decode("utf-8")
100 |
101 | def count_all_proxies(self):
102 | """
103 | 返回所有代理总数
104 | """
105 | return self.redis.zcard(REDIS_KEY)
106 |
107 | def count_score_proxies(self, score):
108 | """
109 | 返回指定分数代理总数
110 |
111 | :param score: 代理分数
112 | """
113 | if 0 <= score <= 10:
114 | proxies = self.redis.zrangebyscore(REDIS_KEY, score, score)
115 | return len(proxies)
116 | return -1
117 |
118 | def clear_proxies(self, score):
119 | """
120 | 删除分数小于等于 score 的代理
121 | """
122 | if 0 <= score <= 10:
123 | proxies = self.redis.zrangebyscore(REDIS_KEY, 0, score)
124 | for proxy in proxies:
125 | self.redis.zrem(REDIS_KEY, proxy)
126 | return True
127 | return False
128 |
129 | def all_proxies(self):
130 | """
131 | 返回全部代理
132 | """
133 | return self.redis.zrangebyscore(REDIS_KEY, MIN_SCORE, MAX_SCORE)
134 |
--------------------------------------------------------------------------------
/async_proxy_pool/logger.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | import logging
5 |
6 |
7 | def get_logger():
8 | """
9 | 创建日志实例
10 | """
11 | formatter = logging.Formatter("%(asctime)s - %(message)s")
12 | logger = logging.getLogger("monitor")
13 | logger.setLevel(logging.INFO)
14 |
15 | ch = logging.StreamHandler()
16 | ch.setFormatter(formatter)
17 | logger.addHandler(ch)
18 | return logger
19 |
20 |
21 | logger = get_logger()
22 |
--------------------------------------------------------------------------------
/async_proxy_pool/scheduler.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | import time
5 |
6 | import schedule
7 |
8 | from .config import CRAWLER_RUN_CYCLE, VALIDATOR_RUN_CYCLE
9 |
10 | from .crawler import crawler
11 | from .validator import validator
12 | from .logger import logger
13 |
14 |
15 | def run_schedule():
16 | """
17 | 启动客户端
18 | """
19 | # 启动收集器
20 | schedule.every(CRAWLER_RUN_CYCLE).minutes.do(crawler.run).run()
21 | # 启动验证器
22 | schedule.every(VALIDATOR_RUN_CYCLE).minutes.do(validator.run).run()
23 |
24 | while True:
25 | try:
26 | schedule.run_pending()
27 | time.sleep(1)
28 | except KeyboardInterrupt:
29 | logger.info("You have canceled all jobs")
30 | return
31 |
--------------------------------------------------------------------------------
/async_proxy_pool/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | import asyncio
5 |
6 | import aiohttp
7 |
8 | from .config import HEADERS, REQUEST_TIMEOUT, REQUEST_DELAY
9 |
10 |
11 | LOOP = asyncio.get_event_loop()
12 |
13 |
14 | async def _get_page(url, sleep):
15 | """
16 | 获取并返回网页内容
17 | """
18 | async with aiohttp.ClientSession() as session:
19 | try:
20 | await asyncio.sleep(sleep)
21 | async with session.get(
22 | url, headers=HEADERS, timeout=REQUEST_TIMEOUT
23 | ) as resp:
24 | return await resp.text()
25 | except:
26 | return ""
27 |
28 |
29 | def requests(url, sleep=REQUEST_DELAY):
30 | """
31 | 请求方法,用于获取网页内容
32 |
33 | :param url: 请求链接
34 | :param sleep: 延迟时间(秒)
35 | """
36 | html = LOOP.run_until_complete(asyncio.gather(_get_page(url, sleep)))
37 | if html:
38 | return "".join(html)
39 |
--------------------------------------------------------------------------------
/async_proxy_pool/validator.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | import os
5 | import asyncio
6 |
7 | import aiohttp
8 |
9 | from .config import VALIDATOR_BASE_URL, VALIDATOR_BATCH_COUNT, REQUEST_TIMEOUT
10 | from .logger import logger
11 | from .database import RedisClient
12 |
13 |
14 | VALIDATOR_BASE_URL = os.environ.get("VALIDATOR_BASE_URL") or VALIDATOR_BASE_URL
15 |
16 |
17 | class Validator:
18 | def __init__(self):
19 | self.redis = RedisClient()
20 |
21 | async def test_proxy(self, proxy):
22 | """
23 | 测试代理
24 |
25 | :param proxy: 指定代理
26 | """
27 | async with aiohttp.ClientSession() as session:
28 | try:
29 | if isinstance(proxy, bytes):
30 | proxy = proxy.decode("utf8")
31 | async with session.get(
32 | VALIDATOR_BASE_URL, proxy=proxy, timeout=REQUEST_TIMEOUT
33 | ) as resp:
34 | if resp.status == 200:
35 | self.redis.increase_proxy_score(proxy)
36 | logger.info("Validator √ {}".format(proxy))
37 | else:
38 | self.redis.reduce_proxy_score(proxy)
39 | logger.info("Validator × {}".format(proxy))
40 | except:
41 | self.redis.reduce_proxy_score(proxy)
42 | logger.info("Validator × {}".format(proxy))
43 |
44 | def run(self):
45 | """
46 | 启动校验器
47 | """
48 | logger.info("Validator working...")
49 | logger.info("Validator website is {}".format(VALIDATOR_BASE_URL))
50 | proxies = self.redis.all_proxies()
51 | loop = asyncio.get_event_loop()
52 | for i in range(0, len(proxies), VALIDATOR_BATCH_COUNT):
53 | _proxies = proxies[i : i + VALIDATOR_BATCH_COUNT]
54 | tasks = [self.test_proxy(proxy) for proxy in _proxies]
55 | if tasks:
56 | loop.run_until_complete(asyncio.wait(tasks))
57 | logger.info("Validator resting...")
58 |
59 |
60 | validator = Validator()
61 |
--------------------------------------------------------------------------------
/async_proxy_pool/webapi_flask.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | from flask import Flask, jsonify
5 | from async_proxy_pool.database import RedisClient
6 |
7 | app = Flask(__name__)
8 | redis_conn = RedisClient()
9 |
10 |
11 | @app.route("/")
12 | def index():
13 | return jsonify({"Welcome": "This is a proxy pool system."})
14 |
15 |
16 | @app.route("/pop")
17 | def pop_proxy():
18 | proxy = redis_conn.pop_proxy().decode("utf8")
19 | if proxy[:5] == "https":
20 | return jsonify({"https": proxy})
21 | else:
22 | return jsonify({"http": proxy})
23 |
24 |
25 | @app.route("/get/")
26 | def get_proxy(count):
27 | res = []
28 | for proxy in redis_conn.get_proxies(count):
29 | if proxy[:5] == "https":
30 | res.append({"https": proxy})
31 | else:
32 | res.append({"http": proxy})
33 | return jsonify(res)
34 |
35 |
36 | @app.route("/count")
37 | def count_all_proxies():
38 | count = redis_conn.count_all_proxies()
39 | return jsonify({"count": str(count)})
40 |
41 |
42 | @app.route("/count/")
43 | def count_score_proxies(score):
44 | count = redis_conn.count_score_proxies(score)
45 | return jsonify({"count": str(count)})
46 |
47 |
48 | @app.route("/clear/")
49 | def clear_proxies(score):
50 | if redis_conn.clear_proxies(score):
51 | return jsonify({"Clear": "Successful"})
52 | return jsonify({"Clear": "Score should >= 0 and <= 10"})
53 |
--------------------------------------------------------------------------------
/async_proxy_pool/webapi_sanic.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | from sanic import Sanic
5 | from sanic.response import json
6 |
7 | from async_proxy_pool.database import RedisClient
8 |
9 | app = Sanic()
10 | redis_conn = RedisClient()
11 |
12 |
13 | @app.route("/")
14 | async def index(request):
15 | return json({"Welcome": "This is a proxy pool system."})
16 |
17 |
18 | @app.route("/pop")
19 | async def pop_proxy(request):
20 | proxy = redis_conn.pop_proxy().decode("utf8")
21 | if proxy[:5] == "https":
22 | return json({"https": proxy})
23 | else:
24 | return json({"http": proxy})
25 |
26 |
27 | @app.route("/get/")
28 | async def get_proxy(request, count):
29 | res = []
30 | for proxy in redis_conn.get_proxies(count):
31 | if proxy[:5] == "https":
32 | res.append({"https": proxy})
33 | else:
34 | res.append({"http": proxy})
35 | return json(res)
36 |
37 |
38 | @app.route("/count")
39 | async def count_all_proxies(request):
40 | count = redis_conn.count_all_proxies()
41 | return json({"count": str(count)})
42 |
43 |
44 | @app.route("/count/")
45 | async def count_score_proxies(request, score):
46 | count = redis_conn.count_score_proxies(score)
47 | return json({"count": str(count)})
48 |
49 |
50 | @app.route("/clear/")
51 | async def clear_proxies(request, score):
52 | if redis_conn.clear_proxies(score):
53 | return json({"Clear": "Successful"})
54 | return json({"Clear": "Score should >= 0 and <= 10"})
55 |
--------------------------------------------------------------------------------
/client.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | from async_proxy_pool.scheduler import run_schedule
5 |
6 |
7 | run_schedule()
8 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | redis<=2.10.6
2 | aiohttp
3 | schedule
4 | pyquery
5 | requests
6 | flask
7 | sanic;sys_platform!='win32'
8 |
--------------------------------------------------------------------------------
/server_flask.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | from async_proxy_pool.webapi_flask import app
5 | from async_proxy_pool.config import SERVER_HOST, SERVER_PORT, SERVER_ACCESS_LOG
6 |
7 | # 启动服务端 Flask app
8 | app.run(host=SERVER_HOST, port=SERVER_PORT, debug=SERVER_ACCESS_LOG)
9 |
--------------------------------------------------------------------------------
/server_sanic.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | from async_proxy_pool.webapi_sanic import app
5 | from async_proxy_pool.config import SERVER_HOST, SERVER_PORT, SERVER_ACCESS_LOG
6 |
7 | # 启动服务端 Sanic app
8 | app.run(host=SERVER_HOST, port=SERVER_PORT, access_log=SERVER_ACCESS_LOG)
9 |
--------------------------------------------------------------------------------
/test/test_proxy.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 |
4 | import os
5 | import random
6 | from concurrent.futures import ThreadPoolExecutor
7 |
8 | import requests
9 |
10 |
11 | HEADERS = {
12 | "X-Requested-With": "XMLHttpRequest",
13 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
14 | "(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
15 | }
16 |
17 | SUCCESS = 0
18 | FAIL = 0
19 | TIMEOUT = 15
20 |
21 | TEST_COUNT = os.environ.get("TEST_COUNT") or 1000
22 | TEST_WEBSITE = os.environ.get("TEST_WEBSITE") or "https://zhihu.com"
23 | TEST_PROXIES = os.environ.get("TEST_PROXIES") or "http://localhost:3289/get/20"
24 |
25 |
26 | def get_proxies():
27 | _proxies = requests.get(TEST_PROXIES, timeout=TIMEOUT).json()
28 | for proxy in _proxies:
29 | if "http" in proxy.keys():
30 | proxy["https"] = proxy["http"]
31 | return _proxies
32 |
33 |
34 | def test_one_proxy(proxy):
35 | global SUCCESS, FAIL
36 | try:
37 | req = requests.get(
38 | TEST_WEBSITE, proxies=proxy, timeout=TIMEOUT, headers=HEADERS
39 | )
40 | if req.status_code == 200:
41 | SUCCESS += 1
42 | else:
43 | FAIL += 1
44 | except:
45 | FAIL += 1
46 |
47 |
48 | if __name__ == "__main__":
49 | proxies = get_proxies()
50 | tasks = [random.choice(proxies) for _ in range(int(TEST_COUNT))]
51 | with ThreadPoolExecutor(max_workers=64) as executor:
52 | executor.map(test_one_proxy, tasks)
53 | print("测试代理:", TEST_PROXIES)
54 | print("测试网站:", TEST_WEBSITE)
55 | print("测试次数:", TEST_COUNT)
56 | print("成功次数:", SUCCESS)
57 | print("失败次数:", FAIL)
58 | print("成功率:", SUCCESS / TEST_COUNT)
59 |
--------------------------------------------------------------------------------