├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── docs ├── 1.png ├── 2.png ├── 3.png ├── 4.png └── 5.png ├── examples └── example.py ├── proxypool ├── __init__.py ├── conf.py ├── dbop.py ├── errors.py ├── schedule │ ├── __init__.py │ ├── adder.py │ ├── schedule.py │ └── tester.py ├── spiders.py ├── utils.py └── webapi.py ├── requirements.txt ├── run.py ├── setup.py └── tests ├── test_dbop.py ├── test_spiders.py └── test_webapi.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.vscode 2 | *.pyc 3 | *.db 4 | .idea 5 | venv -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | 5 | services: 6 | - redis-server 7 | 8 | script: 9 | - python3 setup.py install 10 | - cd tests 11 | - python3 test_dbop.py 12 | - python3 test_spiders.py 13 | 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ProxyPool 2 | [](https://travis-ci.org/WiseDoge/ProxyPool) 3 | 跨语言高性能IP代理池,Python实现。 4 | 5 | 注意:请运行程序前先更新一下抓取代理的爬虫。 6 | 7 | ## 运行环境 8 | 9 | * Python 3.6 10 | 11 | (请务必保证Python的版本在3.6以上,否则异步检验无法使用。) 12 | 13 | * Redis 14 | 15 | Redis官网并没有提供Windows的安装版,Windows用户可以[点击此处](http://pan.baidu.com/s/1kVe6lc7)下载一个我自己编译的二进制版本(3.2版本2.7MB,VS 2015编译)。 16 | 17 | ## 安装 18 | 19 | ### ① 直接使用 20 | 21 | #### 安装依赖 22 | 23 | `$ pip install -r requirements.txt` 24 | 25 | *Windows用户如果无法安装lxml库请[点击这里](http://www.lfd.uci.edu/~gohlke/pythonlibs/)*。 26 | 27 | #### 打开代理池和API 28 | 29 | `$ cd proxypool` 30 | 31 | `$ python3 run.py ` 32 | 33 | ### ② 安装使用 34 | 35 | #### 安装 36 | 37 | `$ cd proxypool` 38 | 39 | `$ python setup.py install` 40 | 41 | #### 打开代理池和API 42 | 43 | `$ proxypool_run` 44 | 45 | 46 | ## 使用API获取代理 47 | 48 | 访问`http://127.0.0.1:5000/`进入主页,如果显示'Welcome',证明成功启动。 49 | 50 |  51 | 52 | 访问`http://127.0.0.1:5000/get`可以获取一个可用代理。 53 | 54 |  55 | 56 | 访问`http://127.0.0.1:5000/count`可以获取代理池中可用代理的数量。 57 | 58 |  59 | 60 | 也可以在程序代码中用相应的语言获取,例如: 61 | 62 | ``` 63 | import requests 64 | from bs4 import BeautifulSoup 65 | import lxml 66 | 67 | def get_proxy(): 68 | r = requests.get('http://127.0.0.1:5000/get') 69 | proxy = BeautifulSoup(r.text, "lxml").get_text() 70 | return proxy 71 | ``` 72 | ## 文件结构 73 |  74 |  75 | -------------------------------------------------------------------------------- /docs/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuslnlp/ProxyPool/0dabf9aae980a493f904362d70d23467ede66127/docs/1.png -------------------------------------------------------------------------------- /docs/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuslnlp/ProxyPool/0dabf9aae980a493f904362d70d23467ede66127/docs/2.png -------------------------------------------------------------------------------- /docs/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuslnlp/ProxyPool/0dabf9aae980a493f904362d70d23467ede66127/docs/3.png -------------------------------------------------------------------------------- /docs/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuslnlp/ProxyPool/0dabf9aae980a493f904362d70d23467ede66127/docs/4.png -------------------------------------------------------------------------------- /docs/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuslnlp/ProxyPool/0dabf9aae980a493f904362d70d23467ede66127/docs/5.png -------------------------------------------------------------------------------- /examples/example.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import lxml 6 | parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 7 | sys.path.insert(0, parentdir) 8 | 9 | # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 10 | # Before run this example, Please run 'run.py'.!! 11 | # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 12 | 13 | def get_proxy(): 14 | r = requests.get('http://127.0.0.1:5000/get') 15 | proxy = BeautifulSoup(r.text, "lxml").get_text() 16 | return proxy 17 | 18 | 19 | def crawl(url, proxy): 20 | proxies = {'http': proxy} 21 | r = requests.get(url, proxies=proxies) 22 | return r.text 23 | 24 | 25 | def main(): 26 | proxy = get_proxy() 27 | html = crawl('http://docs.jinkan.org/docs/flask/', proxy) 28 | print(html) 29 | 30 | if __name__ == '__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /proxypool/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'WiseDoge' 2 | __url__ = 'https://github.com/WiseDoge/ProxyPool' 3 | __version__ = 'V2.0.0' 4 | 5 | 6 | def main(): 7 | """运行""" 8 | from .schedule import ProxyCountCheckProcess, ExpireCheckProcess 9 | from .conf import VALID_CHECK_CYCLE, POOL_LEN_CHECK_CYCLE, \ 10 | POOL_UPPER_THRESHOLD, POOL_LOWER_THRESHOLD 11 | p1 = ProxyCountCheckProcess(POOL_LOWER_THRESHOLD, POOL_UPPER_THRESHOLD, POOL_LEN_CHECK_CYCLE) 12 | p2 = ExpireCheckProcess(VALID_CHECK_CYCLE) 13 | p1.start() 14 | p2.start() 15 | -------------------------------------------------------------------------------- /proxypool/conf.py: -------------------------------------------------------------------------------- 1 | # Redis Host 2 | HOST = 'localhost' 3 | # Redis PORT 4 | PORT = 6379 5 | 6 | # Redis 中存放代理池的 Set 名 7 | POOL_NAME = 'proxies' 8 | 9 | # Pool 的低阈值和高阈值 10 | POOL_LOWER_THRESHOLD = 10 11 | POOL_UPPER_THRESHOLD = 40 12 | 13 | # 两个调度进程的周期 14 | VALID_CHECK_CYCLE = 600 15 | POOL_LEN_CHECK_CYCLE = 20 16 | 17 | # headers 18 | HEADERS = { 19 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \ 20 | (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', 21 | 'Accept-Encoding': 'gzip, deflate, sdch', 22 | 'Accept-Language': 'zh-CN,zh;q=0.8' 23 | } 24 | -------------------------------------------------------------------------------- /proxypool/dbop.py: -------------------------------------------------------------------------------- 1 | from .conf import HOST, PORT, POOL_NAME 2 | import redis 3 | 4 | redis_pool = redis.ConnectionPool(host=HOST, port=PORT, max_connections=20) 5 | 6 | 7 | class RedisOperator(object): 8 | """Redis 操作类""" 9 | 10 | def __init__(self): 11 | """初始化 Redis 连接""" 12 | # self._conn = redis.Po(HOST, PORT) 13 | self._conn = redis.Redis(connection_pool=redis_pool) 14 | 15 | def gets(self, total=1): 16 | """从池中返回给定数量的代理(取出但不删除),当 total > pool.size 17 | 时,将返回 pool.size 个代理。 18 | :param total: 返回的数量 19 | :return: proxies, size=total 20 | """ 21 | tmp = self._conn.srandmember(POOL_NAME, total) 22 | return [s.decode('utf-8') for s in tmp] 23 | 24 | def puts(self, proxies): 25 | """将一定量的代理压入 pool 中 26 | :param proxies: 27 | :return: 28 | """ 29 | self._conn.sadd(POOL_NAME, *proxies) 30 | 31 | def pop(self): 32 | """弹出一个代理(取出并删除) 33 | :return: proxy 34 | """ 35 | # if self.size == 0: 36 | # raise PoolEmptyError 37 | return self._conn.spop(POOL_NAME).decode('utf-8') 38 | 39 | @property 40 | def size(self): 41 | """返回 pool 的 size 42 | :return: pool.size 43 | """ 44 | return self._conn.scard(POOL_NAME) 45 | 46 | def _flush(self): 47 | """清空 Redis 中的全部内容 48 | :return: None 49 | """ 50 | self._conn.flushall() 51 | 52 | -------------------------------------------------------------------------------- /proxypool/errors.py: -------------------------------------------------------------------------------- 1 | class PoolEmptyError(Exception): 2 | """空池异常. 3 | """ 4 | 5 | def __init__(self): 6 | Exception.__init__(self) 7 | 8 | def __str__(self): 9 | return repr('The proxy source is exhausted.') 10 | 11 | 12 | class ResourceDepletionError(Exception): 13 | """资源枯竭异常,如果长时间抓取不到可用的 14 | 代理,则触发此异常. 15 | """ 16 | def __init__(self): 17 | Exception.__init__(self) 18 | 19 | def __str__(self): 20 | return repr('There are not more proxies in internet.') 21 | 22 | 23 | class RewriteSpiderError(Exception): 24 | """重写爬虫异常,当用户自己编写的爬虫类没有按照规定时, 25 | 将触发此异常. 26 | """ 27 | 28 | def __init__(self, cls_name): 29 | self.cls_name = cls_name 30 | Exception.__init__(self) 31 | 32 | def __str__(self): 33 | return repr(f'The spider `{self.cls_name}` does not has func `gets`.') 34 | -------------------------------------------------------------------------------- /proxypool/schedule/__init__.py: -------------------------------------------------------------------------------- 1 | from .adder import PoolAdder 2 | from .tester import UsabilityTester 3 | from .schedule import ExpireCheckProcess, ProxyCountCheckProcess 4 | -------------------------------------------------------------------------------- /proxypool/schedule/adder.py: -------------------------------------------------------------------------------- 1 | from ..conf import POOL_UPPER_THRESHOLD 2 | from ..dbop import RedisOperator 3 | from ..errors import ResourceDepletionError 4 | from ..spiders import SpiderMeta 5 | from .tester import UsabilityTester 6 | from concurrent import futures 7 | 8 | 9 | class PoolAdder(object): 10 | """添加器,负责启动爬虫补充代理""" 11 | 12 | def __init__(self): 13 | self._threshold = POOL_UPPER_THRESHOLD 14 | self._pool = RedisOperator() 15 | self._tester = UsabilityTester() 16 | 17 | def is_over(self): 18 | """ 判断池中代理的数量是否达到阈值 19 | :return: 达到阈值返回 True, 否则返回 False. 20 | """ 21 | return True if self._pool.size >= self._threshold else False 22 | 23 | def add_to_pool(self): 24 | """补充代理 25 | :return: None 26 | """ 27 | print('PoolAdder is working') 28 | spiders = [cls() for cls in SpiderMeta.spiders] 29 | flag = 0 30 | while not self.is_over(): 31 | flag += 1 32 | raw_proxies = [] 33 | with futures.ThreadPoolExecutor(max_workers=len(spiders)) as executor: 34 | future_to_down = {executor.submit(spiders[i].gets, 10): i for i in range(len(spiders))} 35 | for future in futures.as_completed(future_to_down): 36 | raw_proxies.extend(future.result()) 37 | print(raw_proxies) 38 | self._tester.set_raw_proxies(raw_proxies) 39 | self._tester.test() 40 | proxies = self._tester.usable_proxies 41 | if len(proxies) != 0: 42 | self._pool.puts(proxies) 43 | if self.is_over(): 44 | break 45 | if flag >= 20: 46 | raise ResourceDepletionError 47 | 48 | -------------------------------------------------------------------------------- /proxypool/schedule/schedule.py: -------------------------------------------------------------------------------- 1 | from . import UsabilityTester 2 | from . import PoolAdder 3 | from ..dbop import RedisOperator 4 | from multiprocessing import Process 5 | import time 6 | 7 | 8 | class ExpireCheckProcess(Process): 9 | """过期性检验进程,每隔一段时间从 Pool 中提取出 1/4 的数据,检验 10 | 其是否过期,没过期的重新入池,否则丢弃。 11 | """ 12 | def __init__(self, cycle): 13 | """ 14 | :param cycle: 扫描周期 15 | """ 16 | Process.__init__(self) 17 | self._cycle = cycle 18 | 19 | self._tester = UsabilityTester() 20 | # self.daemon = True 21 | 22 | 23 | def run(self): 24 | pool = RedisOperator() 25 | print('Expire Check Process is working..') 26 | while True: 27 | time.sleep(self._cycle) 28 | total = int(0.25 * pool.size) 29 | if total < 4: 30 | continue 31 | raw_proxies = [pool.pop() for _ in range(total)] 32 | self._tester.set_raw_proxies(raw_proxies) 33 | self._tester.test() 34 | proxies = self._tester.usable_proxies 35 | if len(proxies) != 0: 36 | pool.puts(proxies) 37 | 38 | 39 | class ProxyCountCheckProcess(Process): 40 | """proxy 数量监控进程,负责监控 Pool 中的代理数。当 Pool 中的 41 | 代理数量低于下阈值时,将触发添加器,启动爬虫补充代理,当代理的数量 42 | 打到上阈值时,添加器停止工作。 43 | """ 44 | def __init__(self, lower_threshold, upper_threshold, cycle): 45 | """ 46 | :param lower_threshold: 下阈值 47 | :param upper_threshold: 上阈值 48 | :param cycle: 扫描周期 49 | """ 50 | Process.__init__(self) 51 | self._lower_threshold = lower_threshold 52 | self._upper_threshold = upper_threshold 53 | self._cycle = cycle 54 | 55 | def run(self): 56 | adder = PoolAdder() 57 | pool = RedisOperator() 58 | while True: 59 | if pool.size < self._lower_threshold: 60 | adder.add_to_pool() 61 | time.sleep(self._cycle) 62 | 63 | -------------------------------------------------------------------------------- /proxypool/schedule/tester.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import aiohttp 3 | 4 | 5 | class UsabilityTester(object): 6 | """检验器,负责检验给定代理的可用性。""" 7 | test_api = 'https://www.baidu.com' 8 | 9 | def __init__(self): 10 | self.raw_proxies = None 11 | self._usable_proxies = None 12 | 13 | def set_raw_proxies(self, raw_proxies): 14 | self.raw_proxies = raw_proxies 15 | 16 | self._usable_proxies = [] 17 | 18 | async def test_single_proxy(self, proxy): 19 | async with aiohttp.ClientSession() as sess: 20 | try: 21 | real_proxy = 'http://' + proxy 22 | async with sess.get(self.test_api, proxy=real_proxy, timeout=15) as resp: 23 | self._usable_proxies.append(proxy) 24 | except Exception: 25 | print(Exception.__name__) 26 | 27 | def test(self): 28 | print('Usability tester is working...') 29 | loop = asyncio.get_event_loop() 30 | tasks = [self.test_single_proxy(proxy) for proxy in self.raw_proxies] 31 | loop.run_until_complete(asyncio.wait(tasks, loop=loop)) 32 | 33 | @property 34 | def usable_proxies(self): 35 | return self._usable_proxies 36 | -------------------------------------------------------------------------------- /proxypool/spiders.py: -------------------------------------------------------------------------------- 1 | """爬虫模块,包含`SpiderMeta`类和一些初始的 2 | 爬虫类,如果用户需要定义自己的爬虫类,必须要继承 3 | `SpiderMeta`类,并重写`gets`方法,`gets` 4 | 方法要求返回 ip:port 形式的代理。 5 | """ 6 | 7 | from .errors import RewriteSpiderError 8 | from .utils import get_page 9 | import time 10 | 11 | 12 | class SpiderMeta(type): 13 | spiders = [] 14 | 15 | def _init(cls): 16 | """子类的构造方法 17 | :return: None 18 | """ 19 | cls._counter = 1 20 | 21 | def _increment(cls, count): 22 | """子类用于增加计数器的方法 23 | :param count: 计数器增加量 24 | :return: None 25 | """ 26 | cls._counter += count 27 | 28 | def _flush(cls): 29 | """计数器刷新为 1 30 | :return: None 31 | """ 32 | cls._counter = 1 33 | 34 | def __new__(cls, *args, **kwargs): 35 | """构造子类 36 | :param args: args[0] = name, args[1] = bases, args[2] = attrs. 37 | :param kwargs: No. 38 | :return: 新类 39 | """ 40 | 41 | # 爬虫类必须要有 `get` 方法。 42 | if 'gets' not in args[2]: 43 | raise RewriteSpiderError(args[0]) 44 | 45 | # 给爬虫类添加一些默认方法 46 | args[2]['__init__'] = lambda self: SpiderMeta._init(self) 47 | args[2]['increment'] = lambda self, count: SpiderMeta._increment(self, count) 48 | args[2]['flush'] = lambda self: SpiderMeta._flush(self) 49 | 50 | # 将爬虫类加入到 `spiders` 列表中 51 | SpiderMeta.spiders.append(type.__new__(cls, *args, **kwargs)) 52 | return type.__new__(cls, *args, **kwargs) 53 | 54 | 55 | class Proxy360Spider(metaclass=SpiderMeta): 56 | start_url = 'http://www.proxy360.cn/default.aspx' 57 | 58 | def gets(self, page_total=None): 59 | ans = [] 60 | soup = get_page(self.start_url) 61 | for proxy in soup.find_all('div', {'class': 'proxylistitem'}): 62 | item = proxy.find_all('span', {"class": "tbBottomLine"}) 63 | ip = item[0].get_text().replace('\r\n', '').replace(' ', '') 64 | port = item[1].get_text().replace('\r\n', '').replace(' ', '') 65 | ans.append(':'.join([ip, port])) 66 | return ans 67 | 68 | 69 | class Daili666Spider(metaclass=SpiderMeta): 70 | start_url = 'http://www.66ip.cn/{}.html' 71 | 72 | def gets(self, page_total=3): 73 | urls = [self.start_url.format(i) 74 | for i in range(self._counter, self._counter + page_total)] 75 | self.increment(page_total) 76 | ans = [] 77 | for url in urls: 78 | soup = get_page(url) 79 | # 防止被 Ban, 加 1s 的间隔。 80 | time.sleep(1) 81 | proxy_list = soup.find('table', {"border": "2px"}) 82 | for proxy in proxy_list.find_all('tr')[1:]: 83 | ip = proxy.find_all('td')[0].get_text() 84 | port = proxy.find_all('td')[1].get_text() 85 | ans.append(':'.join([ip, port])) 86 | return ans 87 | 88 | 89 | class KuaidailiSpider(metaclass=SpiderMeta): 90 | start_url = 'http://www.kuaidaili.com/free/inha/{}/' 91 | 92 | def gets(self, page_total=2): 93 | urls = [self.start_url.format(i) 94 | for i in range(self._counter, self._counter + page_total)] 95 | self.increment(page_total) 96 | ans = [] 97 | for url in urls: 98 | soup = get_page(url) 99 | time.sleep(1) 100 | proxy_list = soup.find('table', 101 | {'class': 'table table-bordered table-striped'}) \ 102 | .find('tbody') 103 | for proxy in proxy_list.find_all('tr'): 104 | tmp = proxy.find_all('td') 105 | ip = tmp[0].get_text() 106 | port = tmp[1].get_text() 107 | ans.append(':'.join([ip, port])) 108 | return ans 109 | 110 | 111 | class XiciSpider(metaclass=SpiderMeta): 112 | start_url = 'http://www.xicidaili.com/nn/{}' 113 | 114 | def gets(self, page_total=2): 115 | urls = [self.start_url.format(i) 116 | for i in range(self._counter, self._counter + page_total)] 117 | self.increment(page_total) 118 | ans = [] 119 | for url in urls: 120 | soup = get_page(url) 121 | time.sleep(1) 122 | proxy_list = soup.find('table', {'id': 'ip_list'}) \ 123 | .find_all('tr')[1:] 124 | for proxy in proxy_list: 125 | tmp = proxy.find_all('td') 126 | ip = tmp[1].get_text() 127 | port = tmp[2].get_text() 128 | ans.append(':'.join([ip, port])) 129 | return ans 130 | 131 | # 请在此处继续扩展你的爬虫类。 -------------------------------------------------------------------------------- /proxypool/utils.py: -------------------------------------------------------------------------------- 1 | from .conf import HEADERS 2 | import requests 3 | from bs4 import BeautifulSoup 4 | import asyncio 5 | import aiohttp 6 | 7 | 8 | def get_page(url): 9 | """将网页解析为 BeautifulSoup 对象并返回 10 | :param url: web url 11 | :return: BeautifulSoup 12 | """ 13 | r = requests.get(url, headers=HEADERS) 14 | try: 15 | soup = BeautifulSoup(r.content.decode("utf-8"), 'lxml') 16 | except UnicodeDecodeError: 17 | soup = BeautifulSoup(r.text, 'lxml') 18 | return soup 19 | 20 | 21 | class Downloader(object): 22 | """ 23 | 一个异步下载器,可以用该类代替`get_page`函数。 24 | 由于下载速度过快,爬虫很容易被BAN。 25 | """ 26 | 27 | def __init__(self, urls): 28 | self.urls = urls 29 | self._htmls = [] 30 | 31 | async def download_single_page(self, url): 32 | async with aiohttp.ClientSession() as session: 33 | async with session.get(url) as resp: 34 | self._htmls.append(await resp.text()) 35 | 36 | def download(self): 37 | loop = asyncio.get_event_loop() 38 | tasks = [self.download_single_page(url) for url in self.urls] 39 | loop.run_until_complete(asyncio.wait(tasks)) 40 | 41 | @property 42 | def htmls(self): 43 | self.download() 44 | return self._htmls 45 | -------------------------------------------------------------------------------- /proxypool/webapi.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, g 2 | from .dbop import RedisOperator 3 | 4 | __all__ = ['app'] 5 | 6 | app = Flask(__name__) 7 | 8 | 9 | 10 | def get_conn(): 11 | """获取 Redis 连接 12 | :return: RedisOperator 13 | """ 14 | if not hasattr(g, 'redis_connect'): 15 | g.redis_connect = RedisOperator() 16 | return g.redis_connect 17 | 18 | 19 | @app.route('/') 20 | def index(): 21 | """Web API 主页的 HTML 代码 22 | :return: HTML 23 | """ 24 | return '