├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── getproxy ├── __init__.py ├── cli.py ├── data │ └── GeoLite2-Country.mmdb ├── getproxy.py ├── plugin │ ├── __init__.py │ ├── cnproxy.py │ ├── coolproxy.py │ ├── freeproxylist.py │ ├── ip181.py │ ├── proxylist.py │ ├── txt.py │ └── xicidaili.py └── utils.py ├── requirements_dev.txt ├── setup.cfg ├── setup.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | .idea/ 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | 56 | # Sphinx documentation 57 | docs/_build/ 58 | 59 | # PyBuilder 60 | target/ 61 | 62 | # pyenv python configuration file 63 | .python-version 64 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | matrix: 4 | include: 5 | - python: "2.7" 6 | env: TOXENV=py27 7 | - python: "3.4" 8 | env: TOXENV=py34 9 | - python: "3.5" 10 | env: TOXENV=py35 11 | - python: "3.6" 12 | env: TOXENV=py36 13 | 14 | install: pip install -U tox 15 | 16 | script: tox -e ${TOXENV} 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | BSD License 3 | 4 | Copyright (c) 2017, fate0 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, this 14 | list of conditions and the following disclaimer in the documentation and/or 15 | other materials provided with the distribution. 16 | 17 | * Neither the name of the copyright holder nor the names of its 18 | contributors may be used to endorse or promote products derived from this 19 | software without specific prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 24 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 25 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 26 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 28 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 29 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 30 | OF THE POSSIBILITY OF SUCH DAMAGE. 31 | 32 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | include getproxy/data/GeoLite2-Country.mmdb -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # getproxy 2 | 3 | [![Build Status](https://travis-ci.org/fate0/getproxy.svg?branch=master)](https://travis-ci.org/fate0/getproxy) 4 | [![Updates](https://pyup.io/repos/github/fate0/getproxy/shield.svg)](https://pyup.io/repos/github/fate0/getproxy/) 5 | [![PyPI](https://img.shields.io/pypi/v/getproxy.svg)](https://pypi.python.org/pypi/getproxy) 6 | [![PyPI](https://img.shields.io/pypi/pyversions/getproxy.svg)](https://pypi.python.org/pypi/getproxy) 7 | 8 | getproxy 是一个抓取发放代理网站,获取 http/https 代理的程序, 9 | 每 15 min 会更新数据至 [fate0/proxylist](https://github.com/fate0/proxylist) 10 | 11 | 12 | ## 1. 安装 13 | 14 | ``` 15 | pip install -U getproxy 16 | ``` 17 | 18 | ## 2. 使用 19 | 20 | ### 帮助信息 21 | ``` 22 | ➜ ~ getproxy --help 23 | Usage: getproxy [OPTIONS] 24 | 25 | Options: 26 | --in-proxy TEXT Input proxy file 27 | --out-proxy TEXT Output proxy file 28 | --help Show this message and exit. 29 | ``` 30 | 31 | * `--in-proxy` 可选参数,待验证的 proxies 列表文件 32 | * `--out-proxy` 可选参数,输出已验证的 proxies 列表文件,如果为空,则直接输出到终端 33 | 34 | `--in-proxy` 文件格式和 `--out-proxy` 文件格式一致 35 | 36 | ### 使用例子 37 | 38 | ``` 39 | (test2.7) ➜ ~ getproxy 40 | INFO:getproxy.getproxy:[*] Init 41 | INFO:getproxy.getproxy:[*] Current Ip Address: 1.1.1.1 42 | INFO:getproxy.getproxy:[*] Load input proxies 43 | INFO:getproxy.getproxy:[*] Validate input proxies 44 | INFO:getproxy.getproxy:[*] Load plugins 45 | INFO:getproxy.getproxy:[*] Grab proxies 46 | INFO:getproxy.getproxy:[*] Validate web proxies 47 | INFO:getproxy.getproxy:[*] Check 6666 proxies, Got 666 valid proxies 48 | 49 | ... 50 | ``` 51 | 52 | 53 | ## 3. 输入/返回格式 54 | 55 | 每一行结果都是一个 json 字符串,格式如下: 56 | ```json 57 | { 58 | "type": "http", 59 | "host": "1.1.1.1", 60 | "port": 8080, 61 | "anonymity": "transparent", 62 | "country": "CN", 63 | "response_time": 3.14, 64 | "from": "txt" 65 | } 66 | ``` 67 | 68 | | 属性   | 类型   | 描述   | 可选值   | 69 | |------- |--------|-------- |----------| 70 | | type | str   | proxy 类型 | `http`, `https`| 71 | | host         | str   | proxy 地址 | | 72 | | port         | int   | 端口 | | 73 | | anonymity     | str   | 匿名性 | `transparent`, `anonymous`, `high_anonymous` | 74 | | country | str | proxy 国家 | | 75 | | response_time | float | 响应时间 | | 76 | | from         | str   | 来源 | | 77 | 78 | 79 | ## 4. Plugin 相关 80 | 81 | ### Plugin 代码格式 82 | 83 | ``` python 84 | 85 | class Proxy(object): 86 | def __init__(self): 87 | self.result = [] 88 | self.proxies = [] 89 | 90 | def start(self): 91 | pass 92 | ``` 93 | 94 | ### Plugin 返回结果 95 | 96 | ``` 97 | { 98 | "host": "1.1.1.1", 99 | "port": 8080, 100 | "from": "plugin name" 101 | } 102 | ``` 103 | 104 | ### Plugin 小提示 105 | 106 | * 不要在 plugin 内使用多线程、gevent 等方法 107 | * 如果目标网站存在分页,请在获取每页内容之后,自行添加 delay 108 | * 如果目标网站存在分页,请在获取每页结果之后,及时放入 `self.result` 中 109 | * 如果被目标网站 ban 了,可以利用已经验证的 proxies (也就是 `self.proxies`) 110 | 111 | ## 5. 第三方程序调用 112 | 113 | 直接运行 `getproxy` 等同于执行下面程序: 114 | 115 | ``` python 116 | #! /usr/bin/env python 117 | # -*- coding: utf-8 -*- 118 | 119 | from getproxy import GetProxy 120 | 121 | g = GetProxy() 122 | 123 | # 1. 初始化,必须步骤 124 | g.init() 125 | 126 | # 2. 加载 input proxies 列表 127 | g.load_input_proxies() 128 | 129 | # 3. 验证 input proxies 列表 130 | g.validate_input_proxies() 131 | 132 | # 4. 加载 plugin 133 | g.load_plugins() 134 | 135 | # 5. 抓取 web proxies 列表 136 | g.grab_web_proxies() 137 | 138 | # 6. 验证 web proxies 列表 139 | g.validate_web_proxies() 140 | 141 | # 7. 保存当前所有已验证的 proxies 列表 142 | g.save_proxies() 143 | 144 | ``` 145 | 146 | 如果只想验证 proxies 列表,并不需要抓取别人的 proxies,则可以: 147 | 148 | ``` python 149 | g.init() 150 | g.load_input_proxies() 151 | g.validate_input_proxies() 152 | 153 | print(g.valid_proxies) 154 | ``` 155 | 156 | 如果当前程序不需要输出 proxies 列表,而是在程序中直接使用,则可以: 157 | 158 | ``` python 159 | g.init() 160 | g.load_plugins() 161 | g.grab_web_proxies() 162 | g.validate_web_proxies() 163 | 164 | print(g.valid_proxies) 165 | ``` 166 | 167 | ## 6. Q & A 168 | 169 | * 为什么不使用 xxx 数据库? 170 | 171 | 数据量并不大,就算用文本格式全读进内存,也占用不了多少内存,就算真的需要存储至数据库,自己再多写几行代码就搞定。 172 | 另外使用文本格式还有另外一个好处是可以创建这个项目 [fate0/proxylist](https://github.com/fate0/proxylist) 173 | 174 | * 和 xxx 有什么区别? 175 | 176 | 简单、方便、快捷,除了 Python 环境,其他都不用设置。 177 | 178 | * 报错啦,怎么办? 179 | 180 | 仔细看看错误信息,是不是一些 plugin 报错误,而且错误都是和网络相关的? 181 | 如果是的话,可能这些 plugin 访问的网站由于众所周知的原因被 block 了。 182 | 如果不是,赶紧提 Issue。 183 | 184 | * 还继续添加新的 plugin 吗? 185 | 186 | 主要看这个项目 [fate0/proxylist](https://github.com/fate0/proxylist) 中的 `proxy.list` 数量, 187 | 如果 `proxy.list` 行数接近 5000 个,那就不再继续添加新的 plugin,防止 travis 15min 内不结束。 -------------------------------------------------------------------------------- /getproxy/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = """fate0""" 4 | __email__ = 'fate0@fatezero.org' 5 | __version__ = '0.2.3' 6 | 7 | 8 | from .getproxy import GetProxy 9 | -------------------------------------------------------------------------------- /getproxy/cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import click 4 | from getproxy import GetProxy 5 | 6 | 7 | @click.command() 8 | @click.option('--in-proxy', help='Input proxy file') 9 | @click.option('--out-proxy', help='Output proxy file') 10 | def main(in_proxy, out_proxy): 11 | g = GetProxy(in_proxy, out_proxy) 12 | g.start() 13 | 14 | 15 | if __name__ == "__main__": 16 | main() 17 | -------------------------------------------------------------------------------- /getproxy/data/GeoLite2-Country.mmdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fate0/getproxy/26563b22bbb5a80b55db7edee11447568ecb220d/getproxy/data/GeoLite2-Country.mmdb -------------------------------------------------------------------------------- /getproxy/getproxy.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals, absolute_import, division, print_function 5 | 6 | import gevent.monkey 7 | gevent.monkey.patch_all() 8 | 9 | import os 10 | import sys 11 | import json 12 | import time 13 | import copy 14 | import signal 15 | import logging 16 | 17 | import requests 18 | import gevent.pool 19 | import geoip2.database 20 | 21 | from .utils import signal_name, load_object 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | logging.basicConfig(level=logging.INFO) 26 | 27 | 28 | class GetProxy(object): 29 | base_dir = os.path.dirname(os.path.realpath(__file__)) 30 | 31 | def __init__(self, input_proxies_file=None, output_proxies_file=None): 32 | self.pool = gevent.pool.Pool(500) 33 | self.plugins = [] 34 | self.web_proxies = [] 35 | self.valid_proxies = [] 36 | self.input_proxies = [] 37 | self.input_proxies_file = input_proxies_file 38 | self.output_proxies_file = output_proxies_file 39 | self.proxies_hash = {} 40 | self.origin_ip = None 41 | self.geoip_reader = None 42 | 43 | def _collect_result(self): 44 | for plugin in self.plugins: 45 | if not plugin.result: 46 | continue 47 | 48 | self.web_proxies.extend(plugin.result) 49 | 50 | def _validate_proxy(self, proxy, scheme='http'): 51 | country = proxy.get('country') 52 | host = proxy.get('host') 53 | port = proxy.get('port') 54 | 55 | proxy_hash = '%s://%s:%s' % (scheme, host, port) 56 | if proxy_hash in self.proxies_hash: 57 | return 58 | 59 | self.proxies_hash[proxy_hash] = True 60 | request_proxies = { 61 | scheme: "%s:%s" % (host, port) 62 | } 63 | 64 | request_begin = time.time() 65 | try: 66 | response_json = requests.get( 67 | "%s://httpbin.org/get?show_env=1&cur=%s" % (scheme, request_begin), 68 | proxies=request_proxies, 69 | timeout=5 70 | ).json() 71 | except: 72 | return 73 | 74 | request_end = time.time() 75 | 76 | if str(request_begin) != response_json.get('args', {}).get('cur', ''): 77 | return 78 | 79 | anonymity = self._check_proxy_anonymity(response_json) 80 | export_address = self._check_export_address(response_json) 81 | 82 | try: 83 | country = country or self.geoip_reader.country(host).country.iso_code 84 | except Exception: 85 | country = "unknown" 86 | 87 | return { 88 | "type": scheme, 89 | "host": host, 90 | "export_address": export_address, 91 | "port": port, 92 | "anonymity": anonymity, 93 | "country": country, 94 | "response_time": round(request_end - request_begin, 2), 95 | "from": proxy.get('from') 96 | } 97 | 98 | def _validate_proxy_list(self, proxies, timeout=300): 99 | valid_proxies = [] 100 | 101 | def save_result(p): 102 | if p: 103 | valid_proxies.append(p) 104 | 105 | for proxy in proxies: 106 | self.pool.apply_async(self._validate_proxy, args=(proxy, 'http'), callback=save_result) 107 | self.pool.apply_async(self._validate_proxy, args=(proxy, 'https'), callback=save_result) 108 | 109 | self.pool.join(timeout=timeout) 110 | self.pool.kill() 111 | 112 | return valid_proxies 113 | 114 | def _check_proxy_anonymity(self, response): 115 | via = response.get('headers', {}).get('Via', '') 116 | 117 | if self.origin_ip in json.dumps(response): 118 | return 'transparent' 119 | elif via and via != "1.1 vegur": 120 | return 'anonymous' 121 | else: 122 | return 'high_anonymous' 123 | 124 | def _check_export_address(self, response): 125 | origin = response.get('origin', '').split(', ') 126 | if self.origin_ip in origin: 127 | origin.remove(self.origin_ip) 128 | return origin 129 | 130 | def _request_force_stop(self, signum, _): 131 | logger.warning("[-] Cold shut down") 132 | self.save_proxies() 133 | 134 | raise SystemExit() 135 | 136 | def _request_stop(self, signum, _): 137 | logger.debug("Got signal %s" % signal_name(signum)) 138 | 139 | signal.signal(signal.SIGINT, self._request_force_stop) 140 | signal.signal(signal.SIGTERM, self._request_force_stop) 141 | 142 | logger.warning("[-] Press Ctrl+C again for a cold shutdown.") 143 | 144 | def init(self): 145 | logger.info("[*] Init") 146 | signal.signal(signal.SIGINT, self._request_stop) 147 | signal.signal(signal.SIGTERM, self._request_stop) 148 | 149 | rp = requests.get('http://httpbin.org/get') 150 | self.origin_ip = rp.json().get('origin', '') 151 | logger.info("[*] Current Ip Address: %s" % self.origin_ip) 152 | 153 | self.geoip_reader = geoip2.database.Reader(os.path.join(self.base_dir, 'data/GeoLite2-Country.mmdb')) 154 | 155 | def load_input_proxies(self): 156 | logger.info("[*] Load input proxies") 157 | 158 | if self.input_proxies_file and os.path.exists(self.input_proxies_file): 159 | with open(self.input_proxies_file) as fd: 160 | for line in fd: 161 | try: 162 | self.input_proxies.append(json.loads(line)) 163 | except: 164 | continue 165 | 166 | def validate_input_proxies(self): 167 | logger.info("[*] Validate input proxies") 168 | self.valid_proxies = self._validate_proxy_list(self.input_proxies) 169 | logger.info("[*] Check %s input proxies, Got %s valid input proxies" % 170 | (len(self.proxies_hash), len(self.valid_proxies))) 171 | 172 | def load_plugins(self): 173 | logger.info("[*] Load plugins") 174 | for plugin_name in os.listdir(os.path.join(self.base_dir, 'plugin')): 175 | if os.path.splitext(plugin_name)[1] != '.py' or plugin_name == '__init__.py': 176 | continue 177 | 178 | try: 179 | cls = load_object("getproxy.plugin.%s.Proxy" % os.path.splitext(plugin_name)[0]) 180 | except Exception as e: 181 | logger.info("[-] Load Plugin %s error: %s" % (plugin_name, str(e))) 182 | continue 183 | 184 | inst = cls() 185 | inst.proxies = copy.deepcopy(self.valid_proxies) 186 | self.plugins.append(inst) 187 | 188 | def grab_web_proxies(self): 189 | logger.info("[*] Grab proxies") 190 | 191 | for plugin in self.plugins: 192 | self.pool.spawn(plugin.start) 193 | 194 | self.pool.join(timeout=8 * 60) 195 | self.pool.kill() 196 | 197 | self._collect_result() 198 | 199 | def validate_web_proxies(self): 200 | logger.info("[*] Validate web proxies") 201 | input_proxies_len = len(self.proxies_hash) 202 | 203 | valid_proxies = self._validate_proxy_list(self.web_proxies) 204 | self.valid_proxies.extend(valid_proxies) 205 | 206 | output_proxies_len = len(self.proxies_hash) - input_proxies_len 207 | 208 | logger.info("[*] Check %s output proxies, Got %s valid output proxies" % 209 | (output_proxies_len, len(valid_proxies))) 210 | logger.info("[*] Check %s proxies, Got %s valid proxies" % 211 | (len(self.proxies_hash), len(self.valid_proxies))) 212 | 213 | def save_proxies(self): 214 | if self.output_proxies_file: 215 | outfile = open(self.output_proxies_file, 'w') 216 | else: 217 | outfile = sys.stdout 218 | 219 | for item in self.valid_proxies: 220 | outfile.write("%s\n" % json.dumps(item)) 221 | 222 | outfile.flush() 223 | 224 | if outfile != sys.stdout: 225 | outfile.close() 226 | 227 | def start(self): 228 | self.init() 229 | self.load_input_proxies() 230 | self.validate_input_proxies() 231 | self.load_plugins() 232 | self.grab_web_proxies() 233 | self.validate_web_proxies() 234 | self.save_proxies() 235 | 236 | 237 | if __name__ == '__main__': 238 | g = GetProxy() 239 | g.start() 240 | -------------------------------------------------------------------------------- /getproxy/plugin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fate0/getproxy/26563b22bbb5a80b55db7edee11447568ecb220d/getproxy/plugin/__init__.py -------------------------------------------------------------------------------- /getproxy/plugin/cnproxy.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals, absolute_import, division, print_function 5 | 6 | import re 7 | import time 8 | import logging 9 | import retrying 10 | import requests 11 | 12 | 13 | """ 14 | http://www.cnproxy.com/proxy1.html 15 | """ 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | class Proxy(object): 22 | def __init__(self): 23 | self.url = 'http://www.cnproxy.com/proxy{page}.html' # 从1-10 24 | self.re_ip_pattern = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})document.write\(":"([+\w]{2,10})\)') 26 | 27 | self.port_dict = { 28 | 'v': '3', 29 | 'm': '4', 30 | 'a': '2', 31 | 'l': '9', 32 | 'q': '0', 33 | 'b': '5', 34 | 'i': '7', 35 | 'w': '6', 36 | 'r': '8', 37 | 'c': '1', 38 | '+': '' 39 | } 40 | 41 | self.cur_proxy = None 42 | self.proxies = [] 43 | self.result = [] 44 | 45 | @retrying.retry(stop_max_attempt_number=3) 46 | def extract_proxy(self, page_num): 47 | try: 48 | rp = requests.get(self.url.format(page=page_num), proxies=self.cur_proxy, timeout=10) 49 | 50 | re_ip_result = self.re_ip_pattern.findall(rp.text) 51 | re_port_encode_result = self.re_port_encode_pattern.findall(rp.text) 52 | 53 | if not len(re_ip_result) or not len(re_port_encode_result): 54 | raise Exception("empty") 55 | 56 | if len(re_ip_result) != len(re_port_encode_result): 57 | raise Exception("len(host) != len(port)") 58 | 59 | except Exception as e: 60 | logger.error("[-] Request page {page} error: {error}".format(page=page_num, error=str(e))) 61 | while self.proxies: 62 | new_proxy = self.proxies.pop(0) 63 | self.cur_proxy = {new_proxy['type']: "%s:%s" % (new_proxy['host'], new_proxy['port'])} 64 | raise e 65 | else: 66 | return [] 67 | 68 | re_port_result = [] 69 | for each_result in re_port_encode_result: 70 | each_result = each_result.strip() 71 | re_port_result.append(int(''.join(list(map(lambda x: self.port_dict.get(x, ''), each_result))))) 72 | 73 | result_dict = dict(zip(re_ip_result, re_port_result)) 74 | return [{"host": host, "port": int(port), "from": "cnproxy"} for host, port in result_dict.items()] 75 | 76 | def start(self): 77 | for page in range(1, 10): 78 | page_result = self.extract_proxy(page) 79 | time.sleep(3) 80 | 81 | if not page_result: 82 | return 83 | 84 | self.result.extend(page_result) 85 | 86 | 87 | if __name__ == '__main__': 88 | p = Proxy() 89 | p.start() 90 | 91 | for i in p.result: 92 | print(i) 93 | 94 | print(len(p.result)) 95 | -------------------------------------------------------------------------------- /getproxy/plugin/coolproxy.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals, absolute_import, division, print_function 5 | 6 | import re 7 | import time 8 | import codecs 9 | import base64 10 | import logging 11 | import retrying 12 | import requests 13 | 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class Proxy(object): 19 | def __init__(self): 20 | self.url = 'http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:{page}' 21 | self.re_ip_encode_pattern = re.compile(r'Base64.decode\(str_rot13\("([^"]+)"\)\)', re.I) 22 | self.re_port_pattern = re.compile(r'(\d{1,5})', re.I) 23 | 24 | self.cur_proxy = None 25 | self.proxies = [] 26 | self.result = [] 27 | 28 | @retrying.retry(stop_max_attempt_number=3) 29 | def extract_proxy(self, page_num): 30 | try: 31 | rp = requests.get(self.url.format(page=page_num), proxies=self.cur_proxy, timeout=10) 32 | 33 | re_ip_encode_result = self.re_ip_encode_pattern.findall(rp.text) 34 | re_port_result = self.re_port_pattern.findall(rp.text) 35 | 36 | if not len(re_ip_encode_result) or not len(re_port_result): 37 | raise Exception("empty") 38 | 39 | if len(re_ip_encode_result) != len(re_port_result): 40 | raise Exception("len(host) != len(port)") 41 | 42 | except Exception as e: 43 | logger.error("[-] Request page {page} error: {error}".format(page=page_num, error=str(e))) 44 | while self.proxies: 45 | new_proxy = self.proxies.pop(0) 46 | self.cur_proxy = {new_proxy['type']: "%s:%s" % (new_proxy['host'], new_proxy['port'])} 47 | raise e 48 | else: 49 | return [] 50 | 51 | re_ip_result = [] 52 | for each_result in re_ip_encode_result: 53 | decode_ip = base64.b64decode(codecs.decode(each_result.strip(), 'rot-13')).strip() 54 | re_ip_result.append(decode_ip.decode('utf-8')) 55 | 56 | result_dict = dict(zip(re_ip_result, re_port_result)) 57 | return [{"host": host, "port": int(port), "from": "coolproxy"} for host, port in result_dict.items()] 58 | 59 | def start(self): 60 | for page in range(1, 10): 61 | page_result = self.extract_proxy(page) 62 | time.sleep(3) 63 | 64 | if not page_result: 65 | return 66 | 67 | self.result.extend(page_result) 68 | 69 | 70 | if __name__ == '__main__': 71 | p = Proxy() 72 | p.start() 73 | 74 | for i in p.result: 75 | print(i) 76 | -------------------------------------------------------------------------------- /getproxy/plugin/freeproxylist.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals, absolute_import, division, print_function 5 | 6 | import re 7 | import logging 8 | import retrying 9 | import requests 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class Proxy(object): 16 | def __init__(self): 17 | self.url = 'https://free-proxy-list.net/' 18 | self.re_ip_port_pattern = re.compile( 19 | r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(\d{1,5})", re.I) 20 | 21 | self.cur_proxy = None 22 | self.proxies = [] 23 | self.result = [] 24 | 25 | @retrying.retry(stop_max_attempt_number=3) 26 | def extract_proxy(self, page_num): 27 | try: 28 | rp = requests.get(self.url.format(page=page_num), proxies=self.cur_proxy, timeout=10) 29 | re_ip_port_result = self.re_ip_port_pattern.findall(rp.text) 30 | 31 | if not re_ip_port_result: 32 | raise Exception("empty") 33 | 34 | except Exception as e: 35 | logger.error("[-] Request page {page} error: {error}".format(page=page_num, error=str(e))) 36 | while self.proxies: 37 | new_proxy = self.proxies.pop(0) 38 | self.cur_proxy = {new_proxy['type']: "%s:%s" % (new_proxy['host'], new_proxy['port'])} 39 | raise e 40 | else: 41 | return [] 42 | 43 | return [{"host": host, "port": int(port), "from": "freeproxylist"} for host, port in re_ip_port_result] 44 | 45 | def start(self): 46 | page_result = self.extract_proxy(0) 47 | if not page_result: 48 | return 49 | 50 | self.result.extend(page_result) 51 | 52 | 53 | if __name__ == '__main__': 54 | p = Proxy() 55 | p.start() 56 | 57 | for i in p.result: 58 | print(i) 59 | -------------------------------------------------------------------------------- /getproxy/plugin/ip181.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals, absolute_import, division, print_function 5 | 6 | import re 7 | import logging 8 | import retrying 9 | import requests 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class Proxy(object): 16 | def __init__(self): 17 | self.url = 'http://www.ip181.com/' 18 | self.re_ip_port_pattern = re.compile( 19 | r"\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s+(\d{1,5})", re.I) 20 | 21 | self.cur_proxy = None 22 | self.proxies = [] 23 | self.result = [] 24 | 25 | @retrying.retry(stop_max_attempt_number=3) 26 | def extract_proxy(self, page_num): 27 | try: 28 | rp = requests.get(self.url, proxies=self.cur_proxy, timeout=10) 29 | re_ip_port_result = self.re_ip_port_pattern.findall(rp.text) 30 | 31 | if not re_ip_port_result: 32 | raise Exception("empty") 33 | 34 | except Exception as e: 35 | logger.error("[-] Request page {page} error: {error}".format(page=page_num, error=str(e))) 36 | while self.proxies: 37 | new_proxy = self.proxies.pop(0) 38 | self.cur_proxy = {new_proxy['type']: "%s:%s" % (new_proxy['host'], new_proxy['port'])} 39 | raise e 40 | else: 41 | return [] 42 | 43 | return [{"host": host, "port": int(port), "from": "ip181"} for host, port in re_ip_port_result] 44 | 45 | def start(self): 46 | page_result = self.extract_proxy(0) 47 | if not page_result: 48 | return 49 | 50 | self.result.extend(page_result) 51 | 52 | 53 | if __name__ == '__main__': 54 | p = Proxy() 55 | p.start() 56 | 57 | for i in p.result: 58 | print(i) 59 | -------------------------------------------------------------------------------- /getproxy/plugin/proxylist.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals, absolute_import, division, print_function 5 | 6 | import re 7 | import time 8 | import base64 9 | import logging 10 | import retrying 11 | import requests 12 | 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class Proxy(object): 18 | def __init__(self): 19 | self.url = 'http://proxy-list.org/english/index.php?p={page}' # 从1-10 20 | self.re_ip_port_encode_pattern = re.compile(r"Proxy\(\'([\w\d=+]+)\'\)", re.I) 21 | 22 | self.cur_proxy = None 23 | self.proxies = [] 24 | self.result = [] 25 | 26 | @retrying.retry(stop_max_attempt_number=3) 27 | def extract_proxy(self, page_num): 28 | try: 29 | rp = requests.get(self.url.format(page=page_num), proxies=self.cur_proxy, timeout=10) 30 | re_ip_port_encode_result = self.re_ip_port_encode_pattern.findall(rp.text) 31 | 32 | if not re_ip_port_encode_result: 33 | raise Exception("empty") 34 | 35 | except Exception as e: 36 | logger.error("[-] Request page {page} error: {error}".format(page=page_num, error=str(e))) 37 | while self.proxies: 38 | new_proxy = self.proxies.pop(0) 39 | self.cur_proxy = {new_proxy['type']: "%s:%s" % (new_proxy['host'], new_proxy['port'])} 40 | raise e 41 | else: 42 | return [] 43 | 44 | re_ip_port_result = [] 45 | for each_result in re_ip_port_encode_result: 46 | decode_ip_port = base64.b64decode(each_result).decode('utf-8') 47 | host, port = decode_ip_port.split(':') 48 | re_ip_port_result.append({"host": host, "port": int(port), "from": "proxylist"}) 49 | 50 | return re_ip_port_result 51 | 52 | def start(self): 53 | for page in range(1, 10): 54 | page_result = self.extract_proxy(page) 55 | time.sleep(3) 56 | 57 | if not page_result: 58 | return 59 | 60 | self.result.extend(page_result) 61 | 62 | 63 | if __name__ == '__main__': 64 | p = Proxy() 65 | p.start() 66 | 67 | for i in p.result: 68 | print(i) 69 | -------------------------------------------------------------------------------- /getproxy/plugin/txt.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals, absolute_import, division, print_function 5 | 6 | import re 7 | import logging 8 | import retrying 9 | import requests 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class Proxy(object): 16 | def __init__(self): 17 | self.re_ip_port_pattern = re.compile(r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):([\d]{1,5})") 18 | 19 | self.cur_proxy = None 20 | self.proxies = [] 21 | self.result = [] 22 | 23 | self.txt_list = [ 24 | # 'http://api.xicidaili.com/free2016.txt', 25 | 'http://static.fatezero.org/tmp/proxy.txt', 26 | 'http://pubproxy.com/api/proxy?limit=20&format=txt&type=http', 27 | 'http://comp0.ru/downloads/proxylist.txt', 28 | 'http://www.proxylists.net/http_highanon.txt', 29 | 'http://www.proxylists.net/http.txt', 30 | 'http://ab57.ru/downloads/proxylist.txt', 31 | 'https://www.rmccurdy.com/scripts/proxy/good.txt' 32 | ] 33 | 34 | @retrying.retry(stop_max_attempt_number=3) 35 | def extract_proxy(self, url): 36 | try: 37 | headers = { 38 | 'User-Agent': "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) " 39 | "Chrome/21.0.1180.89 Safari/537.1'" 40 | } 41 | rp = requests.get(url, proxies=self.cur_proxy, headers=headers, timeout=10) 42 | 43 | re_ip_port_result = self.re_ip_port_pattern.findall(rp.text) 44 | 45 | if not re_ip_port_result: 46 | raise Exception("empty") 47 | 48 | except Exception as e: 49 | logger.error("[-] Request url {url} error: {error}".format(url=url, error=str(e))) 50 | while self.proxies: 51 | new_proxy = self.proxies.pop(0) 52 | self.cur_proxy = {new_proxy['type']: "%s:%s" % (new_proxy['host'], new_proxy['port'])} 53 | raise e 54 | else: 55 | return [] 56 | 57 | return [{'host': host, 'port': int(port), 'from': 'txt'} for host, port in re_ip_port_result] 58 | 59 | def start(self): 60 | for url in self.txt_list: 61 | try: 62 | page_result = self.extract_proxy(url) 63 | except: 64 | continue 65 | 66 | if not page_result: 67 | continue 68 | 69 | self.result.extend(page_result) 70 | 71 | 72 | if __name__ == '__main__': 73 | p = Proxy() 74 | p.start() 75 | 76 | for i in p.result: 77 | print(i) 78 | 79 | print(len(p.result)) 80 | -------------------------------------------------------------------------------- /getproxy/plugin/xicidaili.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals, absolute_import, division, print_function 5 | 6 | import re 7 | import time 8 | import logging 9 | import retrying 10 | import requests 11 | 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class Proxy(object): 17 | def __init__(self): 18 | self.urls = ['http://www.xicidaili.com/nn/', 'http://www.xicidaili.com/wt/'] 19 | self.re_ip_pattern = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', re.I) 20 | self.re_port_pattern = re.compile(r'(\d{1,5})', re.I) 21 | 22 | self.cur_proxy = None 23 | self.proxies = [] 24 | self.result = [] 25 | 26 | @retrying.retry(stop_max_attempt_number=3) 27 | def extract_proxy(self, url): 28 | try: 29 | headers = { 30 | 'User-Agent': "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) " 31 | "Chrome/21.0.1180.89 Safari/537.1'" 32 | } 33 | rp = requests.get(url, proxies=self.cur_proxy, headers=headers, timeout=10) 34 | 35 | re_ip_result = self.re_ip_pattern.findall(rp.text) 36 | re_port_result = self.re_port_pattern.findall(rp.text) 37 | 38 | if not len(re_ip_result) or not len(re_port_result): 39 | raise Exception("empty") 40 | 41 | if len(re_port_result) != len(re_port_result): 42 | raise Exception("len(host) != len(port)") 43 | 44 | except Exception as e: 45 | logger.error("[-] Request url {url} error: {error}".format(url=url, error=str(e))) 46 | while self.proxies: 47 | new_proxy = self.proxies.pop(0) 48 | self.cur_proxy = {new_proxy['type']: "%s:%s" % (new_proxy['host'], new_proxy['port'])} 49 | raise e 50 | else: 51 | return [] 52 | 53 | result_dict = dict(zip(re_ip_result, re_port_result)) 54 | return [{"host": host, "port": int(port), "from": "xicidaili"} for host, port in result_dict.items()] 55 | 56 | def start(self): 57 | for url in self.urls: 58 | page_result = self.extract_proxy(url) 59 | time.sleep(3) 60 | 61 | if not page_result: 62 | return 63 | 64 | self.result.extend(page_result) 65 | 66 | 67 | if __name__ == '__main__': 68 | p = Proxy() 69 | p.start() 70 | 71 | for i in p.result: 72 | print(i) 73 | -------------------------------------------------------------------------------- /getproxy/utils.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import signal 4 | from importlib import import_module 5 | 6 | 7 | _signames = dict((getattr(signal, signame), signame) 8 | for signame in dir(signal) 9 | if signame.startswith('SIG') and '_' not in signame) 10 | 11 | 12 | def signal_name(signum): 13 | try: 14 | if sys.version_info[:2] >= (3, 5): 15 | return signal.Signals(signum).name 16 | else: 17 | return _signames[signum] 18 | 19 | except KeyError: 20 | return 'SIG_UNKNOWN' 21 | except ValueError: 22 | return 'SIG_UNKNOWN' 23 | 24 | 25 | def load_object(path): 26 | try: 27 | dot = path.rindex('.') 28 | except ValueError: 29 | raise ValueError("Error loading object '%s': not a full path" % path) 30 | 31 | module, name = path[:dot], path[dot + 1:] 32 | mod = import_module(module) 33 | 34 | try: 35 | obj = getattr(mod, name) 36 | except AttributeError: 37 | raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name)) 38 | 39 | return obj 40 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | click>=6.0 2 | requests>=2.13.0 3 | gevent>=1.2.1 4 | geoip2>=2.5.0 5 | retrying>=1.3.3 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import ast 6 | from setuptools import setup, find_packages 7 | 8 | with open('README.md') as readme_file: 9 | readme = readme_file.read() 10 | 11 | 12 | _version_re = re.compile(r'__version__\s+=\s+(.*)') 13 | 14 | 15 | with open('getproxy/__init__.py', 'rb') as f: 16 | version = str(ast.literal_eval(_version_re.search( 17 | f.read().decode('utf-8')).group(1))) 18 | 19 | 20 | requirements = [ 21 | 'click==6.0', 22 | 'requests==2.13.0', 23 | 'gevent==1.2.1', 24 | 'geoip2==2.5.0', 25 | 'maxminddb==1.5.2', 26 | 'retrying==1.3.3' 27 | ] 28 | 29 | setup( 30 | name='getproxy', 31 | version=version, 32 | description="get proxy", 33 | long_description=readme, 34 | author="fate0", 35 | author_email='fate0@fatezero.org', 36 | url='https://github.com/fate0/getproxy', 37 | packages=find_packages(), 38 | package_dir={}, 39 | entry_points={ 40 | 'console_scripts': [ 41 | 'getproxy=getproxy.cli:main' 42 | ] 43 | }, 44 | include_package_data=True, 45 | install_requires=requirements, 46 | license="BSD license", 47 | zip_safe=False, 48 | keywords='getproxy', 49 | classifiers=[ 50 | 'Development Status :: 2 - Pre-Alpha', 51 | 'Intended Audience :: Developers', 52 | 'License :: OSI Approved :: BSD License', 53 | 'Natural Language :: English', 54 | "Programming Language :: Python :: 2", 55 | 'Programming Language :: Python :: 2.7', 56 | 'Programming Language :: Python :: 3', 57 | 'Programming Language :: Python :: 3.3', 58 | 'Programming Language :: Python :: 3.4', 59 | 'Programming Language :: Python :: 3.5', 60 | ], 61 | ) 62 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27, py33, py34, py35, flake8 3 | 4 | [testenv:flake8] 5 | basepython=python 6 | deps=flake8 7 | commands=flake8 --ignore=E501,F401 getproxy 8 | 9 | [testenv] 10 | setenv = 11 | PYTHONPATH = {toxinidir}:{toxinidir}/getproxy 12 | 13 | commands = python setup.py test 14 | --------------------------------------------------------------------------------