├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── getproxy
├── __init__.py
├── cli.py
├── data
│ └── GeoLite2-Country.mmdb
├── getproxy.py
├── plugin
│ ├── __init__.py
│ ├── cnproxy.py
│ ├── coolproxy.py
│ ├── freeproxylist.py
│ ├── ip181.py
│ ├── proxylist.py
│ ├── txt.py
│ └── xicidaili.py
└── utils.py
├── requirements_dev.txt
├── setup.cfg
├── setup.py
└── tox.ini
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | .idea/
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 |
56 | # Sphinx documentation
57 | docs/_build/
58 |
59 | # PyBuilder
60 | target/
61 |
62 | # pyenv python configuration file
63 | .python-version
64 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 |
3 | matrix:
4 | include:
5 | - python: "2.7"
6 | env: TOXENV=py27
7 | - python: "3.4"
8 | env: TOXENV=py34
9 | - python: "3.5"
10 | env: TOXENV=py35
11 | - python: "3.6"
12 | env: TOXENV=py36
13 |
14 | install: pip install -U tox
15 |
16 | script: tox -e ${TOXENV}
17 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | BSD License
3 |
4 | Copyright (c) 2017, fate0
5 | All rights reserved.
6 |
7 | Redistribution and use in source and binary forms, with or without modification,
8 | are permitted provided that the following conditions are met:
9 |
10 | * Redistributions of source code must retain the above copyright notice, this
11 | list of conditions and the following disclaimer.
12 |
13 | * Redistributions in binary form must reproduce the above copyright notice, this
14 | list of conditions and the following disclaimer in the documentation and/or
15 | other materials provided with the distribution.
16 |
17 | * Neither the name of the copyright holder nor the names of its
18 | contributors may be used to endorse or promote products derived from this
19 | software without specific prior written permission.
20 |
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
25 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
28 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
29 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
30 | OF THE POSSIBILITY OF SUCH DAMAGE.
31 |
32 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | include getproxy/data/GeoLite2-Country.mmdb
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # getproxy
2 |
3 | [](https://travis-ci.org/fate0/getproxy)
4 | [](https://pyup.io/repos/github/fate0/getproxy/)
5 | [](https://pypi.python.org/pypi/getproxy)
6 | [](https://pypi.python.org/pypi/getproxy)
7 |
8 | getproxy 是一个抓取发放代理网站,获取 http/https 代理的程序,
9 | 每 15 min 会更新数据至 [fate0/proxylist](https://github.com/fate0/proxylist)
10 |
11 |
12 | ## 1. 安装
13 |
14 | ```
15 | pip install -U getproxy
16 | ```
17 |
18 | ## 2. 使用
19 |
20 | ### 帮助信息
21 | ```
22 | ➜ ~ getproxy --help
23 | Usage: getproxy [OPTIONS]
24 |
25 | Options:
26 | --in-proxy TEXT Input proxy file
27 | --out-proxy TEXT Output proxy file
28 | --help Show this message and exit.
29 | ```
30 |
31 | * `--in-proxy` 可选参数,待验证的 proxies 列表文件
32 | * `--out-proxy` 可选参数,输出已验证的 proxies 列表文件,如果为空,则直接输出到终端
33 |
34 | `--in-proxy` 文件格式和 `--out-proxy` 文件格式一致
35 |
36 | ### 使用例子
37 |
38 | ```
39 | (test2.7) ➜ ~ getproxy
40 | INFO:getproxy.getproxy:[*] Init
41 | INFO:getproxy.getproxy:[*] Current Ip Address: 1.1.1.1
42 | INFO:getproxy.getproxy:[*] Load input proxies
43 | INFO:getproxy.getproxy:[*] Validate input proxies
44 | INFO:getproxy.getproxy:[*] Load plugins
45 | INFO:getproxy.getproxy:[*] Grab proxies
46 | INFO:getproxy.getproxy:[*] Validate web proxies
47 | INFO:getproxy.getproxy:[*] Check 6666 proxies, Got 666 valid proxies
48 |
49 | ...
50 | ```
51 |
52 |
53 | ## 3. 输入/返回格式
54 |
55 | 每一行结果都是一个 json 字符串,格式如下:
56 | ```json
57 | {
58 | "type": "http",
59 | "host": "1.1.1.1",
60 | "port": 8080,
61 | "anonymity": "transparent",
62 | "country": "CN",
63 | "response_time": 3.14,
64 | "from": "txt"
65 | }
66 | ```
67 |
68 | | 属性 | 类型 | 描述 | 可选值 |
69 | |------- |--------|-------- |----------|
70 | | type | str | proxy 类型 | `http`, `https`|
71 | | host | str | proxy 地址 | |
72 | | port | int | 端口 | |
73 | | anonymity | str | 匿名性 | `transparent`, `anonymous`, `high_anonymous` |
74 | | country | str | proxy 国家 | |
75 | | response_time | float | 响应时间 | |
76 | | from | str | 来源 | |
77 |
78 |
79 | ## 4. Plugin 相关
80 |
81 | ### Plugin 代码格式
82 |
83 | ``` python
84 |
85 | class Proxy(object):
86 | def __init__(self):
87 | self.result = []
88 | self.proxies = []
89 |
90 | def start(self):
91 | pass
92 | ```
93 |
94 | ### Plugin 返回结果
95 |
96 | ```
97 | {
98 | "host": "1.1.1.1",
99 | "port": 8080,
100 | "from": "plugin name"
101 | }
102 | ```
103 |
104 | ### Plugin 小提示
105 |
106 | * 不要在 plugin 内使用多线程、gevent 等方法
107 | * 如果目标网站存在分页,请在获取每页内容之后,自行添加 delay
108 | * 如果目标网站存在分页,请在获取每页结果之后,及时放入 `self.result` 中
109 | * 如果被目标网站 ban 了,可以利用已经验证的 proxies (也就是 `self.proxies`)
110 |
111 | ## 5. 第三方程序调用
112 |
113 | 直接运行 `getproxy` 等同于执行下面程序:
114 |
115 | ``` python
116 | #! /usr/bin/env python
117 | # -*- coding: utf-8 -*-
118 |
119 | from getproxy import GetProxy
120 |
121 | g = GetProxy()
122 |
123 | # 1. 初始化,必须步骤
124 | g.init()
125 |
126 | # 2. 加载 input proxies 列表
127 | g.load_input_proxies()
128 |
129 | # 3. 验证 input proxies 列表
130 | g.validate_input_proxies()
131 |
132 | # 4. 加载 plugin
133 | g.load_plugins()
134 |
135 | # 5. 抓取 web proxies 列表
136 | g.grab_web_proxies()
137 |
138 | # 6. 验证 web proxies 列表
139 | g.validate_web_proxies()
140 |
141 | # 7. 保存当前所有已验证的 proxies 列表
142 | g.save_proxies()
143 |
144 | ```
145 |
146 | 如果只想验证 proxies 列表,并不需要抓取别人的 proxies,则可以:
147 |
148 | ``` python
149 | g.init()
150 | g.load_input_proxies()
151 | g.validate_input_proxies()
152 |
153 | print(g.valid_proxies)
154 | ```
155 |
156 | 如果当前程序不需要输出 proxies 列表,而是在程序中直接使用,则可以:
157 |
158 | ``` python
159 | g.init()
160 | g.load_plugins()
161 | g.grab_web_proxies()
162 | g.validate_web_proxies()
163 |
164 | print(g.valid_proxies)
165 | ```
166 |
167 | ## 6. Q & A
168 |
169 | * 为什么不使用 xxx 数据库?
170 |
171 | 数据量并不大,就算用文本格式全读进内存,也占用不了多少内存,就算真的需要存储至数据库,自己再多写几行代码就搞定。
172 | 另外使用文本格式还有另外一个好处是可以创建这个项目 [fate0/proxylist](https://github.com/fate0/proxylist)
173 |
174 | * 和 xxx 有什么区别?
175 |
176 | 简单、方便、快捷,除了 Python 环境,其他都不用设置。
177 |
178 | * 报错啦,怎么办?
179 |
180 | 仔细看看错误信息,是不是一些 plugin 报错误,而且错误都是和网络相关的?
181 | 如果是的话,可能这些 plugin 访问的网站由于众所周知的原因被 block 了。
182 | 如果不是,赶紧提 Issue。
183 |
184 | * 还继续添加新的 plugin 吗?
185 |
186 | 主要看这个项目 [fate0/proxylist](https://github.com/fate0/proxylist) 中的 `proxy.list` 数量,
187 | 如果 `proxy.list` 行数接近 5000 个,那就不再继续添加新的 plugin,防止 travis 15min 内不结束。
--------------------------------------------------------------------------------
/getproxy/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | __author__ = """fate0"""
4 | __email__ = 'fate0@fatezero.org'
5 | __version__ = '0.2.3'
6 |
7 |
8 | from .getproxy import GetProxy
9 |
--------------------------------------------------------------------------------
/getproxy/cli.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import click
4 | from getproxy import GetProxy
5 |
6 |
7 | @click.command()
8 | @click.option('--in-proxy', help='Input proxy file')
9 | @click.option('--out-proxy', help='Output proxy file')
10 | def main(in_proxy, out_proxy):
11 | g = GetProxy(in_proxy, out_proxy)
12 | g.start()
13 |
14 |
15 | if __name__ == "__main__":
16 | main()
17 |
--------------------------------------------------------------------------------
/getproxy/data/GeoLite2-Country.mmdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fate0/getproxy/26563b22bbb5a80b55db7edee11447568ecb220d/getproxy/data/GeoLite2-Country.mmdb
--------------------------------------------------------------------------------
/getproxy/getproxy.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from __future__ import unicode_literals, absolute_import, division, print_function
5 |
6 | import gevent.monkey
7 | gevent.monkey.patch_all()
8 |
9 | import os
10 | import sys
11 | import json
12 | import time
13 | import copy
14 | import signal
15 | import logging
16 |
17 | import requests
18 | import gevent.pool
19 | import geoip2.database
20 |
21 | from .utils import signal_name, load_object
22 |
23 |
24 | logger = logging.getLogger(__name__)
25 | logging.basicConfig(level=logging.INFO)
26 |
27 |
28 | class GetProxy(object):
29 | base_dir = os.path.dirname(os.path.realpath(__file__))
30 |
31 | def __init__(self, input_proxies_file=None, output_proxies_file=None):
32 | self.pool = gevent.pool.Pool(500)
33 | self.plugins = []
34 | self.web_proxies = []
35 | self.valid_proxies = []
36 | self.input_proxies = []
37 | self.input_proxies_file = input_proxies_file
38 | self.output_proxies_file = output_proxies_file
39 | self.proxies_hash = {}
40 | self.origin_ip = None
41 | self.geoip_reader = None
42 |
43 | def _collect_result(self):
44 | for plugin in self.plugins:
45 | if not plugin.result:
46 | continue
47 |
48 | self.web_proxies.extend(plugin.result)
49 |
50 | def _validate_proxy(self, proxy, scheme='http'):
51 | country = proxy.get('country')
52 | host = proxy.get('host')
53 | port = proxy.get('port')
54 |
55 | proxy_hash = '%s://%s:%s' % (scheme, host, port)
56 | if proxy_hash in self.proxies_hash:
57 | return
58 |
59 | self.proxies_hash[proxy_hash] = True
60 | request_proxies = {
61 | scheme: "%s:%s" % (host, port)
62 | }
63 |
64 | request_begin = time.time()
65 | try:
66 | response_json = requests.get(
67 | "%s://httpbin.org/get?show_env=1&cur=%s" % (scheme, request_begin),
68 | proxies=request_proxies,
69 | timeout=5
70 | ).json()
71 | except:
72 | return
73 |
74 | request_end = time.time()
75 |
76 | if str(request_begin) != response_json.get('args', {}).get('cur', ''):
77 | return
78 |
79 | anonymity = self._check_proxy_anonymity(response_json)
80 | export_address = self._check_export_address(response_json)
81 |
82 | try:
83 | country = country or self.geoip_reader.country(host).country.iso_code
84 | except Exception:
85 | country = "unknown"
86 |
87 | return {
88 | "type": scheme,
89 | "host": host,
90 | "export_address": export_address,
91 | "port": port,
92 | "anonymity": anonymity,
93 | "country": country,
94 | "response_time": round(request_end - request_begin, 2),
95 | "from": proxy.get('from')
96 | }
97 |
98 | def _validate_proxy_list(self, proxies, timeout=300):
99 | valid_proxies = []
100 |
101 | def save_result(p):
102 | if p:
103 | valid_proxies.append(p)
104 |
105 | for proxy in proxies:
106 | self.pool.apply_async(self._validate_proxy, args=(proxy, 'http'), callback=save_result)
107 | self.pool.apply_async(self._validate_proxy, args=(proxy, 'https'), callback=save_result)
108 |
109 | self.pool.join(timeout=timeout)
110 | self.pool.kill()
111 |
112 | return valid_proxies
113 |
114 | def _check_proxy_anonymity(self, response):
115 | via = response.get('headers', {}).get('Via', '')
116 |
117 | if self.origin_ip in json.dumps(response):
118 | return 'transparent'
119 | elif via and via != "1.1 vegur":
120 | return 'anonymous'
121 | else:
122 | return 'high_anonymous'
123 |
124 | def _check_export_address(self, response):
125 | origin = response.get('origin', '').split(', ')
126 | if self.origin_ip in origin:
127 | origin.remove(self.origin_ip)
128 | return origin
129 |
130 | def _request_force_stop(self, signum, _):
131 | logger.warning("[-] Cold shut down")
132 | self.save_proxies()
133 |
134 | raise SystemExit()
135 |
136 | def _request_stop(self, signum, _):
137 | logger.debug("Got signal %s" % signal_name(signum))
138 |
139 | signal.signal(signal.SIGINT, self._request_force_stop)
140 | signal.signal(signal.SIGTERM, self._request_force_stop)
141 |
142 | logger.warning("[-] Press Ctrl+C again for a cold shutdown.")
143 |
144 | def init(self):
145 | logger.info("[*] Init")
146 | signal.signal(signal.SIGINT, self._request_stop)
147 | signal.signal(signal.SIGTERM, self._request_stop)
148 |
149 | rp = requests.get('http://httpbin.org/get')
150 | self.origin_ip = rp.json().get('origin', '')
151 | logger.info("[*] Current Ip Address: %s" % self.origin_ip)
152 |
153 | self.geoip_reader = geoip2.database.Reader(os.path.join(self.base_dir, 'data/GeoLite2-Country.mmdb'))
154 |
155 | def load_input_proxies(self):
156 | logger.info("[*] Load input proxies")
157 |
158 | if self.input_proxies_file and os.path.exists(self.input_proxies_file):
159 | with open(self.input_proxies_file) as fd:
160 | for line in fd:
161 | try:
162 | self.input_proxies.append(json.loads(line))
163 | except:
164 | continue
165 |
166 | def validate_input_proxies(self):
167 | logger.info("[*] Validate input proxies")
168 | self.valid_proxies = self._validate_proxy_list(self.input_proxies)
169 | logger.info("[*] Check %s input proxies, Got %s valid input proxies" %
170 | (len(self.proxies_hash), len(self.valid_proxies)))
171 |
172 | def load_plugins(self):
173 | logger.info("[*] Load plugins")
174 | for plugin_name in os.listdir(os.path.join(self.base_dir, 'plugin')):
175 | if os.path.splitext(plugin_name)[1] != '.py' or plugin_name == '__init__.py':
176 | continue
177 |
178 | try:
179 | cls = load_object("getproxy.plugin.%s.Proxy" % os.path.splitext(plugin_name)[0])
180 | except Exception as e:
181 | logger.info("[-] Load Plugin %s error: %s" % (plugin_name, str(e)))
182 | continue
183 |
184 | inst = cls()
185 | inst.proxies = copy.deepcopy(self.valid_proxies)
186 | self.plugins.append(inst)
187 |
188 | def grab_web_proxies(self):
189 | logger.info("[*] Grab proxies")
190 |
191 | for plugin in self.plugins:
192 | self.pool.spawn(plugin.start)
193 |
194 | self.pool.join(timeout=8 * 60)
195 | self.pool.kill()
196 |
197 | self._collect_result()
198 |
199 | def validate_web_proxies(self):
200 | logger.info("[*] Validate web proxies")
201 | input_proxies_len = len(self.proxies_hash)
202 |
203 | valid_proxies = self._validate_proxy_list(self.web_proxies)
204 | self.valid_proxies.extend(valid_proxies)
205 |
206 | output_proxies_len = len(self.proxies_hash) - input_proxies_len
207 |
208 | logger.info("[*] Check %s output proxies, Got %s valid output proxies" %
209 | (output_proxies_len, len(valid_proxies)))
210 | logger.info("[*] Check %s proxies, Got %s valid proxies" %
211 | (len(self.proxies_hash), len(self.valid_proxies)))
212 |
213 | def save_proxies(self):
214 | if self.output_proxies_file:
215 | outfile = open(self.output_proxies_file, 'w')
216 | else:
217 | outfile = sys.stdout
218 |
219 | for item in self.valid_proxies:
220 | outfile.write("%s\n" % json.dumps(item))
221 |
222 | outfile.flush()
223 |
224 | if outfile != sys.stdout:
225 | outfile.close()
226 |
227 | def start(self):
228 | self.init()
229 | self.load_input_proxies()
230 | self.validate_input_proxies()
231 | self.load_plugins()
232 | self.grab_web_proxies()
233 | self.validate_web_proxies()
234 | self.save_proxies()
235 |
236 |
237 | if __name__ == '__main__':
238 | g = GetProxy()
239 | g.start()
240 |
--------------------------------------------------------------------------------
/getproxy/plugin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fate0/getproxy/26563b22bbb5a80b55db7edee11447568ecb220d/getproxy/plugin/__init__.py
--------------------------------------------------------------------------------
/getproxy/plugin/cnproxy.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from __future__ import unicode_literals, absolute_import, division, print_function
5 |
6 | import re
7 | import time
8 | import logging
9 | import retrying
10 | import requests
11 |
12 |
13 | """
14 | http://www.cnproxy.com/proxy1.html
15 | """
16 |
17 |
18 | logger = logging.getLogger(__name__)
19 |
20 |
21 | class Proxy(object):
22 | def __init__(self):
23 | self.url = 'http://www.cnproxy.com/proxy{page}.html' # 从1-10
24 | self.re_ip_pattern = re.compile(r'
(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})')
26 |
27 | self.port_dict = {
28 | 'v': '3',
29 | 'm': '4',
30 | 'a': '2',
31 | 'l': '9',
32 | 'q': '0',
33 | 'b': '5',
34 | 'i': '7',
35 | 'w': '6',
36 | 'r': '8',
37 | 'c': '1',
38 | '+': ''
39 | }
40 |
41 | self.cur_proxy = None
42 | self.proxies = []
43 | self.result = []
44 |
45 | @retrying.retry(stop_max_attempt_number=3)
46 | def extract_proxy(self, page_num):
47 | try:
48 | rp = requests.get(self.url.format(page=page_num), proxies=self.cur_proxy, timeout=10)
49 |
50 | re_ip_result = self.re_ip_pattern.findall(rp.text)
51 | re_port_encode_result = self.re_port_encode_pattern.findall(rp.text)
52 |
53 | if not len(re_ip_result) or not len(re_port_encode_result):
54 | raise Exception("empty")
55 |
56 | if len(re_ip_result) != len(re_port_encode_result):
57 | raise Exception("len(host) != len(port)")
58 |
59 | except Exception as e:
60 | logger.error("[-] Request page {page} error: {error}".format(page=page_num, error=str(e)))
61 | while self.proxies:
62 | new_proxy = self.proxies.pop(0)
63 | self.cur_proxy = {new_proxy['type']: "%s:%s" % (new_proxy['host'], new_proxy['port'])}
64 | raise e
65 | else:
66 | return []
67 |
68 | re_port_result = []
69 | for each_result in re_port_encode_result:
70 | each_result = each_result.strip()
71 | re_port_result.append(int(''.join(list(map(lambda x: self.port_dict.get(x, ''), each_result)))))
72 |
73 | result_dict = dict(zip(re_ip_result, re_port_result))
74 | return [{"host": host, "port": int(port), "from": "cnproxy"} for host, port in result_dict.items()]
75 |
76 | def start(self):
77 | for page in range(1, 10):
78 | page_result = self.extract_proxy(page)
79 | time.sleep(3)
80 |
81 | if not page_result:
82 | return
83 |
84 | self.result.extend(page_result)
85 |
86 |
87 | if __name__ == '__main__':
88 | p = Proxy()
89 | p.start()
90 |
91 | for i in p.result:
92 | print(i)
93 |
94 | print(len(p.result))
95 |
--------------------------------------------------------------------------------
/getproxy/plugin/coolproxy.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from __future__ import unicode_literals, absolute_import, division, print_function
5 |
6 | import re
7 | import time
8 | import codecs
9 | import base64
10 | import logging
11 | import retrying
12 | import requests
13 |
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | class Proxy(object):
19 | def __init__(self):
20 | self.url = 'http://www.cool-proxy.net/proxies/http_proxy_list/sort:score/direction:desc/page:{page}'
21 | self.re_ip_encode_pattern = re.compile(r'Base64.decode\(str_rot13\("([^"]+)"\)\)', re.I)
22 | self.re_port_pattern = re.compile(r' | (\d{1,5}) | ', re.I)
23 |
24 | self.cur_proxy = None
25 | self.proxies = []
26 | self.result = []
27 |
28 | @retrying.retry(stop_max_attempt_number=3)
29 | def extract_proxy(self, page_num):
30 | try:
31 | rp = requests.get(self.url.format(page=page_num), proxies=self.cur_proxy, timeout=10)
32 |
33 | re_ip_encode_result = self.re_ip_encode_pattern.findall(rp.text)
34 | re_port_result = self.re_port_pattern.findall(rp.text)
35 |
36 | if not len(re_ip_encode_result) or not len(re_port_result):
37 | raise Exception("empty")
38 |
39 | if len(re_ip_encode_result) != len(re_port_result):
40 | raise Exception("len(host) != len(port)")
41 |
42 | except Exception as e:
43 | logger.error("[-] Request page {page} error: {error}".format(page=page_num, error=str(e)))
44 | while self.proxies:
45 | new_proxy = self.proxies.pop(0)
46 | self.cur_proxy = {new_proxy['type']: "%s:%s" % (new_proxy['host'], new_proxy['port'])}
47 | raise e
48 | else:
49 | return []
50 |
51 | re_ip_result = []
52 | for each_result in re_ip_encode_result:
53 | decode_ip = base64.b64decode(codecs.decode(each_result.strip(), 'rot-13')).strip()
54 | re_ip_result.append(decode_ip.decode('utf-8'))
55 |
56 | result_dict = dict(zip(re_ip_result, re_port_result))
57 | return [{"host": host, "port": int(port), "from": "coolproxy"} for host, port in result_dict.items()]
58 |
59 | def start(self):
60 | for page in range(1, 10):
61 | page_result = self.extract_proxy(page)
62 | time.sleep(3)
63 |
64 | if not page_result:
65 | return
66 |
67 | self.result.extend(page_result)
68 |
69 |
70 | if __name__ == '__main__':
71 | p = Proxy()
72 | p.start()
73 |
74 | for i in p.result:
75 | print(i)
76 |
--------------------------------------------------------------------------------
/getproxy/plugin/freeproxylist.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from __future__ import unicode_literals, absolute_import, division, print_function
5 |
6 | import re
7 | import logging
8 | import retrying
9 | import requests
10 |
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | class Proxy(object):
16 | def __init__(self):
17 | self.url = 'https://free-proxy-list.net/'
18 | self.re_ip_port_pattern = re.compile(
19 | r"
(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | (\d{1,5}) | ", re.I)
20 |
21 | self.cur_proxy = None
22 | self.proxies = []
23 | self.result = []
24 |
25 | @retrying.retry(stop_max_attempt_number=3)
26 | def extract_proxy(self, page_num):
27 | try:
28 | rp = requests.get(self.url.format(page=page_num), proxies=self.cur_proxy, timeout=10)
29 | re_ip_port_result = self.re_ip_port_pattern.findall(rp.text)
30 |
31 | if not re_ip_port_result:
32 | raise Exception("empty")
33 |
34 | except Exception as e:
35 | logger.error("[-] Request page {page} error: {error}".format(page=page_num, error=str(e)))
36 | while self.proxies:
37 | new_proxy = self.proxies.pop(0)
38 | self.cur_proxy = {new_proxy['type']: "%s:%s" % (new_proxy['host'], new_proxy['port'])}
39 | raise e
40 | else:
41 | return []
42 |
43 | return [{"host": host, "port": int(port), "from": "freeproxylist"} for host, port in re_ip_port_result]
44 |
45 | def start(self):
46 | page_result = self.extract_proxy(0)
47 | if not page_result:
48 | return
49 |
50 | self.result.extend(page_result)
51 |
52 |
53 | if __name__ == '__main__':
54 | p = Proxy()
55 | p.start()
56 |
57 | for i in p.result:
58 | print(i)
59 |
--------------------------------------------------------------------------------
/getproxy/plugin/ip181.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from __future__ import unicode_literals, absolute_import, division, print_function
5 |
6 | import re
7 | import logging
8 | import retrying
9 | import requests
10 |
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | class Proxy(object):
16 | def __init__(self):
17 | self.url = 'http://www.ip181.com/'
18 | self.re_ip_port_pattern = re.compile(
19 | r"
\s+(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | \s+(\d{1,5}) | ", re.I)
20 |
21 | self.cur_proxy = None
22 | self.proxies = []
23 | self.result = []
24 |
25 | @retrying.retry(stop_max_attempt_number=3)
26 | def extract_proxy(self, page_num):
27 | try:
28 | rp = requests.get(self.url, proxies=self.cur_proxy, timeout=10)
29 | re_ip_port_result = self.re_ip_port_pattern.findall(rp.text)
30 |
31 | if not re_ip_port_result:
32 | raise Exception("empty")
33 |
34 | except Exception as e:
35 | logger.error("[-] Request page {page} error: {error}".format(page=page_num, error=str(e)))
36 | while self.proxies:
37 | new_proxy = self.proxies.pop(0)
38 | self.cur_proxy = {new_proxy['type']: "%s:%s" % (new_proxy['host'], new_proxy['port'])}
39 | raise e
40 | else:
41 | return []
42 |
43 | return [{"host": host, "port": int(port), "from": "ip181"} for host, port in re_ip_port_result]
44 |
45 | def start(self):
46 | page_result = self.extract_proxy(0)
47 | if not page_result:
48 | return
49 |
50 | self.result.extend(page_result)
51 |
52 |
53 | if __name__ == '__main__':
54 | p = Proxy()
55 | p.start()
56 |
57 | for i in p.result:
58 | print(i)
59 |
--------------------------------------------------------------------------------
/getproxy/plugin/proxylist.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from __future__ import unicode_literals, absolute_import, division, print_function
5 |
6 | import re
7 | import time
8 | import base64
9 | import logging
10 | import retrying
11 | import requests
12 |
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 |
17 | class Proxy(object):
18 | def __init__(self):
19 | self.url = 'http://proxy-list.org/english/index.php?p={page}' # 从1-10
20 | self.re_ip_port_encode_pattern = re.compile(r"Proxy\(\'([\w\d=+]+)\'\)", re.I)
21 |
22 | self.cur_proxy = None
23 | self.proxies = []
24 | self.result = []
25 |
26 | @retrying.retry(stop_max_attempt_number=3)
27 | def extract_proxy(self, page_num):
28 | try:
29 | rp = requests.get(self.url.format(page=page_num), proxies=self.cur_proxy, timeout=10)
30 | re_ip_port_encode_result = self.re_ip_port_encode_pattern.findall(rp.text)
31 |
32 | if not re_ip_port_encode_result:
33 | raise Exception("empty")
34 |
35 | except Exception as e:
36 | logger.error("[-] Request page {page} error: {error}".format(page=page_num, error=str(e)))
37 | while self.proxies:
38 | new_proxy = self.proxies.pop(0)
39 | self.cur_proxy = {new_proxy['type']: "%s:%s" % (new_proxy['host'], new_proxy['port'])}
40 | raise e
41 | else:
42 | return []
43 |
44 | re_ip_port_result = []
45 | for each_result in re_ip_port_encode_result:
46 | decode_ip_port = base64.b64decode(each_result).decode('utf-8')
47 | host, port = decode_ip_port.split(':')
48 | re_ip_port_result.append({"host": host, "port": int(port), "from": "proxylist"})
49 |
50 | return re_ip_port_result
51 |
52 | def start(self):
53 | for page in range(1, 10):
54 | page_result = self.extract_proxy(page)
55 | time.sleep(3)
56 |
57 | if not page_result:
58 | return
59 |
60 | self.result.extend(page_result)
61 |
62 |
63 | if __name__ == '__main__':
64 | p = Proxy()
65 | p.start()
66 |
67 | for i in p.result:
68 | print(i)
69 |
--------------------------------------------------------------------------------
/getproxy/plugin/txt.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from __future__ import unicode_literals, absolute_import, division, print_function
5 |
6 | import re
7 | import logging
8 | import retrying
9 | import requests
10 |
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | class Proxy(object):
16 | def __init__(self):
17 | self.re_ip_port_pattern = re.compile(r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):([\d]{1,5})")
18 |
19 | self.cur_proxy = None
20 | self.proxies = []
21 | self.result = []
22 |
23 | self.txt_list = [
24 | # 'http://api.xicidaili.com/free2016.txt',
25 | 'http://static.fatezero.org/tmp/proxy.txt',
26 | 'http://pubproxy.com/api/proxy?limit=20&format=txt&type=http',
27 | 'http://comp0.ru/downloads/proxylist.txt',
28 | 'http://www.proxylists.net/http_highanon.txt',
29 | 'http://www.proxylists.net/http.txt',
30 | 'http://ab57.ru/downloads/proxylist.txt',
31 | 'https://www.rmccurdy.com/scripts/proxy/good.txt'
32 | ]
33 |
34 | @retrying.retry(stop_max_attempt_number=3)
35 | def extract_proxy(self, url):
36 | try:
37 | headers = {
38 | 'User-Agent': "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) "
39 | "Chrome/21.0.1180.89 Safari/537.1'"
40 | }
41 | rp = requests.get(url, proxies=self.cur_proxy, headers=headers, timeout=10)
42 |
43 | re_ip_port_result = self.re_ip_port_pattern.findall(rp.text)
44 |
45 | if not re_ip_port_result:
46 | raise Exception("empty")
47 |
48 | except Exception as e:
49 | logger.error("[-] Request url {url} error: {error}".format(url=url, error=str(e)))
50 | while self.proxies:
51 | new_proxy = self.proxies.pop(0)
52 | self.cur_proxy = {new_proxy['type']: "%s:%s" % (new_proxy['host'], new_proxy['port'])}
53 | raise e
54 | else:
55 | return []
56 |
57 | return [{'host': host, 'port': int(port), 'from': 'txt'} for host, port in re_ip_port_result]
58 |
59 | def start(self):
60 | for url in self.txt_list:
61 | try:
62 | page_result = self.extract_proxy(url)
63 | except:
64 | continue
65 |
66 | if not page_result:
67 | continue
68 |
69 | self.result.extend(page_result)
70 |
71 |
72 | if __name__ == '__main__':
73 | p = Proxy()
74 | p.start()
75 |
76 | for i in p.result:
77 | print(i)
78 |
79 | print(len(p.result))
80 |
--------------------------------------------------------------------------------
/getproxy/plugin/xicidaili.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from __future__ import unicode_literals, absolute_import, division, print_function
5 |
6 | import re
7 | import time
8 | import logging
9 | import retrying
10 | import requests
11 |
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | class Proxy(object):
17 | def __init__(self):
18 | self.urls = ['http://www.xicidaili.com/nn/', 'http://www.xicidaili.com/wt/']
19 | self.re_ip_pattern = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | ', re.I)
20 | self.re_port_pattern = re.compile(r'(\d{1,5}) | ', re.I)
21 |
22 | self.cur_proxy = None
23 | self.proxies = []
24 | self.result = []
25 |
26 | @retrying.retry(stop_max_attempt_number=3)
27 | def extract_proxy(self, url):
28 | try:
29 | headers = {
30 | 'User-Agent': "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) "
31 | "Chrome/21.0.1180.89 Safari/537.1'"
32 | }
33 | rp = requests.get(url, proxies=self.cur_proxy, headers=headers, timeout=10)
34 |
35 | re_ip_result = self.re_ip_pattern.findall(rp.text)
36 | re_port_result = self.re_port_pattern.findall(rp.text)
37 |
38 | if not len(re_ip_result) or not len(re_port_result):
39 | raise Exception("empty")
40 |
41 | if len(re_port_result) != len(re_port_result):
42 | raise Exception("len(host) != len(port)")
43 |
44 | except Exception as e:
45 | logger.error("[-] Request url {url} error: {error}".format(url=url, error=str(e)))
46 | while self.proxies:
47 | new_proxy = self.proxies.pop(0)
48 | self.cur_proxy = {new_proxy['type']: "%s:%s" % (new_proxy['host'], new_proxy['port'])}
49 | raise e
50 | else:
51 | return []
52 |
53 | result_dict = dict(zip(re_ip_result, re_port_result))
54 | return [{"host": host, "port": int(port), "from": "xicidaili"} for host, port in result_dict.items()]
55 |
56 | def start(self):
57 | for url in self.urls:
58 | page_result = self.extract_proxy(url)
59 | time.sleep(3)
60 |
61 | if not page_result:
62 | return
63 |
64 | self.result.extend(page_result)
65 |
66 |
67 | if __name__ == '__main__':
68 | p = Proxy()
69 | p.start()
70 |
71 | for i in p.result:
72 | print(i)
73 |
--------------------------------------------------------------------------------
/getproxy/utils.py:
--------------------------------------------------------------------------------
1 |
2 | import sys
3 | import signal
4 | from importlib import import_module
5 |
6 |
7 | _signames = dict((getattr(signal, signame), signame)
8 | for signame in dir(signal)
9 | if signame.startswith('SIG') and '_' not in signame)
10 |
11 |
12 | def signal_name(signum):
13 | try:
14 | if sys.version_info[:2] >= (3, 5):
15 | return signal.Signals(signum).name
16 | else:
17 | return _signames[signum]
18 |
19 | except KeyError:
20 | return 'SIG_UNKNOWN'
21 | except ValueError:
22 | return 'SIG_UNKNOWN'
23 |
24 |
25 | def load_object(path):
26 | try:
27 | dot = path.rindex('.')
28 | except ValueError:
29 | raise ValueError("Error loading object '%s': not a full path" % path)
30 |
31 | module, name = path[:dot], path[dot + 1:]
32 | mod = import_module(module)
33 |
34 | try:
35 | obj = getattr(mod, name)
36 | except AttributeError:
37 | raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
38 |
39 | return obj
40 |
--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | click>=6.0
2 | requests>=2.13.0
3 | gevent>=1.2.1
4 | geoip2>=2.5.0
5 | retrying>=1.3.3
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal = 1
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | import ast
6 | from setuptools import setup, find_packages
7 |
8 | with open('README.md') as readme_file:
9 | readme = readme_file.read()
10 |
11 |
12 | _version_re = re.compile(r'__version__\s+=\s+(.*)')
13 |
14 |
15 | with open('getproxy/__init__.py', 'rb') as f:
16 | version = str(ast.literal_eval(_version_re.search(
17 | f.read().decode('utf-8')).group(1)))
18 |
19 |
20 | requirements = [
21 | 'click==6.0',
22 | 'requests==2.13.0',
23 | 'gevent==1.2.1',
24 | 'geoip2==2.5.0',
25 | 'maxminddb==1.5.2',
26 | 'retrying==1.3.3'
27 | ]
28 |
29 | setup(
30 | name='getproxy',
31 | version=version,
32 | description="get proxy",
33 | long_description=readme,
34 | author="fate0",
35 | author_email='fate0@fatezero.org',
36 | url='https://github.com/fate0/getproxy',
37 | packages=find_packages(),
38 | package_dir={},
39 | entry_points={
40 | 'console_scripts': [
41 | 'getproxy=getproxy.cli:main'
42 | ]
43 | },
44 | include_package_data=True,
45 | install_requires=requirements,
46 | license="BSD license",
47 | zip_safe=False,
48 | keywords='getproxy',
49 | classifiers=[
50 | 'Development Status :: 2 - Pre-Alpha',
51 | 'Intended Audience :: Developers',
52 | 'License :: OSI Approved :: BSD License',
53 | 'Natural Language :: English',
54 | "Programming Language :: Python :: 2",
55 | 'Programming Language :: Python :: 2.7',
56 | 'Programming Language :: Python :: 3',
57 | 'Programming Language :: Python :: 3.3',
58 | 'Programming Language :: Python :: 3.4',
59 | 'Programming Language :: Python :: 3.5',
60 | ],
61 | )
62 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27, py33, py34, py35, flake8
3 |
4 | [testenv:flake8]
5 | basepython=python
6 | deps=flake8
7 | commands=flake8 --ignore=E501,F401 getproxy
8 |
9 | [testenv]
10 | setenv =
11 | PYTHONPATH = {toxinidir}:{toxinidir}/getproxy
12 |
13 | commands = python setup.py test
14 |
--------------------------------------------------------------------------------