├── .gitignore
├── IPProxy.py
├── README.md
├── api
├── __init__.py
└── apiServer.py
├── config.py
├── data
└── qqwry.dat
├── db
├── DataStore.py
├── ISqlHelper.py
├── MongoHelper.py
├── RedisHelper.py
├── SqlHelper.py
└── __init__.py
├── qiye2.jpg
├── requirements.txt
├── spider
├── HtmlDownloader.py
├── HtmlPraser.py
├── ProxyCrawl.py
└── __init__.py
├── start.bat
├── test
├── __init__.py
├── test.py
├── testIPAddress.py
├── testIPType.py
├── testbase64.py
├── testhttpserver.py
├── testlist.py
├── testlxml.py
├── testqueue.py
└── testsql.py
├── util
├── IPAddress.py
├── __init__.py
├── compatibility.py
├── exception.py
└── logger.py
└── validator
├── Validator.py
└── __init__.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 | .idea/
91 | *.db
--------------------------------------------------------------------------------
/IPProxy.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | from multiprocessing import Value, Queue, Process
4 | from api.apiServer import start_api_server
5 | from db.DataStore import store_data
6 |
7 | from validator.Validator import validator, getMyIP
8 | from spider.ProxyCrawl import startProxyCrawl
9 |
10 | from config import TASK_QUEUE_SIZE
11 |
12 | if __name__ == "__main__":
13 | myip = getMyIP()
14 | DB_PROXY_NUM = Value('i', 0)
15 | q1 = Queue(maxsize=TASK_QUEUE_SIZE)
16 | q2 = Queue()
17 | p0 = Process(target=start_api_server)
18 | p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM,myip))
19 | p2 = Process(target=validator, args=(q1, q2, myip))
20 | p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM))
21 | p0.start()
22 | p1.start()
23 | p2.start()
24 | p3.start()
25 | p0.join()
26 | p1.join()
27 | p2.join()
28 | p3.join()
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # IPProxyPool
2 | IPProxyPool代理池项目,提供代理ip。支持py2和py3两个版本。
3 | ### 我的新书[《Python爬虫开发与项目实战》](https://item.jd.com/12206762.html)出版了,喜欢的话可以看一下[样章](http://pan.baidu.com/s/1hrWEOYg)
4 |
5 | 详细使用方式,请看我的博客:
6 | http://www.cnblogs.com/qiyeboy/p/5693128.html
7 |
8 | 最近正在为IPProxyPool添加二级代理,方便调度。大家可以关注我的公众号,更新我会及时通知。
9 |
10 |
11 | #### 我的微信公众号:
12 |
13 | 
14 |
15 | 希望大家提供更多的代理网站,现在爬取的好用的代理ip还是太少。
16 |
17 | 同时感谢[super1-chen](https://github.com/super1-chen),[fancoo](https://github.com/fancoo),[Leibnizhu](https://github.com/Leibnizhu)对项目的贡献。
18 |
19 |
20 | ## 项目依赖
21 |
22 | #### Ubuntu,debian
23 |
24 | 1.安装sqlite数据库(一般系统内置):
25 | apt-get install sqlite3
26 |
27 | 2.安装requests,chardet,web.py,gevent psutil:
28 | pip install requests chardet web.py sqlalchemy gevent psutil
29 |
30 | 3.安装lxml:
31 | apt-get install python-lxml
32 |
33 | 注意:
34 |
35 | * python3下的是pip3
36 | * 有时候使用的gevent版本过低会出现自动退出情况,请使用pip install gevent --upgrade更新)
37 | * 在python3中安装web.py,不能使用pip,直接下载py3版本的[源码](https://codeload.github.com/webpy/webpy/zip/py3)进行安装
38 |
39 | #### Windows
40 |
41 | 1.下载[sqlite](http://www.sqlite.org/download.html),路径添加到环境变量
42 |
43 | 2.安装requests,chardet,web.py,gevent:
44 | pip install requests chardet web.py sqlalchemy gevent
45 |
46 | 3.安装lxml:
47 | pip install lxml或者下载[lxml windows版](https://pypi.python.org/pypi/lxml/)
48 |
49 | 注意:
50 |
51 | * python3下的是pip3
52 | * 有时候使用的gevent版本过低会出现自动退出情况,请使用pip install gevent --upgrade更新)
53 | * 在python3中安装web.py,不能使用pip,直接下载py3版本的[源码](https://codeload.github.com/webpy/webpy/zip/py3)进行安装
54 |
55 | #### 扩展说明
56 |
57 | 本项目默认数据库是sqlite,但是采用sqlalchemy的ORM模型,通过预留接口可以拓展使用MySQL,MongoDB等数据库。
58 | 配置方法:
59 |
60 | 1.MySQL配置
61 | ```
62 | 第一步:首先安装MySQL数据库并启动
63 | 第二步:安装MySQLdb或者pymysql(推荐)
64 | 第三步:在config.py文件中配置DB_CONFIG。如果安装的是MySQLdb模块,配置如下:
65 | DB_CONFIG={
66 | 'DB_CONNECT_TYPE':'sqlalchemy',
67 | 'DB_CONNECT_STRING':'mysql+mysqldb://root:root@localhost/proxy?charset=utf8'
68 | }
69 | 如果安装的是pymysql模块,配置如下:
70 | DB_CONFIG={
71 | 'DB_CONNECT_TYPE':'sqlalchemy',
72 | 'DB_CONNECT_STRING':'mysql+pymysql://root:root@localhost/proxy?charset=utf8'
73 | }
74 | ```
75 | sqlalchemy下的DB_CONNECT_STRING参考[支持数据库](http://docs.sqlalchemy.org/en/latest/core/engines.html#supported-databases),理论上使用这种配置方式不只是适配MySQL,sqlalchemy支持的数据库都可以,但是仅仅测试过MySQL。
76 |
77 | 2.MongoDB配置
78 | ```
79 | 第一步:首先安装MongoDB数据库并启动
80 | 第二步:安装pymongo模块
81 | 第三步:在config.py文件中配置DB_CONFIG。配置类似如下:
82 | DB_CONFIG={
83 | 'DB_CONNECT_TYPE':'pymongo',
84 | 'DB_CONNECT_STRING':'mongodb://localhost:27017/'
85 | }
86 | ```
87 | 由于sqlalchemy并不支持MongoDB,因此额外添加了pymongo模式,DB_CONNECT_STRING参考pymongo的连接字符串。
88 |
89 | ##### 注意
90 |
91 | 如果大家想拓展其他数据库,可以直接继承db下ISqlHelper类,实现其中的方法,具体实现参考我的代码,然后在DataStore中导入类即可。
92 | ```
93 | try:
94 | if DB_CONFIG['DB_CONNECT_TYPE'] == 'pymongo':
95 | from db.MongoHelper import MongoHelper as SqlHelper
96 | else:
97 | from db.SqlHelper import SqlHelper as SqlHelper
98 | sqlhelper = SqlHelper()
99 | sqlhelper.init_db()
100 | except Exception,e:
101 | raise Con_DB_Fail
102 | ```
103 | 有感兴趣的朋友,可以将Redis的实现方式添加进来。
104 |
105 |
106 | ## 如何使用
107 |
108 | 将项目目录clone到当前文件夹
109 |
110 | $ git clone
111 |
112 | 切换工程目录
113 |
114 | ```
115 | $ cd IPProxyPool
116 | ```
117 |
118 | 运行脚本
119 |
120 | ```
121 | python IPProxy.py
122 | ```
123 | 成功运行后,打印信息
124 | ```
125 | IPProxyPool----->>>>>>>>beginning
126 | http://0.0.0.0:8000/
127 | IPProxyPool----->>>>>>>>db exists ip:0
128 | IPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...
129 | IPProxyPool----->>>>>>>>Success ip num :134,Fail ip num:7882
130 | ```
131 |
132 | ## API 使用方法
133 |
134 | #### 第一种模式
135 | ```
136 | GET /
137 | ```
138 | 这种模式用于查询代理ip数据,同时加入评分机制,返回数据的顺序是按照评分由高到低,速度由快到慢制定的。
139 |
140 | #### 参数
141 |
142 | | Name | Type | Description |
143 | | ----| ---- | ---- |
144 | | types | int | 0: 高匿,1:匿名,2 透明 |
145 | | protocol | int | 0: http, 1 https, 2 http/https |
146 | | count | int | 数量 |
147 | | country | str | 取值为 国内, 国外 |
148 | | area | str | 地区 |
149 |
150 |
151 |
152 | #### 例子
153 |
154 | ##### IPProxys默认端口为8000,端口可以在config.py中配置。
155 |
156 | ##### 如果是在本机上测试:
157 |
158 | 1.获取5个ip地址在中国的高匿代理:http://127.0.0.1:8000/?types=0&count=5&country=国内
159 |
160 | 2.响应为JSON格式,按照评分由高到低,响应速度由高到低的顺序,返回数据:
161 |
162 | ```
163 | [["122.226.189.55", 138, 10], ["183.61.236.54", 3128, 10], ["61.132.241.109", 808, 10], ["183.61.236.53", 3128, 10], ["122.227.246.102", 808, 10]]
164 | ```
165 |
166 | 以["122.226.189.55", 138, 10]为例,第一个元素是ip,第二个元素是port,第三个元素是分值score。
167 |
168 | ```
169 | import requests
170 | import json
171 | r = requests.get('http://127.0.0.1:8000/?types=0&count=5&country=国内')
172 | ip_ports = json.loads(r.text)
173 | print ip_ports
174 | ip = ip_ports[0][0]
175 | port = ip_ports[0][1]
176 | proxies={
177 | 'http':'http://%s:%s'%(ip,port),
178 | 'https':'http://%s:%s'%(ip,port)
179 | }
180 | r = requests.get('http://ip.chinaz.com/',proxies=proxies)
181 | r.encoding='utf-8'
182 | print r.text
183 | ```
184 | #### 第二种模式
185 | ```
186 | GET /delete
187 | ```
188 | 这种模式用于方便用户根据自己的需求删除代理ip数据
189 |
190 | #### 参数
191 |
192 | | Name | Type | Description |
193 | | ----| ---- | ---- |
194 | | ip | str | 类似192.168.1.1 |
195 | | port | int | 类似 80 |
196 | | types | int | 0: 高匿,1:匿名,2 透明 |
197 | | protocol | int | 0: http, 1 https, 2 http/https |
198 | | count | int | 数量 |
199 | | country | str | 取值为 国内, 国外 |
200 | | area | str | 地区 |
201 |
202 | 大家可以根据指定以上一种或几种方式删除数据。
203 |
204 | #### 例子
205 |
206 | ##### 如果是在本机上测试:
207 |
208 | 1.删除ip为120.92.3.127的代理:http://127.0.0.1:8000/delete?ip=120.92.3.127
209 |
210 | 2.响应为JSON格式,返回删除的结果为成功,失败或者返回删除的个数,类似如下的效果:
211 | ["deleteNum", "ok"]或者["deleteNum", 1]
212 | ```
213 | import requests
214 | r = requests.get('http://127.0.0.1:8000/delete?ip=120.92.3.127')
215 | print r.text
216 | ```
217 | ## config.py参数配置
218 | ```
219 | #parserList是网址解析规则表,大家可以将发现的代理网址,将提取规则添加到其中,方便爬虫的爬取。
220 | parserList = [
221 | {
222 | 'urls': ['http://www.66ip.cn/%s.html' % n for n in ['index'] + list(range(2, 12))],
223 | 'type': 'xpath',
224 | 'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]",
225 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
226 | },
227 |
228 | ......
229 |
230 |
231 | {
232 | 'urls': ['http://www.cnproxy.com/proxy%s.html' % i for i in range(1, 11)],
233 | 'type': 'module',
234 | 'moduleName': 'CnproxyPraser',
235 | 'pattern': r'
(\d+\.\d+\.\d+\.\d+) | (HTTP|SOCKS4)\s*',
236 | 'position': {'ip': 0, 'port': 1, 'type': -1, 'protocol': 2}
237 | }
238 | ]
239 |
240 | #数据库的配置
241 |
242 | DB_CONFIG = {
243 |
244 | 'DB_CONNECT_TYPE': 'sqlalchemy', # 'pymongo'sqlalchemy;redis
245 | # 'DB_CONNECT_STRING':'mongodb://localhost:27017/'
246 | 'DB_CONNECT_STRING': 'sqlite:///' + os.path.dirname(__file__) + '/data/proxy.db'
247 | # DB_CONNECT_STRING : 'mysql+mysqldb://root:root@localhost/proxy?charset=utf8'
248 |
249 | # 'DB_CONNECT_TYPE': 'redis', # 'pymongo'sqlalchemy;redis
250 | # 'DB_CONNECT_STRING': 'redis://localhost:6379/8',
251 |
252 | }
253 | #THREADNUM为gevent pool的协程数目
254 | THREADNUM = 5
255 |
256 | #API_PORT为API web服务器的端口
257 | API_PORT = 8000
258 |
259 | #爬虫爬取和检测ip的设置条件
260 | #不需要检测ip是否已经存在,因为会定时清理
261 | # UPDATE_TIME:每半个小时检测一次是否有代理ip失效
262 | UPDATE_TIME = 30 * 60
263 |
264 | # 当有效的ip值小于MINNUM时 需要启动爬虫进行爬取
265 | MINNUM = 50
266 |
267 | # socket超时
268 | TIMEOUT = 5
269 |
270 |
271 |
272 |
273 | #爬虫下载网页的重试次数
274 | RETRY_TIME = 3
275 |
276 |
277 | #USER_AGENTS 随机头信息,用来突破爬取网站的反爬虫
278 |
279 | USER_AGENTS = [
280 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
281 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
282 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
283 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
284 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
285 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
286 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
287 | ]
288 | #默认给抓取的ip分配20分,每次连接失败,减一分,直到分数全部扣完从数据库中删除
289 | DEFAULT_SCORE=10
290 |
291 | #CHECK_PROXY变量是为了用户自定义检测代理的函数,,默认是CHECK_PROXY={'function':'checkProxy'}。
292 | #现在使用检测的网址是httpbin.org,但是即使ip通过了验证和检测
293 | #也只能说明通过此代理ip可以到达httpbin.org,但是不一定能到达用户爬取的网址
294 | #因此在这个地方用户可以自己添加检测函数,我以百度为访问网址尝试一下
295 | #大家可以看一下Validator.py文件中的baidu_check函数和detect_proxy函数就会明白
296 |
297 | CHECK_PROXY={'function':'checkProxy'}#{'function':'baidu_check'}
298 | ```
299 | ## TODO
300 | 1.添加squid代理,简化爬虫配置
301 |
302 |
303 |
304 | ## 更新进度
305 | -----------------------------2017-4-6----------------------------
306 |
307 | 1.更新评分机制。
308 |
309 | * 之前的评分机制是刚添加进来每个代理ip为0分,每隔半个小时检测一次,检测之后依然有效则加分,无效则删除。
310 | * 现在的评分机制是每个新的代理ip分配10分,每隔半个小时检测一次,检测之后依然有效则分数不变,无效则分数减一,直至为0删除,可以避免由于检测网站不稳定导致的误删。
311 |
312 | 2.用户可以自定义检测函数,在config.py的CHECK_PROXY变量中可以配置。
313 | ```
314 | CHECK_PROXY变量是为了用户自定义检测代理的函数,默认是CHECK_PROXY={'function':'checkProxy'}
315 | 现在使用检测的网址是httpbin.org,但是即使ip通过了验证和检测
316 | 也只能说明通过此代理ip可以到达httpbin.org,但是不一定能到达用户爬取的网址
317 | 因此在这个地方用户可以自己添加检测函数,我以百度为访问网址尝试一下
318 | 大家可以看一下Validator.py文件中的baidu_check函数和detect_proxy函数就会明白。
319 |
320 | CHECK_PROXY={'function':'baidu_check'}
321 | ```
322 | 3.经过大家的共同努力,彻底解决了僵死进程的问题。
323 |
324 | -----------------------------2017-1-16----------------------------
325 |
326 | 1.将py2和py3版本合并,并且兼容
327 |
328 | 2.修复pymongo查询bug
329 |
330 | -----------------------------2017-1-11----------------------------
331 |
332 | 1.使用httpbin.org检测代理ip的高匿性
333 |
334 | 2.使用 国内 和 国外 作为country的查询条件
335 |
336 | 3.修改types和protocol参数,一定要注意protocol的使用,试试访问http://www.baidu.com和https://www.baidu.com
337 |
338 | 4.美化代码风格
339 |
340 | -----------------------------2016-12-11----------------------------
341 | ####大规模重构,主要包括以下几个方面:
342 | 1.使用多进程+协程的方式,将爬取和验证的效率提高了50倍以上,可以在几分钟之内获取所有的有效IP
343 |
344 | 2.使用web.py作为API服务器,重构HTTP接口
345 |
346 | 3.增加Mysql,MongoDB等数据库的适配
347 |
348 | 4.增加了三个代理网站
349 |
350 | 5.增加评分机制,评比稳定的ip
351 |
352 | 6.支持python3
353 |
354 | -----------------------------2016-11-24----------------------------
355 |
356 | 1.增加chardet识别网页编码
357 |
358 | 2.突破66ip.cn反爬限制
359 |
360 | -----------------------------2016-10-27----------------------------
361 |
362 | 1.增加对代理的检测,测试是否能真正访问到网址,实现代理
363 |
364 | 2.添加通过正则表达式和加载插件解析网页的方式
365 |
366 | 3.又增加一个新的代理网站
367 |
368 |
369 | -----------------------------2016-7-20----------------------------
370 |
371 | 1.修复bug ,将数据库进行压缩
372 |
373 |
--------------------------------------------------------------------------------
/api/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 |
--------------------------------------------------------------------------------
/api/apiServer.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | '''
3 | 定义几个关键字,count type,protocol,country,area,
4 | '''
5 | import json
6 | import sys
7 | import web
8 | import config
9 | from db.DataStore import sqlhelper
10 | from db.SqlHelper import Proxy
11 |
12 | urls = (
13 | '/', 'select',
14 | '/delete', 'delete'
15 | )
16 |
17 |
18 | def start_api_server():
19 | sys.argv.append('0.0.0.0:%s' % config.API_PORT)
20 | app = web.application(urls, globals())
21 | app.run()
22 |
23 |
24 | class select(object):
25 | def GET(self):
26 | inputs = web.input()
27 | json_result = json.dumps(sqlhelper.select(inputs.get('count', None), inputs))
28 | return json_result
29 |
30 |
31 | class delete(object):
32 | params = {}
33 |
34 | def GET(self):
35 | inputs = web.input()
36 | json_result = json.dumps(sqlhelper.delete(inputs))
37 | return json_result
38 |
39 |
40 | if __name__ == '__main__':
41 | sys.argv.append('0.0.0.0:8000')
42 | app = web.application(urls, globals())
43 | app.run()
44 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | '''
3 | 定义规则 urls:url列表
4 | type:解析方式,取值 regular(正则表达式),xpath(xpath解析),module(自定义第三方模块解析)
5 | patten:可以是正则表达式,可以是xpath语句不过要和上面的相对应
6 | '''
7 | import os
8 | import random
9 |
10 | '''
11 | ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https),country(国家),area(省市),updatetime(更新时间)
12 | speed(连接速度)
13 | '''
14 | parserList = [
15 | {
16 | 'urls': ['http://www.66ip.cn/%s.html' % n for n in ['index'] + list(range(2, 12))],
17 | 'type': 'xpath',
18 | 'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]",
19 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
20 | },
21 | {
22 | 'urls': ['http://www.66ip.cn/areaindex_%s/%s.html' % (m, n) for m in range(1, 35) for n in range(1, 10)],
23 | 'type': 'xpath',
24 | 'pattern': ".//*[@id='footer']/div/table/tr[position()>1]",
25 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
26 | },
27 | {
28 | 'urls': ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'],
29 | 'type': 'xpath',
30 | 'pattern': ".//table[@class='sortable']/tbody/tr",
31 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
32 |
33 | },
34 | {
35 | 'urls': ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)],
36 | 'type': 'xpath',
37 | 'pattern': ".//table[@class='list']/tr",
38 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
39 |
40 | },
41 | {
42 | 'urls': ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)],
43 | 'type': 'module',
44 | 'moduleName': 'proxy_listPraser',
45 | 'pattern': 'Proxy\(.+\)',
46 | 'position': {'ip': 0, 'port': -1, 'type': -1, 'protocol': 2}
47 |
48 | },
49 | {
50 | 'urls': ['http://incloak.com/proxy-list/%s#list' % n for n in
51 | ([''] + ['?start=%s' % (64 * m) for m in range(1, 10)])],
52 | 'type': 'xpath',
53 | 'pattern': ".//table[@class='proxy__t']/tbody/tr",
54 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
55 |
56 | },
57 | {
58 | 'urls': ['http://www.kuaidaili.com/proxylist/%s/' % n for n in range(1, 11)],
59 | 'type': 'xpath',
60 | 'pattern': ".//*[@id='index_free_list']/table/tbody/tr[position()>0]",
61 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
62 | },
63 | {
64 | 'urls': ['http://www.kuaidaili.com/free/%s/%s/' % (m, n) for m in ['inha', 'intr', 'outha', 'outtr'] for n in
65 | range(1, 11)],
66 | 'type': 'xpath',
67 | 'pattern': ".//*[@id='list']/table/tbody/tr[position()>0]",
68 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
69 | },
70 | {
71 | 'urls': ['http://www.cz88.net/proxy/%s' % m for m in
72 | ['index.shtml'] + ['http_%s.shtml' % n for n in range(2, 11)]],
73 | 'type': 'xpath',
74 | 'pattern': ".//*[@id='boxright']/div/ul/li[position()>1]",
75 | 'position': {'ip': './div[1]', 'port': './div[2]', 'type': './div[3]', 'protocol': ''}
76 |
77 | },
78 | {
79 | 'urls': ['http://www.ip181.com/daili/%s.html' % n for n in range(1, 11)],
80 | 'type': 'xpath',
81 | 'pattern': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]",
82 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
83 |
84 | },
85 | {
86 | 'urls': ['http://www.xicidaili.com/%s/%s' % (m, n) for m in ['nn', 'nt', 'wn', 'wt'] for n in range(1, 8)],
87 | 'type': 'xpath',
88 | 'pattern': ".//*[@id='ip_list']/tr[position()>1]",
89 | 'position': {'ip': './td[2]', 'port': './td[3]', 'type': './td[5]', 'protocol': './td[6]'}
90 | },
91 | {
92 | 'urls': ['http://www.cnproxy.com/proxy%s.html' % i for i in range(1, 11)],
93 | 'type': 'module',
94 | 'moduleName': 'CnproxyPraser',
95 | 'pattern': r' |
(\d+\.\d+\.\d+\.\d+) | (HTTP|SOCKS4)\s*',
96 | 'position': {'ip': 0, 'port': 1, 'type': -1, 'protocol': 2}
97 | }
98 | ]
99 | '''
100 | 数据库的配置
101 | '''
102 | DB_CONFIG = {
103 |
104 | 'DB_CONNECT_TYPE': 'sqlalchemy', # 'pymongo'sqlalchemy;redis
105 | # 'DB_CONNECT_STRING':'mongodb://localhost:27017/'
106 | 'DB_CONNECT_STRING': 'sqlite:///' + os.path.dirname(__file__) + '/data/proxy.db'
107 | # DB_CONNECT_STRING : 'mysql+mysqldb://root:root@localhost/proxy?charset=utf8'
108 |
109 | # 'DB_CONNECT_TYPE': 'redis', # 'pymongo'sqlalchemy;redis
110 | # 'DB_CONNECT_STRING': 'redis://localhost:6379/8',
111 |
112 | }
113 | CHINA_AREA = ['河北', '山东', '辽宁', '黑龙江', '吉林'
114 | , '甘肃', '青海', '河南', '江苏', '湖北', '湖南',
115 | '江西', '浙江', '广东', '云南', '福建',
116 | '台湾', '海南', '山西', '四川', '陕西',
117 | '贵州', '安徽', '重庆', '北京', '上海', '天津', '广西', '内蒙', '西藏', '新疆', '宁夏', '香港', '澳门']
118 | QQWRY_PATH = os.path.dirname(__file__) + "/data/qqwry.dat"
119 | THREADNUM = 5
120 | API_PORT = 8000
121 | '''
122 | 爬虫爬取和检测ip的设置条件
123 | 不需要检测ip是否已经存在,因为会定时清理
124 | '''
125 | UPDATE_TIME = 30 * 60 # 每半个小时检测一次是否有代理ip失效
126 | MINNUM = 50 # 当有效的ip值小于一个时 需要启动爬虫进行爬取
127 |
128 | TIMEOUT = 5 # socket延时
129 | '''
130 | 反爬虫的设置
131 | '''
132 | '''
133 | 重试次数
134 | '''
135 | RETRY_TIME = 3
136 |
137 | '''
138 | USER_AGENTS 随机头信息
139 | '''
140 | USER_AGENTS = [
141 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
142 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
143 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
144 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
145 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
146 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
147 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
148 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
149 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
150 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
151 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
152 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
153 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
154 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
155 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
156 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
157 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
158 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
159 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
160 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
161 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
162 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
163 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
164 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
165 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
166 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
167 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
168 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
169 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
170 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
171 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
172 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
173 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
174 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
175 | ]
176 |
177 |
178 | def get_header():
179 | return {
180 | 'User-Agent': random.choice(USER_AGENTS),
181 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
182 | 'Accept-Language': 'en-US,en;q=0.5',
183 | 'Connection': 'keep-alive',
184 | 'Accept-Encoding': 'gzip, deflate',
185 | }
186 | #默认给抓取的ip分配20分,每次连接失败,减一分,直到分数全部扣完从数据库中删除
187 | DEFAULT_SCORE=10
188 |
189 | TEST_URL = 'http://ip.chinaz.com/getip.aspx'
190 | TEST_IP = 'http://httpbin.org/ip'
191 | TEST_HTTP_HEADER = 'http://httpbin.org/get'
192 | TEST_HTTPS_HEADER = 'https://httpbin.org/get'
193 | #CHECK_PROXY变量是为了用户自定义检测代理的函数
194 | #现在使用检测的网址是httpbin.org,但是即使ip通过了验证和检测
195 | #也只能说明通过此代理ip可以到达httpbin.org,但是不一定能到达用户爬取的网址
196 | #因此在这个地方用户可以自己添加检测函数,我以百度为访问网址尝试一下
197 | #大家可以看一下Validator.py文件中的baidu_check函数和detect_proxy函数就会明白
198 |
199 | CHECK_PROXY={'function':'checkProxy'}#{'function':'baidu_check'}
200 |
201 | #下面配置squid,现在还没实现
202 | #SQUID={'path':None,'confpath':'C:/squid/etc/squid.conf'}
203 |
204 | MAX_CHECK_PROCESS = 2 # CHECK_PROXY最大进程数
205 | MAX_CHECK_CONCURRENT_PER_PROCESS = 30 # CHECK_PROXY时每个进程的最大并发
206 | TASK_QUEUE_SIZE = 50 # 任务队列SIZE
207 | MAX_DOWNLOAD_CONCURRENT = 3 # 从免费代理网站下载时的最大并发
208 | CHECK_WATI_TIME = 1#进程数达到上限时的等待时间
--------------------------------------------------------------------------------
/data/qqwry.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiyeboy/IPProxyPool/127827829535641d57a1a05e20523cb0a192170b/data/qqwry.dat
--------------------------------------------------------------------------------
/db/DataStore.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import sys
3 | from config import DB_CONFIG
4 | from util.exception import Con_DB_Fail
5 |
6 |
7 | try:
8 | if DB_CONFIG['DB_CONNECT_TYPE'] == 'pymongo':
9 | from db.MongoHelper import MongoHelper as SqlHelper
10 | elif DB_CONFIG['DB_CONNECT_TYPE'] == 'redis':
11 | from db.RedisHelper import RedisHelper as SqlHelper
12 | else:
13 | from db.SqlHelper import SqlHelper as SqlHelper
14 | sqlhelper = SqlHelper()
15 | sqlhelper.init_db()
16 | except Exception as e:
17 | raise Con_DB_Fail
18 |
19 |
20 | def store_data(queue2, db_proxy_num):
21 | '''
22 | 读取队列中的数据,写入数据库中
23 | :param queue2:
24 | :return:
25 | '''
26 | successNum = 0
27 | failNum = 0
28 | while True:
29 | try:
30 | proxy = queue2.get(timeout=300)
31 | if proxy:
32 |
33 | sqlhelper.insert(proxy)
34 | successNum += 1
35 | else:
36 | failNum += 1
37 | str = 'IPProxyPool----->>>>>>>>Success ip num :%d,Fail ip num:%d' % (successNum, failNum)
38 | sys.stdout.write(str + "\r")
39 | sys.stdout.flush()
40 | except BaseException as e:
41 | if db_proxy_num.value != 0:
42 | successNum += db_proxy_num.value
43 | db_proxy_num.value = 0
44 | str = 'IPProxyPool----->>>>>>>>Success ip num :%d,Fail ip num:%d' % (successNum, failNum)
45 | sys.stdout.write(str + "\r")
46 | sys.stdout.flush()
47 | successNum = 0
48 | failNum = 0
49 |
50 |
51 |
--------------------------------------------------------------------------------
/db/ISqlHelper.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | class ISqlHelper(object):
4 | params = {'ip': None, 'port': None, 'types': None, 'protocol': None, 'country': None, 'area': None}
5 |
6 | def init_db(self):
7 | raise NotImplemented
8 |
9 | def drop_db(self):
10 | raise NotImplemented
11 |
12 | def insert(self, value=None):
13 | raise NotImplemented
14 |
15 | def delete(self, conditions=None):
16 | raise NotImplemented
17 |
18 | def update(self, conditions=None, value=None):
19 | raise NotImplemented
20 |
21 | def select(self, count=None, conditions=None):
22 | raise NotImplemented
--------------------------------------------------------------------------------
/db/MongoHelper.py:
--------------------------------------------------------------------------------
1 | import pymongo
2 | from config import DB_CONFIG, DEFAULT_SCORE
3 |
4 | from db.ISqlHelper import ISqlHelper
5 |
6 |
7 | class MongoHelper(ISqlHelper):
8 | def __init__(self):
9 | self.client = pymongo.MongoClient(DB_CONFIG['DB_CONNECT_STRING'], connect=False)
10 |
11 | def init_db(self):
12 | self.db = self.client.proxy
13 | self.proxys = self.db.proxys
14 |
15 | def drop_db(self):
16 | self.client.drop_database(self.db)
17 |
18 | def insert(self, value=None):
19 | if value:
20 | proxy = dict(ip=value['ip'], port=value['port'], types=value['types'], protocol=value['protocol'],
21 | country=value['country'],
22 | area=value['area'], speed=value['speed'], score=DEFAULT_SCORE)
23 | self.proxys.insert(proxy)
24 |
25 | def delete(self, conditions=None):
26 | if conditions:
27 | self.proxys.remove(conditions)
28 | return ('deleteNum', 'ok')
29 | else:
30 | return ('deleteNum', 'None')
31 |
32 | def update(self, conditions=None, value=None):
33 | # update({"UserName":"libing"},{"$set":{"Email":"libing@126.com","Password":"123"}})
34 | if conditions and value:
35 | self.proxys.update(conditions, {"$set": value})
36 | return {'updateNum': 'ok'}
37 | else:
38 | return {'updateNum': 'fail'}
39 |
40 | def select(self, count=None, conditions=None):
41 | if count:
42 | count = int(count)
43 | else:
44 | count = 0
45 | if conditions:
46 | conditions = dict(conditions)
47 | if 'count' in conditions:
48 | del conditions['count']
49 | conditions_name = ['types', 'protocol']
50 | for condition_name in conditions_name:
51 | value = conditions.get(condition_name, None)
52 | if value:
53 | conditions[condition_name] = int(value)
54 | else:
55 | conditions = {}
56 | items = self.proxys.find(conditions, limit=count).sort(
57 | [("speed", pymongo.ASCENDING), ("score", pymongo.DESCENDING)])
58 | results = []
59 | for item in items:
60 | result = (item['ip'], item['port'], item['score'])
61 | results.append(result)
62 | return results
63 |
64 |
65 | if __name__ == '__main__':
66 | # from db.MongoHelper import MongoHelper as SqlHelper
67 | # sqlhelper = SqlHelper()
68 | # sqlhelper.init_db()
69 | # # print sqlhelper.select(None,{'types':u'1'})
70 | # items= sqlhelper.proxys.find({'types':0})
71 | # for item in items:
72 | # print item
73 | # # # print sqlhelper.select(None,{'types':u'0'})
74 | pass
75 |
--------------------------------------------------------------------------------
/db/RedisHelper.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | from __future__ import unicode_literals
3 |
4 | from redis import Redis
5 |
6 | import config
7 | from db.ISqlHelper import ISqlHelper
8 | from db.SqlHelper import Proxy
9 |
10 |
11 | class RedisHelper(ISqlHelper):
12 | def __init__(self, url=None):
13 | self.index_names = ('types', 'protocol', 'country', 'area', 'score')
14 | self.redis_url = url or config.DB_CONFIG['DB_CONNECT_STRING']
15 |
16 | def get_proxy_name(self, ip=None, port=None, protocal=None, proxy=None):
17 | ip = ip or proxy.ip
18 | port = port or proxy.port
19 | protocal = protocal or proxy.protocol
20 | return "proxy::{}:{}:{}".format(ip, port, protocal)
21 |
22 | def get_index_name(self, index_name, value=None):
23 | if index_name == 'score':
24 | return 'index::score'
25 | return "index::{}:{}".format(index_name, value)
26 |
27 | def get_proxy_by_name(self, name):
28 | pd = self.redis.hgetall(name)
29 | if pd:
30 | return Proxy(**{k.decode('utf8'): v.decode('utf8') for k, v in pd.items()})
31 |
32 | def init_db(self, url=None):
33 | self.redis = Redis.from_url(url or self.redis_url)
34 |
35 | def drop_db(self):
36 | return self.redis.flushdb()
37 |
38 | def get_keys(self, conditions):
39 | select_keys = {self.get_index_name(key, conditions[key]) for key in conditions.keys() if
40 | key in self.index_names}
41 | if 'ip' in conditions and 'port' in conditions:
42 | return self.redis.keys(self.get_proxy_name(conditions['ip'], conditions['port'], '*'))
43 | if select_keys:
44 | return [name.decode('utf8') for name in self.redis.sinter(keys=select_keys)]
45 | return []
46 |
47 | def insert(self, value):
48 | proxy = Proxy(ip=value['ip'], port=value['port'], types=value['types'], protocol=value['protocol'],
49 | country=value['country'], area=value['area'],
50 | speed=value['speed'], score=value.get('score', config.DEFAULT_SCORE))
51 | mapping = proxy.__dict__
52 | for k in list(mapping.keys()):
53 | if k.startswith('_'):
54 | mapping.pop(k)
55 | object_name = self.get_proxy_name(proxy=proxy)
56 | # 存结构
57 | insert_num = self.redis.hmset(object_name, mapping)
58 | # 创建索引
59 | if insert_num > 0:
60 | for index_name in self.index_names:
61 | self.create_index(index_name, object_name, proxy)
62 | return insert_num
63 |
64 | def create_index(self, index_name, object_name, proxy):
65 | redis_key = self.get_index_name(index_name, getattr(proxy, index_name))
66 | if index_name == 'score':
67 | return self.redis.zadd(redis_key, object_name, int(proxy.score))
68 | return self.redis.sadd(redis_key, object_name)
69 |
70 | def delete(self, conditions):
71 | proxy_keys = self.get_keys(conditions)
72 | index_keys = self.redis.keys(u"index::*")
73 | if not proxy_keys:
74 | return 0
75 |
76 | for iname in index_keys:
77 | if iname == b'index::score':
78 | self.redis.zrem(self.get_index_name('score'), *proxy_keys)
79 | else:
80 | self.redis.srem(iname, *proxy_keys)
81 | return self.redis.delete(*proxy_keys) if proxy_keys else 0
82 |
83 | def update(self, conditions, values):
84 | objects = self.get_keys(conditions)
85 | count = 0
86 | for name in objects:
87 | for k, v in values.items():
88 | if k == 'score':
89 | self.redis.zrem(self.get_index_name('score'), [name])
90 | self.redis.zadd(self.get_index_name('score'), name, int(v))
91 | self.redis.hset(name, key=k, value=v)
92 | count += 1
93 | return count
94 |
95 | def select(self, count=None, conditions=None):
96 | count = (count and int(count)) or 1000 # 最多返回1000条数据
97 | count = 1000 if count > 1000 else count
98 |
99 | querys = {k: v for k, v in conditions.items() if k in self.index_names} if conditions else None
100 | if querys:
101 | objects = list(self.get_keys(querys))[:count]
102 | redis_name = self.get_index_name('score')
103 | objects.sort(key=lambda x: int(self.redis.zscore(redis_name, x)))
104 | else:
105 | objects = list(
106 | self.redis.zrevrangebyscore(self.get_index_name("score"), '+inf', '-inf', start=0, num=count))
107 |
108 | result = []
109 | for name in objects:
110 | p = self.get_proxy_by_name(name)
111 | result.append((p.ip, p.port, p.score))
112 | return result
113 |
114 |
115 | if __name__ == '__main__':
116 | sqlhelper = RedisHelper()
117 | sqlhelper.init_db('redis://localhost:6379/9')
118 | proxy = {'ip': '192.168.1.1', 'port': 80, 'type': 0, 'protocol': 0, 'country': '中国', 'area': '广州', 'speed': 11.123,
119 | 'types': 1}
120 | proxy2 = {'ip': 'localhost', 'port': 433, 'type': 1, 'protocol': 1, 'country': u'中国', 'area': u'广州', 'speed': 123,
121 | 'types': 0, 'score': 100}
122 | assert sqlhelper.insert(proxy) == True
123 | assert sqlhelper.insert(proxy2) == True
124 | assert sqlhelper.get_keys({'types': 1}) == ['proxy::192.168.1.1:80:0', ], sqlhelper.get_keys({'types': 1})
125 | assert sqlhelper.select(conditions={'protocol': 0}) == [('192.168.1.1', '80', '0')]
126 | assert sqlhelper.update({'types': 1}, {'score': 888}) == 1
127 | assert sqlhelper.select() == [('192.168.1.1', '80', '888'), ('localhost', '433', '100')]
128 | # assert sqlhelper.delete({'types': 1}) == 1
129 | # sqlhelper.drop_db()
130 | print('All pass.')
131 |
--------------------------------------------------------------------------------
/db/SqlHelper.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import datetime
3 | from sqlalchemy import Column, Integer, String, DateTime, Numeric, create_engine, VARCHAR
4 | from sqlalchemy.ext.declarative import declarative_base
5 | from sqlalchemy.orm import sessionmaker
6 | from config import DB_CONFIG, DEFAULT_SCORE
7 |
8 | from db.ISqlHelper import ISqlHelper
9 |
10 | '''
11 | sql操作的基类
12 | 包括ip,端口,types类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
13 | speed(连接速度)
14 | '''
15 |
16 | BaseModel = declarative_base()
17 |
18 |
19 | class Proxy(BaseModel):
20 | __tablename__ = 'proxys'
21 | id = Column(Integer, primary_key=True, autoincrement=True)
22 | ip = Column(VARCHAR(16), nullable=False)
23 | port = Column(Integer, nullable=False)
24 | types = Column(Integer, nullable=False)
25 | protocol = Column(Integer, nullable=False, default=0)
26 | country = Column(VARCHAR(100), nullable=False)
27 | area = Column(VARCHAR(100), nullable=False)
28 | updatetime = Column(DateTime(), default=datetime.datetime.utcnow)
29 | speed = Column(Numeric(5, 2), nullable=False)
30 | score = Column(Integer, nullable=False, default=DEFAULT_SCORE)
31 |
32 |
33 | class SqlHelper(ISqlHelper):
34 | params = {'ip': Proxy.ip, 'port': Proxy.port, 'types': Proxy.types, 'protocol': Proxy.protocol,
35 | 'country': Proxy.country, 'area': Proxy.area, 'score': Proxy.score}
36 |
37 | def __init__(self):
38 | if 'sqlite' in DB_CONFIG['DB_CONNECT_STRING']:
39 | connect_args = {'check_same_thread': False}
40 | self.engine = create_engine(DB_CONFIG['DB_CONNECT_STRING'], echo=False, connect_args=connect_args)
41 | else:
42 | self.engine = create_engine(DB_CONFIG['DB_CONNECT_STRING'], echo=False)
43 | DB_Session = sessionmaker(bind=self.engine)
44 | self.session = DB_Session()
45 |
46 | def init_db(self):
47 | BaseModel.metadata.create_all(self.engine)
48 |
49 | def drop_db(self):
50 | BaseModel.metadata.drop_all(self.engine)
51 |
52 |
53 | def insert(self, value):
54 | proxy = Proxy(ip=value['ip'], port=value['port'], types=value['types'], protocol=value['protocol'],
55 | country=value['country'],
56 | area=value['area'], speed=value['speed'])
57 | self.session.add(proxy)
58 | self.session.commit()
59 |
60 |
61 | def delete(self, conditions=None):
62 | if conditions:
63 | conditon_list = []
64 | for key in list(conditions.keys()):
65 | if self.params.get(key, None):
66 | conditon_list.append(self.params.get(key) == conditions.get(key))
67 | conditions = conditon_list
68 | query = self.session.query(Proxy)
69 | for condition in conditions:
70 | query = query.filter(condition)
71 | deleteNum = query.delete()
72 | self.session.commit()
73 | else:
74 | deleteNum = 0
75 | return ('deleteNum', deleteNum)
76 |
77 |
78 | def update(self, conditions=None, value=None):
79 | '''
80 | conditions的格式是个字典。类似self.params
81 | :param conditions:
82 | :param value:也是个字典:{'ip':192.168.0.1}
83 | :return:
84 | '''
85 | if conditions and value:
86 | conditon_list = []
87 | for key in list(conditions.keys()):
88 | if self.params.get(key, None):
89 | conditon_list.append(self.params.get(key) == conditions.get(key))
90 | conditions = conditon_list
91 | query = self.session.query(Proxy)
92 | for condition in conditions:
93 | query = query.filter(condition)
94 | updatevalue = {}
95 | for key in list(value.keys()):
96 | if self.params.get(key, None):
97 | updatevalue[self.params.get(key, None)] = value.get(key)
98 | updateNum = query.update(updatevalue)
99 | self.session.commit()
100 | else:
101 | updateNum = 0
102 | return {'updateNum': updateNum}
103 |
104 |
105 | def select(self, count=None, conditions=None):
106 | '''
107 | conditions的格式是个字典。类似self.params
108 | :param count:
109 | :param conditions:
110 | :return:
111 | '''
112 | if conditions:
113 | conditon_list = []
114 | for key in list(conditions.keys()):
115 | if self.params.get(key, None):
116 | conditon_list.append(self.params.get(key) == conditions.get(key))
117 | conditions = conditon_list
118 | else:
119 | conditions = []
120 |
121 | query = self.session.query(Proxy.ip, Proxy.port, Proxy.score)
122 | if len(conditions) > 0 and count:
123 | for condition in conditions:
124 | query = query.filter(condition)
125 | return query.order_by(Proxy.score.desc(), Proxy.speed).limit(count).all()
126 | elif count:
127 | return query.order_by(Proxy.score.desc(), Proxy.speed).limit(count).all()
128 | elif len(conditions) > 0:
129 | for condition in conditions:
130 | query = query.filter(condition)
131 | return query.order_by(Proxy.score.desc(), Proxy.speed).all()
132 | else:
133 | return query.order_by(Proxy.score.desc(), Proxy.speed).all()
134 |
135 |
136 | def close(self):
137 | pass
138 |
139 |
140 | if __name__ == '__main__':
141 | sqlhelper = SqlHelper()
142 | sqlhelper.init_db()
143 | proxy = {'ip': '192.168.1.1', 'port': 80, 'type': 0, 'protocol': 0, 'country': '中国', 'area': '广州', 'speed': 11.123, 'types': ''}
144 | sqlhelper.insert(proxy)
145 | sqlhelper.update({'ip': '192.168.1.1', 'port': 80}, {'score': 10})
146 | print(sqlhelper.select(1))
147 |
148 |
--------------------------------------------------------------------------------
/db/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 |
--------------------------------------------------------------------------------
/qiye2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiyeboy/IPProxyPool/127827829535641d57a1a05e20523cb0a192170b/qiye2.jpg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | chardet==2.3.0
2 | gevent==1.2.0
3 | greenlet==0.4.11
4 | lxml==3.7.1
5 | requests==2.12.4
6 | SQLAlchemy==1.1.4
7 | web.py==0.38
8 | redis==2.10.5
9 | psutil
10 |
--------------------------------------------------------------------------------
/spider/HtmlDownloader.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 |
3 | import random
4 | import config
5 | import json
6 | from db.DataStore import sqlhelper
7 |
8 | __author__ = 'qiye'
9 |
10 | import requests
11 | import chardet
12 |
13 |
14 | class Html_Downloader(object):
15 | @staticmethod
16 | def download(url):
17 | try:
18 | r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT)
19 | r.encoding = chardet.detect(r.content)['encoding']
20 | if (not r.ok) or len(r.content) < 500:
21 | raise ConnectionError
22 | else:
23 | return r.text
24 |
25 | except Exception:
26 | count = 0 # 重试次数
27 | proxylist = sqlhelper.select(10)
28 | if not proxylist:
29 | return None
30 |
31 | while count < config.RETRY_TIME:
32 | try:
33 | proxy = random.choice(proxylist)
34 | ip = proxy[0]
35 | port = proxy[1]
36 | proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
37 |
38 | r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
39 | r.encoding = chardet.detect(r.content)['encoding']
40 | if (not r.ok) or len(r.content) < 500:
41 | raise ConnectionError
42 | else:
43 | return r.text
44 | except Exception:
45 | count += 1
46 |
47 | return None
48 |
--------------------------------------------------------------------------------
/spider/HtmlPraser.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import base64
3 | from config import QQWRY_PATH, CHINA_AREA
4 | from util.IPAddress import IPAddresss
5 | import re
6 | from util.compatibility import text_
7 |
8 | __author__ = 'qiye'
9 | from lxml import etree
10 |
11 |
12 | class Html_Parser(object):
13 | def __init__(self):
14 | self.ips = IPAddresss(QQWRY_PATH)
15 |
16 | def parse(self, response, parser):
17 | '''
18 |
19 | :param response: 响应
20 | :param type: 解析方式
21 | :return:
22 | '''
23 | if parser['type'] == 'xpath':
24 | return self.XpathPraser(response, parser)
25 | elif parser['type'] == 'regular':
26 | return self.RegularPraser(response, parser)
27 | elif parser['type'] == 'module':
28 | return getattr(self, parser['moduleName'], None)(response, parser)
29 | else:
30 | return None
31 |
32 | def AuthCountry(self, addr):
33 | '''
34 | 用来判断地址是哪个国家的
35 | :param addr:
36 | :return:
37 | '''
38 | for area in CHINA_AREA:
39 | if text_(area) in addr:
40 | return True
41 | return False
42 |
43 |
44 | def XpathPraser(self, response, parser):
45 | '''
46 | 针对xpath方式进行解析
47 | :param response:
48 | :param parser:
49 | :return:
50 | '''
51 | proxylist = []
52 | root = etree.HTML(response)
53 | proxys = root.xpath(parser['pattern'])
54 | for proxy in proxys:
55 | try:
56 | ip = proxy.xpath(parser['position']['ip'])[0].text
57 | port = proxy.xpath(parser['position']['port'])[0].text
58 | type = 0
59 | protocol = 0
60 | addr = self.ips.getIpAddr(self.ips.str2ip(ip))
61 | country = text_('')
62 | area = text_('')
63 | if text_('省') in addr or self.AuthCountry(addr):
64 | country = text_('国内')
65 | area = addr
66 | else:
67 | country = text_('国外')
68 | area = addr
69 | except Exception as e:
70 | continue
71 | # updatetime = datetime.datetime.now()
72 | # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
73 |
74 | # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
75 | proxy = {'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country,
76 | 'area': area, 'speed': 100}
77 | proxylist.append(proxy)
78 | return proxylist
79 |
80 | def RegularPraser(self, response, parser):
81 | '''
82 | 针对正则表达式进行解析
83 | :param response:
84 | :param parser:
85 | :return:
86 | '''
87 | proxylist = []
88 | pattern = re.compile(parser['pattern'])
89 | matchs = pattern.findall(response)
90 | if matchs != None:
91 | for match in matchs:
92 | try:
93 | ip = match[parser['position']['ip']]
94 | port = match[parser['position']['port']]
95 | # 网站的类型一直不靠谱所以还是默认,之后会检测
96 | type = 0
97 | # if parser['postion']['protocol'] > 0:
98 | # protocol = match[parser['postion']['protocol']]
99 | # if protocol.lower().find('https')!=-1:
100 | # protocol = 1
101 | # else:
102 | # protocol = 0
103 | # else:
104 | protocol = 0
105 | addr = self.ips.getIpAddr(self.ips.str2ip(ip))
106 | country = text_('')
107 | area = text_('')
108 | # print(ip,port)
109 | if text_('省') in addr or self.AuthCountry(addr):
110 | country = text_('国内')
111 | area = addr
112 | else:
113 | country = text_('国外')
114 | area = addr
115 | except Exception as e:
116 | continue
117 |
118 | proxy = {'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area,
119 | 'speed': 100}
120 |
121 | proxylist.append(proxy)
122 | return proxylist
123 |
124 |
125 | def CnproxyPraser(self, response, parser):
126 | proxylist = self.RegularPraser(response, parser)
127 | chardict = {'v': '3', 'm': '4', 'a': '2', 'l': '9', 'q': '0', 'b': '5', 'i': '7', 'w': '6', 'r': '8', 'c': '1'}
128 |
129 | for proxy in proxylist:
130 | port = proxy['port']
131 | new_port = ''
132 | for i in range(len(port)):
133 | if port[i] != '+':
134 | new_port += chardict[port[i]]
135 | new_port = int(new_port)
136 | proxy['port'] = new_port
137 | return proxylist
138 |
139 |
140 | def proxy_listPraser(self, response, parser):
141 | proxylist = []
142 | pattern = re.compile(parser['pattern'])
143 | matchs = pattern.findall(response)
144 | if matchs:
145 | for match in matchs:
146 | try:
147 | ip_port = base64.b64decode(match.replace("Proxy('", "").replace("')", ""))
148 | ip = ip_port.split(':')[0]
149 | port = ip_port.split(':')[1]
150 | type = 0
151 | protocol = 0
152 | addr = self.ips.getIpAddr(self.ips.str2ip(ip))
153 | country = text_('')
154 | area = text_('')
155 | # print(ip,port)
156 | if text_('省') in addr or self.AuthCountry(addr):
157 | country = text_('国内')
158 | area = addr
159 | else:
160 | country = text_('国外')
161 | area = addr
162 | except Exception as e:
163 | continue
164 | proxy = {'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country,
165 | 'area': area, 'speed': 100}
166 | proxylist.append(proxy)
167 | return proxylist
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
--------------------------------------------------------------------------------
/spider/ProxyCrawl.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | from gevent import monkey
3 | monkey.patch_all()
4 |
5 | import sys
6 | import time
7 | import gevent
8 |
9 | from gevent.pool import Pool
10 | from multiprocessing import Queue, Process, Value
11 |
12 | from api.apiServer import start_api_server
13 | from config import THREADNUM, parserList, UPDATE_TIME, MINNUM, MAX_CHECK_CONCURRENT_PER_PROCESS, MAX_DOWNLOAD_CONCURRENT
14 | from db.DataStore import store_data, sqlhelper
15 | from spider.HtmlDownloader import Html_Downloader
16 | from spider.HtmlPraser import Html_Parser
17 | from validator.Validator import validator, getMyIP, detect_from_db
18 |
19 | '''
20 | 这个类的作用是描述爬虫的逻辑
21 | '''
22 |
23 |
24 | def startProxyCrawl(queue, db_proxy_num,myip):
25 | crawl = ProxyCrawl(queue, db_proxy_num,myip)
26 | crawl.run()
27 |
28 |
29 | class ProxyCrawl(object):
30 | proxies = set()
31 |
32 | def __init__(self, queue, db_proxy_num,myip):
33 | self.crawl_pool = Pool(THREADNUM)
34 | self.queue = queue
35 | self.db_proxy_num = db_proxy_num
36 | self.myip = myip
37 |
38 |
39 | def run(self):
40 | while True:
41 | self.proxies.clear()
42 | str = 'IPProxyPool----->>>>>>>>beginning'
43 | sys.stdout.write(str + "\r\n")
44 | sys.stdout.flush()
45 | proxylist = sqlhelper.select()
46 |
47 | spawns = []
48 | for proxy in proxylist:
49 | spawns.append(gevent.spawn(detect_from_db, self.myip, proxy, self.proxies))
50 | if len(spawns) >= MAX_CHECK_CONCURRENT_PER_PROCESS:
51 | gevent.joinall(spawns)
52 | spawns= []
53 | gevent.joinall(spawns)
54 | self.db_proxy_num.value = len(self.proxies)
55 | str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies)
56 |
57 | if len(self.proxies) < MINNUM:
58 | str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
59 | sys.stdout.write(str + "\r\n")
60 | sys.stdout.flush()
61 | spawns = []
62 | for p in parserList:
63 | spawns.append(gevent.spawn(self.crawl, p))
64 | if len(spawns) >= MAX_DOWNLOAD_CONCURRENT:
65 | gevent.joinall(spawns)
66 | spawns= []
67 | gevent.joinall(spawns)
68 | else:
69 | str += '\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...'
70 | sys.stdout.write(str + "\r\n")
71 | sys.stdout.flush()
72 |
73 | time.sleep(UPDATE_TIME)
74 |
75 | def crawl(self, parser):
76 | html_parser = Html_Parser()
77 | for url in parser['urls']:
78 | response = Html_Downloader.download(url)
79 | if response is not None:
80 | proxylist = html_parser.parse(response, parser)
81 | if proxylist is not None:
82 | for proxy in proxylist:
83 | proxy_str = '%s:%s' % (proxy['ip'], proxy['port'])
84 | if proxy_str not in self.proxies:
85 | self.proxies.add(proxy_str)
86 | while True:
87 | if self.queue.full():
88 | time.sleep(0.1)
89 | else:
90 | self.queue.put(proxy)
91 | break
92 |
93 |
94 | if __name__ == "__main__":
95 | DB_PROXY_NUM = Value('i', 0)
96 | q1 = Queue()
97 | q2 = Queue()
98 | p0 = Process(target=start_api_server)
99 | p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM))
100 | p2 = Process(target=validator, args=(q1, q2))
101 | p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM))
102 |
103 | p0.start()
104 | p1.start()
105 | p2.start()
106 | p3.start()
107 |
108 | # spider = ProxyCrawl()
109 | # spider.run()
110 |
--------------------------------------------------------------------------------
/spider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 |
--------------------------------------------------------------------------------
/start.bat:
--------------------------------------------------------------------------------
1 | py -2 IPProxy.py
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 |
--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import requests
3 | import json
4 |
5 | r = requests.get('http://127.0.0.1:8000/?types=0&count=5&country=中国')
6 | ip_ports = json.loads(r.text)
7 | print(ip_ports)
8 | ip = ip_ports[0][0]
9 | port = ip_ports[0][1]
10 | proxies = {
11 | 'http': 'http://%s:%s' % (ip, port),
12 | 'https': 'http://%s:%s' % (ip, port)
13 | }
14 | r = requests.get('http://www.baidu.com', proxies=proxies)
15 | r.encoding = 'utf-8'
16 | print(r.text)
17 |
--------------------------------------------------------------------------------
/test/testIPAddress.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import os
4 |
5 | import socket
6 | import struct
7 |
8 | import logging
9 |
10 |
11 | logger = logging.getLogger('util')
12 |
13 |
14 | class IPAddresss:
15 | def __init__(self, ipdbFile):
16 | self.ipdb = open(ipdbFile, "rb")
17 | str = self.ipdb.read(8)
18 | (self.firstIndex, self.lastIndex) = struct.unpack('II', str)
19 | self.indexCount = int((self.lastIndex - self.firstIndex) / 7 + 1)
20 | # print self.getVersion(), u" 纪录总数: %d 条 "%(self.indexCount)
21 |
22 | def getVersion(self):
23 | s = self.getIpAddr(0xffffff00)
24 | return s
25 |
26 | def getAreaAddr(self, offset=0):
27 | if offset:
28 | self.ipdb.seek(offset)
29 | str = self.ipdb.read(1)
30 | (byte,) = struct.unpack('B', str)
31 | if byte == 0x01 or byte == 0x02:
32 | p = self.getLong3()
33 | if p:
34 | return self.getString(p)
35 | else:
36 | return ""
37 | else:
38 | self.ipdb.seek(-1, 1)
39 | return self.getString(offset)
40 |
41 | def getAddr(self, offset, ip=0):
42 | self.ipdb.seek(offset + 4)
43 | countryAddr = ""
44 | areaAddr = ""
45 | str = self.ipdb.read(1)
46 | (byte,) = struct.unpack('B', str)
47 | if byte == 0x01:
48 | countryOffset = self.getLong3()
49 | self.ipdb.seek(countryOffset)
50 | str = self.ipdb.read(1)
51 | (b,) = struct.unpack('B', str)
52 | if b == 0x02:
53 | countryAddr = self.getString(self.getLong3())
54 | self.ipdb.seek(countryOffset + 4)
55 | else:
56 | countryAddr = self.getString(countryOffset)
57 | areaAddr = self.getAreaAddr()
58 | elif byte == 0x02:
59 | countryAddr = self.getString(self.getLong3())
60 | areaAddr = self.getAreaAddr(offset + 8)
61 | else:
62 | countryAddr = self.getString(offset + 4)
63 | areaAddr = self.getAreaAddr()
64 | return countryAddr + " " + areaAddr
65 |
66 | def dump(self, first, last):
67 | if last > self.indexCount:
68 | last = self.indexCount
69 | for index in range(first, last):
70 | offset = self.firstIndex + index * 7
71 | self.ipdb.seek(offset)
72 | buf = self.ipdb.read(7)
73 | (ip, of1, of2) = struct.unpack("IHB", buf)
74 | address = self.getAddr(of1 + (of2 << 16))
75 | # 把GBK转为utf-8
76 | address = str(address, 'gbk').encode("utf-8")
77 | logger.info("%d %s %s" % (index, self.ip2str(ip), address))
78 |
79 | def setIpRange(self, index):
80 | offset = self.firstIndex + index * 7
81 | self.ipdb.seek(offset)
82 | buf = self.ipdb.read(7)
83 | (self.curStartIp, of1, of2) = struct.unpack("IHB", buf)
84 | self.curEndIpOffset = of1 + (of2 << 16)
85 | self.ipdb.seek(self.curEndIpOffset)
86 | buf = self.ipdb.read(4)
87 | (self.curEndIp,) = struct.unpack("I", buf)
88 |
89 | def getIpAddr(self, ip):
90 | L = 0
91 | R = self.indexCount - 1
92 | while L < R - 1:
93 | M = int((L + R) / 2)
94 | self.setIpRange(M)
95 | if ip == self.curStartIp:
96 | L = M
97 | break
98 | if ip > self.curStartIp:
99 | L = M
100 | else:
101 | R = M
102 | self.setIpRange(L)
103 | # version information, 255.255.255.X, urgy but useful
104 | if ip & 0xffffff00 == 0xffffff00:
105 | self.setIpRange(R)
106 | if self.curStartIp <= ip <= self.curEndIp:
107 | address = self.getAddr(self.curEndIpOffset)
108 | # 把GBK转为utf-8
109 | address = str(address)
110 | else:
111 | address = "未找到该IP的地址"
112 | return address
113 |
114 | def getIpRange(self, ip):
115 | self.getIpAddr(ip)
116 | range = self.ip2str(self.curStartIp) + ' - ' \
117 | + self.ip2str(self.curEndIp)
118 | return range
119 |
120 | def getString(self, offset=0):
121 | if offset:
122 | self.ipdb.seek(offset)
123 | str = b''
124 | ch = self.ipdb.read(1)
125 | (byte,) = struct.unpack('B', ch)
126 | while byte != 0:
127 | str += ch
128 | ch = self.ipdb.read(1)
129 | (byte,) = struct.unpack('B', ch)
130 | return str.decode('gbk')
131 |
132 | def ip2str(self, ip):
133 | return str(ip >> 24) + '.' + str((ip >> 16) & 0xff) + '.' + str((ip >> 8) & 0xff) + '.' + str(ip & 0xff)
134 |
135 | def str2ip(self, s):
136 | (ip,) = struct.unpack('I', socket.inet_aton(s))
137 | return ((ip >> 24) & 0xff) | ((ip & 0xff) << 24) | ((ip >> 8) & 0xff00) | ((ip & 0xff00) << 8)
138 |
139 | def getLong3(self, offset=0):
140 | if offset:
141 | self.ipdb.seek(offset)
142 | str = self.ipdb.read(3)
143 | (a, b) = struct.unpack('HB', str)
144 | return (b << 16) + a
145 |
146 |
147 | QQWRY_PATH = os.path.dirname(__file__) + "/../data/qqwry.dat"
148 | ips = IPAddresss(QQWRY_PATH)
149 | addr = ips.getIpAddr(ips.str2ip('183.61.236.53'))
150 | print(addr)
151 |
--------------------------------------------------------------------------------
/test/testIPType.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | from lxml import etree
3 | import requests
4 | import config
5 |
6 |
7 | def checkProxyType(selfip, proxies):
8 | '''
9 | 用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
10 | :param proxies: 代理(0 高匿,1 匿名,2 透明 3 无效代理
11 | :return:
12 | '''
13 |
14 | try:
15 | r = requests.get(url='https://incloak.com/ip/', headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies)
16 | print
17 | r.text
18 | # if r.ok:
19 | # root = etree.HTML(r.text)
20 | # ip = root.xpath('.//center[2]/table/tr[3]/td[2]')[0].text
21 | # http_x_forwared_for = root.xpath('.//center[2]/table/tr[8]/td[2]')[0].text
22 | # http_via = root.xpath('.//center[2]/table/tr[9]/td[2]')[0].text
23 | # # print ip,http_x_forwared_for,http_via,type(http_via),type(http_x_forwared_for)
24 | # if ip==selfip:
25 | # return 3
26 | # if http_x_forwared_for is None and http_via is None:
27 | # return 0
28 | # if http_via != None and http_x_forwared_for.find(selfip)== -1:
29 | # return 1
30 | #
31 | # if http_via != None and http_x_forwared_for.find(selfip)!= -1:
32 | # return 2
33 | # return 3
34 |
35 |
36 | except Exception as e:
37 | print
38 | str(e)
39 | return 3
40 |
41 |
42 | if __name__ == '__main__':
43 | ip = '61.132.241.109'
44 | port = '808'
45 | proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
46 | checkProxyType(None, proxies)
--------------------------------------------------------------------------------
/test/testbase64.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import base64
3 | import re
4 |
5 | str = '''
6 |
7 | '''
8 | match = re.search('Proxy\(.+\)', str)
9 | print
10 | match.group()
11 | ip_port = base64.b64decode(match.group().replace("Proxy('", "").replace("')", ""))
12 | print
13 | ip_port
14 |
--------------------------------------------------------------------------------
/test/testhttpserver.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import BaseHTTPServer
3 | import json
4 | import urlparse
5 |
6 |
7 | class WebRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
8 | def do_GET(self):
9 | """
10 | """
11 | print
12 | self.path
13 | parsed_path = urlparse.urlparse(self.path)
14 | print
15 | parsed_path
16 | print
17 | parsed_path.query
18 | # message_parts = [
19 | # 'CLIENT VALUES:',
20 | # 'client_address=%s (%s)' % (self.client_address,
21 | # self.address_string()),
22 | # 'command=%s' % self.command,
23 | # 'path=%s' % self.path,
24 | # 'real path=%s' % parsed_path.path,
25 | # 'query=%s' % parsed_path.query,
26 | # 'request_version=%s' % self.request_version,
27 | # '',
28 | # 'SERVER VALUES:',
29 | # 'server_version=%s' % self.server_version,
30 | # 'sys_version=%s' % self.sys_version,
31 | # 'protocol_version=%s' % self.protocol_version,
32 | # '',
33 | # 'HEADERS RECEIVED:',
34 | # ]
35 | # for name, value in sorted(self.headers.items()):
36 | # message_parts.append('%s=%s' % (name, value.rstrip()))
37 | # message_parts.append('')
38 | # message = '\r\n'.join(message_parts)
39 | data1 = [{'ip': '192.168.0.0', 'port': 456}] * 10
40 | d1 = json.dumps(data1, sort_keys=True, indent=4)
41 | message = ('192.168.1.1', 80)
42 | self.send_response(200)
43 | self.end_headers()
44 | self.wfile.write(d1)
45 |
46 |
47 | server = BaseHTTPServer.HTTPServer(('0.0.0.0', 8000), WebRequestHandler)
48 | server.serve_forever()
--------------------------------------------------------------------------------
/test/testlist.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | from decimal import Decimal
3 |
4 | __author__ = 'Xaxdus'
5 |
6 |
7 | # list = ["www.baidu.com/%s" %m for m in ['index']+range(1,5)]
8 | #
9 | # list = [(1,10)]*10
10 | #
11 | # for m,n in list:
12 | # print m,n
13 | #
14 | #
15 | # list2 = ["www.baidu.com/%s/%s"%(i[0],i[1]) for i in list]
16 | # print list2
17 |
18 | # x=Decimal('0.998531571219').quantize(Decimal('0.00'))
19 | # a= 0.998531571219
20 | # value = round(a, 3)
21 | # print x,type(x),value
22 | # proxys=[]
23 | # proxy=[123,1234]
24 | # proxys.append(proxy)
25 | #
26 | # proxy=[123,1234]
27 | # proxys.append(proxy)
28 | #
29 | # print proxys
30 | # l = [{'ip':'123.1.1.1','port':80},{'ip':'123.1.1.1','port':80},{'ip':'123.1.2.1','port':80},{'ip':'123.1.1.1','port':81}]
31 | #
32 | # # for d in l:
33 | # # print [tuple(d.items())]
34 | # print [tuple(d.items()) for d in l]
35 | #
36 | # print [dict(t) for t in set([tuple(d.items()) for d in l])]
37 | import requests
38 |
39 | r = requests.get('http://127.0.0.1:8000/delete?ip=120.92.3.127')
40 | print
41 | r.text
--------------------------------------------------------------------------------
/test/testlxml.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | from lxml import etree
3 |
4 | __author__ = 'Xaxdus'
5 |
6 | html = '''
7 |
8 |
9 |
10 |
11 | 北京http代理ip_66免费代理ip提取网
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 | |
56 |
57 |
58 |
59 | |
60 |
61 |
62 |
63 |
108 |
109 |
110 |
111 |
112 |
113 |
116 |
135 |
136 |
137 |
162 |
163 |
164 |
165 | '''
166 |
167 | root = etree.HTML(html)
168 | proxys = root.xpath(".//*[@id='footer']/div/table/tr[position()>1]")
169 |
170 | for proxy in proxys:
171 | print
172 | proxy.xpath('./td[1]')[0].text
--------------------------------------------------------------------------------
/test/testqueue.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | from multiprocessing import Queue
3 |
4 | try:
5 | q = Queue()
6 | q.get(timeout=5)
7 | except BaseException, e:
8 | print
9 | '--' + str(e)
10 |
11 |
--------------------------------------------------------------------------------
/test/testsql.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | from db.SqlHelper import SqlHelper
3 | from util.exception import Con_DB_Fail
4 |
5 | try:
6 | sqlhelper = SqlHelper()
7 | sqlhelper.init_db()
8 | except Exception:
9 | raise Con_DB_Fail
10 |
11 | proxy = {'ip': '192.168.1.1', 'port': int('80'), 'type': 0, 'protocol': 0, 'country': u'中国', 'area': u'四川', 'speed': 0}
12 | sqlhelper.insert(proxy)
--------------------------------------------------------------------------------
/util/IPAddress.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 |
6 | import socket
7 | import struct
8 |
9 | import logging
10 | from util.compatibility import text_
11 |
12 | logger = logging.getLogger('util')
13 |
14 |
15 | class IPAddresss:
16 | def __init__(self, ipdbFile):
17 | self.ipdb = open(ipdbFile, "rb")
18 | str = self.ipdb.read(8)
19 | (self.firstIndex, self.lastIndex) = struct.unpack('II', str)
20 | self.indexCount = int((self.lastIndex - self.firstIndex) / 7 + 1)
21 | # print self.getVersion(), u" 纪录总数: %d 条 "%(self.indexCount)
22 |
23 | def getVersion(self):
24 | s = self.getIpAddr(0xffffff00)
25 | return s
26 |
27 | def getAreaAddr(self, offset=0):
28 | if offset:
29 | self.ipdb.seek(offset)
30 | str = self.ipdb.read(1)
31 | (byte,) = struct.unpack('B', str)
32 | if byte == 0x01 or byte == 0x02:
33 | p = self.getLong3()
34 | if p:
35 | return self.getString(p)
36 | else:
37 | return ""
38 | else:
39 | self.ipdb.seek(-1, 1)
40 | return self.getString(offset)
41 |
42 | def getAddr(self, offset, ip=0):
43 | self.ipdb.seek(offset + 4)
44 | countryAddr = text_("")
45 | areaAddr = text_("")
46 | str = self.ipdb.read(1)
47 | (byte,) = struct.unpack('B', str)
48 | if byte == 0x01:
49 | countryOffset = self.getLong3()
50 | self.ipdb.seek(countryOffset)
51 | str = self.ipdb.read(1)
52 | (b,) = struct.unpack('B', str)
53 | if b == 0x02:
54 | countryAddr = self.getString(self.getLong3())
55 | self.ipdb.seek(countryOffset + 4)
56 | else:
57 | countryAddr = self.getString(countryOffset)
58 | areaAddr = self.getAreaAddr()
59 | elif byte == 0x02:
60 | countryAddr = self.getString(self.getLong3())
61 | areaAddr = self.getAreaAddr(offset + 8)
62 | else:
63 | countryAddr = self.getString(offset + 4)
64 | areaAddr = self.getAreaAddr()
65 | return countryAddr + text_(" ") + areaAddr
66 |
67 | def dump(self, first, last):
68 | if last > self.indexCount:
69 | last = self.indexCount
70 | for index in range(first, last):
71 | offset = self.firstIndex + index * 7
72 | self.ipdb.seek(offset)
73 | buf = self.ipdb.read(7)
74 | (ip, of1, of2) = struct.unpack("IHB", buf)
75 | address = self.getAddr(of1 + (of2 << 16))
76 | # 把GBK转为utf-8
77 | address = text_(address, 'gbk').encode("utf-8")
78 | logger.info("%d %s %s" % (index, self.ip2str(ip), address))
79 |
80 | def setIpRange(self, index):
81 | offset = self.firstIndex + index * 7
82 | self.ipdb.seek(offset)
83 | buf = self.ipdb.read(7)
84 | (self.curStartIp, of1, of2) = struct.unpack("IHB", buf)
85 | self.curEndIpOffset = of1 + (of2 << 16)
86 | self.ipdb.seek(self.curEndIpOffset)
87 | buf = self.ipdb.read(4)
88 | (self.curEndIp,) = struct.unpack("I", buf)
89 |
90 | def getIpAddr(self, ip):
91 | L = 0
92 | R = self.indexCount - 1
93 | while L < R - 1:
94 | M = int((L + R) / 2)
95 | self.setIpRange(M)
96 | if ip == self.curStartIp:
97 | L = M
98 | break
99 | if ip > self.curStartIp:
100 | L = M
101 | else:
102 | R = M
103 | self.setIpRange(L)
104 | # version information, 255.255.255.X, urgy but useful
105 | if ip & 0xffffff00 == 0xffffff00:
106 | self.setIpRange(R)
107 | if self.curStartIp <= ip <= self.curEndIp:
108 | address = self.getAddr(self.curEndIpOffset)
109 | # 把GBK转为utf-8
110 | address = text_(address)
111 | else:
112 | address = text_("未找到该IP的地址")
113 | return address
114 |
115 | def getIpRange(self, ip):
116 | self.getIpAddr(ip)
117 | range = self.ip2str(self.curStartIp) + ' - ' \
118 | + self.ip2str(self.curEndIp)
119 | return range
120 |
121 | def getString(self, offset=0):
122 | if offset:
123 | self.ipdb.seek(offset)
124 | str = b''
125 | ch = self.ipdb.read(1)
126 | (byte,) = struct.unpack('B', ch)
127 | while byte != 0:
128 | str += ch
129 | ch = self.ipdb.read(1)
130 | (byte,) = struct.unpack('B', ch)
131 | return str.decode('gbk')
132 |
133 | def ip2str(self, ip):
134 | return str(ip >> 24) + '.' + str((ip >> 16) & 0xff) + '.' + str((ip >> 8) & 0xff) + '.' + str(ip & 0xff)
135 |
136 | def str2ip(self, s):
137 | (ip,) = struct.unpack('I', socket.inet_aton(s))
138 | return ((ip >> 24) & 0xff) | ((ip & 0xff) << 24) | ((ip >> 8) & 0xff00) | ((ip & 0xff00) << 8)
139 |
140 | def getLong3(self, offset=0):
141 | if offset:
142 | self.ipdb.seek(offset)
143 | str = self.ipdb.read(3)
144 | (a, b) = struct.unpack('HB', str)
145 | return (b << 16) + a
146 |
147 |
148 |
149 |
--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 |
3 |
--------------------------------------------------------------------------------
/util/compatibility.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import sys
3 |
4 | PY3 = sys.version_info[0] == 3
5 | if PY3:
6 | text_type = str
7 | binary_type = bytes
8 | else:
9 | text_type = unicode
10 | binary_type = str
11 |
12 |
13 | def text_(s, encoding='utf-8', errors='strict'):
14 | if isinstance(s, binary_type):
15 | return s.decode(encoding, errors)
16 | return s
17 |
18 |
19 | def bytes_(s, encoding='utf-8', errors='strict'):
20 | if isinstance(s, text_type):
21 | return s.encode(encoding, errors)
22 | return s
23 |
--------------------------------------------------------------------------------
/util/exception.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import config
3 |
4 |
5 | class Test_URL_Fail(Exception):
6 | def __str__(self):
7 | str = "访问%s失败,请检查网络连接" % config.TEST_IP
8 | return str
9 |
10 |
11 | class Con_DB_Fail(Exception):
12 | def __str__(self):
13 | str = "使用DB_CONNECT_STRING:%s--连接数据库失败" % config.DB_CONNECT_STRING
14 | return str
15 |
--------------------------------------------------------------------------------
/util/logger.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import logging
3 |
4 | __author__ = 'qiye'
5 |
6 | logger = logging.getLogger()
7 |
8 |
9 | def logger_proxy(proxy):
10 | logger.setLevel(logging.INFO)
11 | logger.info(proxy)
12 |
--------------------------------------------------------------------------------
/validator/Validator.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import sys
3 |
4 | import chardet
5 | from gevent import monkey
6 | monkey.patch_all()
7 |
8 | import json
9 | import os
10 | import gevent
11 | import requests
12 | import time
13 | import psutil
14 | from multiprocessing import Process, Queue
15 |
16 | import config
17 | from db.DataStore import sqlhelper
18 | from util.exception import Test_URL_Fail
19 |
20 |
21 | def detect_from_db(myip, proxy, proxies_set):
22 | proxy_dict = {'ip': proxy[0], 'port': proxy[1]}
23 | result = detect_proxy(myip, proxy_dict)
24 | if result:
25 | proxy_str = '%s:%s' % (proxy[0], proxy[1])
26 | proxies_set.add(proxy_str)
27 |
28 | else:
29 | if proxy[2] < 1:
30 | sqlhelper.delete({'ip': proxy[0], 'port': proxy[1]})
31 | else:
32 | score = proxy[2]-1
33 | sqlhelper.update({'ip': proxy[0], 'port': proxy[1]}, {'score': score})
34 | proxy_str = '%s:%s' % (proxy[0], proxy[1])
35 | proxies_set.add(proxy_str)
36 |
37 |
38 |
39 | def validator(queue1, queue2, myip):
40 | tasklist = []
41 | proc_pool = {} # 所有进程列表
42 | cntl_q = Queue() # 控制信息队列
43 | while True:
44 | if not cntl_q.empty():
45 | # 处理已结束的进程
46 | try:
47 | pid = cntl_q.get()
48 | proc = proc_pool.pop(pid)
49 | proc_ps = psutil.Process(pid)
50 | proc_ps.kill()
51 | proc_ps.wait()
52 | except Exception as e:
53 | pass
54 | # print(e)
55 | # print(" we are unable to kill pid:%s" % (pid))
56 | try:
57 | # proxy_dict = {'source':'crawl','data':proxy}
58 | if len(proc_pool) >= config.MAX_CHECK_PROCESS:
59 | time.sleep(config.CHECK_WATI_TIME)
60 | continue
61 | proxy = queue1.get()
62 | tasklist.append(proxy)
63 | if len(tasklist) >= config.MAX_CHECK_CONCURRENT_PER_PROCESS:
64 | p = Process(target=process_start, args=(tasklist, myip, queue2, cntl_q))
65 | p.start()
66 | proc_pool[p.pid] = p
67 | tasklist = []
68 |
69 | except Exception as e:
70 | if len(tasklist) > 0:
71 | p = Process(target=process_start, args=(tasklist, myip, queue2, cntl_q))
72 | p.start()
73 | proc_pool[p.pid] = p
74 | tasklist = []
75 |
76 | def process_start(tasks, myip, queue2, cntl):
77 | spawns = []
78 | for task in tasks:
79 | spawns.append(gevent.spawn(detect_proxy, myip, task, queue2))
80 | gevent.joinall(spawns)
81 | cntl.put(os.getpid()) # 子进程退出是加入控制队列
82 |
83 |
84 | def detect_proxy(selfip, proxy, queue2=None):
85 | '''
86 | :param proxy: ip字典
87 | :return:
88 | '''
89 | ip = proxy['ip']
90 | port = proxy['port']
91 | proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
92 | protocol, types, speed = getattr(sys.modules[__name__],config.CHECK_PROXY['function'])(selfip, proxies)#checkProxy(selfip, proxies)
93 | if protocol >= 0:
94 | proxy['protocol'] = protocol
95 | proxy['types'] = types
96 | proxy['speed'] = speed
97 | else:
98 | proxy = None
99 | if queue2:
100 | queue2.put(proxy)
101 | return proxy
102 |
103 |
104 | def checkProxy(selfip, proxies):
105 | '''
106 | 用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
107 | :param
108 | :return:
109 | '''
110 | protocol = -1
111 | types = -1
112 | speed = -1
113 | http, http_types, http_speed = _checkHttpProxy(selfip, proxies)
114 | https, https_types, https_speed = _checkHttpProxy(selfip, proxies, False)
115 | if http and https:
116 | protocol = 2
117 | types = http_types
118 | speed = http_speed
119 | elif http:
120 | types = http_types
121 | protocol = 0
122 | speed = http_speed
123 | elif https:
124 | types = https_types
125 | protocol = 1
126 | speed = https_speed
127 | else:
128 | types = -1
129 | protocol = -1
130 | speed = -1
131 | return protocol, types, speed
132 |
133 |
134 | def _checkHttpProxy(selfip, proxies, isHttp=True):
135 | types = -1
136 | speed = -1
137 | if isHttp:
138 | test_url = config.TEST_HTTP_HEADER
139 | else:
140 | test_url = config.TEST_HTTPS_HEADER
141 | try:
142 | start = time.time()
143 | r = requests.get(url=test_url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
144 | if r.ok:
145 | speed = round(time.time() - start, 2)
146 | content = json.loads(r.text)
147 | headers = content['headers']
148 | ip = content['origin']
149 | proxy_connection = headers.get('Proxy-Connection', None)
150 | if ',' in ip:
151 | types = 2
152 | elif proxy_connection:
153 | types = 1
154 | else:
155 | types = 0
156 |
157 | return True, types, speed
158 | else:
159 | return False, types, speed
160 | except Exception as e:
161 | return False, types, speed
162 |
163 |
164 | def baidu_check(selfip, proxies):
165 | '''
166 | 用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
167 | :param
168 | :return:
169 | '''
170 | protocol = -1
171 | types = -1
172 | speed = -1
173 | # try:
174 | # #http://ip.chinaz.com/getip.aspx挺稳定,可以用来检测ip
175 | # r = requests.get(url=config.TEST_URL, headers=config.get_header(), timeout=config.TIMEOUT,
176 | # proxies=proxies)
177 | # r.encoding = chardet.detect(r.content)['encoding']
178 | # if r.ok:
179 | # if r.text.find(selfip)>0:
180 | # return protocol, types, speed
181 | # else:
182 | # return protocol,types,speed
183 | #
184 | #
185 | # except Exception as e:
186 | # return protocol, types, speed
187 | try:
188 | start = time.time()
189 | r = requests.get(url='https://www.baidu.com', headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
190 | r.encoding = chardet.detect(r.content)['encoding']
191 | if r.ok:
192 | speed = round(time.time() - start, 2)
193 | protocol= 0
194 | types=0
195 |
196 | else:
197 | speed = -1
198 | protocol= -1
199 | types=-1
200 | except Exception as e:
201 | speed = -1
202 | protocol = -1
203 | types = -1
204 | return protocol, types, speed
205 |
206 | def getMyIP():
207 | try:
208 | r = requests.get(url=config.TEST_IP, headers=config.get_header(), timeout=config.TIMEOUT)
209 | ip = json.loads(r.text)
210 | return ip['origin']
211 | except Exception as e:
212 | raise Test_URL_Fail
213 |
214 |
215 | if __name__ == '__main__':
216 | ip = '222.186.161.132'
217 | port = 3128
218 | proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
219 | _checkHttpProxy(None,proxies)
220 | # getMyIP()
221 | # str="{ip:'61.150.43.121',address:'陕西省西安市 西安电子科技大学'}"
222 | # j = json.dumps(str)
223 | # str = j['ip']
224 | # print str
225 |
--------------------------------------------------------------------------------
/validator/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 |
--------------------------------------------------------------------------------
|