├── .gitignore
├── IPProxy.py
├── README.md
├── api
    ├── __init__.py
    └── apiServer.py
├── config.py
├── data
    └── qqwry.dat
├── db
    ├── DataStore.py
    ├── ISqlHelper.py
    ├── MongoHelper.py
    ├── RedisHelper.py
    ├── SqlHelper.py
    └── __init__.py
├── qiye2.jpg
├── requirements.txt
├── spider
    ├── HtmlDownloader.py
    ├── HtmlPraser.py
    ├── ProxyCrawl.py
    └── __init__.py
├── start.bat
├── test
    ├── __init__.py
    ├── test.py
    ├── testIPAddress.py
    ├── testIPType.py
    ├── testbase64.py
    ├── testhttpserver.py
    ├── testlist.py
    ├── testlxml.py
    ├── testqueue.py
    └── testsql.py
├── util
    ├── IPAddress.py
    ├── __init__.py
    ├── compatibility.py
    ├── exception.py
    └── logger.py
└── validator
    ├── Validator.py
    └── __init__.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | .idea/
91 | *.db


--------------------------------------------------------------------------------
/IPProxy.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | from multiprocessing import Value, Queue, Process
 4 | from api.apiServer import start_api_server
 5 | from db.DataStore import store_data
 6 | 
 7 | from validator.Validator import validator, getMyIP
 8 | from spider.ProxyCrawl import startProxyCrawl
 9 | 
10 | from config import TASK_QUEUE_SIZE
11 | 
12 | if __name__ == "__main__":
13 |     myip = getMyIP()
14 |     DB_PROXY_NUM = Value('i', 0)
15 |     q1 = Queue(maxsize=TASK_QUEUE_SIZE)
16 |     q2 = Queue()
17 |     p0 = Process(target=start_api_server)
18 |     p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM,myip))
19 |     p2 = Process(target=validator, args=(q1, q2, myip))
20 |     p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM))
21 |     p0.start()
22 |     p1.start()
23 |     p2.start()
24 |     p3.start()
25 |     p0.join()
26 |     p1.join()
27 |     p2.join()
28 |     p3.join()
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ﻿# IPProxyPool
  2 | IPProxyPool代理池项目，提供代理ip。支持py2和py3两个版本。
  3 | ### 我的新书[《Python爬虫开发与项目实战》](https://item.jd.com/12206762.html)出版了,喜欢的话可以看一下[样章](http://pan.baidu.com/s/1hrWEOYg)
  4 | <br/>
  5 | 详细使用方式，请看我的博客:
  6 | http://www.cnblogs.com/qiyeboy/p/5693128.html
  7 | <br/>
  8 | 最近正在为IPProxyPool添加二级代理，方便调度。大家可以关注我的公众号，更新我会及时通知。
  9 | <br/>
 10 | 
 11 | #### 我的微信公众号:
 12 | 
 13 | ![](qiye2.jpg)
 14 | <br/>
 15 | 希望大家提供更多的代理网站，现在爬取的好用的代理ip还是太少。
 16 | <br/>
 17 | 同时感谢[super1-chen](https://github.com/super1-chen),[fancoo](https://github.com/fancoo),[Leibnizhu](https://github.com/Leibnizhu)对项目的贡献。
 18 | <br/>
 19 | 
 20 | ## 项目依赖
 21 | 
 22 | #### Ubuntu,debian
 23 | 
 24 | 1.安装sqlite数据库(一般系统内置):
 25 | apt-get install sqlite3
 26 | <br/>
 27 | 2.安装requests,chardet,web.py,gevent psutil:
 28 | pip install requests chardet web.py sqlalchemy gevent psutil
 29 | <br/>
 30 | 3.安装lxml:
 31 | apt-get install python-lxml
 32 | <br/>
 33 | 注意：
 34 | 
 35 | * python3下的是pip3
 36 | * 有时候使用的gevent版本过低会出现自动退出情况，请使用pip install gevent --upgrade更新)
 37 | * 在python3中安装web.py，不能使用pip，直接下载py3版本的[源码](https://codeload.github.com/webpy/webpy/zip/py3)进行安装
 38 | 
 39 | #### Windows
 40 | 
 41 | 1.下载[sqlite](http://www.sqlite.org/download.html),路径添加到环境变量
 42 | <br/>
 43 | 2.安装requests,chardet,web.py,gevent:
 44 | pip install requests chardet web.py sqlalchemy gevent
 45 | <br/>
 46 | 3.安装lxml:
 47 | pip install lxml或者下载[lxml windows版](https://pypi.python.org/pypi/lxml/)
 48 | <br/>
 49 | 注意：
 50 | 
 51 | * python3下的是pip3
 52 | * 有时候使用的gevent版本过低会出现自动退出情况，请使用pip install gevent --upgrade更新)
 53 | * 在python3中安装web.py，不能使用pip，直接下载py3版本的[源码](https://codeload.github.com/webpy/webpy/zip/py3)进行安装
 54 | 
 55 | #### 扩展说明
 56 | 
 57 | 本项目默认数据库是sqlite，但是采用sqlalchemy的ORM模型，通过预留接口可以拓展使用MySQL，MongoDB等数据库。
 58 | 配置方法：
 59 | <br/>
 60 | 1.MySQL配置
 61 | ```
 62 | 第一步：首先安装MySQL数据库并启动
 63 | 第二步：安装MySQLdb或者pymysql(推荐)
 64 | 第三步：在config.py文件中配置DB_CONFIG。如果安装的是MySQLdb模块，配置如下：
 65 |         DB_CONFIG={
 66 |             'DB_CONNECT_TYPE':'sqlalchemy',
 67 |             'DB_CONNECT_STRING':'mysql+mysqldb://root:root@localhost/proxy?charset=utf8'
 68 |         }
 69 |         如果安装的是pymysql模块，配置如下：
 70 |          DB_CONFIG={
 71 |             'DB_CONNECT_TYPE':'sqlalchemy',
 72 |             'DB_CONNECT_STRING':'mysql+pymysql://root:root@localhost/proxy?charset=utf8'
 73 |         }
 74 | ```
 75 | sqlalchemy下的DB_CONNECT_STRING参考[支持数据库](http://docs.sqlalchemy.org/en/latest/core/engines.html#supported-databases)，理论上使用这种配置方式不只是适配MySQL，sqlalchemy支持的数据库都可以，但是仅仅测试过MySQL。
 76 | <br/>
 77 | 2.MongoDB配置
 78 | ```
 79 | 第一步：首先安装MongoDB数据库并启动
 80 | 第二步：安装pymongo模块
 81 | 第三步：在config.py文件中配置DB_CONFIG。配置类似如下：
 82 |         DB_CONFIG={
 83 |             'DB_CONNECT_TYPE':'pymongo',
 84 |             'DB_CONNECT_STRING':'mongodb://localhost:27017/'
 85 |         }
 86 | ```
 87 | 由于sqlalchemy并不支持MongoDB,因此额外添加了pymongo模式，DB_CONNECT_STRING参考pymongo的连接字符串。
 88 | 
 89 | ##### 注意
 90 | 
 91 | 如果大家想拓展其他数据库，可以直接继承db下ISqlHelper类，实现其中的方法，具体实现参考我的代码，然后在DataStore中导入类即可。
 92 | ```
 93 | try:
 94 |     if DB_CONFIG['DB_CONNECT_TYPE'] == 'pymongo':
 95 |         from db.MongoHelper import MongoHelper as SqlHelper
 96 |     else:
 97 |         from db.SqlHelper import SqlHelper as SqlHelper
 98 |     sqlhelper = SqlHelper()
 99 |     sqlhelper.init_db()
100 | except Exception,e:
101 |     raise Con_DB_Fail
102 | ```
103 | 有感兴趣的朋友，可以将Redis的实现方式添加进来。
104 | 
105 | 
106 | ## 如何使用
107 | 
108 | 将项目目录clone到当前文件夹
109 | 
110 | $ git clone 
111 | 
112 | 切换工程目录
113 | 
114 | ```
115 | $ cd IPProxyPool
116 | ```
117 | 
118 | 运行脚本
119 | 
120 | ```
121 | python IPProxy.py
122 | ```
123 | 成功运行后，打印信息
124 | ```
125 | IPProxyPool----->>>>>>>>beginning
126 | http://0.0.0.0:8000/
127 | IPProxyPool----->>>>>>>>db exists ip:0
128 | IPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...
129 | IPProxyPool----->>>>>>>>Success ip num :134,Fail ip num:7882
130 | ```
131 | 
132 | ## API 使用方法
133 | 
134 | #### 第一种模式
135 | ```
136 | GET /
137 | ```
138 | 这种模式用于查询代理ip数据，同时加入评分机制，返回数据的顺序是按照评分由高到低，速度由快到慢制定的。
139 | 
140 | #### 参数 
141 | 
142 | | Name | Type | Description |
143 | | ----| ---- | ---- |
144 | | types | int | 0: 高匿,1:匿名,2 透明 |
145 | | protocol | int | 0: http, 1 https, 2 http/https |
146 | | count | int | 数量 |
147 | | country | str | 取值为 国内, 国外 |
148 | | area | str | 地区 |
149 | 
150 | 
151 | 
152 | #### 例子
153 | 
154 | ##### IPProxys默认端口为8000,端口可以在config.py中配置。
155 | 
156 | ##### 如果是在本机上测试：
157 | 
158 | 1.获取5个ip地址在中国的高匿代理：http://127.0.0.1:8000/?types=0&count=5&country=国内
159 | <br/>
160 | 2.响应为JSON格式，按照评分由高到低，响应速度由高到低的顺序，返回数据：
161 | <br/>
162 | ```
163 | [["122.226.189.55", 138, 10], ["183.61.236.54", 3128, 10], ["61.132.241.109", 808, 10], ["183.61.236.53", 3128, 10], ["122.227.246.102", 808, 10]]
164 | ```
165 | <br/>
166 | 以["122.226.189.55", 138, 10]为例，第一个元素是ip,第二个元素是port，第三个元素是分值score。
167 | 
168 | ```
169 | import requests
170 | import json
171 | r = requests.get('http://127.0.0.1:8000/?types=0&count=5&country=国内')
172 | ip_ports = json.loads(r.text)
173 | print ip_ports
174 | ip = ip_ports[0][0]
175 | port = ip_ports[0][1]
176 | proxies={
177 |     'http':'http://%s:%s'%(ip,port),
178 |     'https':'http://%s:%s'%(ip,port)
179 | }
180 | r = requests.get('http://ip.chinaz.com/',proxies=proxies)
181 | r.encoding='utf-8'
182 | print r.text
183 | ```
184 | #### 第二种模式
185 | ```
186 | GET /delete
187 | ```
188 | 这种模式用于方便用户根据自己的需求删除代理ip数据
189 | 
190 | #### 参数 
191 | 
192 | | Name | Type | Description |
193 | | ----| ---- | ---- |
194 | | ip | str | 类似192.168.1.1 |
195 | | port | int | 类似 80 |
196 | | types | int |  0: 高匿,1:匿名,2 透明 |
197 | | protocol | int | 0: http, 1 https, 2 http/https |
198 | | count | int | 数量 |
199 | | country | str | 取值为 国内, 国外 |
200 | | area | str | 地区 |
201 | 
202 | 大家可以根据指定以上一种或几种方式删除数据。
203 | 
204 | #### 例子
205 | 
206 | ##### 如果是在本机上测试：
207 | 
208 | 1.删除ip为120.92.3.127的代理：http://127.0.0.1:8000/delete?ip=120.92.3.127
209 | <br/>
210 | 2.响应为JSON格式，返回删除的结果为成功,失败或者返回删除的个数,类似如下的效果：
211 | ["deleteNum", "ok"]或者["deleteNum", 1]
212 | ```
213 | import requests
214 | r = requests.get('http://127.0.0.1:8000/delete?ip=120.92.3.127')
215 | print r.text
216 | ```
217 | ## config.py参数配置
218 | ```
219 | #parserList是网址解析规则表,大家可以将发现的代理网址,将提取规则添加到其中,方便爬虫的爬取。
220 | parserList = [
221 |     {
222 |         'urls': ['http://www.66ip.cn/%s.html' % n for n in ['index'] + list(range(2, 12))],
223 |         'type': 'xpath',
224 |         'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]",
225 |         'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
226 |     },
227 |     
228 |    ......
229 |  
230 |    
231 |     {
232 |         'urls': ['http://www.cnproxy.com/proxy%s.html' % i for i in range(1, 11)],
233 |         'type': 'module',
234 |         'moduleName': 'CnproxyPraser',
235 |         'pattern': r'<tr><td>(\d+\.\d+\.\d+\.\d+)<SCRIPT type=text/javascript>document.write\(\"\:\"(.+)\)</SCRIPT></td><td>(HTTP|SOCKS4)\s*',
236 |         'position': {'ip': 0, 'port': 1, 'type': -1, 'protocol': 2}
237 |     }
238 | ]
239 | 
240 | #数据库的配置
241 | 
242 | DB_CONFIG = {
243 | 
244 |     'DB_CONNECT_TYPE': 'sqlalchemy',  # 'pymongo'sqlalchemy;redis
245 |     # 'DB_CONNECT_STRING':'mongodb://localhost:27017/'
246 |     'DB_CONNECT_STRING': 'sqlite:///' + os.path.dirname(__file__) + '/data/proxy.db'
247 |     # DB_CONNECT_STRING : 'mysql+mysqldb://root:root@localhost/proxy?charset=utf8'
248 | 
249 |     # 'DB_CONNECT_TYPE': 'redis',  # 'pymongo'sqlalchemy;redis
250 |     # 'DB_CONNECT_STRING': 'redis://localhost:6379/8',
251 | 
252 | }
253 | #THREADNUM为gevent pool的协程数目
254 | THREADNUM = 5
255 | 
256 | #API_PORT为API web服务器的端口
257 | API_PORT = 8000
258 | 
259 | #爬虫爬取和检测ip的设置条件
260 | #不需要检测ip是否已经存在，因为会定时清理
261 | # UPDATE_TIME:每半个小时检测一次是否有代理ip失效
262 | UPDATE_TIME = 30 * 60 
263 | 
264 | # 当有效的ip值小于MINNUM时 需要启动爬虫进行爬取
265 | MINNUM = 50  
266 | 
267 | # socket超时
268 | TIMEOUT = 5 
269 | 
270 | 
271 | 
272 | 
273 | #爬虫下载网页的重试次数
274 | RETRY_TIME = 3
275 | 
276 | 
277 | #USER_AGENTS 随机头信息,用来突破爬取网站的反爬虫
278 | 
279 | USER_AGENTS = [
280 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
281 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
282 |     "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
283 |     "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
284 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
285 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
286 |     "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
287 |    ]
288 | #默认给抓取的ip分配20分,每次连接失败,减一分,直到分数全部扣完从数据库中删除
289 | DEFAULT_SCORE=10
290 | 
291 | #CHECK_PROXY变量是为了用户自定义检测代理的函数,，默认是CHECK_PROXY={'function':'checkProxy'}。
292 | #现在使用检测的网址是httpbin.org,但是即使ip通过了验证和检测
293 | #也只能说明通过此代理ip可以到达httpbin.org,但是不一定能到达用户爬取的网址
294 | #因此在这个地方用户可以自己添加检测函数,我以百度为访问网址尝试一下
295 | #大家可以看一下Validator.py文件中的baidu_check函数和detect_proxy函数就会明白
296 | 
297 | CHECK_PROXY={'function':'checkProxy'}#{'function':'baidu_check'}
298 | ```
299 | ## TODO
300 | 1.添加squid代理，简化爬虫配置
301 | <br/>
302 | 
303 | 
304 | ## 更新进度
305 | -----------------------------2017-4-6----------------------------
306 | <br/>
307 | 1.更新评分机制。
308 | <br/>
309 | * 之前的评分机制是刚添加进来每个代理ip为0分，每隔半个小时检测一次，检测之后依然有效则加分，无效则删除。
310 | * 现在的评分机制是每个新的代理ip分配10分,每隔半个小时检测一次，检测之后依然有效则分数不变，无效则分数减一,直至为0删除,可以避免由于检测网站不稳定导致的误删。
311 | 
312 | 2.用户可以自定义检测函数,在config.py的CHECK_PROXY变量中可以配置。
313 | ```
314 | CHECK_PROXY变量是为了用户自定义检测代理的函数，默认是CHECK_PROXY={'function':'checkProxy'}
315 | 现在使用检测的网址是httpbin.org,但是即使ip通过了验证和检测
316 | 也只能说明通过此代理ip可以到达httpbin.org,但是不一定能到达用户爬取的网址
317 | 因此在这个地方用户可以自己添加检测函数,我以百度为访问网址尝试一下
318 | 大家可以看一下Validator.py文件中的baidu_check函数和detect_proxy函数就会明白。
319 | 
320 | CHECK_PROXY={'function':'baidu_check'}
321 | ```
322 | 3.经过大家的共同努力,彻底解决了僵死进程的问题。
323 | 
324 | -----------------------------2017-1-16----------------------------
325 | <br/>
326 | 1.将py2和py3版本合并，并且兼容
327 | <br/>
328 | 2.修复pymongo查询bug
329 | <br/>
330 | -----------------------------2017-1-11----------------------------
331 | <br/>
332 | 1.使用httpbin.org检测代理ip的高匿性
333 | <br/>
334 | 2.使用 国内 和 国外 作为country的查询条件
335 | <br/>
336 | 3.修改types和protocol参数，一定要注意protocol的使用，试试访问http://www.baidu.com和https://www.baidu.com
337 | <br/>
338 | 4.美化代码风格
339 | <br/>
340 | -----------------------------2016-12-11----------------------------
341 | ####大规模重构，主要包括以下几个方面：
342 | 1.使用多进程+协程的方式，将爬取和验证的效率提高了50倍以上，可以在几分钟之内获取所有的有效IP
343 | <br/>
344 | 2.使用web.py作为API服务器，重构HTTP接口
345 | <br/>
346 | 3.增加Mysql,MongoDB等数据库的适配
347 | <br/>
348 | 4.增加了三个代理网站
349 | <br/>
350 | 5.增加评分机制，评比稳定的ip
351 | <br/>
352 | 6.支持python3
353 | <br/>
354 | -----------------------------2016-11-24----------------------------
355 | <br/>
356 | 1.增加chardet识别网页编码
357 | <br/>
358 | 2.突破66ip.cn反爬限制
359 | <br/>
360 | -----------------------------2016-10-27----------------------------
361 | <br/>
362 | 1.增加对代理的检测，测试是否能真正访问到网址，实现代理
363 | <br/>
364 | 2.添加通过正则表达式和加载插件解析网页的方式
365 | <br/>
366 | 3.又增加一个新的代理网站
367 | <br/>
368 | 
369 | -----------------------------2016-7-20----------------------------
370 | <br/>
371 | 1.修复bug ,将数据库进行压缩
372 | <br/>
373 | 


--------------------------------------------------------------------------------
/api/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 | 


--------------------------------------------------------------------------------
/api/apiServer.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | '''
 3 | 定义几个关键字，count type,protocol,country,area,
 4 | '''
 5 | import json
 6 | import sys
 7 | import web
 8 | import config
 9 | from db.DataStore import sqlhelper
10 | from db.SqlHelper import Proxy
11 | 
12 | urls = (
13 |     '/', 'select',
14 |     '/delete', 'delete'
15 | )
16 | 
17 | 
18 | def start_api_server():
19 |     sys.argv.append('0.0.0.0:%s' % config.API_PORT)
20 |     app = web.application(urls, globals())
21 |     app.run()
22 | 
23 | 
24 | class select(object):
25 |     def GET(self):
26 |         inputs = web.input()
27 |         json_result = json.dumps(sqlhelper.select(inputs.get('count', None), inputs))
28 |         return json_result
29 | 
30 | 
31 | class delete(object):
32 |     params = {}
33 | 
34 |     def GET(self):
35 |         inputs = web.input()
36 |         json_result = json.dumps(sqlhelper.delete(inputs))
37 |         return json_result
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     sys.argv.append('0.0.0.0:8000')
42 |     app = web.application(urls, globals())
43 |     app.run()
44 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | '''
  3 | 定义规则 urls:url列表
  4 |          type：解析方式,取值 regular(正则表达式),xpath(xpath解析),module(自定义第三方模块解析)
  5 |          patten：可以是正则表达式,可以是xpath语句不过要和上面的相对应
  6 | '''
  7 | import os
  8 | import random
  9 | 
 10 | '''
 11 | ip，端口，类型(0高匿名，1透明)，protocol(0 http,1 https),country(国家),area(省市),updatetime(更新时间)
 12 |  speed(连接速度)
 13 | '''
 14 | parserList = [
 15 |     {
 16 |         'urls': ['http://www.66ip.cn/%s.html' % n for n in ['index'] + list(range(2, 12))],
 17 |         'type': 'xpath',
 18 |         'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]",
 19 |         'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
 20 |     },
 21 |     {
 22 |         'urls': ['http://www.66ip.cn/areaindex_%s/%s.html' % (m, n) for m in range(1, 35) for n in range(1, 10)],
 23 |         'type': 'xpath',
 24 |         'pattern': ".//*[@id='footer']/div/table/tr[position()>1]",
 25 |         'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''}
 26 |     },
 27 |     {
 28 |         'urls': ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'],
 29 |         'type': 'xpath',
 30 |         'pattern': ".//table[@class='sortable']/tbody/tr",
 31 |         'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
 32 | 
 33 |     },
 34 |     {
 35 |         'urls': ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)],
 36 |         'type': 'xpath',
 37 |         'pattern': ".//table[@class='list']/tr",
 38 |         'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
 39 | 
 40 |     },
 41 |     {
 42 |         'urls': ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)],
 43 |         'type': 'module',
 44 |         'moduleName': 'proxy_listPraser',
 45 |         'pattern': 'Proxy\(.+\)',
 46 |         'position': {'ip': 0, 'port': -1, 'type': -1, 'protocol': 2}
 47 | 
 48 |     },
 49 |     {
 50 |         'urls': ['http://incloak.com/proxy-list/%s#list' % n for n in
 51 |                  ([''] + ['?start=%s' % (64 * m) for m in range(1, 10)])],
 52 |         'type': 'xpath',
 53 |         'pattern': ".//table[@class='proxy__t']/tbody/tr",
 54 |         'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''}
 55 | 
 56 |     },
 57 |     {
 58 |         'urls': ['http://www.kuaidaili.com/proxylist/%s/' % n for n in range(1, 11)],
 59 |         'type': 'xpath',
 60 |         'pattern': ".//*[@id='index_free_list']/table/tbody/tr[position()>0]",
 61 |         'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
 62 |     },
 63 |     {
 64 |         'urls': ['http://www.kuaidaili.com/free/%s/%s/' % (m, n) for m in ['inha', 'intr', 'outha', 'outtr'] for n in
 65 |                  range(1, 11)],
 66 |         'type': 'xpath',
 67 |         'pattern': ".//*[@id='list']/table/tbody/tr[position()>0]",
 68 |         'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
 69 |     },
 70 |     {
 71 |         'urls': ['http://www.cz88.net/proxy/%s' % m for m in
 72 |                  ['index.shtml'] + ['http_%s.shtml' % n for n in range(2, 11)]],
 73 |         'type': 'xpath',
 74 |         'pattern': ".//*[@id='boxright']/div/ul/li[position()>1]",
 75 |         'position': {'ip': './div[1]', 'port': './div[2]', 'type': './div[3]', 'protocol': ''}
 76 | 
 77 |     },
 78 |     {
 79 |         'urls': ['http://www.ip181.com/daili/%s.html' % n for n in range(1, 11)],
 80 |         'type': 'xpath',
 81 |         'pattern': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]",
 82 |         'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'}
 83 | 
 84 |     },
 85 |     {
 86 |         'urls': ['http://www.xicidaili.com/%s/%s' % (m, n) for m in ['nn', 'nt', 'wn', 'wt'] for n in range(1, 8)],
 87 |         'type': 'xpath',
 88 |         'pattern': ".//*[@id='ip_list']/tr[position()>1]",
 89 |         'position': {'ip': './td[2]', 'port': './td[3]', 'type': './td[5]', 'protocol': './td[6]'}
 90 |     },
 91 |     {
 92 |         'urls': ['http://www.cnproxy.com/proxy%s.html' % i for i in range(1, 11)],
 93 |         'type': 'module',
 94 |         'moduleName': 'CnproxyPraser',
 95 |         'pattern': r'<tr><td>(\d+\.\d+\.\d+\.\d+)<SCRIPT type=text/javascript>document.write\(\"\:\"(.+)\)</SCRIPT></td><td>(HTTP|SOCKS4)\s*',
 96 |         'position': {'ip': 0, 'port': 1, 'type': -1, 'protocol': 2}
 97 |     }
 98 | ]
 99 | '''
100 | 数据库的配置
101 | '''
102 | DB_CONFIG = {
103 | 
104 |     'DB_CONNECT_TYPE': 'sqlalchemy',  # 'pymongo'sqlalchemy;redis
105 |     # 'DB_CONNECT_STRING':'mongodb://localhost:27017/'
106 |     'DB_CONNECT_STRING': 'sqlite:///' + os.path.dirname(__file__) + '/data/proxy.db'
107 |     # DB_CONNECT_STRING : 'mysql+mysqldb://root:root@localhost/proxy?charset=utf8'
108 | 
109 |     # 'DB_CONNECT_TYPE': 'redis',  # 'pymongo'sqlalchemy;redis
110 |     # 'DB_CONNECT_STRING': 'redis://localhost:6379/8',
111 | 
112 | }
113 | CHINA_AREA = ['河北', '山东', '辽宁', '黑龙江', '吉林'
114 |     , '甘肃', '青海', '河南', '江苏', '湖北', '湖南',
115 |               '江西', '浙江', '广东', '云南', '福建',
116 |               '台湾', '海南', '山西', '四川', '陕西',
117 |               '贵州', '安徽', '重庆', '北京', '上海', '天津', '广西', '内蒙', '西藏', '新疆', '宁夏', '香港', '澳门']
118 | QQWRY_PATH = os.path.dirname(__file__) + "/data/qqwry.dat"
119 | THREADNUM = 5
120 | API_PORT = 8000
121 | '''
122 | 爬虫爬取和检测ip的设置条件
123 | 不需要检测ip是否已经存在，因为会定时清理
124 | '''
125 | UPDATE_TIME = 30 * 60  # 每半个小时检测一次是否有代理ip失效
126 | MINNUM = 50  # 当有效的ip值小于一个时 需要启动爬虫进行爬取
127 | 
128 | TIMEOUT = 5  # socket延时
129 | '''
130 | 反爬虫的设置
131 | '''
132 | '''
133 | 重试次数
134 | '''
135 | RETRY_TIME = 3
136 | 
137 | '''
138 | USER_AGENTS 随机头信息
139 | '''
140 | USER_AGENTS = [
141 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
142 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
143 |     "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
144 |     "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
145 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
146 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
147 |     "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
148 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
149 |     "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
150 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
151 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
152 |     "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
153 |     "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
154 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
155 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
156 |     "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
157 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
158 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
159 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
160 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
161 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
162 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
163 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
164 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
165 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
166 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
167 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
168 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
169 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
170 |     "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
171 |     "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
172 |     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
173 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
174 |     "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
175 | ]
176 | 
177 | 
178 | def get_header():
179 |     return {
180 |         'User-Agent': random.choice(USER_AGENTS),
181 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
182 |         'Accept-Language': 'en-US,en;q=0.5',
183 |         'Connection': 'keep-alive',
184 |         'Accept-Encoding': 'gzip, deflate',
185 |     }
186 | #默认给抓取的ip分配20分,每次连接失败,减一分,直到分数全部扣完从数据库中删除
187 | DEFAULT_SCORE=10
188 | 
189 | TEST_URL = 'http://ip.chinaz.com/getip.aspx'
190 | TEST_IP = 'http://httpbin.org/ip'
191 | TEST_HTTP_HEADER = 'http://httpbin.org/get'
192 | TEST_HTTPS_HEADER = 'https://httpbin.org/get'
193 | #CHECK_PROXY变量是为了用户自定义检测代理的函数
194 | #现在使用检测的网址是httpbin.org,但是即使ip通过了验证和检测
195 | #也只能说明通过此代理ip可以到达httpbin.org,但是不一定能到达用户爬取的网址
196 | #因此在这个地方用户可以自己添加检测函数,我以百度为访问网址尝试一下
197 | #大家可以看一下Validator.py文件中的baidu_check函数和detect_proxy函数就会明白
198 | 
199 | CHECK_PROXY={'function':'checkProxy'}#{'function':'baidu_check'}
200 | 
201 | #下面配置squid,现在还没实现
202 | #SQUID={'path':None,'confpath':'C:/squid/etc/squid.conf'}
203 | 
204 | MAX_CHECK_PROCESS = 2 # CHECK_PROXY最大进程数
205 | MAX_CHECK_CONCURRENT_PER_PROCESS = 30 # CHECK_PROXY时每个进程的最大并发
206 | TASK_QUEUE_SIZE = 50 # 任务队列SIZE
207 | MAX_DOWNLOAD_CONCURRENT = 3 # 从免费代理网站下载时的最大并发 
208 | CHECK_WATI_TIME = 1#进程数达到上限时的等待时间


--------------------------------------------------------------------------------
/data/qqwry.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiyeboy/IPProxyPool/127827829535641d57a1a05e20523cb0a192170b/data/qqwry.dat


--------------------------------------------------------------------------------
/db/DataStore.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import sys
 3 | from config import DB_CONFIG
 4 | from util.exception import Con_DB_Fail
 5 | 
 6 | 
 7 | try:
 8 |     if DB_CONFIG['DB_CONNECT_TYPE'] == 'pymongo':
 9 |         from db.MongoHelper import MongoHelper as SqlHelper
10 |     elif DB_CONFIG['DB_CONNECT_TYPE'] == 'redis':
11 |         from db.RedisHelper import RedisHelper as SqlHelper
12 |     else:
13 |         from db.SqlHelper import SqlHelper as SqlHelper
14 |     sqlhelper = SqlHelper()
15 |     sqlhelper.init_db()
16 | except Exception as e:
17 |     raise Con_DB_Fail
18 | 
19 | 
20 | def store_data(queue2, db_proxy_num):
21 |     '''
22 |     读取队列中的数据，写入数据库中
23 |     :param queue2:
24 |     :return:
25 |     '''
26 |     successNum = 0
27 |     failNum = 0
28 |     while True:
29 |         try:
30 |             proxy = queue2.get(timeout=300)
31 |             if proxy:
32 | 
33 |                 sqlhelper.insert(proxy)
34 |                 successNum += 1
35 |             else:
36 |                 failNum += 1
37 |             str = 'IPProxyPool----->>>>>>>>Success ip num :%d,Fail ip num:%d' % (successNum, failNum)
38 |             sys.stdout.write(str + "\r")
39 |             sys.stdout.flush()
40 |         except BaseException as e:
41 |             if db_proxy_num.value != 0:
42 |                 successNum += db_proxy_num.value
43 |                 db_proxy_num.value = 0
44 |                 str = 'IPProxyPool----->>>>>>>>Success ip num :%d,Fail ip num:%d' % (successNum, failNum)
45 |                 sys.stdout.write(str + "\r")
46 |                 sys.stdout.flush()
47 |                 successNum = 0
48 |                 failNum = 0
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/db/ISqlHelper.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | class ISqlHelper(object):
 4 |     params = {'ip': None, 'port': None, 'types': None, 'protocol': None, 'country': None, 'area': None}
 5 | 
 6 |     def init_db(self):
 7 |         raise NotImplemented
 8 | 
 9 |     def drop_db(self):
10 |         raise NotImplemented
11 | 
12 |     def insert(self, value=None):
13 |         raise NotImplemented
14 | 
15 |     def delete(self, conditions=None):
16 |         raise NotImplemented
17 | 
18 |     def update(self, conditions=None, value=None):
19 |         raise NotImplemented
20 | 
21 |     def select(self, count=None, conditions=None):
22 |         raise NotImplemented


--------------------------------------------------------------------------------
/db/MongoHelper.py:
--------------------------------------------------------------------------------
 1 | import pymongo
 2 | from config import DB_CONFIG, DEFAULT_SCORE
 3 | 
 4 | from db.ISqlHelper import ISqlHelper
 5 | 
 6 | 
 7 | class MongoHelper(ISqlHelper):
 8 |     def __init__(self):
 9 |         self.client = pymongo.MongoClient(DB_CONFIG['DB_CONNECT_STRING'], connect=False)
10 | 
11 |     def init_db(self):
12 |         self.db = self.client.proxy
13 |         self.proxys = self.db.proxys
14 | 
15 |     def drop_db(self):
16 |         self.client.drop_database(self.db)
17 | 
18 |     def insert(self, value=None):
19 |         if value:
20 |             proxy = dict(ip=value['ip'], port=value['port'], types=value['types'], protocol=value['protocol'],
21 |                          country=value['country'],
22 |                          area=value['area'], speed=value['speed'], score=DEFAULT_SCORE)
23 |             self.proxys.insert(proxy)
24 | 
25 |     def delete(self, conditions=None):
26 |         if conditions:
27 |             self.proxys.remove(conditions)
28 |             return ('deleteNum', 'ok')
29 |         else:
30 |             return ('deleteNum', 'None')
31 | 
32 |     def update(self, conditions=None, value=None):
33 |         # update({"UserName":"libing"},{"$set":{"Email":"libing@126.com","Password":"123"}})
34 |         if conditions and value:
35 |             self.proxys.update(conditions, {"$set": value})
36 |             return {'updateNum': 'ok'}
37 |         else:
38 |             return {'updateNum': 'fail'}
39 | 
40 |     def select(self, count=None, conditions=None):
41 |         if count:
42 |             count = int(count)
43 |         else:
44 |             count = 0
45 |         if conditions:
46 |             conditions = dict(conditions)
47 |             if 'count' in conditions:
48 |                 del conditions['count']
49 |             conditions_name = ['types', 'protocol']
50 |             for condition_name in conditions_name:
51 |                 value = conditions.get(condition_name, None)
52 |                 if value:
53 |                     conditions[condition_name] = int(value)
54 |         else:
55 |             conditions = {}
56 |         items = self.proxys.find(conditions, limit=count).sort(
57 |             [("speed", pymongo.ASCENDING), ("score", pymongo.DESCENDING)])
58 |         results = []
59 |         for item in items:
60 |             result = (item['ip'], item['port'], item['score'])
61 |             results.append(result)
62 |         return results
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     # from db.MongoHelper import MongoHelper as SqlHelper
67 |     # sqlhelper = SqlHelper()
68 |     # sqlhelper.init_db()
69 |     # # print  sqlhelper.select(None,{'types':u'1'})
70 |     # items= sqlhelper.proxys.find({'types':0})
71 |     # for item in items:
72 |     # print item
73 |     # # # print sqlhelper.select(None,{'types':u'0'})
74 |     pass
75 | 


--------------------------------------------------------------------------------
/db/RedisHelper.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | from __future__ import unicode_literals
  3 | 
  4 | from redis import Redis
  5 | 
  6 | import config
  7 | from db.ISqlHelper import ISqlHelper
  8 | from db.SqlHelper import Proxy
  9 | 
 10 | 
 11 | class RedisHelper(ISqlHelper):
 12 |     def __init__(self, url=None):
 13 |         self.index_names = ('types', 'protocol', 'country', 'area', 'score')
 14 |         self.redis_url = url or config.DB_CONFIG['DB_CONNECT_STRING']
 15 | 
 16 |     def get_proxy_name(self, ip=None, port=None, protocal=None, proxy=None):
 17 |         ip = ip or proxy.ip
 18 |         port = port or proxy.port
 19 |         protocal = protocal or proxy.protocol
 20 |         return "proxy::{}:{}:{}".format(ip, port, protocal)
 21 | 
 22 |     def get_index_name(self, index_name, value=None):
 23 |         if index_name == 'score':
 24 |             return 'index::score'
 25 |         return "index::{}:{}".format(index_name, value)
 26 | 
 27 |     def get_proxy_by_name(self, name):
 28 |         pd = self.redis.hgetall(name)
 29 |         if pd:
 30 |             return Proxy(**{k.decode('utf8'): v.decode('utf8') for k, v in pd.items()})
 31 | 
 32 |     def init_db(self, url=None):
 33 |         self.redis = Redis.from_url(url or self.redis_url)
 34 | 
 35 |     def drop_db(self):
 36 |         return self.redis.flushdb()
 37 | 
 38 |     def get_keys(self, conditions):
 39 |         select_keys = {self.get_index_name(key, conditions[key]) for key in conditions.keys() if
 40 |                        key in self.index_names}
 41 |         if 'ip' in conditions and 'port' in conditions:
 42 |             return self.redis.keys(self.get_proxy_name(conditions['ip'], conditions['port'], '*'))
 43 |         if select_keys:
 44 |             return [name.decode('utf8') for name in self.redis.sinter(keys=select_keys)]
 45 |         return []
 46 | 
 47 |     def insert(self, value):
 48 |         proxy = Proxy(ip=value['ip'], port=value['port'], types=value['types'], protocol=value['protocol'],
 49 |                       country=value['country'], area=value['area'],
 50 |                       speed=value['speed'], score=value.get('score', config.DEFAULT_SCORE))
 51 |         mapping = proxy.__dict__
 52 |         for k in list(mapping.keys()):
 53 |             if k.startswith('_'):
 54 |                 mapping.pop(k)
 55 |         object_name = self.get_proxy_name(proxy=proxy)
 56 |         # 存结构
 57 |         insert_num = self.redis.hmset(object_name, mapping)
 58 |         # 创建索引
 59 |         if insert_num > 0:
 60 |             for index_name in self.index_names:
 61 |                 self.create_index(index_name, object_name, proxy)
 62 |         return insert_num
 63 | 
 64 |     def create_index(self, index_name, object_name, proxy):
 65 |         redis_key = self.get_index_name(index_name, getattr(proxy, index_name))
 66 |         if index_name == 'score':
 67 |             return self.redis.zadd(redis_key, object_name, int(proxy.score))
 68 |         return self.redis.sadd(redis_key, object_name)
 69 | 
 70 |     def delete(self, conditions):
 71 |         proxy_keys = self.get_keys(conditions)
 72 |         index_keys = self.redis.keys(u"index::*")
 73 |         if not proxy_keys:
 74 |             return 0
 75 | 
 76 |         for iname in index_keys:
 77 |             if iname == b'index::score':
 78 |                 self.redis.zrem(self.get_index_name('score'), *proxy_keys)
 79 |             else:
 80 |                 self.redis.srem(iname, *proxy_keys)
 81 |         return self.redis.delete(*proxy_keys) if proxy_keys else 0
 82 | 
 83 |     def update(self, conditions, values):
 84 |         objects = self.get_keys(conditions)
 85 |         count = 0
 86 |         for name in objects:
 87 |             for k, v in values.items():
 88 |                 if k == 'score':
 89 |                     self.redis.zrem(self.get_index_name('score'), [name])
 90 |                     self.redis.zadd(self.get_index_name('score'), name, int(v))
 91 |                 self.redis.hset(name, key=k, value=v)
 92 |             count += 1
 93 |         return count
 94 | 
 95 |     def select(self, count=None, conditions=None):
 96 |         count = (count and int(count)) or 1000  # 最多返回1000条数据
 97 |         count = 1000 if count > 1000 else count
 98 | 
 99 |         querys = {k: v for k, v in conditions.items() if k in self.index_names} if conditions else None
100 |         if querys:
101 |             objects = list(self.get_keys(querys))[:count]
102 |             redis_name = self.get_index_name('score')
103 |             objects.sort(key=lambda x: int(self.redis.zscore(redis_name, x)))
104 |         else:
105 |             objects = list(
106 |                 self.redis.zrevrangebyscore(self.get_index_name("score"), '+inf', '-inf', start=0, num=count))
107 | 
108 |         result = []
109 |         for name in objects:
110 |             p = self.get_proxy_by_name(name)
111 |             result.append((p.ip, p.port, p.score))
112 |         return result
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     sqlhelper = RedisHelper()
117 |     sqlhelper.init_db('redis://localhost:6379/9')
118 |     proxy = {'ip': '192.168.1.1', 'port': 80, 'type': 0, 'protocol': 0, 'country': '中国', 'area': '广州', 'speed': 11.123,
119 |              'types': 1}
120 |     proxy2 = {'ip': 'localhost', 'port': 433, 'type': 1, 'protocol': 1, 'country': u'中国', 'area': u'广州', 'speed': 123,
121 |               'types': 0, 'score': 100}
122 |     assert sqlhelper.insert(proxy) == True
123 |     assert sqlhelper.insert(proxy2) == True
124 |     assert sqlhelper.get_keys({'types': 1}) == ['proxy::192.168.1.1:80:0', ], sqlhelper.get_keys({'types': 1})
125 |     assert sqlhelper.select(conditions={'protocol': 0}) == [('192.168.1.1', '80', '0')]
126 |     assert sqlhelper.update({'types': 1}, {'score': 888}) == 1
127 |     assert sqlhelper.select() == [('192.168.1.1', '80', '888'), ('localhost', '433', '100')]
128 |     # assert sqlhelper.delete({'types': 1}) == 1
129 |     # sqlhelper.drop_db()
130 |     print('All pass.')
131 | 


--------------------------------------------------------------------------------
/db/SqlHelper.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | import datetime
  3 | from sqlalchemy import Column, Integer, String, DateTime, Numeric, create_engine, VARCHAR
  4 | from sqlalchemy.ext.declarative import declarative_base
  5 | from sqlalchemy.orm import sessionmaker
  6 | from config import DB_CONFIG, DEFAULT_SCORE
  7 | 
  8 | from db.ISqlHelper import ISqlHelper
  9 | 
 10 | '''
 11 | sql操作的基类
 12 | 包括ip，端口，types类型(0高匿名，1透明)，protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
 13 |  speed(连接速度)
 14 | '''
 15 | 
 16 | BaseModel = declarative_base()
 17 | 
 18 | 
 19 | class Proxy(BaseModel):
 20 |     __tablename__ = 'proxys'
 21 |     id = Column(Integer, primary_key=True, autoincrement=True)
 22 |     ip = Column(VARCHAR(16), nullable=False)
 23 |     port = Column(Integer, nullable=False)
 24 |     types = Column(Integer, nullable=False)
 25 |     protocol = Column(Integer, nullable=False, default=0)
 26 |     country = Column(VARCHAR(100), nullable=False)
 27 |     area = Column(VARCHAR(100), nullable=False)
 28 |     updatetime = Column(DateTime(), default=datetime.datetime.utcnow)
 29 |     speed = Column(Numeric(5, 2), nullable=False)
 30 |     score = Column(Integer, nullable=False, default=DEFAULT_SCORE)
 31 | 
 32 | 
 33 | class SqlHelper(ISqlHelper):
 34 |     params = {'ip': Proxy.ip, 'port': Proxy.port, 'types': Proxy.types, 'protocol': Proxy.protocol,
 35 |               'country': Proxy.country, 'area': Proxy.area, 'score': Proxy.score}
 36 | 
 37 |     def __init__(self):
 38 |         if 'sqlite' in DB_CONFIG['DB_CONNECT_STRING']:
 39 |             connect_args = {'check_same_thread': False}
 40 |             self.engine = create_engine(DB_CONFIG['DB_CONNECT_STRING'], echo=False, connect_args=connect_args)
 41 |         else:
 42 |             self.engine = create_engine(DB_CONFIG['DB_CONNECT_STRING'], echo=False)
 43 |         DB_Session = sessionmaker(bind=self.engine)
 44 |         self.session = DB_Session()
 45 | 
 46 |     def init_db(self):
 47 |         BaseModel.metadata.create_all(self.engine)
 48 | 
 49 |     def drop_db(self):
 50 |         BaseModel.metadata.drop_all(self.engine)
 51 | 
 52 | 
 53 |     def insert(self, value):
 54 |         proxy = Proxy(ip=value['ip'], port=value['port'], types=value['types'], protocol=value['protocol'],
 55 |                       country=value['country'],
 56 |                       area=value['area'], speed=value['speed'])
 57 |         self.session.add(proxy)
 58 |         self.session.commit()
 59 | 
 60 | 
 61 |     def delete(self, conditions=None):
 62 |         if conditions:
 63 |             conditon_list = []
 64 |             for key in list(conditions.keys()):
 65 |                 if self.params.get(key, None):
 66 |                     conditon_list.append(self.params.get(key) == conditions.get(key))
 67 |             conditions = conditon_list
 68 |             query = self.session.query(Proxy)
 69 |             for condition in conditions:
 70 |                 query = query.filter(condition)
 71 |             deleteNum = query.delete()
 72 |             self.session.commit()
 73 |         else:
 74 |             deleteNum = 0
 75 |         return ('deleteNum', deleteNum)
 76 | 
 77 | 
 78 |     def update(self, conditions=None, value=None):
 79 |         '''
 80 |         conditions的格式是个字典。类似self.params
 81 |         :param conditions:
 82 |         :param value:也是个字典：{'ip':192.168.0.1}
 83 |         :return:
 84 |         '''
 85 |         if conditions and value:
 86 |             conditon_list = []
 87 |             for key in list(conditions.keys()):
 88 |                 if self.params.get(key, None):
 89 |                     conditon_list.append(self.params.get(key) == conditions.get(key))
 90 |             conditions = conditon_list
 91 |             query = self.session.query(Proxy)
 92 |             for condition in conditions:
 93 |                 query = query.filter(condition)
 94 |             updatevalue = {}
 95 |             for key in list(value.keys()):
 96 |                 if self.params.get(key, None):
 97 |                     updatevalue[self.params.get(key, None)] = value.get(key)
 98 |             updateNum = query.update(updatevalue)
 99 |             self.session.commit()
100 |         else:
101 |             updateNum = 0
102 |         return {'updateNum': updateNum}
103 | 
104 | 
105 |     def select(self, count=None, conditions=None):
106 |         '''
107 |         conditions的格式是个字典。类似self.params
108 |         :param count:
109 |         :param conditions:
110 |         :return:
111 |         '''
112 |         if conditions:
113 |             conditon_list = []
114 |             for key in list(conditions.keys()):
115 |                 if self.params.get(key, None):
116 |                     conditon_list.append(self.params.get(key) == conditions.get(key))
117 |             conditions = conditon_list
118 |         else:
119 |             conditions = []
120 | 
121 |         query = self.session.query(Proxy.ip, Proxy.port, Proxy.score)
122 |         if len(conditions) > 0 and count:
123 |             for condition in conditions:
124 |                 query = query.filter(condition)
125 |             return query.order_by(Proxy.score.desc(), Proxy.speed).limit(count).all()
126 |         elif count:
127 |             return query.order_by(Proxy.score.desc(), Proxy.speed).limit(count).all()
128 |         elif len(conditions) > 0:
129 |             for condition in conditions:
130 |                 query = query.filter(condition)
131 |             return query.order_by(Proxy.score.desc(), Proxy.speed).all()
132 |         else:
133 |             return query.order_by(Proxy.score.desc(), Proxy.speed).all()
134 | 
135 | 
136 |     def close(self):
137 |         pass
138 | 
139 | 
140 | if __name__ == '__main__':
141 |     sqlhelper = SqlHelper()
142 |     sqlhelper.init_db()
143 |     proxy = {'ip': '192.168.1.1', 'port': 80, 'type': 0, 'protocol': 0, 'country': '中国', 'area': '广州', 'speed': 11.123, 'types': ''}
144 |     sqlhelper.insert(proxy)
145 |     sqlhelper.update({'ip': '192.168.1.1', 'port': 80}, {'score': 10})
146 |     print(sqlhelper.select(1))
147 | 
148 | 


--------------------------------------------------------------------------------
/db/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 | 


--------------------------------------------------------------------------------
/qiye2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qiyeboy/IPProxyPool/127827829535641d57a1a05e20523cb0a192170b/qiye2.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | chardet==2.3.0
 2 | gevent==1.2.0
 3 | greenlet==0.4.11
 4 | lxml==3.7.1
 5 | requests==2.12.4
 6 | SQLAlchemy==1.1.4
 7 | web.py==0.38
 8 | redis==2.10.5
 9 | psutil
10 | 


--------------------------------------------------------------------------------
/spider/HtmlDownloader.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | import random
 4 | import config
 5 | import json
 6 | from db.DataStore import sqlhelper
 7 | 
 8 | __author__ = 'qiye'
 9 | 
10 | import requests
11 | import chardet
12 | 
13 | 
14 | class Html_Downloader(object):
15 |     @staticmethod
16 |     def download(url):
17 |         try:
18 |             r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT)
19 |             r.encoding = chardet.detect(r.content)['encoding']
20 |             if (not r.ok) or len(r.content) < 500:
21 |                 raise ConnectionError
22 |             else:
23 |                 return r.text
24 | 
25 |         except Exception:
26 |             count = 0  # 重试次数
27 |             proxylist = sqlhelper.select(10)
28 |             if not proxylist:
29 |                 return None
30 | 
31 |             while count < config.RETRY_TIME:
32 |                 try:
33 |                     proxy = random.choice(proxylist)
34 |                     ip = proxy[0]
35 |                     port = proxy[1]
36 |                     proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
37 | 
38 |                     r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
39 |                     r.encoding = chardet.detect(r.content)['encoding']
40 |                     if (not r.ok) or len(r.content) < 500:
41 |                         raise ConnectionError
42 |                     else:
43 |                         return r.text
44 |                 except Exception:
45 |                     count += 1
46 | 
47 |         return None
48 | 


--------------------------------------------------------------------------------
/spider/HtmlPraser.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | import base64
  3 | from config import QQWRY_PATH, CHINA_AREA
  4 | from util.IPAddress import IPAddresss
  5 | import re
  6 | from util.compatibility import text_
  7 | 
  8 | __author__ = 'qiye'
  9 | from lxml import etree
 10 | 
 11 | 
 12 | class Html_Parser(object):
 13 |     def __init__(self):
 14 |         self.ips = IPAddresss(QQWRY_PATH)
 15 | 
 16 |     def parse(self, response, parser):
 17 |         '''
 18 | 
 19 |         :param response: 响应
 20 |         :param type: 解析方式
 21 |         :return:
 22 |         '''
 23 |         if parser['type'] == 'xpath':
 24 |             return self.XpathPraser(response, parser)
 25 |         elif parser['type'] == 'regular':
 26 |             return self.RegularPraser(response, parser)
 27 |         elif parser['type'] == 'module':
 28 |             return getattr(self, parser['moduleName'], None)(response, parser)
 29 |         else:
 30 |             return None
 31 | 
 32 |     def AuthCountry(self, addr):
 33 |         '''
 34 |         用来判断地址是哪个国家的
 35 |         :param addr:
 36 |         :return:
 37 |         '''
 38 |         for area in CHINA_AREA:
 39 |             if text_(area) in addr:
 40 |                 return True
 41 |         return False
 42 | 
 43 | 
 44 |     def XpathPraser(self, response, parser):
 45 |         '''
 46 |         针对xpath方式进行解析
 47 |         :param response:
 48 |         :param parser:
 49 |         :return:
 50 |         '''
 51 |         proxylist = []
 52 |         root = etree.HTML(response)
 53 |         proxys = root.xpath(parser['pattern'])
 54 |         for proxy in proxys:
 55 |             try:
 56 |                 ip = proxy.xpath(parser['position']['ip'])[0].text
 57 |                 port = proxy.xpath(parser['position']['port'])[0].text
 58 |                 type = 0
 59 |                 protocol = 0
 60 |                 addr = self.ips.getIpAddr(self.ips.str2ip(ip))
 61 |                 country = text_('')
 62 |                 area = text_('')
 63 |                 if text_('省') in addr or self.AuthCountry(addr):
 64 |                     country = text_('国内')
 65 |                     area = addr
 66 |                 else:
 67 |                     country = text_('国外')
 68 |                     area = addr
 69 |             except Exception as e:
 70 |                 continue
 71 |             # updatetime = datetime.datetime.now()
 72 |             # ip，端口，类型(0高匿名，1透明)，protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
 73 | 
 74 |             # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
 75 |             proxy = {'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country,
 76 |                      'area': area, 'speed': 100}
 77 |             proxylist.append(proxy)
 78 |         return proxylist
 79 | 
 80 |     def RegularPraser(self, response, parser):
 81 |         '''
 82 |         针对正则表达式进行解析
 83 |         :param response:
 84 |         :param parser:
 85 |         :return:
 86 |         '''
 87 |         proxylist = []
 88 |         pattern = re.compile(parser['pattern'])
 89 |         matchs = pattern.findall(response)
 90 |         if matchs != None:
 91 |             for match in matchs:
 92 |                 try:
 93 |                     ip = match[parser['position']['ip']]
 94 |                     port = match[parser['position']['port']]
 95 |                     # 网站的类型一直不靠谱所以还是默认，之后会检测
 96 |                     type = 0
 97 |                     # if parser['postion']['protocol'] > 0:
 98 |                     # protocol = match[parser['postion']['protocol']]
 99 |                     # if protocol.lower().find('https')!=-1:
100 |                     #         protocol = 1
101 |                     #     else:
102 |                     #         protocol = 0
103 |                     # else:
104 |                     protocol = 0
105 |                     addr = self.ips.getIpAddr(self.ips.str2ip(ip))
106 |                     country = text_('')
107 |                     area = text_('')
108 |                     # print(ip,port)
109 |                     if text_('省') in addr or self.AuthCountry(addr):
110 |                         country = text_('国内')
111 |                         area = addr
112 |                     else:
113 |                         country = text_('国外')
114 |                         area = addr
115 |                 except Exception as e:
116 |                     continue
117 | 
118 |                 proxy = {'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area,
119 |                          'speed': 100}
120 | 
121 |                 proxylist.append(proxy)
122 |             return proxylist
123 | 
124 | 
125 |     def CnproxyPraser(self, response, parser):
126 |         proxylist = self.RegularPraser(response, parser)
127 |         chardict = {'v': '3', 'm': '4', 'a': '2', 'l': '9', 'q': '0', 'b': '5', 'i': '7', 'w': '6', 'r': '8', 'c': '1'}
128 | 
129 |         for proxy in proxylist:
130 |             port = proxy['port']
131 |             new_port = ''
132 |             for i in range(len(port)):
133 |                 if port[i] != '+':
134 |                     new_port += chardict[port[i]]
135 |             new_port = int(new_port)
136 |             proxy['port'] = new_port
137 |         return proxylist
138 | 
139 | 
140 |     def proxy_listPraser(self, response, parser):
141 |         proxylist = []
142 |         pattern = re.compile(parser['pattern'])
143 |         matchs = pattern.findall(response)
144 |         if matchs:
145 |             for match in matchs:
146 |                 try:
147 |                     ip_port = base64.b64decode(match.replace("Proxy('", "").replace("')", ""))
148 |                     ip = ip_port.split(':')[0]
149 |                     port = ip_port.split(':')[1]
150 |                     type = 0
151 |                     protocol = 0
152 |                     addr = self.ips.getIpAddr(self.ips.str2ip(ip))
153 |                     country = text_('')
154 |                     area = text_('')
155 |                     # print(ip,port)
156 |                     if text_('省') in addr or self.AuthCountry(addr):
157 |                         country = text_('国内')
158 |                         area = addr
159 |                     else:
160 |                         country = text_('国外')
161 |                         area = addr
162 |                 except Exception as e:
163 |                     continue
164 |                 proxy = {'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country,
165 |                          'area': area, 'speed': 100}
166 |                 proxylist.append(proxy)
167 |             return proxylist
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/spider/ProxyCrawl.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | from gevent import monkey
  3 | monkey.patch_all()
  4 | 
  5 | import sys
  6 | import time
  7 | import gevent
  8 | 
  9 | from gevent.pool import Pool
 10 | from multiprocessing import Queue, Process, Value
 11 | 
 12 | from api.apiServer import start_api_server
 13 | from config import THREADNUM, parserList, UPDATE_TIME, MINNUM, MAX_CHECK_CONCURRENT_PER_PROCESS, MAX_DOWNLOAD_CONCURRENT
 14 | from db.DataStore import store_data, sqlhelper
 15 | from spider.HtmlDownloader import Html_Downloader
 16 | from spider.HtmlPraser import Html_Parser
 17 | from validator.Validator import validator, getMyIP, detect_from_db
 18 | 
 19 | '''
 20 | 这个类的作用是描述爬虫的逻辑
 21 | '''
 22 | 
 23 | 
 24 | def startProxyCrawl(queue, db_proxy_num,myip):
 25 |     crawl = ProxyCrawl(queue, db_proxy_num,myip)
 26 |     crawl.run()
 27 | 
 28 | 
 29 | class ProxyCrawl(object):
 30 |     proxies = set()
 31 | 
 32 |     def __init__(self, queue, db_proxy_num,myip):
 33 |         self.crawl_pool = Pool(THREADNUM)
 34 |         self.queue = queue
 35 |         self.db_proxy_num = db_proxy_num
 36 |         self.myip = myip
 37 | 
 38 | 
 39 |     def run(self):
 40 |         while True:
 41 |             self.proxies.clear()
 42 |             str = 'IPProxyPool----->>>>>>>>beginning'
 43 |             sys.stdout.write(str + "\r\n")
 44 |             sys.stdout.flush()
 45 |             proxylist = sqlhelper.select()
 46 | 
 47 |             spawns = []
 48 |             for proxy in proxylist:
 49 |                 spawns.append(gevent.spawn(detect_from_db, self.myip, proxy, self.proxies))
 50 |                 if len(spawns) >= MAX_CHECK_CONCURRENT_PER_PROCESS:
 51 |                     gevent.joinall(spawns)
 52 |                     spawns= []
 53 |             gevent.joinall(spawns)
 54 |             self.db_proxy_num.value = len(self.proxies)
 55 |             str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies)
 56 | 
 57 |             if len(self.proxies) < MINNUM:
 58 |                 str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
 59 |                 sys.stdout.write(str + "\r\n")
 60 |                 sys.stdout.flush()
 61 |                 spawns = []
 62 |                 for p in parserList:
 63 |                     spawns.append(gevent.spawn(self.crawl, p))
 64 |                     if len(spawns) >= MAX_DOWNLOAD_CONCURRENT:
 65 |                         gevent.joinall(spawns)
 66 |                         spawns= []
 67 |                 gevent.joinall(spawns)
 68 |             else:
 69 |                 str += '\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...'
 70 |                 sys.stdout.write(str + "\r\n")
 71 |                 sys.stdout.flush()
 72 | 
 73 |             time.sleep(UPDATE_TIME)
 74 | 
 75 |     def crawl(self, parser):
 76 |         html_parser = Html_Parser()
 77 |         for url in parser['urls']:
 78 |             response = Html_Downloader.download(url)
 79 |             if response is not None:
 80 |                 proxylist = html_parser.parse(response, parser)
 81 |                 if proxylist is not None:
 82 |                     for proxy in proxylist:
 83 |                         proxy_str = '%s:%s' % (proxy['ip'], proxy['port'])
 84 |                         if proxy_str not in self.proxies:
 85 |                             self.proxies.add(proxy_str)
 86 |                             while True:
 87 |                                 if self.queue.full():
 88 |                                     time.sleep(0.1)
 89 |                                 else:
 90 |                                     self.queue.put(proxy)
 91 |                                     break
 92 | 
 93 | 
 94 | if __name__ == "__main__":
 95 |     DB_PROXY_NUM = Value('i', 0)
 96 |     q1 = Queue()
 97 |     q2 = Queue()
 98 |     p0 = Process(target=start_api_server)
 99 |     p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM))
100 |     p2 = Process(target=validator, args=(q1, q2))
101 |     p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM))
102 | 
103 |     p0.start()
104 |     p1.start()
105 |     p2.start()
106 |     p3.start()
107 | 
108 |     # spider = ProxyCrawl()
109 |     # spider.run()
110 | 


--------------------------------------------------------------------------------
/spider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 | 


--------------------------------------------------------------------------------
/start.bat:
--------------------------------------------------------------------------------
1 | py -2 IPProxy.py


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 | 


--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import requests
 3 | import json
 4 | 
 5 | r = requests.get('http://127.0.0.1:8000/?types=0&count=5&country=中国')
 6 | ip_ports = json.loads(r.text)
 7 | print(ip_ports)
 8 | ip = ip_ports[0][0]
 9 | port = ip_ports[0][1]
10 | proxies = {
11 |     'http': 'http://%s:%s' % (ip, port),
12 |     'https': 'http://%s:%s' % (ip, port)
13 | }
14 | r = requests.get('http://www.baidu.com', proxies=proxies)
15 | r.encoding = 'utf-8'
16 | print(r.text)
17 | 


--------------------------------------------------------------------------------
/test/testIPAddress.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import os
  4 | 
  5 | import socket
  6 | import struct
  7 | 
  8 | import logging
  9 | 
 10 | 
 11 | logger = logging.getLogger('util')
 12 | 
 13 | 
 14 | class IPAddresss:
 15 |     def __init__(self, ipdbFile):
 16 |         self.ipdb = open(ipdbFile, "rb")
 17 |         str = self.ipdb.read(8)
 18 |         (self.firstIndex, self.lastIndex) = struct.unpack('II', str)
 19 |         self.indexCount = int((self.lastIndex - self.firstIndex) / 7 + 1)
 20 |         # print self.getVersion(), u" 纪录总数: %d 条 "%(self.indexCount)
 21 | 
 22 |     def getVersion(self):
 23 |         s = self.getIpAddr(0xffffff00)
 24 |         return s
 25 | 
 26 |     def getAreaAddr(self, offset=0):
 27 |         if offset:
 28 |             self.ipdb.seek(offset)
 29 |         str = self.ipdb.read(1)
 30 |         (byte,) = struct.unpack('B', str)
 31 |         if byte == 0x01 or byte == 0x02:
 32 |             p = self.getLong3()
 33 |             if p:
 34 |                 return self.getString(p)
 35 |             else:
 36 |                 return ""
 37 |         else:
 38 |             self.ipdb.seek(-1, 1)
 39 |             return self.getString(offset)
 40 | 
 41 |     def getAddr(self, offset, ip=0):
 42 |         self.ipdb.seek(offset + 4)
 43 |         countryAddr = ""
 44 |         areaAddr = ""
 45 |         str = self.ipdb.read(1)
 46 |         (byte,) = struct.unpack('B', str)
 47 |         if byte == 0x01:
 48 |             countryOffset = self.getLong3()
 49 |             self.ipdb.seek(countryOffset)
 50 |             str = self.ipdb.read(1)
 51 |             (b,) = struct.unpack('B', str)
 52 |             if b == 0x02:
 53 |                 countryAddr = self.getString(self.getLong3())
 54 |                 self.ipdb.seek(countryOffset + 4)
 55 |             else:
 56 |                 countryAddr = self.getString(countryOffset)
 57 |             areaAddr = self.getAreaAddr()
 58 |         elif byte == 0x02:
 59 |             countryAddr = self.getString(self.getLong3())
 60 |             areaAddr = self.getAreaAddr(offset + 8)
 61 |         else:
 62 |             countryAddr = self.getString(offset + 4)
 63 |             areaAddr = self.getAreaAddr()
 64 |         return countryAddr + " " + areaAddr
 65 | 
 66 |     def dump(self, first, last):
 67 |         if last > self.indexCount:
 68 |             last = self.indexCount
 69 |         for index in range(first, last):
 70 |             offset = self.firstIndex + index * 7
 71 |             self.ipdb.seek(offset)
 72 |             buf = self.ipdb.read(7)
 73 |             (ip, of1, of2) = struct.unpack("IHB", buf)
 74 |             address = self.getAddr(of1 + (of2 << 16))
 75 |             # 把GBK转为utf-8
 76 |             address = str(address, 'gbk').encode("utf-8")
 77 |             logger.info("%d %s %s" % (index, self.ip2str(ip), address))
 78 | 
 79 |     def setIpRange(self, index):
 80 |         offset = self.firstIndex + index * 7
 81 |         self.ipdb.seek(offset)
 82 |         buf = self.ipdb.read(7)
 83 |         (self.curStartIp, of1, of2) = struct.unpack("IHB", buf)
 84 |         self.curEndIpOffset = of1 + (of2 << 16)
 85 |         self.ipdb.seek(self.curEndIpOffset)
 86 |         buf = self.ipdb.read(4)
 87 |         (self.curEndIp,) = struct.unpack("I", buf)
 88 | 
 89 |     def getIpAddr(self, ip):
 90 |         L = 0
 91 |         R = self.indexCount - 1
 92 |         while L < R - 1:
 93 |             M = int((L + R) / 2)
 94 |             self.setIpRange(M)
 95 |             if ip == self.curStartIp:
 96 |                 L = M
 97 |                 break
 98 |             if ip > self.curStartIp:
 99 |                 L = M
100 |             else:
101 |                 R = M
102 |         self.setIpRange(L)
103 |         # version information, 255.255.255.X, urgy but useful
104 |         if ip & 0xffffff00 == 0xffffff00:
105 |             self.setIpRange(R)
106 |         if self.curStartIp <= ip <= self.curEndIp:
107 |             address = self.getAddr(self.curEndIpOffset)
108 |             # 把GBK转为utf-8
109 |             address = str(address)
110 |         else:
111 |             address = "未找到该IP的地址"
112 |         return address
113 | 
114 |     def getIpRange(self, ip):
115 |         self.getIpAddr(ip)
116 |         range = self.ip2str(self.curStartIp) + ' - ' \
117 |                 + self.ip2str(self.curEndIp)
118 |         return range
119 | 
120 |     def getString(self, offset=0):
121 |         if offset:
122 |             self.ipdb.seek(offset)
123 |         str = b''
124 |         ch = self.ipdb.read(1)
125 |         (byte,) = struct.unpack('B', ch)
126 |         while byte != 0:
127 |             str += ch
128 |             ch = self.ipdb.read(1)
129 |             (byte,) = struct.unpack('B', ch)
130 |         return str.decode('gbk')
131 | 
132 |     def ip2str(self, ip):
133 |         return str(ip >> 24) + '.' + str((ip >> 16) & 0xff) + '.' + str((ip >> 8) & 0xff) + '.' + str(ip & 0xff)
134 | 
135 |     def str2ip(self, s):
136 |         (ip,) = struct.unpack('I', socket.inet_aton(s))
137 |         return ((ip >> 24) & 0xff) | ((ip & 0xff) << 24) | ((ip >> 8) & 0xff00) | ((ip & 0xff00) << 8)
138 | 
139 |     def getLong3(self, offset=0):
140 |         if offset:
141 |             self.ipdb.seek(offset)
142 |         str = self.ipdb.read(3)
143 |         (a, b) = struct.unpack('HB', str)
144 |         return (b << 16) + a
145 | 
146 | 
147 | QQWRY_PATH = os.path.dirname(__file__) + "/../data/qqwry.dat"
148 | ips = IPAddresss(QQWRY_PATH)
149 | addr = ips.getIpAddr(ips.str2ip('183.61.236.53'))
150 | print(addr)
151 | 


--------------------------------------------------------------------------------
/test/testIPType.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | from lxml import etree
 3 | import requests
 4 | import config
 5 | 
 6 | 
 7 | def checkProxyType(selfip, proxies):
 8 |     '''
 9 |     用来检测代理的类型，突然发现，免费网站写的信息不靠谱，还是要自己检测代理的类型
10 |     :param proxies: 代理(0 高匿，1 匿名，2 透明 3 无效代理
11 |     :return:
12 |     '''
13 | 
14 |     try:
15 |         r = requests.get(url='https://incloak.com/ip/', headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies)
16 |         print
17 |         r.text
18 |         # if r.ok:
19 |         # root = etree.HTML(r.text)
20 |         # ip = root.xpath('.//center[2]/table/tr[3]/td[2]')[0].text
21 |         # http_x_forwared_for = root.xpath('.//center[2]/table/tr[8]/td[2]')[0].text
22 |         #     http_via = root.xpath('.//center[2]/table/tr[9]/td[2]')[0].text
23 |         #     # print ip,http_x_forwared_for,http_via,type(http_via),type(http_x_forwared_for)
24 |         #     if ip==selfip:
25 |         #         return 3
26 |         #     if http_x_forwared_for is None and http_via is None:
27 |         #         return 0
28 |         #     if http_via != None and http_x_forwared_for.find(selfip)== -1:
29 |         #         return 1
30 |         #
31 |         #     if http_via != None and http_x_forwared_for.find(selfip)!= -1:
32 |         #         return 2
33 |         # return 3
34 | 
35 | 
36 |     except Exception as e:
37 |         print
38 |         str(e)
39 |         return 3
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     ip = '61.132.241.109'
44 |     port = '808'
45 |     proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
46 |     checkProxyType(None, proxies)


--------------------------------------------------------------------------------
/test/testbase64.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import base64
 3 | import re
 4 | 
 5 | str = '''
 6 | <script type="text/javascript">Proxy('NzcuODcuMjEuODY6ODA4MA==')</script></li>
 7 | '''
 8 | match = re.search('Proxy\(.+\)', str)
 9 | print
10 | match.group()
11 | ip_port = base64.b64decode(match.group().replace("Proxy('", "").replace("')", ""))
12 | print
13 | ip_port
14 | 


--------------------------------------------------------------------------------
/test/testhttpserver.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import BaseHTTPServer
 3 | import json
 4 | import urlparse
 5 | 
 6 | 
 7 | class WebRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
 8 |     def do_GET(self):
 9 |         """
10 |         """
11 |         print
12 |         self.path
13 |         parsed_path = urlparse.urlparse(self.path)
14 |         print
15 |         parsed_path
16 |         print
17 |         parsed_path.query
18 |         # message_parts = [
19 |         # 'CLIENT VALUES:',
20 |         # 'client_address=%s (%s)' % (self.client_address,
21 |         # self.address_string()),
22 |         #         'command=%s' % self.command,
23 |         #         'path=%s' % self.path,
24 |         #         'real path=%s' % parsed_path.path,
25 |         #         'query=%s' % parsed_path.query,
26 |         #         'request_version=%s' % self.request_version,
27 |         #         '',
28 |         #         'SERVER VALUES:',
29 |         #         'server_version=%s' % self.server_version,
30 |         #         'sys_version=%s' % self.sys_version,
31 |         #         'protocol_version=%s' % self.protocol_version,
32 |         #         '',
33 |         #         'HEADERS RECEIVED:',
34 |         #         ]
35 |         # for name, value in sorted(self.headers.items()):
36 |         #     message_parts.append('%s=%s' % (name, value.rstrip()))
37 |         # message_parts.append('')
38 |         # message = '\r\n'.join(message_parts)
39 |         data1 = [{'ip': '192.168.0.0', 'port': 456}] * 10
40 |         d1 = json.dumps(data1, sort_keys=True, indent=4)
41 |         message = ('192.168.1.1', 80)
42 |         self.send_response(200)
43 |         self.end_headers()
44 |         self.wfile.write(d1)
45 | 
46 | 
47 | server = BaseHTTPServer.HTTPServer(('0.0.0.0', 8000), WebRequestHandler)
48 | server.serve_forever()


--------------------------------------------------------------------------------
/test/testlist.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | from decimal import Decimal
 3 | 
 4 | __author__ = 'Xaxdus'
 5 | 
 6 | 
 7 | # list = ["www.baidu.com/%s" %m for m in ['index']+range(1,5)]
 8 | #
 9 | # list = [(1,10)]*10
10 | #
11 | # for m,n in list:
12 | # print m,n
13 | #
14 | #
15 | # list2 = ["www.baidu.com/%s/%s"%(i[0],i[1]) for i in list]
16 | # print list2
17 | 
18 | # x=Decimal('0.998531571219').quantize(Decimal('0.00'))
19 | # a= 0.998531571219
20 | # value = round(a, 3)
21 | # print x,type(x),value
22 | # proxys=[]
23 | # proxy=[123,1234]
24 | # proxys.append(proxy)
25 | #
26 | # proxy=[123,1234]
27 | # proxys.append(proxy)
28 | #
29 | # print proxys
30 | # l = [{'ip':'123.1.1.1','port':80},{'ip':'123.1.1.1','port':80},{'ip':'123.1.2.1','port':80},{'ip':'123.1.1.1','port':81}]
31 | #
32 | # # for d in l:
33 | # #    print  [tuple(d.items())]
34 | # print [tuple(d.items()) for d in l]
35 | #
36 | # print [dict(t) for t in set([tuple(d.items()) for d in l])]
37 | import requests
38 | 
39 | r = requests.get('http://127.0.0.1:8000/delete?ip=120.92.3.127')
40 | print
41 | r.text


--------------------------------------------------------------------------------
/test/testlxml.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | from lxml import etree
  3 | 
  4 | __author__ = 'Xaxdus'
  5 | 
  6 | html = '''
  7 | 
  8 | <!doctype html public "-//w3c//dtd xhtml 1.0 transitional//en" "http://www.w3.org/tr/xhtml1/dtd/xhtml1-transitional.dtd">
  9 | <html xmlns="http://www.w3.org/1999/xhtml">
 10 | <head>
 11 | <title>北京http代理ip_66免费代理ip提取网</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312"/>
 12 | <meta name="keywords" content="ip提取，http代理，国内外代理，免费高匿名代理，免费ip提取网站，ip代理提取，免费ip代理,代理ip,ip代理,ip代理软件"/>
 13 | <meta name="description" content="66免费ip代理网时时更新最新免费代理ip、http代理为主,常年提供免费代理ip、qq代理ip、https匿名代理、国内代理软件等加速服务，为用户提供最优质代理."/>
 14 | <meta name=generator content="mshtml 8.00.7601.17514">
 15 | <script type="text/javascript" src="/common/js/public.js"></script>
 16 | <script type="text/javascript" src="/common/js/app.js"></script>
 17 | <link rel="stylesheet" type="text/css" href="/common/css/main.css"/>
 18 | </head>
 19 | <body>
 20 | <div id="header" class="header">
 21 | <div class="logoarea">
 22 | <a class="logo left block" href=""><img src="/common/tp/logo.gif" alt="66免费ip代理网LOGO" height="65"></a>
 23 | <div class="clr">
 24 | <div align="right">
 25 | <a href="index.html"></a>
 26 | <ul class="textlarge" style="padding-top:12px">
 27 | <a href="/zh.php">祝贺:网站会员已经开通在线支付功能,支持支付宝.财付通.微信,付款购买会员点击购买!
 28 | <a href="http://www.66ip.cn/zh.php" rel="nofollow" target="_blank"><img src="/common/img/yaoyao.jpg" border="0"></a>
 29 | </ul>
 30 | </div>
 31 | </div>
 32 | </div>
 33 | <div class="navigator">
 34 | <div align="center">
 35 | <ul class="textlarge">
 36 | <li><a href="/index.html">网站首页</a></li>
 37 | <li><a href="/pt.html">免费HTTP提取</a></li>
 38 | <li><a href="/nm.html">免费匿名IP提取</a></li>
 39 | <li><a href="/rj.html">免费软件IP提取</a></li>
 40 | <li><a href="/yz">在线代理IP检测</a></li>
 41 | <li><a href="/zz.html">收费会员IP提取</a></li>
 42 | <li><a href="/zh.php">收费购买IP价格</a></li>
 43 | <li><a class="logo left block" href="tencent://message/?uin=709295847&amp;site=客服&amp;menu=yes"><img src="/common/tp/qq.gif" alt="QQ客服" width="77" height="22" border="0" align="right"></a></li>
 44 | </ul>
 45 | </div>
 46 | </div>
 47 | </div>
 48 | <div style="margin-top:10px">
 49 | <table style="width:959px;margin:0 auto;border:1px solid #ccc;border-collapse:collapse" cellpadding="0" cellspacing="0">
 50 | <tr>
 51 | <td>
 52 | <p>
 53 | <a href="http://shang.qq.com/wpa/qunwpa?idkey=21cc6c4df48771b2049764bbc76a51fb9df9627e6096c14378f8f1bef78a2e8f" target="_blank"><img src="/klz/953X164.gif" alt="免流量" border="0"></a>
 54 | </p>
 55 | </td>
 56 | </tr>
 57 | <tr>
 58 | <td>
 59 | </td>
 60 | </tr>
 61 | </table>
 62 | </div>
 63 | <div style="margin-top:10px;">
 64 | <table cellspacing="0" cellpadding="0" style="width:959px;margin:0 auto;border:1px solid #cccccc;border-collapse:collapse;">
 65 | <tr>
 66 | <td style="padding-right:0px;">
 67 | <ul class="textlarge22">
 68 | <li><a href="http://www.66ip.cn">全国代理ip</a> </li>
 69 | <li class="myactive"><a href="/areaindex_1/1.html">北京代理ip</a> </li>
 70 | <li><a href="/areaindex_2/1.html">上海代理ip</a> </li>
 71 | <li><a href="/areaindex_3/1.html">天津代理ip</a> </li>
 72 | <li><a href="/areaindex_4/1.html">重庆代理ip</a> </li>
 73 | <li><a href="/areaindex_5/1.html">河北代理ip</a> </li>
 74 | <li><a href="/areaindex_6/1.html">山西代理ip</a> </li>
 75 | <li><a href="/areaindex_7/1.html">辽宁代理ip</a> </li>
 76 | <li><a href="/areaindex_8/1.html">吉林代理ip</a> </li>
 77 | <li><a href="/areaindex_9/1.html">黑龙江代理ip</a> </li>
 78 | <li><a href="/areaindex_10/1.html">江苏代理ip</a> </li>
 79 | <li><a href="/areaindex_11/1.html">浙江代理ip</a> </li>
 80 | <li><a href="/areaindex_12/1.html">安徽代理ip</a> </li>
 81 | <li><a href="/areaindex_13/1.html">福建代理ip</a> </li>
 82 | <li><a href="/areaindex_14/1.html">江西代理ip</a> </li>
 83 | <li><a href="/areaindex_15/1.html">山东代理ip</a> </li>
 84 | <li><a href="/areaindex_16/1.html">河南代理ip</a> </li>
 85 | <li><a href="/areaindex_17/1.html">湖北代理ip</a> </li>
 86 | <li><a href="/areaindex_18/1.html">湖南代理ip</a> </li>
 87 | <li><a href="/areaindex_19/1.html">广东代理ip</a> </li>
 88 | <li><a href="/areaindex_20/1.html">海南代理ip</a> </li>
 89 | <li><a href="/areaindex_21/1.html">四川代理ip</a> </li>
 90 | <li><a href="/areaindex_22/1.html">贵州代理ip</a> </li>
 91 | <li><a href="/areaindex_23/1.html">云南代理ip</a> </li>
 92 | <li><a href="/areaindex_24/1.html">陕西代理ip</a> </li>
 93 | <li><a href="/areaindex_25/1.html">甘肃代理ip</a> </li>
 94 | <li><a href="/areaindex_26/1.html">青海代理ip</a> </li>
 95 | <li><a href="/areaindex_27/1.html">台湾代理ip</a> </li>
 96 | <li><a href="/areaindex_28/1.html">内蒙古代理ip</a> </li>
 97 | <li><a href="/areaindex_29/1.html">广西代理ip</a> </li>
 98 | <li><a href="/areaindex_30/1.html">西藏代理ip</a> </li>
 99 | <li><a href="/areaindex_31/1.html">宁夏代理ip</a> </li>
100 | <li><a href="/areaindex_32/1.html">新疆代理ip</a> </li>
101 | <li><a href="/areaindex_33/1.html">香港代理ip</a> </li>
102 | <li><a href="/areaindex_34/1.html">澳门代理ip</a> </li>
103 | </ul>
104 | </td>
105 | </tr>
106 | </table>
107 | </div>
108 | </ul></div></div>
109 | <div id=main class=container>
110 | <div class="containerbox boxindex">
111 | <div>
112 | <div>
113 | <div class=style1>
114 | <div align="center"></div>
115 | </div>
116 | <div id="footer" class="footer">
117 | <div align="center">
118 | <p>
119 | 
120 | <span class="style6">新界面-新ip-高质量-新功能-多ip</span></p>
121 | <p class="style6">更多http-https追求更高的质量</p>
122 | <p class="style6">ip地区表格数量会出现误差请不要大量对比</p>
123 | <p>
124 | </p>
125 | <p class="style7">北京代理总数 <span style="color:red;">165</span></p>
126 | <table width='100%' border="2px" cellspacing="0px" bordercolor="#6699ff">
127 | <tr><td>ip</td><td>端口号</td><td>代理位置</td><td>代理类型</td><td>验证时间</td></tr>
128 | <tr><td>124.206.56.125</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日13时 验证</td></tr><tr><td>115.25.138.245</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日13时 验证</td></tr><tr><td>180.76.189.148</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日12时 验证</td></tr><tr><td>180.76.189.148</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日12时 验证</td></tr><tr><td>123.119.27.67</td><td>8118</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日12时 验证</td></tr><tr><td>123.119.27.67</td><td>8118</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日12时 验证</td></tr><tr><td>115.183.11.158</td><td>9999</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日11时 验证</td></tr><tr><td>118.26.226.157</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日11时 验证</td></tr><tr><td>115.25.138.245</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日10时 验证</td></tr><tr><td>101.251.247.82</td><td>10000</td><td>北京市海淀区</td><td>高匿代理</td><td>2016年07月19日10时 验证</td></tr><tr><td>182.48.102.2</td><td>9000</td><td>北京市海淀区</td><td>高匿代理</td><td>2016年07月19日09时 验证</td></tr><tr><td>203.91.121.74</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日09时 验证</td></tr><tr><td>111.202.112.169</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日08时 验证</td></tr><tr><td>118.244.239.2</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日08时 验证</td></tr><tr><td>119.254.84.90</td><td>80</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日07时 验证</td></tr><tr><td>119.254.84.90</td><td>80</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日07时 验证</td></tr><tr><td>123.124.168.107</td><td>80</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日07时 验证</td></tr><tr><td>123.124.168.107</td><td>80</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日07时 验证</td></tr><tr><td>101.200.174.11</td><td>3128</td><td>北京市海淀区</td><td>高匿代理</td><td>2016年07月19日06时 验证</td></tr><tr><td>125.39.118.163</td><td>18000</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日06时 验证</td></tr><tr><td>182.61.9.177</td><td>80</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日05时 验证</td></tr><tr><td>118.26.226.157</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日05时 验证</td></tr><tr><td>118.26.226.157</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日05时 验证</td></tr><tr><td>182.61.9.177</td><td>80</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日05时 验证</td></tr><tr><td>182.61.7.193</td><td>80</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日04时 验证</td></tr><tr><td>111.202.112.169</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日04时 验证</td></tr><tr><td>203.91.121.74</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日03时 验证</td></tr><tr><td>116.213.102.66</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日03时 验证</td></tr><tr><td>118.244.239.2</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日02时 验证</td></tr><tr><td>124.16.70.20</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日02时 验证</td></tr><tr><td>219.141.225.149</td><td>80</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日01时 验证</td></tr><tr><td>203.91.121.74</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日01时 验证</td></tr><tr><td>182.48.102.2</td><td>9000</td><td>北京市海淀区</td><td>高匿代理</td><td>2016年07月19日00时 验证</td></tr><tr><td>182.48.102.2</td><td>9000</td><td>北京市海淀区</td><td>高匿代理</td><td>2016年07月19日00时 验证</td></tr><tr><td>203.91.121.74</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日00时 验证</td></tr><tr><td>203.91.121.74</td><td>3128</td><td>北京市</td><td>高匿代理</td><td>2016年07月19日00时 验证</td></tr></table>
129 | <style>#pagelist a{width:23px;display:block;float:left;}.btn_left{position:absolute;left:0px;top:0px;display:block;width:16px;}.btn_right{position:absolute;right:0px;top:0px;display:block;width:16px;}.mypage a{background:#fff none repeat scroll 0 0;border:1px solid #eaeaea;display:inline;margin:0 0 0 3px;padding:5px 9px;text-decoration:none;}.mypage a:hover{background:#cecece none repeat scroll 0 0;border:1px solid #eaeaea;}a.pagecurrent{background:#e3e3e3 none repeat scroll 0 0;}a.dotdot{background:#fff none repeat scroll 0 0;border:0px;}a.dotdot:hover{background:#fff none repeat scroll 0 0;border:0px;}</style>
130 | <div class="mypage" style="position:relative;margin:10px auto;width:900px;">
131 | <div style="width:740px;overflow:hidden;height:30px;margin:0px auto;"><div id="PageList" style="width:944px;"><a href="/areaindex_1/index" class="pageCurrent">1</a> <a href="/areaindex_1/2.html">2</a> <a href="/areaindex_1/3.html">3</a> <a href="/areaindex_1/4.html">4</a> <a href="/areaindex_1/5.html">5</a> <a href="/areaindex_1/6.html">6</a> <a href="/areaindex_1/7.html">7</a> <a href="/areaindex_1/8.html">8</a> <a href="/areaindex_1/9.html">9</a> <a href="/areaindex_1/10.html">10</a> <a href="javascript:void();" class="dotdot">..</a> <a href="/areaindex_1/59.html">59</a> <a href="/areaindex_1/2.html">&raquo;</a></div></div>
132 | </div>
133 | </div>
134 | </div>
135 | <div align="center"></div>
136 | <div align="center"></div>
137 | <div class="footer">
138 | <div align="center" class="center">
139 | <p>copyright&nbsp;&copy; 2012-2016 66IP.版权所有 粤icp备14092868号-1|QQ一群：<a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=1f3ff00fef89fe190d2606b070fdd656f9ec25ba6a35f9becffa6fd712992ea2" rel="nofollow"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="免费HTTP代理提取66ip.cn" title="免费HTTP代理提取66ip.cn"></a>|QQ二群：<a target="_blank" href="http://shang.qq.com/wpa/qunwpa?idkey=cecdb11c7c6fda939dee876cad2cbaefe02091584788e22bdd3b5815d246e9c0"><img border="0" src="http://pub.idqqimg.com/wpa/images/group.png" alt="免流圣子IP代理提取2群" title="免流圣子IP代理提取2群"></a>
140 | </a><span>|</span><a href="/sitemap.html">网站地图</a></div>
141 | </script>
142 | <script type="text/javascript">
143 |     /*120*300 创建于 2016-02-28*/
144 | var cpro_id = "u2535026";
145 | </script>
146 | <script src="http://cpro.baidustatic.com/cpro/ui/f.js" type="text/javascript"></script>
147 | <script type="text/javascript">
148 |     /* 创建于 2015-05-14*/
149 |     var cpro_psid = "u2103791";
150 | </script>
151 | <script src="http://su.bdimg.com/static/dspui/js/f.js"></script>
152 | <script>
153 | var _hmt = _hmt || [];
154 | (function() {
155 |   var hm = document.createElement("script");
156 |   hm.src = "//hm.baidu.com/hm.js?1761fabf3c988e7f04bec51acd4073f4";
157 |   var s = document.getElementsByTagName("script")[0];
158 |   s.parentNode.insertBefore(hm, s);
159 | })();
160 | </script>
161 | </div>
162 | </body>
163 | </html>
164 | 
165 | '''
166 | 
167 | root = etree.HTML(html)
168 | proxys = root.xpath(".//*[@id='footer']/div/table/tr[position()>1]")
169 | 
170 | for proxy in proxys:
171 |     print
172 |     proxy.xpath('./td[1]')[0].text


--------------------------------------------------------------------------------
/test/testqueue.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | from multiprocessing import Queue
 3 | 
 4 | try:
 5 |     q = Queue()
 6 |     q.get(timeout=5)
 7 | except BaseException, e:
 8 |     print
 9 |     '--' + str(e)
10 | 
11 | 


--------------------------------------------------------------------------------
/test/testsql.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | from db.SqlHelper import SqlHelper
 3 | from util.exception import Con_DB_Fail
 4 | 
 5 | try:
 6 |     sqlhelper = SqlHelper()
 7 |     sqlhelper.init_db()
 8 | except Exception:
 9 |     raise Con_DB_Fail
10 | 
11 | proxy = {'ip': '192.168.1.1', 'port': int('80'), 'type': 0, 'protocol': 0, 'country': u'中国', 'area': u'四川', 'speed': 0}
12 | sqlhelper.insert(proxy)


--------------------------------------------------------------------------------
/util/IPAddress.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | 
  6 | import socket
  7 | import struct
  8 | 
  9 | import logging
 10 | from util.compatibility import text_
 11 | 
 12 | logger = logging.getLogger('util')
 13 | 
 14 | 
 15 | class IPAddresss:
 16 |     def __init__(self, ipdbFile):
 17 |         self.ipdb = open(ipdbFile, "rb")
 18 |         str = self.ipdb.read(8)
 19 |         (self.firstIndex, self.lastIndex) = struct.unpack('II', str)
 20 |         self.indexCount = int((self.lastIndex - self.firstIndex) / 7 + 1)
 21 |         # print self.getVersion(), u" 纪录总数: %d 条 "%(self.indexCount)
 22 | 
 23 |     def getVersion(self):
 24 |         s = self.getIpAddr(0xffffff00)
 25 |         return s
 26 | 
 27 |     def getAreaAddr(self, offset=0):
 28 |         if offset:
 29 |             self.ipdb.seek(offset)
 30 |         str = self.ipdb.read(1)
 31 |         (byte,) = struct.unpack('B', str)
 32 |         if byte == 0x01 or byte == 0x02:
 33 |             p = self.getLong3()
 34 |             if p:
 35 |                 return self.getString(p)
 36 |             else:
 37 |                 return ""
 38 |         else:
 39 |             self.ipdb.seek(-1, 1)
 40 |             return self.getString(offset)
 41 | 
 42 |     def getAddr(self, offset, ip=0):
 43 |         self.ipdb.seek(offset + 4)
 44 |         countryAddr = text_("")
 45 |         areaAddr = text_("")
 46 |         str = self.ipdb.read(1)
 47 |         (byte,) = struct.unpack('B', str)
 48 |         if byte == 0x01:
 49 |             countryOffset = self.getLong3()
 50 |             self.ipdb.seek(countryOffset)
 51 |             str = self.ipdb.read(1)
 52 |             (b,) = struct.unpack('B', str)
 53 |             if b == 0x02:
 54 |                 countryAddr = self.getString(self.getLong3())
 55 |                 self.ipdb.seek(countryOffset + 4)
 56 |             else:
 57 |                 countryAddr = self.getString(countryOffset)
 58 |             areaAddr = self.getAreaAddr()
 59 |         elif byte == 0x02:
 60 |             countryAddr = self.getString(self.getLong3())
 61 |             areaAddr = self.getAreaAddr(offset + 8)
 62 |         else:
 63 |             countryAddr = self.getString(offset + 4)
 64 |             areaAddr = self.getAreaAddr()
 65 |         return countryAddr + text_(" ") + areaAddr
 66 | 
 67 |     def dump(self, first, last):
 68 |         if last > self.indexCount:
 69 |             last = self.indexCount
 70 |         for index in range(first, last):
 71 |             offset = self.firstIndex + index * 7
 72 |             self.ipdb.seek(offset)
 73 |             buf = self.ipdb.read(7)
 74 |             (ip, of1, of2) = struct.unpack("IHB", buf)
 75 |             address = self.getAddr(of1 + (of2 << 16))
 76 |             # 把GBK转为utf-8
 77 |             address = text_(address, 'gbk').encode("utf-8")
 78 |             logger.info("%d %s %s" % (index, self.ip2str(ip), address))
 79 | 
 80 |     def setIpRange(self, index):
 81 |         offset = self.firstIndex + index * 7
 82 |         self.ipdb.seek(offset)
 83 |         buf = self.ipdb.read(7)
 84 |         (self.curStartIp, of1, of2) = struct.unpack("IHB", buf)
 85 |         self.curEndIpOffset = of1 + (of2 << 16)
 86 |         self.ipdb.seek(self.curEndIpOffset)
 87 |         buf = self.ipdb.read(4)
 88 |         (self.curEndIp,) = struct.unpack("I", buf)
 89 | 
 90 |     def getIpAddr(self, ip):
 91 |         L = 0
 92 |         R = self.indexCount - 1
 93 |         while L < R - 1:
 94 |             M = int((L + R) / 2)
 95 |             self.setIpRange(M)
 96 |             if ip == self.curStartIp:
 97 |                 L = M
 98 |                 break
 99 |             if ip > self.curStartIp:
100 |                 L = M
101 |             else:
102 |                 R = M
103 |         self.setIpRange(L)
104 |         # version information, 255.255.255.X, urgy but useful
105 |         if ip & 0xffffff00 == 0xffffff00:
106 |             self.setIpRange(R)
107 |         if self.curStartIp <= ip <= self.curEndIp:
108 |             address = self.getAddr(self.curEndIpOffset)
109 |             # 把GBK转为utf-8
110 |             address = text_(address)
111 |         else:
112 |             address = text_("未找到该IP的地址")
113 |         return address
114 | 
115 |     def getIpRange(self, ip):
116 |         self.getIpAddr(ip)
117 |         range = self.ip2str(self.curStartIp) + ' - ' \
118 |                 + self.ip2str(self.curEndIp)
119 |         return range
120 | 
121 |     def getString(self, offset=0):
122 |         if offset:
123 |             self.ipdb.seek(offset)
124 |         str = b''
125 |         ch = self.ipdb.read(1)
126 |         (byte,) = struct.unpack('B', ch)
127 |         while byte != 0:
128 |             str += ch
129 |             ch = self.ipdb.read(1)
130 |             (byte,) = struct.unpack('B', ch)
131 |         return str.decode('gbk')
132 | 
133 |     def ip2str(self, ip):
134 |         return str(ip >> 24) + '.' + str((ip >> 16) & 0xff) + '.' + str((ip >> 8) & 0xff) + '.' + str(ip & 0xff)
135 | 
136 |     def str2ip(self, s):
137 |         (ip,) = struct.unpack('I', socket.inet_aton(s))
138 |         return ((ip >> 24) & 0xff) | ((ip & 0xff) << 24) | ((ip >> 8) & 0xff00) | ((ip & 0xff00) << 8)
139 | 
140 |     def getLong3(self, offset=0):
141 |         if offset:
142 |             self.ipdb.seek(offset)
143 |         str = self.ipdb.read(3)
144 |         (a, b) = struct.unpack('HB', str)
145 |         return (b << 16) + a
146 | 
147 | 
148 | 
149 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 | 
3 | 


--------------------------------------------------------------------------------
/util/compatibility.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import sys
 3 | 
 4 | PY3 = sys.version_info[0] == 3
 5 | if PY3:
 6 |     text_type = str
 7 |     binary_type = bytes
 8 | else:
 9 |     text_type = unicode
10 |     binary_type = str
11 | 
12 | 
13 | def text_(s, encoding='utf-8', errors='strict'):
14 |     if isinstance(s, binary_type):
15 |         return s.decode(encoding, errors)
16 |     return s
17 | 
18 | 
19 | def bytes_(s, encoding='utf-8', errors='strict'):
20 |     if isinstance(s, text_type):
21 |         return s.encode(encoding, errors)
22 |     return s
23 | 


--------------------------------------------------------------------------------
/util/exception.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import config
 3 | 
 4 | 
 5 | class Test_URL_Fail(Exception):
 6 |     def __str__(self):
 7 |         str = "访问%s失败，请检查网络连接" % config.TEST_IP
 8 |         return str
 9 | 
10 | 
11 | class Con_DB_Fail(Exception):
12 |     def __str__(self):
13 |         str = "使用DB_CONNECT_STRING:%s--连接数据库失败" % config.DB_CONNECT_STRING
14 |         return str
15 | 


--------------------------------------------------------------------------------
/util/logger.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import logging
 3 | 
 4 | __author__ = 'qiye'
 5 | 
 6 | logger = logging.getLogger()
 7 | 
 8 | 
 9 | def logger_proxy(proxy):
10 |     logger.setLevel(logging.INFO)
11 |     logger.info(proxy)
12 | 


--------------------------------------------------------------------------------
/validator/Validator.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | import sys
  3 | 
  4 | import chardet
  5 | from gevent import monkey
  6 | monkey.patch_all()
  7 | 
  8 | import json
  9 | import os
 10 | import gevent
 11 | import requests
 12 | import time
 13 | import psutil
 14 | from multiprocessing import Process, Queue
 15 | 
 16 | import config
 17 | from db.DataStore import sqlhelper
 18 | from util.exception import Test_URL_Fail
 19 | 
 20 | 
 21 | def detect_from_db(myip, proxy, proxies_set):
 22 |     proxy_dict = {'ip': proxy[0], 'port': proxy[1]}
 23 |     result = detect_proxy(myip, proxy_dict)
 24 |     if result:
 25 |         proxy_str = '%s:%s' % (proxy[0], proxy[1])
 26 |         proxies_set.add(proxy_str)
 27 | 
 28 |     else:
 29 |         if proxy[2] < 1:
 30 |             sqlhelper.delete({'ip': proxy[0], 'port': proxy[1]})
 31 |         else:
 32 |             score = proxy[2]-1
 33 |             sqlhelper.update({'ip': proxy[0], 'port': proxy[1]}, {'score': score})
 34 |             proxy_str = '%s:%s' % (proxy[0], proxy[1])
 35 |             proxies_set.add(proxy_str)
 36 | 
 37 | 
 38 | 
 39 | def validator(queue1, queue2, myip):
 40 |     tasklist = []
 41 |     proc_pool = {}     # 所有进程列表
 42 |     cntl_q = Queue()   # 控制信息队列
 43 |     while True:
 44 |         if not cntl_q.empty():
 45 |             # 处理已结束的进程
 46 |             try:
 47 |                 pid = cntl_q.get()
 48 |                 proc = proc_pool.pop(pid)
 49 |                 proc_ps = psutil.Process(pid)
 50 |                 proc_ps.kill()
 51 |                 proc_ps.wait()
 52 |             except Exception as e:
 53 |                 pass
 54 |                 # print(e)
 55 |                 # print(" we are unable to kill pid:%s" % (pid))
 56 |         try:
 57 |             # proxy_dict = {'source':'crawl','data':proxy}
 58 |             if len(proc_pool) >= config.MAX_CHECK_PROCESS:
 59 |                 time.sleep(config.CHECK_WATI_TIME)
 60 |                 continue
 61 |             proxy = queue1.get()
 62 |             tasklist.append(proxy)
 63 |             if len(tasklist) >= config.MAX_CHECK_CONCURRENT_PER_PROCESS:
 64 |                 p = Process(target=process_start, args=(tasklist, myip, queue2, cntl_q))
 65 |                 p.start()
 66 |                 proc_pool[p.pid] = p
 67 |                 tasklist = []
 68 | 
 69 |         except Exception as e:
 70 |             if len(tasklist) > 0:
 71 |                 p = Process(target=process_start, args=(tasklist, myip, queue2, cntl_q))
 72 |                 p.start()
 73 |                 proc_pool[p.pid] = p
 74 |                 tasklist = []
 75 | 
 76 | def process_start(tasks, myip, queue2, cntl):
 77 |     spawns = []
 78 |     for task in tasks:
 79 |         spawns.append(gevent.spawn(detect_proxy, myip, task, queue2))
 80 |     gevent.joinall(spawns)
 81 |     cntl.put(os.getpid())  # 子进程退出是加入控制队列
 82 | 
 83 | 
 84 | def detect_proxy(selfip, proxy, queue2=None):
 85 |     '''
 86 |     :param proxy: ip字典
 87 |     :return:
 88 |     '''
 89 |     ip = proxy['ip']
 90 |     port = proxy['port']
 91 |     proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
 92 |     protocol, types, speed = getattr(sys.modules[__name__],config.CHECK_PROXY['function'])(selfip, proxies)#checkProxy(selfip, proxies)
 93 |     if protocol >= 0:
 94 |         proxy['protocol'] = protocol
 95 |         proxy['types'] = types
 96 |         proxy['speed'] = speed
 97 |     else:
 98 |         proxy = None
 99 |     if queue2:
100 |         queue2.put(proxy)
101 |     return proxy
102 | 
103 | 
104 | def checkProxy(selfip, proxies):
105 |     '''
106 |     用来检测代理的类型，突然发现，免费网站写的信息不靠谱，还是要自己检测代理的类型
107 |     :param
108 |     :return:
109 |     '''
110 |     protocol = -1
111 |     types = -1
112 |     speed = -1
113 |     http, http_types, http_speed = _checkHttpProxy(selfip, proxies)
114 |     https, https_types, https_speed = _checkHttpProxy(selfip, proxies, False)
115 |     if http and https:
116 |         protocol = 2
117 |         types = http_types
118 |         speed = http_speed
119 |     elif http:
120 |         types = http_types
121 |         protocol = 0
122 |         speed = http_speed
123 |     elif https:
124 |         types = https_types
125 |         protocol = 1
126 |         speed = https_speed
127 |     else:
128 |         types = -1
129 |         protocol = -1
130 |         speed = -1
131 |     return protocol, types, speed
132 | 
133 | 
134 | def _checkHttpProxy(selfip, proxies, isHttp=True):
135 |     types = -1
136 |     speed = -1
137 |     if isHttp:
138 |         test_url = config.TEST_HTTP_HEADER
139 |     else:
140 |         test_url = config.TEST_HTTPS_HEADER
141 |     try:
142 |         start = time.time()
143 |         r = requests.get(url=test_url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
144 |         if r.ok:
145 |             speed = round(time.time() - start, 2)
146 |             content = json.loads(r.text)
147 |             headers = content['headers']
148 |             ip = content['origin']
149 |             proxy_connection = headers.get('Proxy-Connection', None)
150 |             if ',' in ip:
151 |                 types = 2
152 |             elif proxy_connection:
153 |                 types = 1
154 |             else:
155 |                 types = 0
156 | 
157 |             return True, types, speed
158 |         else:
159 |             return False, types, speed
160 |     except Exception as e:
161 |         return False, types, speed
162 | 
163 | 
164 | def baidu_check(selfip, proxies):
165 |     '''
166 |     用来检测代理的类型，突然发现，免费网站写的信息不靠谱，还是要自己检测代理的类型
167 |     :param
168 |     :return:
169 |     '''
170 |     protocol = -1
171 |     types = -1
172 |     speed = -1
173 |     # try:
174 |     #     #http://ip.chinaz.com/getip.aspx挺稳定，可以用来检测ip
175 |     #     r = requests.get(url=config.TEST_URL, headers=config.get_header(), timeout=config.TIMEOUT,
176 |     #                      proxies=proxies)
177 |     #     r.encoding = chardet.detect(r.content)['encoding']
178 |     #     if r.ok:
179 |     #         if r.text.find(selfip)>0:
180 |     #             return protocol, types, speed
181 |     #     else:
182 |     #         return protocol,types,speed
183 |     #
184 |     #
185 |     # except Exception as e:
186 |     #     return protocol, types, speed
187 |     try:
188 |         start = time.time()
189 |         r = requests.get(url='https://www.baidu.com', headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
190 |         r.encoding = chardet.detect(r.content)['encoding']
191 |         if r.ok:
192 |             speed = round(time.time() - start, 2)
193 |             protocol= 0
194 |             types=0
195 | 
196 |         else:
197 |             speed = -1
198 |             protocol= -1
199 |             types=-1
200 |     except Exception as e:
201 |             speed = -1
202 |             protocol = -1
203 |             types = -1
204 |     return protocol, types, speed
205 | 
206 | def getMyIP():
207 |     try:
208 |         r = requests.get(url=config.TEST_IP, headers=config.get_header(), timeout=config.TIMEOUT)
209 |         ip = json.loads(r.text)
210 |         return ip['origin']
211 |     except Exception as e:
212 |         raise Test_URL_Fail
213 | 
214 | 
215 | if __name__ == '__main__':
216 |     ip = '222.186.161.132'
217 |     port = 3128
218 |     proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
219 |     _checkHttpProxy(None,proxies)
220 |     # getMyIP()
221 |     # str="{ip:'61.150.43.121',address:'陕西省西安市 西安电子科技大学'}"
222 |     # j = json.dumps(str)
223 |     # str = j['ip']
224 |     # print str
225 | 


--------------------------------------------------------------------------------
/validator/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 | 


--------------------------------------------------------------------------------