├── .gitignore ├── IPProxy.py ├── README.md ├── api ├── __init__.py └── apiServer.py ├── config.py ├── data └── qqwry.dat ├── db ├── DataStore.py ├── ISqlHelper.py ├── MongoHelper.py ├── RedisHelper.py ├── SqlHelper.py └── __init__.py ├── qiye2.jpg ├── requirements.txt ├── spider ├── HtmlDownloader.py ├── HtmlPraser.py ├── ProxyCrawl.py └── __init__.py ├── start.bat ├── test ├── __init__.py ├── test.py ├── testIPAddress.py ├── testIPType.py ├── testbase64.py ├── testhttpserver.py ├── testlist.py ├── testlxml.py ├── testqueue.py └── testsql.py ├── util ├── IPAddress.py ├── __init__.py ├── compatibility.py ├── exception.py └── logger.py └── validator ├── Validator.py └── __init__.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | .idea/ 91 | *.db -------------------------------------------------------------------------------- /IPProxy.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from multiprocessing import Value, Queue, Process 4 | from api.apiServer import start_api_server 5 | from db.DataStore import store_data 6 | 7 | from validator.Validator import validator, getMyIP 8 | from spider.ProxyCrawl import startProxyCrawl 9 | 10 | from config import TASK_QUEUE_SIZE 11 | 12 | if __name__ == "__main__": 13 | myip = getMyIP() 14 | DB_PROXY_NUM = Value('i', 0) 15 | q1 = Queue(maxsize=TASK_QUEUE_SIZE) 16 | q2 = Queue() 17 | p0 = Process(target=start_api_server) 18 | p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM,myip)) 19 | p2 = Process(target=validator, args=(q1, q2, myip)) 20 | p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM)) 21 | p0.start() 22 | p1.start() 23 | p2.start() 24 | p3.start() 25 | p0.join() 26 | p1.join() 27 | p2.join() 28 | p3.join() 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IPProxyPool 2 | IPProxyPool代理池项目,提供代理ip。支持py2和py3两个版本。 3 | ### 我的新书[《Python爬虫开发与项目实战》](https://item.jd.com/12206762.html)出版了,喜欢的话可以看一下[样章](http://pan.baidu.com/s/1hrWEOYg) 4 |
5 | 详细使用方式,请看我的博客: 6 | http://www.cnblogs.com/qiyeboy/p/5693128.html 7 |
8 | 最近正在为IPProxyPool添加二级代理,方便调度。大家可以关注我的公众号,更新我会及时通知。 9 |
10 | 11 | #### 我的微信公众号: 12 | 13 | ![](qiye2.jpg) 14 |
15 | 希望大家提供更多的代理网站,现在爬取的好用的代理ip还是太少。 16 |
17 | 同时感谢[super1-chen](https://github.com/super1-chen),[fancoo](https://github.com/fancoo),[Leibnizhu](https://github.com/Leibnizhu)对项目的贡献。 18 |
19 | 20 | ## 项目依赖 21 | 22 | #### Ubuntu,debian 23 | 24 | 1.安装sqlite数据库(一般系统内置): 25 | apt-get install sqlite3 26 |
27 | 2.安装requests,chardet,web.py,gevent psutil: 28 | pip install requests chardet web.py sqlalchemy gevent psutil 29 |
30 | 3.安装lxml: 31 | apt-get install python-lxml 32 |
33 | 注意: 34 | 35 | * python3下的是pip3 36 | * 有时候使用的gevent版本过低会出现自动退出情况,请使用pip install gevent --upgrade更新) 37 | * 在python3中安装web.py,不能使用pip,直接下载py3版本的[源码](https://codeload.github.com/webpy/webpy/zip/py3)进行安装 38 | 39 | #### Windows 40 | 41 | 1.下载[sqlite](http://www.sqlite.org/download.html),路径添加到环境变量 42 |
43 | 2.安装requests,chardet,web.py,gevent: 44 | pip install requests chardet web.py sqlalchemy gevent 45 |
46 | 3.安装lxml: 47 | pip install lxml或者下载[lxml windows版](https://pypi.python.org/pypi/lxml/) 48 |
49 | 注意: 50 | 51 | * python3下的是pip3 52 | * 有时候使用的gevent版本过低会出现自动退出情况,请使用pip install gevent --upgrade更新) 53 | * 在python3中安装web.py,不能使用pip,直接下载py3版本的[源码](https://codeload.github.com/webpy/webpy/zip/py3)进行安装 54 | 55 | #### 扩展说明 56 | 57 | 本项目默认数据库是sqlite,但是采用sqlalchemy的ORM模型,通过预留接口可以拓展使用MySQL,MongoDB等数据库。 58 | 配置方法: 59 |
60 | 1.MySQL配置 61 | ``` 62 | 第一步:首先安装MySQL数据库并启动 63 | 第二步:安装MySQLdb或者pymysql(推荐) 64 | 第三步:在config.py文件中配置DB_CONFIG。如果安装的是MySQLdb模块,配置如下: 65 | DB_CONFIG={ 66 | 'DB_CONNECT_TYPE':'sqlalchemy', 67 | 'DB_CONNECT_STRING':'mysql+mysqldb://root:root@localhost/proxy?charset=utf8' 68 | } 69 | 如果安装的是pymysql模块,配置如下: 70 | DB_CONFIG={ 71 | 'DB_CONNECT_TYPE':'sqlalchemy', 72 | 'DB_CONNECT_STRING':'mysql+pymysql://root:root@localhost/proxy?charset=utf8' 73 | } 74 | ``` 75 | sqlalchemy下的DB_CONNECT_STRING参考[支持数据库](http://docs.sqlalchemy.org/en/latest/core/engines.html#supported-databases),理论上使用这种配置方式不只是适配MySQL,sqlalchemy支持的数据库都可以,但是仅仅测试过MySQL。 76 |
77 | 2.MongoDB配置 78 | ``` 79 | 第一步:首先安装MongoDB数据库并启动 80 | 第二步:安装pymongo模块 81 | 第三步:在config.py文件中配置DB_CONFIG。配置类似如下: 82 | DB_CONFIG={ 83 | 'DB_CONNECT_TYPE':'pymongo', 84 | 'DB_CONNECT_STRING':'mongodb://localhost:27017/' 85 | } 86 | ``` 87 | 由于sqlalchemy并不支持MongoDB,因此额外添加了pymongo模式,DB_CONNECT_STRING参考pymongo的连接字符串。 88 | 89 | ##### 注意 90 | 91 | 如果大家想拓展其他数据库,可以直接继承db下ISqlHelper类,实现其中的方法,具体实现参考我的代码,然后在DataStore中导入类即可。 92 | ``` 93 | try: 94 | if DB_CONFIG['DB_CONNECT_TYPE'] == 'pymongo': 95 | from db.MongoHelper import MongoHelper as SqlHelper 96 | else: 97 | from db.SqlHelper import SqlHelper as SqlHelper 98 | sqlhelper = SqlHelper() 99 | sqlhelper.init_db() 100 | except Exception,e: 101 | raise Con_DB_Fail 102 | ``` 103 | 有感兴趣的朋友,可以将Redis的实现方式添加进来。 104 | 105 | 106 | ## 如何使用 107 | 108 | 将项目目录clone到当前文件夹 109 | 110 | $ git clone 111 | 112 | 切换工程目录 113 | 114 | ``` 115 | $ cd IPProxyPool 116 | ``` 117 | 118 | 运行脚本 119 | 120 | ``` 121 | python IPProxy.py 122 | ``` 123 | 成功运行后,打印信息 124 | ``` 125 | IPProxyPool----->>>>>>>>beginning 126 | http://0.0.0.0:8000/ 127 | IPProxyPool----->>>>>>>>db exists ip:0 128 | IPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling... 129 | IPProxyPool----->>>>>>>>Success ip num :134,Fail ip num:7882 130 | ``` 131 | 132 | ## API 使用方法 133 | 134 | #### 第一种模式 135 | ``` 136 | GET / 137 | ``` 138 | 这种模式用于查询代理ip数据,同时加入评分机制,返回数据的顺序是按照评分由高到低,速度由快到慢制定的。 139 | 140 | #### 参数 141 | 142 | | Name | Type | Description | 143 | | ----| ---- | ---- | 144 | | types | int | 0: 高匿,1:匿名,2 透明 | 145 | | protocol | int | 0: http, 1 https, 2 http/https | 146 | | count | int | 数量 | 147 | | country | str | 取值为 国内, 国外 | 148 | | area | str | 地区 | 149 | 150 | 151 | 152 | #### 例子 153 | 154 | ##### IPProxys默认端口为8000,端口可以在config.py中配置。 155 | 156 | ##### 如果是在本机上测试: 157 | 158 | 1.获取5个ip地址在中国的高匿代理:http://127.0.0.1:8000/?types=0&count=5&country=国内 159 |
160 | 2.响应为JSON格式,按照评分由高到低,响应速度由高到低的顺序,返回数据: 161 |
162 | ``` 163 | [["122.226.189.55", 138, 10], ["183.61.236.54", 3128, 10], ["61.132.241.109", 808, 10], ["183.61.236.53", 3128, 10], ["122.227.246.102", 808, 10]] 164 | ``` 165 |
166 | 以["122.226.189.55", 138, 10]为例,第一个元素是ip,第二个元素是port,第三个元素是分值score。 167 | 168 | ``` 169 | import requests 170 | import json 171 | r = requests.get('http://127.0.0.1:8000/?types=0&count=5&country=国内') 172 | ip_ports = json.loads(r.text) 173 | print ip_ports 174 | ip = ip_ports[0][0] 175 | port = ip_ports[0][1] 176 | proxies={ 177 | 'http':'http://%s:%s'%(ip,port), 178 | 'https':'http://%s:%s'%(ip,port) 179 | } 180 | r = requests.get('http://ip.chinaz.com/',proxies=proxies) 181 | r.encoding='utf-8' 182 | print r.text 183 | ``` 184 | #### 第二种模式 185 | ``` 186 | GET /delete 187 | ``` 188 | 这种模式用于方便用户根据自己的需求删除代理ip数据 189 | 190 | #### 参数 191 | 192 | | Name | Type | Description | 193 | | ----| ---- | ---- | 194 | | ip | str | 类似192.168.1.1 | 195 | | port | int | 类似 80 | 196 | | types | int | 0: 高匿,1:匿名,2 透明 | 197 | | protocol | int | 0: http, 1 https, 2 http/https | 198 | | count | int | 数量 | 199 | | country | str | 取值为 国内, 国外 | 200 | | area | str | 地区 | 201 | 202 | 大家可以根据指定以上一种或几种方式删除数据。 203 | 204 | #### 例子 205 | 206 | ##### 如果是在本机上测试: 207 | 208 | 1.删除ip为120.92.3.127的代理:http://127.0.0.1:8000/delete?ip=120.92.3.127 209 |
210 | 2.响应为JSON格式,返回删除的结果为成功,失败或者返回删除的个数,类似如下的效果: 211 | ["deleteNum", "ok"]或者["deleteNum", 1] 212 | ``` 213 | import requests 214 | r = requests.get('http://127.0.0.1:8000/delete?ip=120.92.3.127') 215 | print r.text 216 | ``` 217 | ## config.py参数配置 218 | ``` 219 | #parserList是网址解析规则表,大家可以将发现的代理网址,将提取规则添加到其中,方便爬虫的爬取。 220 | parserList = [ 221 | { 222 | 'urls': ['http://www.66ip.cn/%s.html' % n for n in ['index'] + list(range(2, 12))], 223 | 'type': 'xpath', 224 | 'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]", 225 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''} 226 | }, 227 | 228 | ...... 229 | 230 | 231 | { 232 | 'urls': ['http://www.cnproxy.com/proxy%s.html' % i for i in range(1, 11)], 233 | 'type': 'module', 234 | 'moduleName': 'CnproxyPraser', 235 | 'pattern': r'(\d+\.\d+\.\d+\.\d+)(HTTP|SOCKS4)\s*', 236 | 'position': {'ip': 0, 'port': 1, 'type': -1, 'protocol': 2} 237 | } 238 | ] 239 | 240 | #数据库的配置 241 | 242 | DB_CONFIG = { 243 | 244 | 'DB_CONNECT_TYPE': 'sqlalchemy', # 'pymongo'sqlalchemy;redis 245 | # 'DB_CONNECT_STRING':'mongodb://localhost:27017/' 246 | 'DB_CONNECT_STRING': 'sqlite:///' + os.path.dirname(__file__) + '/data/proxy.db' 247 | # DB_CONNECT_STRING : 'mysql+mysqldb://root:root@localhost/proxy?charset=utf8' 248 | 249 | # 'DB_CONNECT_TYPE': 'redis', # 'pymongo'sqlalchemy;redis 250 | # 'DB_CONNECT_STRING': 'redis://localhost:6379/8', 251 | 252 | } 253 | #THREADNUM为gevent pool的协程数目 254 | THREADNUM = 5 255 | 256 | #API_PORT为API web服务器的端口 257 | API_PORT = 8000 258 | 259 | #爬虫爬取和检测ip的设置条件 260 | #不需要检测ip是否已经存在,因为会定时清理 261 | # UPDATE_TIME:每半个小时检测一次是否有代理ip失效 262 | UPDATE_TIME = 30 * 60 263 | 264 | # 当有效的ip值小于MINNUM时 需要启动爬虫进行爬取 265 | MINNUM = 50 266 | 267 | # socket超时 268 | TIMEOUT = 5 269 | 270 | 271 | 272 | 273 | #爬虫下载网页的重试次数 274 | RETRY_TIME = 3 275 | 276 | 277 | #USER_AGENTS 随机头信息,用来突破爬取网站的反爬虫 278 | 279 | USER_AGENTS = [ 280 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 281 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 282 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 283 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 284 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 285 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 286 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 287 | ] 288 | #默认给抓取的ip分配20分,每次连接失败,减一分,直到分数全部扣完从数据库中删除 289 | DEFAULT_SCORE=10 290 | 291 | #CHECK_PROXY变量是为了用户自定义检测代理的函数,,默认是CHECK_PROXY={'function':'checkProxy'}。 292 | #现在使用检测的网址是httpbin.org,但是即使ip通过了验证和检测 293 | #也只能说明通过此代理ip可以到达httpbin.org,但是不一定能到达用户爬取的网址 294 | #因此在这个地方用户可以自己添加检测函数,我以百度为访问网址尝试一下 295 | #大家可以看一下Validator.py文件中的baidu_check函数和detect_proxy函数就会明白 296 | 297 | CHECK_PROXY={'function':'checkProxy'}#{'function':'baidu_check'} 298 | ``` 299 | ## TODO 300 | 1.添加squid代理,简化爬虫配置 301 |
302 | 303 | 304 | ## 更新进度 305 | -----------------------------2017-4-6---------------------------- 306 |
307 | 1.更新评分机制。 308 |
309 | * 之前的评分机制是刚添加进来每个代理ip为0分,每隔半个小时检测一次,检测之后依然有效则加分,无效则删除。 310 | * 现在的评分机制是每个新的代理ip分配10分,每隔半个小时检测一次,检测之后依然有效则分数不变,无效则分数减一,直至为0删除,可以避免由于检测网站不稳定导致的误删。 311 | 312 | 2.用户可以自定义检测函数,在config.py的CHECK_PROXY变量中可以配置。 313 | ``` 314 | CHECK_PROXY变量是为了用户自定义检测代理的函数,默认是CHECK_PROXY={'function':'checkProxy'} 315 | 现在使用检测的网址是httpbin.org,但是即使ip通过了验证和检测 316 | 也只能说明通过此代理ip可以到达httpbin.org,但是不一定能到达用户爬取的网址 317 | 因此在这个地方用户可以自己添加检测函数,我以百度为访问网址尝试一下 318 | 大家可以看一下Validator.py文件中的baidu_check函数和detect_proxy函数就会明白。 319 | 320 | CHECK_PROXY={'function':'baidu_check'} 321 | ``` 322 | 3.经过大家的共同努力,彻底解决了僵死进程的问题。 323 | 324 | -----------------------------2017-1-16---------------------------- 325 |
326 | 1.将py2和py3版本合并,并且兼容 327 |
328 | 2.修复pymongo查询bug 329 |
330 | -----------------------------2017-1-11---------------------------- 331 |
332 | 1.使用httpbin.org检测代理ip的高匿性 333 |
334 | 2.使用 国内 和 国外 作为country的查询条件 335 |
336 | 3.修改types和protocol参数,一定要注意protocol的使用,试试访问http://www.baidu.com和https://www.baidu.com 337 |
338 | 4.美化代码风格 339 |
340 | -----------------------------2016-12-11---------------------------- 341 | ####大规模重构,主要包括以下几个方面: 342 | 1.使用多进程+协程的方式,将爬取和验证的效率提高了50倍以上,可以在几分钟之内获取所有的有效IP 343 |
344 | 2.使用web.py作为API服务器,重构HTTP接口 345 |
346 | 3.增加Mysql,MongoDB等数据库的适配 347 |
348 | 4.增加了三个代理网站 349 |
350 | 5.增加评分机制,评比稳定的ip 351 |
352 | 6.支持python3 353 |
354 | -----------------------------2016-11-24---------------------------- 355 |
356 | 1.增加chardet识别网页编码 357 |
358 | 2.突破66ip.cn反爬限制 359 |
360 | -----------------------------2016-10-27---------------------------- 361 |
362 | 1.增加对代理的检测,测试是否能真正访问到网址,实现代理 363 |
364 | 2.添加通过正则表达式和加载插件解析网页的方式 365 |
366 | 3.又增加一个新的代理网站 367 |
368 | 369 | -----------------------------2016-7-20---------------------------- 370 |
371 | 1.修复bug ,将数据库进行压缩 372 |
373 | -------------------------------------------------------------------------------- /api/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Xaxdus' 2 | -------------------------------------------------------------------------------- /api/apiServer.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | ''' 3 | 定义几个关键字,count type,protocol,country,area, 4 | ''' 5 | import json 6 | import sys 7 | import web 8 | import config 9 | from db.DataStore import sqlhelper 10 | from db.SqlHelper import Proxy 11 | 12 | urls = ( 13 | '/', 'select', 14 | '/delete', 'delete' 15 | ) 16 | 17 | 18 | def start_api_server(): 19 | sys.argv.append('0.0.0.0:%s' % config.API_PORT) 20 | app = web.application(urls, globals()) 21 | app.run() 22 | 23 | 24 | class select(object): 25 | def GET(self): 26 | inputs = web.input() 27 | json_result = json.dumps(sqlhelper.select(inputs.get('count', None), inputs)) 28 | return json_result 29 | 30 | 31 | class delete(object): 32 | params = {} 33 | 34 | def GET(self): 35 | inputs = web.input() 36 | json_result = json.dumps(sqlhelper.delete(inputs)) 37 | return json_result 38 | 39 | 40 | if __name__ == '__main__': 41 | sys.argv.append('0.0.0.0:8000') 42 | app = web.application(urls, globals()) 43 | app.run() 44 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | ''' 3 | 定义规则 urls:url列表 4 | type:解析方式,取值 regular(正则表达式),xpath(xpath解析),module(自定义第三方模块解析) 5 | patten:可以是正则表达式,可以是xpath语句不过要和上面的相对应 6 | ''' 7 | import os 8 | import random 9 | 10 | ''' 11 | ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https),country(国家),area(省市),updatetime(更新时间) 12 | speed(连接速度) 13 | ''' 14 | parserList = [ 15 | { 16 | 'urls': ['http://www.66ip.cn/%s.html' % n for n in ['index'] + list(range(2, 12))], 17 | 'type': 'xpath', 18 | 'pattern': ".//*[@id='main']/div/div[1]/table/tr[position()>1]", 19 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''} 20 | }, 21 | { 22 | 'urls': ['http://www.66ip.cn/areaindex_%s/%s.html' % (m, n) for m in range(1, 35) for n in range(1, 10)], 23 | 'type': 'xpath', 24 | 'pattern': ".//*[@id='footer']/div/table/tr[position()>1]", 25 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[4]', 'protocol': ''} 26 | }, 27 | { 28 | 'urls': ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'], 29 | 'type': 'xpath', 30 | 'pattern': ".//table[@class='sortable']/tbody/tr", 31 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''} 32 | 33 | }, 34 | { 35 | 'urls': ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)], 36 | 'type': 'xpath', 37 | 'pattern': ".//table[@class='list']/tr", 38 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''} 39 | 40 | }, 41 | { 42 | 'urls': ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)], 43 | 'type': 'module', 44 | 'moduleName': 'proxy_listPraser', 45 | 'pattern': 'Proxy\(.+\)', 46 | 'position': {'ip': 0, 'port': -1, 'type': -1, 'protocol': 2} 47 | 48 | }, 49 | { 50 | 'urls': ['http://incloak.com/proxy-list/%s#list' % n for n in 51 | ([''] + ['?start=%s' % (64 * m) for m in range(1, 10)])], 52 | 'type': 'xpath', 53 | 'pattern': ".//table[@class='proxy__t']/tbody/tr", 54 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': '', 'protocol': ''} 55 | 56 | }, 57 | { 58 | 'urls': ['http://www.kuaidaili.com/proxylist/%s/' % n for n in range(1, 11)], 59 | 'type': 'xpath', 60 | 'pattern': ".//*[@id='index_free_list']/table/tbody/tr[position()>0]", 61 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'} 62 | }, 63 | { 64 | 'urls': ['http://www.kuaidaili.com/free/%s/%s/' % (m, n) for m in ['inha', 'intr', 'outha', 'outtr'] for n in 65 | range(1, 11)], 66 | 'type': 'xpath', 67 | 'pattern': ".//*[@id='list']/table/tbody/tr[position()>0]", 68 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'} 69 | }, 70 | { 71 | 'urls': ['http://www.cz88.net/proxy/%s' % m for m in 72 | ['index.shtml'] + ['http_%s.shtml' % n for n in range(2, 11)]], 73 | 'type': 'xpath', 74 | 'pattern': ".//*[@id='boxright']/div/ul/li[position()>1]", 75 | 'position': {'ip': './div[1]', 'port': './div[2]', 'type': './div[3]', 'protocol': ''} 76 | 77 | }, 78 | { 79 | 'urls': ['http://www.ip181.com/daili/%s.html' % n for n in range(1, 11)], 80 | 'type': 'xpath', 81 | 'pattern': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]", 82 | 'position': {'ip': './td[1]', 'port': './td[2]', 'type': './td[3]', 'protocol': './td[4]'} 83 | 84 | }, 85 | { 86 | 'urls': ['http://www.xicidaili.com/%s/%s' % (m, n) for m in ['nn', 'nt', 'wn', 'wt'] for n in range(1, 8)], 87 | 'type': 'xpath', 88 | 'pattern': ".//*[@id='ip_list']/tr[position()>1]", 89 | 'position': {'ip': './td[2]', 'port': './td[3]', 'type': './td[5]', 'protocol': './td[6]'} 90 | }, 91 | { 92 | 'urls': ['http://www.cnproxy.com/proxy%s.html' % i for i in range(1, 11)], 93 | 'type': 'module', 94 | 'moduleName': 'CnproxyPraser', 95 | 'pattern': r'(\d+\.\d+\.\d+\.\d+)(HTTP|SOCKS4)\s*', 96 | 'position': {'ip': 0, 'port': 1, 'type': -1, 'protocol': 2} 97 | } 98 | ] 99 | ''' 100 | 数据库的配置 101 | ''' 102 | DB_CONFIG = { 103 | 104 | 'DB_CONNECT_TYPE': 'sqlalchemy', # 'pymongo'sqlalchemy;redis 105 | # 'DB_CONNECT_STRING':'mongodb://localhost:27017/' 106 | 'DB_CONNECT_STRING': 'sqlite:///' + os.path.dirname(__file__) + '/data/proxy.db' 107 | # DB_CONNECT_STRING : 'mysql+mysqldb://root:root@localhost/proxy?charset=utf8' 108 | 109 | # 'DB_CONNECT_TYPE': 'redis', # 'pymongo'sqlalchemy;redis 110 | # 'DB_CONNECT_STRING': 'redis://localhost:6379/8', 111 | 112 | } 113 | CHINA_AREA = ['河北', '山东', '辽宁', '黑龙江', '吉林' 114 | , '甘肃', '青海', '河南', '江苏', '湖北', '湖南', 115 | '江西', '浙江', '广东', '云南', '福建', 116 | '台湾', '海南', '山西', '四川', '陕西', 117 | '贵州', '安徽', '重庆', '北京', '上海', '天津', '广西', '内蒙', '西藏', '新疆', '宁夏', '香港', '澳门'] 118 | QQWRY_PATH = os.path.dirname(__file__) + "/data/qqwry.dat" 119 | THREADNUM = 5 120 | API_PORT = 8000 121 | ''' 122 | 爬虫爬取和检测ip的设置条件 123 | 不需要检测ip是否已经存在,因为会定时清理 124 | ''' 125 | UPDATE_TIME = 30 * 60 # 每半个小时检测一次是否有代理ip失效 126 | MINNUM = 50 # 当有效的ip值小于一个时 需要启动爬虫进行爬取 127 | 128 | TIMEOUT = 5 # socket延时 129 | ''' 130 | 反爬虫的设置 131 | ''' 132 | ''' 133 | 重试次数 134 | ''' 135 | RETRY_TIME = 3 136 | 137 | ''' 138 | USER_AGENTS 随机头信息 139 | ''' 140 | USER_AGENTS = [ 141 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 142 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 143 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 144 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 145 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 146 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 147 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 148 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 149 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 150 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 151 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 152 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 153 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 154 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 155 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 156 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 157 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", 158 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", 159 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", 160 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", 161 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", 162 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 163 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", 164 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 165 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", 166 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 167 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 168 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 169 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 170 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 171 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", 172 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", 173 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", 174 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10" 175 | ] 176 | 177 | 178 | def get_header(): 179 | return { 180 | 'User-Agent': random.choice(USER_AGENTS), 181 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 182 | 'Accept-Language': 'en-US,en;q=0.5', 183 | 'Connection': 'keep-alive', 184 | 'Accept-Encoding': 'gzip, deflate', 185 | } 186 | #默认给抓取的ip分配20分,每次连接失败,减一分,直到分数全部扣完从数据库中删除 187 | DEFAULT_SCORE=10 188 | 189 | TEST_URL = 'http://ip.chinaz.com/getip.aspx' 190 | TEST_IP = 'http://httpbin.org/ip' 191 | TEST_HTTP_HEADER = 'http://httpbin.org/get' 192 | TEST_HTTPS_HEADER = 'https://httpbin.org/get' 193 | #CHECK_PROXY变量是为了用户自定义检测代理的函数 194 | #现在使用检测的网址是httpbin.org,但是即使ip通过了验证和检测 195 | #也只能说明通过此代理ip可以到达httpbin.org,但是不一定能到达用户爬取的网址 196 | #因此在这个地方用户可以自己添加检测函数,我以百度为访问网址尝试一下 197 | #大家可以看一下Validator.py文件中的baidu_check函数和detect_proxy函数就会明白 198 | 199 | CHECK_PROXY={'function':'checkProxy'}#{'function':'baidu_check'} 200 | 201 | #下面配置squid,现在还没实现 202 | #SQUID={'path':None,'confpath':'C:/squid/etc/squid.conf'} 203 | 204 | MAX_CHECK_PROCESS = 2 # CHECK_PROXY最大进程数 205 | MAX_CHECK_CONCURRENT_PER_PROCESS = 30 # CHECK_PROXY时每个进程的最大并发 206 | TASK_QUEUE_SIZE = 50 # 任务队列SIZE 207 | MAX_DOWNLOAD_CONCURRENT = 3 # 从免费代理网站下载时的最大并发 208 | CHECK_WATI_TIME = 1#进程数达到上限时的等待时间 -------------------------------------------------------------------------------- /data/qqwry.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiyeboy/IPProxyPool/127827829535641d57a1a05e20523cb0a192170b/data/qqwry.dat -------------------------------------------------------------------------------- /db/DataStore.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import sys 3 | from config import DB_CONFIG 4 | from util.exception import Con_DB_Fail 5 | 6 | 7 | try: 8 | if DB_CONFIG['DB_CONNECT_TYPE'] == 'pymongo': 9 | from db.MongoHelper import MongoHelper as SqlHelper 10 | elif DB_CONFIG['DB_CONNECT_TYPE'] == 'redis': 11 | from db.RedisHelper import RedisHelper as SqlHelper 12 | else: 13 | from db.SqlHelper import SqlHelper as SqlHelper 14 | sqlhelper = SqlHelper() 15 | sqlhelper.init_db() 16 | except Exception as e: 17 | raise Con_DB_Fail 18 | 19 | 20 | def store_data(queue2, db_proxy_num): 21 | ''' 22 | 读取队列中的数据,写入数据库中 23 | :param queue2: 24 | :return: 25 | ''' 26 | successNum = 0 27 | failNum = 0 28 | while True: 29 | try: 30 | proxy = queue2.get(timeout=300) 31 | if proxy: 32 | 33 | sqlhelper.insert(proxy) 34 | successNum += 1 35 | else: 36 | failNum += 1 37 | str = 'IPProxyPool----->>>>>>>>Success ip num :%d,Fail ip num:%d' % (successNum, failNum) 38 | sys.stdout.write(str + "\r") 39 | sys.stdout.flush() 40 | except BaseException as e: 41 | if db_proxy_num.value != 0: 42 | successNum += db_proxy_num.value 43 | db_proxy_num.value = 0 44 | str = 'IPProxyPool----->>>>>>>>Success ip num :%d,Fail ip num:%d' % (successNum, failNum) 45 | sys.stdout.write(str + "\r") 46 | sys.stdout.flush() 47 | successNum = 0 48 | failNum = 0 49 | 50 | 51 | -------------------------------------------------------------------------------- /db/ISqlHelper.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | class ISqlHelper(object): 4 | params = {'ip': None, 'port': None, 'types': None, 'protocol': None, 'country': None, 'area': None} 5 | 6 | def init_db(self): 7 | raise NotImplemented 8 | 9 | def drop_db(self): 10 | raise NotImplemented 11 | 12 | def insert(self, value=None): 13 | raise NotImplemented 14 | 15 | def delete(self, conditions=None): 16 | raise NotImplemented 17 | 18 | def update(self, conditions=None, value=None): 19 | raise NotImplemented 20 | 21 | def select(self, count=None, conditions=None): 22 | raise NotImplemented -------------------------------------------------------------------------------- /db/MongoHelper.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | from config import DB_CONFIG, DEFAULT_SCORE 3 | 4 | from db.ISqlHelper import ISqlHelper 5 | 6 | 7 | class MongoHelper(ISqlHelper): 8 | def __init__(self): 9 | self.client = pymongo.MongoClient(DB_CONFIG['DB_CONNECT_STRING'], connect=False) 10 | 11 | def init_db(self): 12 | self.db = self.client.proxy 13 | self.proxys = self.db.proxys 14 | 15 | def drop_db(self): 16 | self.client.drop_database(self.db) 17 | 18 | def insert(self, value=None): 19 | if value: 20 | proxy = dict(ip=value['ip'], port=value['port'], types=value['types'], protocol=value['protocol'], 21 | country=value['country'], 22 | area=value['area'], speed=value['speed'], score=DEFAULT_SCORE) 23 | self.proxys.insert(proxy) 24 | 25 | def delete(self, conditions=None): 26 | if conditions: 27 | self.proxys.remove(conditions) 28 | return ('deleteNum', 'ok') 29 | else: 30 | return ('deleteNum', 'None') 31 | 32 | def update(self, conditions=None, value=None): 33 | # update({"UserName":"libing"},{"$set":{"Email":"libing@126.com","Password":"123"}}) 34 | if conditions and value: 35 | self.proxys.update(conditions, {"$set": value}) 36 | return {'updateNum': 'ok'} 37 | else: 38 | return {'updateNum': 'fail'} 39 | 40 | def select(self, count=None, conditions=None): 41 | if count: 42 | count = int(count) 43 | else: 44 | count = 0 45 | if conditions: 46 | conditions = dict(conditions) 47 | if 'count' in conditions: 48 | del conditions['count'] 49 | conditions_name = ['types', 'protocol'] 50 | for condition_name in conditions_name: 51 | value = conditions.get(condition_name, None) 52 | if value: 53 | conditions[condition_name] = int(value) 54 | else: 55 | conditions = {} 56 | items = self.proxys.find(conditions, limit=count).sort( 57 | [("speed", pymongo.ASCENDING), ("score", pymongo.DESCENDING)]) 58 | results = [] 59 | for item in items: 60 | result = (item['ip'], item['port'], item['score']) 61 | results.append(result) 62 | return results 63 | 64 | 65 | if __name__ == '__main__': 66 | # from db.MongoHelper import MongoHelper as SqlHelper 67 | # sqlhelper = SqlHelper() 68 | # sqlhelper.init_db() 69 | # # print sqlhelper.select(None,{'types':u'1'}) 70 | # items= sqlhelper.proxys.find({'types':0}) 71 | # for item in items: 72 | # print item 73 | # # # print sqlhelper.select(None,{'types':u'0'}) 74 | pass 75 | -------------------------------------------------------------------------------- /db/RedisHelper.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from __future__ import unicode_literals 3 | 4 | from redis import Redis 5 | 6 | import config 7 | from db.ISqlHelper import ISqlHelper 8 | from db.SqlHelper import Proxy 9 | 10 | 11 | class RedisHelper(ISqlHelper): 12 | def __init__(self, url=None): 13 | self.index_names = ('types', 'protocol', 'country', 'area', 'score') 14 | self.redis_url = url or config.DB_CONFIG['DB_CONNECT_STRING'] 15 | 16 | def get_proxy_name(self, ip=None, port=None, protocal=None, proxy=None): 17 | ip = ip or proxy.ip 18 | port = port or proxy.port 19 | protocal = protocal or proxy.protocol 20 | return "proxy::{}:{}:{}".format(ip, port, protocal) 21 | 22 | def get_index_name(self, index_name, value=None): 23 | if index_name == 'score': 24 | return 'index::score' 25 | return "index::{}:{}".format(index_name, value) 26 | 27 | def get_proxy_by_name(self, name): 28 | pd = self.redis.hgetall(name) 29 | if pd: 30 | return Proxy(**{k.decode('utf8'): v.decode('utf8') for k, v in pd.items()}) 31 | 32 | def init_db(self, url=None): 33 | self.redis = Redis.from_url(url or self.redis_url) 34 | 35 | def drop_db(self): 36 | return self.redis.flushdb() 37 | 38 | def get_keys(self, conditions): 39 | select_keys = {self.get_index_name(key, conditions[key]) for key in conditions.keys() if 40 | key in self.index_names} 41 | if 'ip' in conditions and 'port' in conditions: 42 | return self.redis.keys(self.get_proxy_name(conditions['ip'], conditions['port'], '*')) 43 | if select_keys: 44 | return [name.decode('utf8') for name in self.redis.sinter(keys=select_keys)] 45 | return [] 46 | 47 | def insert(self, value): 48 | proxy = Proxy(ip=value['ip'], port=value['port'], types=value['types'], protocol=value['protocol'], 49 | country=value['country'], area=value['area'], 50 | speed=value['speed'], score=value.get('score', config.DEFAULT_SCORE)) 51 | mapping = proxy.__dict__ 52 | for k in list(mapping.keys()): 53 | if k.startswith('_'): 54 | mapping.pop(k) 55 | object_name = self.get_proxy_name(proxy=proxy) 56 | # 存结构 57 | insert_num = self.redis.hmset(object_name, mapping) 58 | # 创建索引 59 | if insert_num > 0: 60 | for index_name in self.index_names: 61 | self.create_index(index_name, object_name, proxy) 62 | return insert_num 63 | 64 | def create_index(self, index_name, object_name, proxy): 65 | redis_key = self.get_index_name(index_name, getattr(proxy, index_name)) 66 | if index_name == 'score': 67 | return self.redis.zadd(redis_key, object_name, int(proxy.score)) 68 | return self.redis.sadd(redis_key, object_name) 69 | 70 | def delete(self, conditions): 71 | proxy_keys = self.get_keys(conditions) 72 | index_keys = self.redis.keys(u"index::*") 73 | if not proxy_keys: 74 | return 0 75 | 76 | for iname in index_keys: 77 | if iname == b'index::score': 78 | self.redis.zrem(self.get_index_name('score'), *proxy_keys) 79 | else: 80 | self.redis.srem(iname, *proxy_keys) 81 | return self.redis.delete(*proxy_keys) if proxy_keys else 0 82 | 83 | def update(self, conditions, values): 84 | objects = self.get_keys(conditions) 85 | count = 0 86 | for name in objects: 87 | for k, v in values.items(): 88 | if k == 'score': 89 | self.redis.zrem(self.get_index_name('score'), [name]) 90 | self.redis.zadd(self.get_index_name('score'), name, int(v)) 91 | self.redis.hset(name, key=k, value=v) 92 | count += 1 93 | return count 94 | 95 | def select(self, count=None, conditions=None): 96 | count = (count and int(count)) or 1000 # 最多返回1000条数据 97 | count = 1000 if count > 1000 else count 98 | 99 | querys = {k: v for k, v in conditions.items() if k in self.index_names} if conditions else None 100 | if querys: 101 | objects = list(self.get_keys(querys))[:count] 102 | redis_name = self.get_index_name('score') 103 | objects.sort(key=lambda x: int(self.redis.zscore(redis_name, x))) 104 | else: 105 | objects = list( 106 | self.redis.zrevrangebyscore(self.get_index_name("score"), '+inf', '-inf', start=0, num=count)) 107 | 108 | result = [] 109 | for name in objects: 110 | p = self.get_proxy_by_name(name) 111 | result.append((p.ip, p.port, p.score)) 112 | return result 113 | 114 | 115 | if __name__ == '__main__': 116 | sqlhelper = RedisHelper() 117 | sqlhelper.init_db('redis://localhost:6379/9') 118 | proxy = {'ip': '192.168.1.1', 'port': 80, 'type': 0, 'protocol': 0, 'country': '中国', 'area': '广州', 'speed': 11.123, 119 | 'types': 1} 120 | proxy2 = {'ip': 'localhost', 'port': 433, 'type': 1, 'protocol': 1, 'country': u'中国', 'area': u'广州', 'speed': 123, 121 | 'types': 0, 'score': 100} 122 | assert sqlhelper.insert(proxy) == True 123 | assert sqlhelper.insert(proxy2) == True 124 | assert sqlhelper.get_keys({'types': 1}) == ['proxy::192.168.1.1:80:0', ], sqlhelper.get_keys({'types': 1}) 125 | assert sqlhelper.select(conditions={'protocol': 0}) == [('192.168.1.1', '80', '0')] 126 | assert sqlhelper.update({'types': 1}, {'score': 888}) == 1 127 | assert sqlhelper.select() == [('192.168.1.1', '80', '888'), ('localhost', '433', '100')] 128 | # assert sqlhelper.delete({'types': 1}) == 1 129 | # sqlhelper.drop_db() 130 | print('All pass.') 131 | -------------------------------------------------------------------------------- /db/SqlHelper.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import datetime 3 | from sqlalchemy import Column, Integer, String, DateTime, Numeric, create_engine, VARCHAR 4 | from sqlalchemy.ext.declarative import declarative_base 5 | from sqlalchemy.orm import sessionmaker 6 | from config import DB_CONFIG, DEFAULT_SCORE 7 | 8 | from db.ISqlHelper import ISqlHelper 9 | 10 | ''' 11 | sql操作的基类 12 | 包括ip,端口,types类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) 13 | speed(连接速度) 14 | ''' 15 | 16 | BaseModel = declarative_base() 17 | 18 | 19 | class Proxy(BaseModel): 20 | __tablename__ = 'proxys' 21 | id = Column(Integer, primary_key=True, autoincrement=True) 22 | ip = Column(VARCHAR(16), nullable=False) 23 | port = Column(Integer, nullable=False) 24 | types = Column(Integer, nullable=False) 25 | protocol = Column(Integer, nullable=False, default=0) 26 | country = Column(VARCHAR(100), nullable=False) 27 | area = Column(VARCHAR(100), nullable=False) 28 | updatetime = Column(DateTime(), default=datetime.datetime.utcnow) 29 | speed = Column(Numeric(5, 2), nullable=False) 30 | score = Column(Integer, nullable=False, default=DEFAULT_SCORE) 31 | 32 | 33 | class SqlHelper(ISqlHelper): 34 | params = {'ip': Proxy.ip, 'port': Proxy.port, 'types': Proxy.types, 'protocol': Proxy.protocol, 35 | 'country': Proxy.country, 'area': Proxy.area, 'score': Proxy.score} 36 | 37 | def __init__(self): 38 | if 'sqlite' in DB_CONFIG['DB_CONNECT_STRING']: 39 | connect_args = {'check_same_thread': False} 40 | self.engine = create_engine(DB_CONFIG['DB_CONNECT_STRING'], echo=False, connect_args=connect_args) 41 | else: 42 | self.engine = create_engine(DB_CONFIG['DB_CONNECT_STRING'], echo=False) 43 | DB_Session = sessionmaker(bind=self.engine) 44 | self.session = DB_Session() 45 | 46 | def init_db(self): 47 | BaseModel.metadata.create_all(self.engine) 48 | 49 | def drop_db(self): 50 | BaseModel.metadata.drop_all(self.engine) 51 | 52 | 53 | def insert(self, value): 54 | proxy = Proxy(ip=value['ip'], port=value['port'], types=value['types'], protocol=value['protocol'], 55 | country=value['country'], 56 | area=value['area'], speed=value['speed']) 57 | self.session.add(proxy) 58 | self.session.commit() 59 | 60 | 61 | def delete(self, conditions=None): 62 | if conditions: 63 | conditon_list = [] 64 | for key in list(conditions.keys()): 65 | if self.params.get(key, None): 66 | conditon_list.append(self.params.get(key) == conditions.get(key)) 67 | conditions = conditon_list 68 | query = self.session.query(Proxy) 69 | for condition in conditions: 70 | query = query.filter(condition) 71 | deleteNum = query.delete() 72 | self.session.commit() 73 | else: 74 | deleteNum = 0 75 | return ('deleteNum', deleteNum) 76 | 77 | 78 | def update(self, conditions=None, value=None): 79 | ''' 80 | conditions的格式是个字典。类似self.params 81 | :param conditions: 82 | :param value:也是个字典:{'ip':192.168.0.1} 83 | :return: 84 | ''' 85 | if conditions and value: 86 | conditon_list = [] 87 | for key in list(conditions.keys()): 88 | if self.params.get(key, None): 89 | conditon_list.append(self.params.get(key) == conditions.get(key)) 90 | conditions = conditon_list 91 | query = self.session.query(Proxy) 92 | for condition in conditions: 93 | query = query.filter(condition) 94 | updatevalue = {} 95 | for key in list(value.keys()): 96 | if self.params.get(key, None): 97 | updatevalue[self.params.get(key, None)] = value.get(key) 98 | updateNum = query.update(updatevalue) 99 | self.session.commit() 100 | else: 101 | updateNum = 0 102 | return {'updateNum': updateNum} 103 | 104 | 105 | def select(self, count=None, conditions=None): 106 | ''' 107 | conditions的格式是个字典。类似self.params 108 | :param count: 109 | :param conditions: 110 | :return: 111 | ''' 112 | if conditions: 113 | conditon_list = [] 114 | for key in list(conditions.keys()): 115 | if self.params.get(key, None): 116 | conditon_list.append(self.params.get(key) == conditions.get(key)) 117 | conditions = conditon_list 118 | else: 119 | conditions = [] 120 | 121 | query = self.session.query(Proxy.ip, Proxy.port, Proxy.score) 122 | if len(conditions) > 0 and count: 123 | for condition in conditions: 124 | query = query.filter(condition) 125 | return query.order_by(Proxy.score.desc(), Proxy.speed).limit(count).all() 126 | elif count: 127 | return query.order_by(Proxy.score.desc(), Proxy.speed).limit(count).all() 128 | elif len(conditions) > 0: 129 | for condition in conditions: 130 | query = query.filter(condition) 131 | return query.order_by(Proxy.score.desc(), Proxy.speed).all() 132 | else: 133 | return query.order_by(Proxy.score.desc(), Proxy.speed).all() 134 | 135 | 136 | def close(self): 137 | pass 138 | 139 | 140 | if __name__ == '__main__': 141 | sqlhelper = SqlHelper() 142 | sqlhelper.init_db() 143 | proxy = {'ip': '192.168.1.1', 'port': 80, 'type': 0, 'protocol': 0, 'country': '中国', 'area': '广州', 'speed': 11.123, 'types': ''} 144 | sqlhelper.insert(proxy) 145 | sqlhelper.update({'ip': '192.168.1.1', 'port': 80}, {'score': 10}) 146 | print(sqlhelper.select(1)) 147 | 148 | -------------------------------------------------------------------------------- /db/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Xaxdus' 2 | -------------------------------------------------------------------------------- /qiye2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qiyeboy/IPProxyPool/127827829535641d57a1a05e20523cb0a192170b/qiye2.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | chardet==2.3.0 2 | gevent==1.2.0 3 | greenlet==0.4.11 4 | lxml==3.7.1 5 | requests==2.12.4 6 | SQLAlchemy==1.1.4 7 | web.py==0.38 8 | redis==2.10.5 9 | psutil 10 | -------------------------------------------------------------------------------- /spider/HtmlDownloader.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import random 4 | import config 5 | import json 6 | from db.DataStore import sqlhelper 7 | 8 | __author__ = 'qiye' 9 | 10 | import requests 11 | import chardet 12 | 13 | 14 | class Html_Downloader(object): 15 | @staticmethod 16 | def download(url): 17 | try: 18 | r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT) 19 | r.encoding = chardet.detect(r.content)['encoding'] 20 | if (not r.ok) or len(r.content) < 500: 21 | raise ConnectionError 22 | else: 23 | return r.text 24 | 25 | except Exception: 26 | count = 0 # 重试次数 27 | proxylist = sqlhelper.select(10) 28 | if not proxylist: 29 | return None 30 | 31 | while count < config.RETRY_TIME: 32 | try: 33 | proxy = random.choice(proxylist) 34 | ip = proxy[0] 35 | port = proxy[1] 36 | proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)} 37 | 38 | r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies) 39 | r.encoding = chardet.detect(r.content)['encoding'] 40 | if (not r.ok) or len(r.content) < 500: 41 | raise ConnectionError 42 | else: 43 | return r.text 44 | except Exception: 45 | count += 1 46 | 47 | return None 48 | -------------------------------------------------------------------------------- /spider/HtmlPraser.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import base64 3 | from config import QQWRY_PATH, CHINA_AREA 4 | from util.IPAddress import IPAddresss 5 | import re 6 | from util.compatibility import text_ 7 | 8 | __author__ = 'qiye' 9 | from lxml import etree 10 | 11 | 12 | class Html_Parser(object): 13 | def __init__(self): 14 | self.ips = IPAddresss(QQWRY_PATH) 15 | 16 | def parse(self, response, parser): 17 | ''' 18 | 19 | :param response: 响应 20 | :param type: 解析方式 21 | :return: 22 | ''' 23 | if parser['type'] == 'xpath': 24 | return self.XpathPraser(response, parser) 25 | elif parser['type'] == 'regular': 26 | return self.RegularPraser(response, parser) 27 | elif parser['type'] == 'module': 28 | return getattr(self, parser['moduleName'], None)(response, parser) 29 | else: 30 | return None 31 | 32 | def AuthCountry(self, addr): 33 | ''' 34 | 用来判断地址是哪个国家的 35 | :param addr: 36 | :return: 37 | ''' 38 | for area in CHINA_AREA: 39 | if text_(area) in addr: 40 | return True 41 | return False 42 | 43 | 44 | def XpathPraser(self, response, parser): 45 | ''' 46 | 针对xpath方式进行解析 47 | :param response: 48 | :param parser: 49 | :return: 50 | ''' 51 | proxylist = [] 52 | root = etree.HTML(response) 53 | proxys = root.xpath(parser['pattern']) 54 | for proxy in proxys: 55 | try: 56 | ip = proxy.xpath(parser['position']['ip'])[0].text 57 | port = proxy.xpath(parser['position']['port'])[0].text 58 | type = 0 59 | protocol = 0 60 | addr = self.ips.getIpAddr(self.ips.str2ip(ip)) 61 | country = text_('') 62 | area = text_('') 63 | if text_('省') in addr or self.AuthCountry(addr): 64 | country = text_('国内') 65 | area = addr 66 | else: 67 | country = text_('国外') 68 | area = addr 69 | except Exception as e: 70 | continue 71 | # updatetime = datetime.datetime.now() 72 | # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) 73 | 74 | # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100} 75 | proxy = {'ip': ip, 'port': int(port), 'types': int(type), 'protocol': int(protocol), 'country': country, 76 | 'area': area, 'speed': 100} 77 | proxylist.append(proxy) 78 | return proxylist 79 | 80 | def RegularPraser(self, response, parser): 81 | ''' 82 | 针对正则表达式进行解析 83 | :param response: 84 | :param parser: 85 | :return: 86 | ''' 87 | proxylist = [] 88 | pattern = re.compile(parser['pattern']) 89 | matchs = pattern.findall(response) 90 | if matchs != None: 91 | for match in matchs: 92 | try: 93 | ip = match[parser['position']['ip']] 94 | port = match[parser['position']['port']] 95 | # 网站的类型一直不靠谱所以还是默认,之后会检测 96 | type = 0 97 | # if parser['postion']['protocol'] > 0: 98 | # protocol = match[parser['postion']['protocol']] 99 | # if protocol.lower().find('https')!=-1: 100 | # protocol = 1 101 | # else: 102 | # protocol = 0 103 | # else: 104 | protocol = 0 105 | addr = self.ips.getIpAddr(self.ips.str2ip(ip)) 106 | country = text_('') 107 | area = text_('') 108 | # print(ip,port) 109 | if text_('省') in addr or self.AuthCountry(addr): 110 | country = text_('国内') 111 | area = addr 112 | else: 113 | country = text_('国外') 114 | area = addr 115 | except Exception as e: 116 | continue 117 | 118 | proxy = {'ip': ip, 'port': port, 'types': type, 'protocol': protocol, 'country': country, 'area': area, 119 | 'speed': 100} 120 | 121 | proxylist.append(proxy) 122 | return proxylist 123 | 124 | 125 | def CnproxyPraser(self, response, parser): 126 | proxylist = self.RegularPraser(response, parser) 127 | chardict = {'v': '3', 'm': '4', 'a': '2', 'l': '9', 'q': '0', 'b': '5', 'i': '7', 'w': '6', 'r': '8', 'c': '1'} 128 | 129 | for proxy in proxylist: 130 | port = proxy['port'] 131 | new_port = '' 132 | for i in range(len(port)): 133 | if port[i] != '+': 134 | new_port += chardict[port[i]] 135 | new_port = int(new_port) 136 | proxy['port'] = new_port 137 | return proxylist 138 | 139 | 140 | def proxy_listPraser(self, response, parser): 141 | proxylist = [] 142 | pattern = re.compile(parser['pattern']) 143 | matchs = pattern.findall(response) 144 | if matchs: 145 | for match in matchs: 146 | try: 147 | ip_port = base64.b64decode(match.replace("Proxy('", "").replace("')", "")) 148 | ip = ip_port.split(':')[0] 149 | port = ip_port.split(':')[1] 150 | type = 0 151 | protocol = 0 152 | addr = self.ips.getIpAddr(self.ips.str2ip(ip)) 153 | country = text_('') 154 | area = text_('') 155 | # print(ip,port) 156 | if text_('省') in addr or self.AuthCountry(addr): 157 | country = text_('国内') 158 | area = addr 159 | else: 160 | country = text_('国外') 161 | area = addr 162 | except Exception as e: 163 | continue 164 | proxy = {'ip': ip, 'port': int(port), 'types': type, 'protocol': protocol, 'country': country, 165 | 'area': area, 'speed': 100} 166 | proxylist.append(proxy) 167 | return proxylist 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /spider/ProxyCrawl.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from gevent import monkey 3 | monkey.patch_all() 4 | 5 | import sys 6 | import time 7 | import gevent 8 | 9 | from gevent.pool import Pool 10 | from multiprocessing import Queue, Process, Value 11 | 12 | from api.apiServer import start_api_server 13 | from config import THREADNUM, parserList, UPDATE_TIME, MINNUM, MAX_CHECK_CONCURRENT_PER_PROCESS, MAX_DOWNLOAD_CONCURRENT 14 | from db.DataStore import store_data, sqlhelper 15 | from spider.HtmlDownloader import Html_Downloader 16 | from spider.HtmlPraser import Html_Parser 17 | from validator.Validator import validator, getMyIP, detect_from_db 18 | 19 | ''' 20 | 这个类的作用是描述爬虫的逻辑 21 | ''' 22 | 23 | 24 | def startProxyCrawl(queue, db_proxy_num,myip): 25 | crawl = ProxyCrawl(queue, db_proxy_num,myip) 26 | crawl.run() 27 | 28 | 29 | class ProxyCrawl(object): 30 | proxies = set() 31 | 32 | def __init__(self, queue, db_proxy_num,myip): 33 | self.crawl_pool = Pool(THREADNUM) 34 | self.queue = queue 35 | self.db_proxy_num = db_proxy_num 36 | self.myip = myip 37 | 38 | 39 | def run(self): 40 | while True: 41 | self.proxies.clear() 42 | str = 'IPProxyPool----->>>>>>>>beginning' 43 | sys.stdout.write(str + "\r\n") 44 | sys.stdout.flush() 45 | proxylist = sqlhelper.select() 46 | 47 | spawns = [] 48 | for proxy in proxylist: 49 | spawns.append(gevent.spawn(detect_from_db, self.myip, proxy, self.proxies)) 50 | if len(spawns) >= MAX_CHECK_CONCURRENT_PER_PROCESS: 51 | gevent.joinall(spawns) 52 | spawns= [] 53 | gevent.joinall(spawns) 54 | self.db_proxy_num.value = len(self.proxies) 55 | str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies) 56 | 57 | if len(self.proxies) < MINNUM: 58 | str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...' 59 | sys.stdout.write(str + "\r\n") 60 | sys.stdout.flush() 61 | spawns = [] 62 | for p in parserList: 63 | spawns.append(gevent.spawn(self.crawl, p)) 64 | if len(spawns) >= MAX_DOWNLOAD_CONCURRENT: 65 | gevent.joinall(spawns) 66 | spawns= [] 67 | gevent.joinall(spawns) 68 | else: 69 | str += '\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...' 70 | sys.stdout.write(str + "\r\n") 71 | sys.stdout.flush() 72 | 73 | time.sleep(UPDATE_TIME) 74 | 75 | def crawl(self, parser): 76 | html_parser = Html_Parser() 77 | for url in parser['urls']: 78 | response = Html_Downloader.download(url) 79 | if response is not None: 80 | proxylist = html_parser.parse(response, parser) 81 | if proxylist is not None: 82 | for proxy in proxylist: 83 | proxy_str = '%s:%s' % (proxy['ip'], proxy['port']) 84 | if proxy_str not in self.proxies: 85 | self.proxies.add(proxy_str) 86 | while True: 87 | if self.queue.full(): 88 | time.sleep(0.1) 89 | else: 90 | self.queue.put(proxy) 91 | break 92 | 93 | 94 | if __name__ == "__main__": 95 | DB_PROXY_NUM = Value('i', 0) 96 | q1 = Queue() 97 | q2 = Queue() 98 | p0 = Process(target=start_api_server) 99 | p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM)) 100 | p2 = Process(target=validator, args=(q1, q2)) 101 | p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM)) 102 | 103 | p0.start() 104 | p1.start() 105 | p2.start() 106 | p3.start() 107 | 108 | # spider = ProxyCrawl() 109 | # spider.run() 110 | -------------------------------------------------------------------------------- /spider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Xaxdus' 2 | -------------------------------------------------------------------------------- /start.bat: -------------------------------------------------------------------------------- 1 | py -2 IPProxy.py -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Xaxdus' 2 | -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import requests 3 | import json 4 | 5 | r = requests.get('http://127.0.0.1:8000/?types=0&count=5&country=中国') 6 | ip_ports = json.loads(r.text) 7 | print(ip_ports) 8 | ip = ip_ports[0][0] 9 | port = ip_ports[0][1] 10 | proxies = { 11 | 'http': 'http://%s:%s' % (ip, port), 12 | 'https': 'http://%s:%s' % (ip, port) 13 | } 14 | r = requests.get('http://www.baidu.com', proxies=proxies) 15 | r.encoding = 'utf-8' 16 | print(r.text) 17 | -------------------------------------------------------------------------------- /test/testIPAddress.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | 5 | import socket 6 | import struct 7 | 8 | import logging 9 | 10 | 11 | logger = logging.getLogger('util') 12 | 13 | 14 | class IPAddresss: 15 | def __init__(self, ipdbFile): 16 | self.ipdb = open(ipdbFile, "rb") 17 | str = self.ipdb.read(8) 18 | (self.firstIndex, self.lastIndex) = struct.unpack('II', str) 19 | self.indexCount = int((self.lastIndex - self.firstIndex) / 7 + 1) 20 | # print self.getVersion(), u" 纪录总数: %d 条 "%(self.indexCount) 21 | 22 | def getVersion(self): 23 | s = self.getIpAddr(0xffffff00) 24 | return s 25 | 26 | def getAreaAddr(self, offset=0): 27 | if offset: 28 | self.ipdb.seek(offset) 29 | str = self.ipdb.read(1) 30 | (byte,) = struct.unpack('B', str) 31 | if byte == 0x01 or byte == 0x02: 32 | p = self.getLong3() 33 | if p: 34 | return self.getString(p) 35 | else: 36 | return "" 37 | else: 38 | self.ipdb.seek(-1, 1) 39 | return self.getString(offset) 40 | 41 | def getAddr(self, offset, ip=0): 42 | self.ipdb.seek(offset + 4) 43 | countryAddr = "" 44 | areaAddr = "" 45 | str = self.ipdb.read(1) 46 | (byte,) = struct.unpack('B', str) 47 | if byte == 0x01: 48 | countryOffset = self.getLong3() 49 | self.ipdb.seek(countryOffset) 50 | str = self.ipdb.read(1) 51 | (b,) = struct.unpack('B', str) 52 | if b == 0x02: 53 | countryAddr = self.getString(self.getLong3()) 54 | self.ipdb.seek(countryOffset + 4) 55 | else: 56 | countryAddr = self.getString(countryOffset) 57 | areaAddr = self.getAreaAddr() 58 | elif byte == 0x02: 59 | countryAddr = self.getString(self.getLong3()) 60 | areaAddr = self.getAreaAddr(offset + 8) 61 | else: 62 | countryAddr = self.getString(offset + 4) 63 | areaAddr = self.getAreaAddr() 64 | return countryAddr + " " + areaAddr 65 | 66 | def dump(self, first, last): 67 | if last > self.indexCount: 68 | last = self.indexCount 69 | for index in range(first, last): 70 | offset = self.firstIndex + index * 7 71 | self.ipdb.seek(offset) 72 | buf = self.ipdb.read(7) 73 | (ip, of1, of2) = struct.unpack("IHB", buf) 74 | address = self.getAddr(of1 + (of2 << 16)) 75 | # 把GBK转为utf-8 76 | address = str(address, 'gbk').encode("utf-8") 77 | logger.info("%d %s %s" % (index, self.ip2str(ip), address)) 78 | 79 | def setIpRange(self, index): 80 | offset = self.firstIndex + index * 7 81 | self.ipdb.seek(offset) 82 | buf = self.ipdb.read(7) 83 | (self.curStartIp, of1, of2) = struct.unpack("IHB", buf) 84 | self.curEndIpOffset = of1 + (of2 << 16) 85 | self.ipdb.seek(self.curEndIpOffset) 86 | buf = self.ipdb.read(4) 87 | (self.curEndIp,) = struct.unpack("I", buf) 88 | 89 | def getIpAddr(self, ip): 90 | L = 0 91 | R = self.indexCount - 1 92 | while L < R - 1: 93 | M = int((L + R) / 2) 94 | self.setIpRange(M) 95 | if ip == self.curStartIp: 96 | L = M 97 | break 98 | if ip > self.curStartIp: 99 | L = M 100 | else: 101 | R = M 102 | self.setIpRange(L) 103 | # version information, 255.255.255.X, urgy but useful 104 | if ip & 0xffffff00 == 0xffffff00: 105 | self.setIpRange(R) 106 | if self.curStartIp <= ip <= self.curEndIp: 107 | address = self.getAddr(self.curEndIpOffset) 108 | # 把GBK转为utf-8 109 | address = str(address) 110 | else: 111 | address = "未找到该IP的地址" 112 | return address 113 | 114 | def getIpRange(self, ip): 115 | self.getIpAddr(ip) 116 | range = self.ip2str(self.curStartIp) + ' - ' \ 117 | + self.ip2str(self.curEndIp) 118 | return range 119 | 120 | def getString(self, offset=0): 121 | if offset: 122 | self.ipdb.seek(offset) 123 | str = b'' 124 | ch = self.ipdb.read(1) 125 | (byte,) = struct.unpack('B', ch) 126 | while byte != 0: 127 | str += ch 128 | ch = self.ipdb.read(1) 129 | (byte,) = struct.unpack('B', ch) 130 | return str.decode('gbk') 131 | 132 | def ip2str(self, ip): 133 | return str(ip >> 24) + '.' + str((ip >> 16) & 0xff) + '.' + str((ip >> 8) & 0xff) + '.' + str(ip & 0xff) 134 | 135 | def str2ip(self, s): 136 | (ip,) = struct.unpack('I', socket.inet_aton(s)) 137 | return ((ip >> 24) & 0xff) | ((ip & 0xff) << 24) | ((ip >> 8) & 0xff00) | ((ip & 0xff00) << 8) 138 | 139 | def getLong3(self, offset=0): 140 | if offset: 141 | self.ipdb.seek(offset) 142 | str = self.ipdb.read(3) 143 | (a, b) = struct.unpack('HB', str) 144 | return (b << 16) + a 145 | 146 | 147 | QQWRY_PATH = os.path.dirname(__file__) + "/../data/qqwry.dat" 148 | ips = IPAddresss(QQWRY_PATH) 149 | addr = ips.getIpAddr(ips.str2ip('183.61.236.53')) 150 | print(addr) 151 | -------------------------------------------------------------------------------- /test/testIPType.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from lxml import etree 3 | import requests 4 | import config 5 | 6 | 7 | def checkProxyType(selfip, proxies): 8 | ''' 9 | 用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型 10 | :param proxies: 代理(0 高匿,1 匿名,2 透明 3 无效代理 11 | :return: 12 | ''' 13 | 14 | try: 15 | r = requests.get(url='https://incloak.com/ip/', headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies) 16 | print 17 | r.text 18 | # if r.ok: 19 | # root = etree.HTML(r.text) 20 | # ip = root.xpath('.//center[2]/table/tr[3]/td[2]')[0].text 21 | # http_x_forwared_for = root.xpath('.//center[2]/table/tr[8]/td[2]')[0].text 22 | # http_via = root.xpath('.//center[2]/table/tr[9]/td[2]')[0].text 23 | # # print ip,http_x_forwared_for,http_via,type(http_via),type(http_x_forwared_for) 24 | # if ip==selfip: 25 | # return 3 26 | # if http_x_forwared_for is None and http_via is None: 27 | # return 0 28 | # if http_via != None and http_x_forwared_for.find(selfip)== -1: 29 | # return 1 30 | # 31 | # if http_via != None and http_x_forwared_for.find(selfip)!= -1: 32 | # return 2 33 | # return 3 34 | 35 | 36 | except Exception as e: 37 | print 38 | str(e) 39 | return 3 40 | 41 | 42 | if __name__ == '__main__': 43 | ip = '61.132.241.109' 44 | port = '808' 45 | proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)} 46 | checkProxyType(None, proxies) -------------------------------------------------------------------------------- /test/testbase64.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import base64 3 | import re 4 | 5 | str = ''' 6 | 7 | ''' 8 | match = re.search('Proxy\(.+\)', str) 9 | print 10 | match.group() 11 | ip_port = base64.b64decode(match.group().replace("Proxy('", "").replace("')", "")) 12 | print 13 | ip_port 14 | -------------------------------------------------------------------------------- /test/testhttpserver.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import BaseHTTPServer 3 | import json 4 | import urlparse 5 | 6 | 7 | class WebRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler): 8 | def do_GET(self): 9 | """ 10 | """ 11 | print 12 | self.path 13 | parsed_path = urlparse.urlparse(self.path) 14 | print 15 | parsed_path 16 | print 17 | parsed_path.query 18 | # message_parts = [ 19 | # 'CLIENT VALUES:', 20 | # 'client_address=%s (%s)' % (self.client_address, 21 | # self.address_string()), 22 | # 'command=%s' % self.command, 23 | # 'path=%s' % self.path, 24 | # 'real path=%s' % parsed_path.path, 25 | # 'query=%s' % parsed_path.query, 26 | # 'request_version=%s' % self.request_version, 27 | # '', 28 | # 'SERVER VALUES:', 29 | # 'server_version=%s' % self.server_version, 30 | # 'sys_version=%s' % self.sys_version, 31 | # 'protocol_version=%s' % self.protocol_version, 32 | # '', 33 | # 'HEADERS RECEIVED:', 34 | # ] 35 | # for name, value in sorted(self.headers.items()): 36 | # message_parts.append('%s=%s' % (name, value.rstrip())) 37 | # message_parts.append('') 38 | # message = '\r\n'.join(message_parts) 39 | data1 = [{'ip': '192.168.0.0', 'port': 456}] * 10 40 | d1 = json.dumps(data1, sort_keys=True, indent=4) 41 | message = ('192.168.1.1', 80) 42 | self.send_response(200) 43 | self.end_headers() 44 | self.wfile.write(d1) 45 | 46 | 47 | server = BaseHTTPServer.HTTPServer(('0.0.0.0', 8000), WebRequestHandler) 48 | server.serve_forever() -------------------------------------------------------------------------------- /test/testlist.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from decimal import Decimal 3 | 4 | __author__ = 'Xaxdus' 5 | 6 | 7 | # list = ["www.baidu.com/%s" %m for m in ['index']+range(1,5)] 8 | # 9 | # list = [(1,10)]*10 10 | # 11 | # for m,n in list: 12 | # print m,n 13 | # 14 | # 15 | # list2 = ["www.baidu.com/%s/%s"%(i[0],i[1]) for i in list] 16 | # print list2 17 | 18 | # x=Decimal('0.998531571219').quantize(Decimal('0.00')) 19 | # a= 0.998531571219 20 | # value = round(a, 3) 21 | # print x,type(x),value 22 | # proxys=[] 23 | # proxy=[123,1234] 24 | # proxys.append(proxy) 25 | # 26 | # proxy=[123,1234] 27 | # proxys.append(proxy) 28 | # 29 | # print proxys 30 | # l = [{'ip':'123.1.1.1','port':80},{'ip':'123.1.1.1','port':80},{'ip':'123.1.2.1','port':80},{'ip':'123.1.1.1','port':81}] 31 | # 32 | # # for d in l: 33 | # # print [tuple(d.items())] 34 | # print [tuple(d.items()) for d in l] 35 | # 36 | # print [dict(t) for t in set([tuple(d.items()) for d in l])] 37 | import requests 38 | 39 | r = requests.get('http://127.0.0.1:8000/delete?ip=120.92.3.127') 40 | print 41 | r.text -------------------------------------------------------------------------------- /test/testlxml.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from lxml import etree 3 | 4 | __author__ = 'Xaxdus' 5 | 6 | html = ''' 7 | 8 | 9 | 10 | 11 | 北京http代理ip_66免费代理ip提取网 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 48 |
49 | 50 | 51 | 56 | 57 | 58 | 60 | 61 |
52 |

53 | 免流量 54 |

55 |
59 |
62 |
63 |
64 | 65 | 66 | 105 | 106 |
67 | 104 |
107 |
108 | 109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 | 135 |
136 |
137 | 162 | 163 | 164 | 165 | ''' 166 | 167 | root = etree.HTML(html) 168 | proxys = root.xpath(".//*[@id='footer']/div/table/tr[position()>1]") 169 | 170 | for proxy in proxys: 171 | print 172 | proxy.xpath('./td[1]')[0].text -------------------------------------------------------------------------------- /test/testqueue.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from multiprocessing import Queue 3 | 4 | try: 5 | q = Queue() 6 | q.get(timeout=5) 7 | except BaseException, e: 8 | print 9 | '--' + str(e) 10 | 11 | -------------------------------------------------------------------------------- /test/testsql.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from db.SqlHelper import SqlHelper 3 | from util.exception import Con_DB_Fail 4 | 5 | try: 6 | sqlhelper = SqlHelper() 7 | sqlhelper.init_db() 8 | except Exception: 9 | raise Con_DB_Fail 10 | 11 | proxy = {'ip': '192.168.1.1', 'port': int('80'), 'type': 0, 'protocol': 0, 'country': u'中国', 'area': u'四川', 'speed': 0} 12 | sqlhelper.insert(proxy) -------------------------------------------------------------------------------- /util/IPAddress.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | 6 | import socket 7 | import struct 8 | 9 | import logging 10 | from util.compatibility import text_ 11 | 12 | logger = logging.getLogger('util') 13 | 14 | 15 | class IPAddresss: 16 | def __init__(self, ipdbFile): 17 | self.ipdb = open(ipdbFile, "rb") 18 | str = self.ipdb.read(8) 19 | (self.firstIndex, self.lastIndex) = struct.unpack('II', str) 20 | self.indexCount = int((self.lastIndex - self.firstIndex) / 7 + 1) 21 | # print self.getVersion(), u" 纪录总数: %d 条 "%(self.indexCount) 22 | 23 | def getVersion(self): 24 | s = self.getIpAddr(0xffffff00) 25 | return s 26 | 27 | def getAreaAddr(self, offset=0): 28 | if offset: 29 | self.ipdb.seek(offset) 30 | str = self.ipdb.read(1) 31 | (byte,) = struct.unpack('B', str) 32 | if byte == 0x01 or byte == 0x02: 33 | p = self.getLong3() 34 | if p: 35 | return self.getString(p) 36 | else: 37 | return "" 38 | else: 39 | self.ipdb.seek(-1, 1) 40 | return self.getString(offset) 41 | 42 | def getAddr(self, offset, ip=0): 43 | self.ipdb.seek(offset + 4) 44 | countryAddr = text_("") 45 | areaAddr = text_("") 46 | str = self.ipdb.read(1) 47 | (byte,) = struct.unpack('B', str) 48 | if byte == 0x01: 49 | countryOffset = self.getLong3() 50 | self.ipdb.seek(countryOffset) 51 | str = self.ipdb.read(1) 52 | (b,) = struct.unpack('B', str) 53 | if b == 0x02: 54 | countryAddr = self.getString(self.getLong3()) 55 | self.ipdb.seek(countryOffset + 4) 56 | else: 57 | countryAddr = self.getString(countryOffset) 58 | areaAddr = self.getAreaAddr() 59 | elif byte == 0x02: 60 | countryAddr = self.getString(self.getLong3()) 61 | areaAddr = self.getAreaAddr(offset + 8) 62 | else: 63 | countryAddr = self.getString(offset + 4) 64 | areaAddr = self.getAreaAddr() 65 | return countryAddr + text_(" ") + areaAddr 66 | 67 | def dump(self, first, last): 68 | if last > self.indexCount: 69 | last = self.indexCount 70 | for index in range(first, last): 71 | offset = self.firstIndex + index * 7 72 | self.ipdb.seek(offset) 73 | buf = self.ipdb.read(7) 74 | (ip, of1, of2) = struct.unpack("IHB", buf) 75 | address = self.getAddr(of1 + (of2 << 16)) 76 | # 把GBK转为utf-8 77 | address = text_(address, 'gbk').encode("utf-8") 78 | logger.info("%d %s %s" % (index, self.ip2str(ip), address)) 79 | 80 | def setIpRange(self, index): 81 | offset = self.firstIndex + index * 7 82 | self.ipdb.seek(offset) 83 | buf = self.ipdb.read(7) 84 | (self.curStartIp, of1, of2) = struct.unpack("IHB", buf) 85 | self.curEndIpOffset = of1 + (of2 << 16) 86 | self.ipdb.seek(self.curEndIpOffset) 87 | buf = self.ipdb.read(4) 88 | (self.curEndIp,) = struct.unpack("I", buf) 89 | 90 | def getIpAddr(self, ip): 91 | L = 0 92 | R = self.indexCount - 1 93 | while L < R - 1: 94 | M = int((L + R) / 2) 95 | self.setIpRange(M) 96 | if ip == self.curStartIp: 97 | L = M 98 | break 99 | if ip > self.curStartIp: 100 | L = M 101 | else: 102 | R = M 103 | self.setIpRange(L) 104 | # version information, 255.255.255.X, urgy but useful 105 | if ip & 0xffffff00 == 0xffffff00: 106 | self.setIpRange(R) 107 | if self.curStartIp <= ip <= self.curEndIp: 108 | address = self.getAddr(self.curEndIpOffset) 109 | # 把GBK转为utf-8 110 | address = text_(address) 111 | else: 112 | address = text_("未找到该IP的地址") 113 | return address 114 | 115 | def getIpRange(self, ip): 116 | self.getIpAddr(ip) 117 | range = self.ip2str(self.curStartIp) + ' - ' \ 118 | + self.ip2str(self.curEndIp) 119 | return range 120 | 121 | def getString(self, offset=0): 122 | if offset: 123 | self.ipdb.seek(offset) 124 | str = b'' 125 | ch = self.ipdb.read(1) 126 | (byte,) = struct.unpack('B', ch) 127 | while byte != 0: 128 | str += ch 129 | ch = self.ipdb.read(1) 130 | (byte,) = struct.unpack('B', ch) 131 | return str.decode('gbk') 132 | 133 | def ip2str(self, ip): 134 | return str(ip >> 24) + '.' + str((ip >> 16) & 0xff) + '.' + str((ip >> 8) & 0xff) + '.' + str(ip & 0xff) 135 | 136 | def str2ip(self, s): 137 | (ip,) = struct.unpack('I', socket.inet_aton(s)) 138 | return ((ip >> 24) & 0xff) | ((ip & 0xff) << 24) | ((ip >> 8) & 0xff00) | ((ip & 0xff00) << 8) 139 | 140 | def getLong3(self, offset=0): 141 | if offset: 142 | self.ipdb.seek(offset) 143 | str = self.ipdb.read(3) 144 | (a, b) = struct.unpack('HB', str) 145 | return (b << 16) + a 146 | 147 | 148 | 149 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Xaxdus' 2 | 3 | -------------------------------------------------------------------------------- /util/compatibility.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import sys 3 | 4 | PY3 = sys.version_info[0] == 3 5 | if PY3: 6 | text_type = str 7 | binary_type = bytes 8 | else: 9 | text_type = unicode 10 | binary_type = str 11 | 12 | 13 | def text_(s, encoding='utf-8', errors='strict'): 14 | if isinstance(s, binary_type): 15 | return s.decode(encoding, errors) 16 | return s 17 | 18 | 19 | def bytes_(s, encoding='utf-8', errors='strict'): 20 | if isinstance(s, text_type): 21 | return s.encode(encoding, errors) 22 | return s 23 | -------------------------------------------------------------------------------- /util/exception.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import config 3 | 4 | 5 | class Test_URL_Fail(Exception): 6 | def __str__(self): 7 | str = "访问%s失败,请检查网络连接" % config.TEST_IP 8 | return str 9 | 10 | 11 | class Con_DB_Fail(Exception): 12 | def __str__(self): 13 | str = "使用DB_CONNECT_STRING:%s--连接数据库失败" % config.DB_CONNECT_STRING 14 | return str 15 | -------------------------------------------------------------------------------- /util/logger.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import logging 3 | 4 | __author__ = 'qiye' 5 | 6 | logger = logging.getLogger() 7 | 8 | 9 | def logger_proxy(proxy): 10 | logger.setLevel(logging.INFO) 11 | logger.info(proxy) 12 | -------------------------------------------------------------------------------- /validator/Validator.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import sys 3 | 4 | import chardet 5 | from gevent import monkey 6 | monkey.patch_all() 7 | 8 | import json 9 | import os 10 | import gevent 11 | import requests 12 | import time 13 | import psutil 14 | from multiprocessing import Process, Queue 15 | 16 | import config 17 | from db.DataStore import sqlhelper 18 | from util.exception import Test_URL_Fail 19 | 20 | 21 | def detect_from_db(myip, proxy, proxies_set): 22 | proxy_dict = {'ip': proxy[0], 'port': proxy[1]} 23 | result = detect_proxy(myip, proxy_dict) 24 | if result: 25 | proxy_str = '%s:%s' % (proxy[0], proxy[1]) 26 | proxies_set.add(proxy_str) 27 | 28 | else: 29 | if proxy[2] < 1: 30 | sqlhelper.delete({'ip': proxy[0], 'port': proxy[1]}) 31 | else: 32 | score = proxy[2]-1 33 | sqlhelper.update({'ip': proxy[0], 'port': proxy[1]}, {'score': score}) 34 | proxy_str = '%s:%s' % (proxy[0], proxy[1]) 35 | proxies_set.add(proxy_str) 36 | 37 | 38 | 39 | def validator(queue1, queue2, myip): 40 | tasklist = [] 41 | proc_pool = {} # 所有进程列表 42 | cntl_q = Queue() # 控制信息队列 43 | while True: 44 | if not cntl_q.empty(): 45 | # 处理已结束的进程 46 | try: 47 | pid = cntl_q.get() 48 | proc = proc_pool.pop(pid) 49 | proc_ps = psutil.Process(pid) 50 | proc_ps.kill() 51 | proc_ps.wait() 52 | except Exception as e: 53 | pass 54 | # print(e) 55 | # print(" we are unable to kill pid:%s" % (pid)) 56 | try: 57 | # proxy_dict = {'source':'crawl','data':proxy} 58 | if len(proc_pool) >= config.MAX_CHECK_PROCESS: 59 | time.sleep(config.CHECK_WATI_TIME) 60 | continue 61 | proxy = queue1.get() 62 | tasklist.append(proxy) 63 | if len(tasklist) >= config.MAX_CHECK_CONCURRENT_PER_PROCESS: 64 | p = Process(target=process_start, args=(tasklist, myip, queue2, cntl_q)) 65 | p.start() 66 | proc_pool[p.pid] = p 67 | tasklist = [] 68 | 69 | except Exception as e: 70 | if len(tasklist) > 0: 71 | p = Process(target=process_start, args=(tasklist, myip, queue2, cntl_q)) 72 | p.start() 73 | proc_pool[p.pid] = p 74 | tasklist = [] 75 | 76 | def process_start(tasks, myip, queue2, cntl): 77 | spawns = [] 78 | for task in tasks: 79 | spawns.append(gevent.spawn(detect_proxy, myip, task, queue2)) 80 | gevent.joinall(spawns) 81 | cntl.put(os.getpid()) # 子进程退出是加入控制队列 82 | 83 | 84 | def detect_proxy(selfip, proxy, queue2=None): 85 | ''' 86 | :param proxy: ip字典 87 | :return: 88 | ''' 89 | ip = proxy['ip'] 90 | port = proxy['port'] 91 | proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)} 92 | protocol, types, speed = getattr(sys.modules[__name__],config.CHECK_PROXY['function'])(selfip, proxies)#checkProxy(selfip, proxies) 93 | if protocol >= 0: 94 | proxy['protocol'] = protocol 95 | proxy['types'] = types 96 | proxy['speed'] = speed 97 | else: 98 | proxy = None 99 | if queue2: 100 | queue2.put(proxy) 101 | return proxy 102 | 103 | 104 | def checkProxy(selfip, proxies): 105 | ''' 106 | 用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型 107 | :param 108 | :return: 109 | ''' 110 | protocol = -1 111 | types = -1 112 | speed = -1 113 | http, http_types, http_speed = _checkHttpProxy(selfip, proxies) 114 | https, https_types, https_speed = _checkHttpProxy(selfip, proxies, False) 115 | if http and https: 116 | protocol = 2 117 | types = http_types 118 | speed = http_speed 119 | elif http: 120 | types = http_types 121 | protocol = 0 122 | speed = http_speed 123 | elif https: 124 | types = https_types 125 | protocol = 1 126 | speed = https_speed 127 | else: 128 | types = -1 129 | protocol = -1 130 | speed = -1 131 | return protocol, types, speed 132 | 133 | 134 | def _checkHttpProxy(selfip, proxies, isHttp=True): 135 | types = -1 136 | speed = -1 137 | if isHttp: 138 | test_url = config.TEST_HTTP_HEADER 139 | else: 140 | test_url = config.TEST_HTTPS_HEADER 141 | try: 142 | start = time.time() 143 | r = requests.get(url=test_url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies) 144 | if r.ok: 145 | speed = round(time.time() - start, 2) 146 | content = json.loads(r.text) 147 | headers = content['headers'] 148 | ip = content['origin'] 149 | proxy_connection = headers.get('Proxy-Connection', None) 150 | if ',' in ip: 151 | types = 2 152 | elif proxy_connection: 153 | types = 1 154 | else: 155 | types = 0 156 | 157 | return True, types, speed 158 | else: 159 | return False, types, speed 160 | except Exception as e: 161 | return False, types, speed 162 | 163 | 164 | def baidu_check(selfip, proxies): 165 | ''' 166 | 用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型 167 | :param 168 | :return: 169 | ''' 170 | protocol = -1 171 | types = -1 172 | speed = -1 173 | # try: 174 | # #http://ip.chinaz.com/getip.aspx挺稳定,可以用来检测ip 175 | # r = requests.get(url=config.TEST_URL, headers=config.get_header(), timeout=config.TIMEOUT, 176 | # proxies=proxies) 177 | # r.encoding = chardet.detect(r.content)['encoding'] 178 | # if r.ok: 179 | # if r.text.find(selfip)>0: 180 | # return protocol, types, speed 181 | # else: 182 | # return protocol,types,speed 183 | # 184 | # 185 | # except Exception as e: 186 | # return protocol, types, speed 187 | try: 188 | start = time.time() 189 | r = requests.get(url='https://www.baidu.com', headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies) 190 | r.encoding = chardet.detect(r.content)['encoding'] 191 | if r.ok: 192 | speed = round(time.time() - start, 2) 193 | protocol= 0 194 | types=0 195 | 196 | else: 197 | speed = -1 198 | protocol= -1 199 | types=-1 200 | except Exception as e: 201 | speed = -1 202 | protocol = -1 203 | types = -1 204 | return protocol, types, speed 205 | 206 | def getMyIP(): 207 | try: 208 | r = requests.get(url=config.TEST_IP, headers=config.get_header(), timeout=config.TIMEOUT) 209 | ip = json.loads(r.text) 210 | return ip['origin'] 211 | except Exception as e: 212 | raise Test_URL_Fail 213 | 214 | 215 | if __name__ == '__main__': 216 | ip = '222.186.161.132' 217 | port = 3128 218 | proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)} 219 | _checkHttpProxy(None,proxies) 220 | # getMyIP() 221 | # str="{ip:'61.150.43.121',address:'陕西省西安市 西安电子科技大学'}" 222 | # j = json.dumps(str) 223 | # str = j['ip'] 224 | # print str 225 | -------------------------------------------------------------------------------- /validator/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Xaxdus' 2 | --------------------------------------------------------------------------------