├── .gitignore ├── IPProxys.py ├── README.md ├── api ├── __init__.py └── apiServer.py ├── config.py ├── data ├── proxy.db └── qqwry.dat ├── db ├── SQLiteHelper.py ├── SqlHelper.py └── __init__.py ├── logging.conf ├── qiye2.jpg ├── requirements.txt ├── spider ├── HtmlDownLoader.py ├── HtmlPraser.py ├── ProxySpider.py └── __init__.py ├── start.bat ├── test ├── __init__.py ├── test.py ├── testhttpserver.py ├── testlist.py └── testlxml.py ├── util ├── IPAddress.py ├── __init__.py └── logger.py └── validator ├── Validator.py └── __init__.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | .idea/ -------------------------------------------------------------------------------- /IPProxys.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import BaseHTTPServer 3 | import threading 4 | import logging 5 | import logging.config 6 | 7 | from api.apiServer import WebRequestHandler 8 | from config import API_PORT 9 | from db.SQLiteHelper import SqliteHelper 10 | from spider.ProxySpider import ProxySpider 11 | import sys 12 | reload(sys) 13 | sys.setdefaultencoding('utf8') 14 | logging.config.fileConfig('logging.conf') 15 | 16 | class IPProxys(object): 17 | 18 | def startApiServer(self): 19 | ''' 20 | 启动api服务器 21 | :return: 22 | ''' 23 | logging.info('Start server @ %s:%s' %('0.0.0.0',API_PORT)) 24 | server = BaseHTTPServer.HTTPServer(('0.0.0.0',API_PORT), WebRequestHandler) 25 | server.serve_forever() 26 | 27 | def startSpider(self): 28 | logging.info('Start Spider') 29 | spider = ProxySpider() 30 | spider.run() 31 | 32 | if __name__=="__main__": 33 | 34 | proxys = IPProxys() 35 | 36 | apiServer = threading.Thread(target=proxys.startApiServer) 37 | spider = threading.Thread(target=proxys.startSpider) 38 | apiServer.start() 39 | spider.start() 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IPProxys 2 | IPProxys代理池项目,提供代理ip。使用python2.7.x开发 3 |
4 | 详细使用方式,请看我的博客: 5 | http://www.cnblogs.com/qiyeboy/p/5693128.html 6 |
7 | 我的微信公众号: 8 |
9 | ![](qiye2.jpg) 10 |
11 | 希望大家提供更多的代理网站,现在爬取的好用的代理ip还是太少。 12 | 13 |
14 | 同时感谢[super1-chen](https://github.com/super1-chen)对项目的贡献。 15 |
16 | ##项目依赖 17 | ####ubuntu,debian下 18 |
19 | 安装sqlite数据库(一般系统内置): 20 | apt-get install sqlite3 21 |
22 | 安装requests库: 23 | pip install requests 24 |
25 | 安装chardet库: 26 | pip install chardet 27 |
28 | 安装lxml: 29 | apt-get install python-lxml 30 |
31 | 安装gevent库: 32 | pip install gevent 33 | ######(有时候使用的gevent版本过低会出现自动退出情况,请使用pip install gevent --upgrade更新) 34 |
35 | ####windows下 36 | 下载[sqlite](http://www.sqlite.org/download.html),路径添加到环境变量 37 |
38 | 安装requests库: 39 | pip install requests 40 |
41 | 安装chardet库: 42 | pip install chardet 43 |
44 | 安装lxml: 45 | pip install lxml或者下载[lxml windows版](https://pypi.python.org/pypi/lxml/) 46 |
47 | 安装gevent库: 48 | pip install gevent 49 | ######(有时候使用的gevent版本过低会出现自动退出情况,请使用pip install gevent --upgrade更新) 50 | ## 如何使用 51 | 52 | 将项目目录clone到当前文件夹 53 | 54 | $ git clone 55 | 56 | 切换工程目录 57 | 58 | ``` 59 | $ cd IPProxys 60 | ``` 61 | 62 | 运行脚本 63 | 64 | ``` 65 | python IPProxys.py 66 | ``` 67 | 68 | ## API 使用方法 69 | 70 | #### 模式 71 | ``` 72 | GET / 73 | ``` 74 | 75 | ####参数 76 | 77 | 78 | | Name | Type | Description | 79 | | ----| ---- | ---- | 80 | | types | int | 0: 高匿代理, 1 透明 | 81 | | protocol | int | 0: http, 1 https | 82 | | count | int | 数量 | 83 | | country | str | 国家 | 84 | | area | str | 地区 | 85 | 86 | 87 | 88 | #### 例子 89 | #####IPProxys默认端口为8000 90 | #####如果是在本机上测试: 91 | 1.获取5个ip地址在中国的高匿代理:http://127.0.0.1:8000/?types=0&count=5&country=中国 92 |
93 | 2.响应为JSON格式,按照响应速度由高到低,返回数据: 94 |
95 | [{"ip": "220.160.22.115", "port": 80}, {"ip": "183.129.151.130", "port": 80}, {"ip": "59.52.243.88", "port": 80}, {"ip": "112.228.35.24", "port": 8888}, {"ip": "106.75.176.4", "port": 80}] 96 |
97 | ``` 98 | import requests 99 | import json 100 | r = requests.get('http://127.0.0.1:8000/?types=0&count=5&country=中国') 101 | ip_ports = json.loads(r.text) 102 | print ip_ports 103 | ip = ip_ports[0]['ip'] 104 | port = ip_ports[0]['port'] 105 | proxies={ 106 | 'http':'http://%s:%s'%(ip,port), 107 | 'https':'http://%s:%s'%(ip,port) 108 | } 109 | r = requests.get('http://ip.chinaz.com/',proxies=proxies) 110 | r.encoding='utf-8' 111 | print r.text 112 | ``` 113 | ## TODO 114 | 1.添加对Python3.x的支持 115 |
116 | 2.可自主选择添加squid反向代理服务器,简化爬虫配置 117 |
118 | 3.重构HTTP API接口 119 |
120 | 4.增加更多代理网站和数据库适配 121 | ## 更新进度 122 | -----------------------------2016-11-24---------------------------- 123 |
124 | 1.增加chardet识别网页编码 125 |
126 | 2.突破66ip.cn反爬限制 127 |
128 | -----------------------------2016-10-27---------------------------- 129 |
130 | 1.增加对代理的检测,测试是否能真正访问到网址,实现代理 131 |
132 | 2.添加通过正则表达式和加载插件解析网页的方式 133 |
134 | 3.又增加一个新的代理网站 135 |
136 | 137 | -----------------------------2016-7-20---------------------------- 138 |
139 | 1.修复bug ,将数据库进行压缩 140 |
141 | -------------------------------------------------------------------------------- /api/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Xaxdus' 2 | -------------------------------------------------------------------------------- /api/apiServer.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | ''' 3 | 定义几个关键字,count type,protocol,country,area, 4 | ''' 5 | import urllib 6 | from config import API_PORT 7 | from db.SQLiteHelper import SqliteHelper 8 | 9 | __author__ = 'Xaxdus' 10 | 11 | import BaseHTTPServer 12 | import json 13 | import urlparse 14 | import logging 15 | logger = logging.getLogger('api') 16 | 17 | # keylist=['count', 'types','protocol','country','area'] 18 | class WebRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler): 19 | 20 | def do_GET(self): 21 | """ 22 | """ 23 | dict={} 24 | 25 | parsed_path = urlparse.urlparse(self.path) 26 | try: 27 | query = urllib.unquote(parsed_path.query) 28 | logger.info("query %s" %query) 29 | if query.find('&')!=-1: 30 | params = query.split('&') 31 | for param in params: 32 | dict[param.split('=')[0]]=param.split('=')[1] 33 | else: 34 | dict[query.split('=')[0]]=query.split('=')[1] 35 | 36 | sqlHelper = SqliteHelper() 37 | # 处理删除代理的请求 38 | if dict.has_key('delete'): 39 | condition="ip='" + dict['ip'] + "' AND port=" + dict['port'] 40 | sqlHelper.delete(SqliteHelper.tableName, condition) 41 | self.send_response(200) 42 | self.end_headers() 43 | self.wfile.write("Success delete proxy: " + dict['ip'] + ":" + dict['port']) 44 | else: 45 | str_count='' 46 | conditions=[] 47 | for key in dict: 48 | if key =='count': 49 | str_count = 'LIMIT 0,%s'% dict[key] 50 | if key =='country' or key =='area': 51 | conditions .append(key+" LIKE '"+dict[key]+"%'") 52 | elif key =='types' or key =='protocol' or key =='country' or key =='area': 53 | conditions .append(key+"="+dict[key]) 54 | if len(conditions)>1: 55 | conditions = ' AND '.join(conditions) 56 | else: 57 | conditions =conditions[0] 58 | result = sqlHelper.select(sqlHelper.tableName,conditions,str_count) 59 | # print type(result) 60 | # for r in result: 61 | # print r 62 | data = [{'ip':item[0], 'port': item[1]} for item in result] 63 | data = json.dumps(data) 64 | self.send_response(200) 65 | self.end_headers() 66 | self.wfile.write(data) 67 | except Exception,e: 68 | logger.warning(str(e)) 69 | self.send_response(404) 70 | 71 | if __name__=='__main__': 72 | server = BaseHTTPServer.HTTPServer(('0.0.0.0',API_PORT), WebRequestHandler) 73 | server.serve_forever() 74 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | ''' 3 | 定义规则 urls:url列表 4 | type:解析方式,取值 regular(正则表达式),xpath(xpath解析),module(自定义第三方模块解析) 5 | patten:可以是正则表达式,可以是xpath语句不过要和上面的相对应 6 | ''' 7 | import random 8 | 9 | ''' 10 | ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https),country(国家),area(省市),updatetime(更新时间) 11 | speed(连接速度) 12 | ''' 13 | parserList = [ 14 | { 15 | 'urls': ['http://m.66ip.cn/%s.html'% n for n in ['index']+range(2,12)], 16 | 'type':'xpath', 17 | 'pattern': ".//*[@class='profit-c']/table/tr[position()>1]", 18 | 'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[4]','protocol':''} 19 | }, 20 | { 21 | 'urls': ['http://m.66ip.cn/areaindex_%s/%s.html'%(m,n) for m in range(1,35) for n in range(1,10)], 22 | 'type':'xpath', 23 | 'pattern': ".//*[@id='footer']/div/table/tr[position()>1]", 24 | 'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[4]','protocol':''} 25 | }, 26 | { 27 | 'urls': ['http://www.kuaidaili.com/proxylist/%s/'% n for n in range(1,11)], 28 | 'type': 'xpath', 29 | 'pattern': ".//*[@id='index_free_list']/table/tbody/tr[position()>0]", 30 | 'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[3]','protocol':'./td[4]'} 31 | }, 32 | { 33 | 'urls': ['http://www.kuaidaili.com/free/%s/%s/'% (m,n) for m in ['inha', 'intr', 'outha', 'outtr'] for n in range(1,11)], 34 | 'type':'xpath', 35 | 'pattern': ".//*[@id='list']/table/tbody/tr[position()>0]", 36 | 'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[3]','protocol':'./td[4]'} 37 | }, 38 | { 39 | 'urls': ['http://www.cz88.net/proxy/%s'% m for m in ['index.shtml']+['http_%s.shtml' % n for n in range(2, 11)]], 40 | 'type':'xpath', 41 | 'pattern':".//*[@id='boxright']/div/ul/li[position()>1]", 42 | 'postion':{'ip':'./div[1]','port':'./div[2]','type':'./div[3]','protocol':''} 43 | 44 | }, 45 | { 46 | 'urls': ['http://www.ip181.com/daili/%s.html'% n for n in range(1, 11)], 47 | 'type':'xpath', 48 | 'pattern': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]", 49 | 'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[3]','protocol':'./td[4]'} 50 | 51 | }, 52 | { 53 | 'urls': ['http://www.xicidaili.com/%s/%s'%(m,n) for m in ['nn', 'nt', 'wn', 'wt'] for n in range(1, 8) ], 54 | 'type':'xpath', 55 | 'pattern': ".//*[@id='ip_list']/tr[position()>1]", 56 | 'postion':{'ip':'./td[2]','port':'./td[3]','type':'./td[5]','protocol':'./td[6]'} 57 | }, 58 | { 59 | 'urls':['http://www.cnproxy.com/proxy%s.html'% i for i in range(1,11)], 60 | 'type':'module', 61 | 'moduleName':'CnproxyPraser', 62 | 'pattern':r'(\d+\.\d+\.\d+\.\d+)(HTTP|SOCKS4)\s*', 63 | 'postion':{'ip':0,'port':1,'type':-1,'protocol':2} 64 | } 65 | ] 66 | ''' 67 | 数据库的配置 68 | ''' 69 | DB_CONFIG={ 70 | 'dbType':'sqlite',#sqlite,mysql,mongodb 71 | 'dbPath':'./data/proxy.db',#这个仅仅对sqlite有效 72 | 'dbUser':'',#用户名 73 | 'dbPass':'',#密码 74 | 'dbName':''#数据库名称 75 | 76 | } 77 | 78 | CHINA_AREA=[u'河北',u'山东',u'辽宁',u'黑龙江',u'吉林' 79 | ,u'甘肃',u'青海',u'河南',u'江苏',u'湖北',u'湖南', 80 | u'江西',u'浙江',u'广东',u'云南',u'福建', 81 | u'台湾',u'海南',u'山西',u'四川',u'陕西', 82 | u'贵州',u'安徽',u'重庆',u'北京',u'上海',u'天津',u'广西',u'内蒙',u'西藏',u'新疆',u'宁夏',u'香港',u'澳门'] 83 | QQWRY_PATH="./data/qqwry.dat" 84 | 85 | THREADNUM = 20 86 | API_PORT=8000 87 | ''' 88 | 爬虫爬取和检测ip的设置条件 89 | 不需要检测ip是否已经存在,因为会定时清理 90 | ''' 91 | UPDATE_TIME=20*60#每半个小时检测一次是否有代理ip失效 92 | MINNUM = 50 #当有效的ip值小于一个时 需要启动爬虫进行爬取 93 | MAXTIME = 3*24*60 #当爬取存储开始一直使用的最大时间,如果超过这个时间,都删除 94 | 95 | TIMEOUT = 5#socket延时 96 | 97 | 98 | 99 | ''' 100 | 反爬虫的设置 101 | ''' 102 | ''' 103 | 重试次数 104 | ''' 105 | RETRY_TIME=3 106 | 107 | 108 | ''' 109 | USER_AGENTS 随机头信息 110 | ''' 111 | USER_AGENTS = [ 112 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 113 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 114 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 115 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 116 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 117 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 118 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 119 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 120 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 121 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 122 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 123 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 124 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 125 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 126 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 127 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 128 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", 129 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", 130 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", 131 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", 132 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", 133 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 134 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", 135 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 136 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", 137 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 138 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", 139 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 140 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", 141 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", 142 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", 143 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", 144 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", 145 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10" 146 | ] 147 | 148 | HEADER = { 149 | 'User-Agent': random.choice(USER_AGENTS), 150 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 151 | 'Accept-Language': 'en-US,en;q=0.5', 152 | 'Connection': 'keep-alive', 153 | 'Accept-Encoding': 'gzip, deflate', 154 | } 155 | 156 | TEST_URL='http://ip.chinaz.com/getip.aspx' 157 | # #添加的检测关键字,修复测试的代理是否能真正的访问到目的网址 158 | # TEST_KEY = '站长工具' 159 | TEST_PROXY='http://www.stilllistener.com/checkpoint1/test11/' -------------------------------------------------------------------------------- /data/proxy.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TimoGroom/IPProxys/8de2efb23db1d96c12b82053519795b5ef7231e8/data/proxy.db -------------------------------------------------------------------------------- /data/qqwry.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TimoGroom/IPProxys/8de2efb23db1d96c12b82053519795b5ef7231e8/data/qqwry.dat -------------------------------------------------------------------------------- /db/SQLiteHelper.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from config import DB_CONFIG 3 | from db.SqlHelper import SqlHelper 4 | 5 | __author__ = 'Xaxdus' 6 | import sqlite3 7 | class SqliteHelper(SqlHelper): 8 | 9 | tableName='proxys' 10 | def __init__(self): 11 | ''' 12 | 建立数据库的链接 13 | :return: 14 | ''' 15 | self.database = sqlite3.connect(DB_CONFIG['dbPath'],check_same_thread=False) 16 | self.cursor = self.database.cursor() 17 | #创建表结构 18 | self.createTable() 19 | def compress(self): 20 | ''' 21 | 数据库进行压缩 22 | :return: 23 | ''' 24 | self.database.execute('VACUUM') 25 | 26 | def createTable(self): 27 | self.cursor.execute("create TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY ,ip VARCHAR(16) NOT NULL," 28 | "port INTEGER NOT NULL ,types INTEGER NOT NULL ,protocol INTEGER NOT NULL DEFAULT 0," 29 | "country VARCHAR (20) NOT NULL,area VARCHAR (20) NOT NULL,updatetime TimeStamp NOT NULL DEFAULT (datetime('now','localtime')) ,speed DECIMAL(3,2) NOT NULL DEFAULT 100)"% self.tableName) 30 | 31 | self.database.commit() 32 | 33 | def select(self,tableName,condition,count): 34 | ''' 35 | 36 | :param tableName: 表名 37 | :param condition: 条件包含占位符 38 | :param value: 占位符所对应的值(主要是为了防注入) 39 | :return: 40 | ''' 41 | command = 'SELECT DISTINCT ip,port FROM %s WHERE %s ORDER BY speed ASC %s '%(tableName,condition,count) 42 | 43 | self.cursor.execute(command) 44 | result = self.cursor.fetchall() 45 | return result 46 | 47 | def selectAll(self): 48 | self.cursor.execute('SELECT DISTINCT ip,port FROM %s ORDER BY speed ASC '%self.tableName) 49 | result = self.cursor.fetchall() 50 | return result 51 | 52 | def selectCount(self): 53 | self.cursor.execute('SELECT COUNT( DISTINCT ip) FROM %s'%self.tableName) 54 | count = self.cursor.fetchone() 55 | return count 56 | 57 | def selectOne(self,tableName,condition,value): 58 | ''' 59 | 60 | :param tableName: 表名 61 | :param condition: 条件包含占位符 62 | :param value: 占位符所对应的值(主要是为了防注入) 63 | :return: 64 | ''' 65 | self.cursor.execute('SELECT DISTINCT ip,port FROM %s WHERE %s ORDER BY speed ASC'%(tableName,condition),value) 66 | result = self.cursor.fetchone() 67 | return result 68 | 69 | def update(self,tableName,condition,value): 70 | self.cursor.execute('UPDATE %s %s'%(tableName,condition),value) 71 | self.database.commit() 72 | 73 | def delete(self,tableName,condition): 74 | ''' 75 | 76 | :param tableName: 表名 77 | :param condition: 条件 78 | :return: 79 | ''' 80 | deleCommand = 'DELETE FROM %s WHERE %s'%(tableName,condition) 81 | # print deleCommand 82 | self.cursor.execute(deleCommand) 83 | self.commit() 84 | 85 | def commit(self): 86 | self.database.commit() 87 | 88 | 89 | def insert(self,tableName,value): 90 | 91 | proxy = [value['ip'],value['port'],value['type'],value['protocol'],value['country'],value['area'],value['speed']] 92 | # print proxy 93 | self.cursor.execute("INSERT INTO %s (ip,port,types,protocol,country,area,speed)VALUES (?,?,?,?,?,?,?)"% tableName 94 | ,proxy) 95 | 96 | 97 | def batch_insert(self,tableName,values): 98 | 99 | for value in values: 100 | if value!=None: 101 | self.insert(self.tableName,value) 102 | self.database.commit() 103 | 104 | 105 | def close(self): 106 | self.cursor.close() 107 | self.database.close() 108 | 109 | 110 | 111 | if __name__=="__main__": 112 | s = SqliteHelper() 113 | print s.selectCount()[0] 114 | # print s.selectAll() -------------------------------------------------------------------------------- /db/SqlHelper.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | __author__ = 'Xaxdus' 4 | ''' 5 | sql操作的基类 6 | 包括ip,端口,types类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) 7 | speed(连接速度) 8 | ''' 9 | class SqlHelper(object): 10 | 11 | 12 | 13 | def __init__(self): 14 | pass 15 | 16 | def insert(self, tableName,value): 17 | pass 18 | 19 | def batch_insert(self,values): 20 | pass 21 | 22 | def delete(self, tableName, condition): 23 | pass 24 | 25 | def batch_delete(self, tableName,values): 26 | pass 27 | 28 | def update(self, tableName,condition,value): 29 | pass 30 | def select(self, tableName,condition,count): 31 | pass 32 | def selectOne(self,tableName,condition,value): 33 | pass 34 | def close(self): 35 | pass 36 | -------------------------------------------------------------------------------- /db/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Xaxdus' 2 | -------------------------------------------------------------------------------- /logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root,api,data,db,spider,validator,download 3 | 4 | [logger_root] 5 | level=INFO 6 | handlers=screen 7 | 8 | [logger_api] 9 | level=INFO 10 | handlers=screen 11 | qualname=api 12 | propagate=0 13 | 14 | [logger_util] 15 | level=INFO 16 | handlers=screen 17 | qualname=util 18 | propagate=0 19 | 20 | [logger_download] 21 | level=INFO 22 | handlers=screen 23 | qualname=download 24 | propagate=0 25 | 26 | [logger_data] 27 | level=DEBUG 28 | handlers=screen 29 | qualname=data 30 | propagate=0 31 | 32 | [logger_db] 33 | level=DEBUG 34 | handlers=screen 35 | qualname=db 36 | propagate=0 37 | 38 | [logger_spider] 39 | level=INFO 40 | handlers=screen 41 | qualname=spider 42 | propagate=0 43 | 44 | [logger_validator] 45 | level=INFO 46 | handlers=screen 47 | qualname=validator 48 | propagate=0 49 | 50 | [handlers] 51 | keys=screen 52 | 53 | [handler_screen] 54 | class=logging.StreamHandler 55 | formatter=pretty 56 | level=DEBUG 57 | args=(sys.stderr, ) 58 | 59 | [formatters] 60 | keys=pretty 61 | 62 | [formatter_pretty] 63 | format= %(module)s %(asctime)s %(levelname)s %(lineno)d %(message)s 64 | datefmt= %Y-%m-%d %H:%M:%S 65 | class=logging.Formatter -------------------------------------------------------------------------------- /qiye2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TimoGroom/IPProxys/8de2efb23db1d96c12b82053519795b5ef7231e8/qiye2.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | 3 | -------------------------------------------------------------------------------- /spider/HtmlDownLoader.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import random 4 | import config 5 | import json 6 | __author__ = 'Xaxdus' 7 | 8 | import requests 9 | import logging 10 | import chardet 11 | logger = logging.getLogger('download') 12 | class Html_Downloader(object): 13 | 14 | @classmethod 15 | def download(self,url): 16 | count = 0#重试次数 17 | r='' 18 | logger.info("downloading url: %s",url) 19 | try: 20 | r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT) 21 | r.encoding =chardet.detect(r.content)['encoding'] 22 | while count< config.RETRY_TIME: 23 | if (not r.ok) or len(r.content)<500 : 24 | response = requests.get("http://127.0.0.1:%s/?types=0&count=10"%config.API_PORT) 25 | if response.ok: 26 | content = response.text 27 | choose = random.choice(json.loads(content)) 28 | proxies={"https": "http://%s:%s"%(choose[0],choose[1])} 29 | try: 30 | r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies) 31 | r.encoding =chardet.detect(r.content)['encoding'] 32 | count += 1 33 | except Exception,e: 34 | count += 1 35 | else: 36 | return None 37 | 38 | else: 39 | return r.text 40 | 41 | return None 42 | 43 | 44 | except Exception,e: 45 | while count< config.RETRY_TIME: 46 | if r==''or (not r.ok) or len(r.content)<500 : 47 | try: 48 | response = requests.get("http://127.0.0.1:%s/?types=0&count=10"%config.API_PORT) 49 | if response.ok: 50 | content = response.text 51 | choose = random.choice(json.loads(content)) 52 | proxies={"https": "http://%s:%s"%(choose[0],choose[1])} 53 | try: 54 | r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies) 55 | r.encoding =chardet.detect(r.content)['encoding'] 56 | count += 1 57 | except Exception,e: 58 | count += 1 59 | else: 60 | return None 61 | except Exception,e: 62 | return None 63 | 64 | else: 65 | return r.text 66 | 67 | return None 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /spider/HtmlPraser.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import datetime 3 | from config import QQWRY_PATH, CHINA_AREA 4 | 5 | from util.IPAddress import IPAddresss 6 | import re 7 | import logging 8 | logger = logging.getLogger('spider') 9 | 10 | __author__ = 'Xaxdus' 11 | from lxml import etree 12 | class Html_Parser(object): 13 | 14 | def __init__(self): 15 | self.ips = IPAddresss(QQWRY_PATH) 16 | def parse(self,response,parser): 17 | ''' 18 | 19 | :param response: 响应 20 | :param type: 解析方式 21 | :return: 22 | ''' 23 | if parser['type']=='xpath': 24 | return self.XpathPraser(response,parser) 25 | elif parser['type']=='regular': 26 | return self.RegularPraser(response,parser) 27 | elif parser['type']=='module': 28 | return getattr(self,parser['moduleName'],None)(response,parser) 29 | else: 30 | return None 31 | 32 | def AuthCountry(self,addr): 33 | ''' 34 | 用来判断地址是哪个国家的 35 | :param addr: 36 | :return: 37 | ''' 38 | for area in CHINA_AREA: 39 | if addr.find(area)!=-1: 40 | return True 41 | return False 42 | 43 | 44 | 45 | def XpathPraser(self,response,parser): 46 | ''' 47 | 针对xpath方式进行解析 48 | :param response: 49 | :param parser: 50 | :return: 51 | ''' 52 | # print response 53 | proxylist=[] 54 | root = etree.HTML(response) 55 | proxys = root.xpath(parser['pattern']) 56 | # print proxys 57 | for proxy in proxys: 58 | # print parser['postion']['ip'] 59 | try: 60 | ip = proxy.xpath(parser['postion']['ip'])[0].text 61 | port = proxy.xpath(parser['postion']['port'])[0].text 62 | type = proxy.xpath(parser['postion']['type'])[0].text 63 | # print ip,port,type 64 | if type.find(u'高匿')!=-1: 65 | type = 0 66 | else: 67 | type = 1 68 | protocol='' 69 | if len(parser['postion']['protocol']) > 0: 70 | protocol = proxy.xpath(parser['postion']['protocol'])[0].text 71 | if protocol.lower().find('https')!=-1: 72 | protocol = 1 73 | else: 74 | protocol = 0 75 | else: 76 | protocol = 0 77 | addr = self.ips.getIpAddr(self.ips.str2ip(ip)) 78 | country = '' 79 | area = '' 80 | if addr.find(u'省')!=-1 or self.AuthCountry(addr): 81 | country = u'中国' 82 | area = addr 83 | else: 84 | country = addr 85 | area = '' 86 | except Exception,e: 87 | logger.warning(str(e)) 88 | continue 89 | # updatetime = datetime.datetime.now() 90 | # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间) 91 | 92 | # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100} 93 | proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'speed':100} 94 | logger.info("Fetch proxy %s" %str(proxy)) 95 | proxylist.append(proxy) 96 | 97 | return proxylist 98 | 99 | def RegularPraser(self,response,parser): 100 | ''' 101 | 针对正则表达式进行解析 102 | :param response: 103 | :param parser: 104 | :return: 105 | ''' 106 | proxylist=[] 107 | pattern = re.compile(parser['pattern']) 108 | matchs = pattern.findall(response) 109 | if matchs !=None: 110 | for match in matchs: 111 | logging.info(str(match)) 112 | ip = match[parser['postion']['ip']] 113 | port = match[parser['postion']['port']] 114 | #网站的类型一直不靠谱所以还是默认,之后会检测 115 | type =0 116 | if parser['postion']['protocol'] > 0: 117 | protocol = match[parser['postion']['protocol']] 118 | if protocol.lower().find('https')!=-1: 119 | protocol = 1 120 | else: 121 | protocol = 0 122 | else: 123 | protocol = 0 124 | addr = self.ips.getIpAddr(self.ips.str2ip(ip)) 125 | country = '' 126 | area = '' 127 | if addr.find(u'省')!=-1 or self.AuthCountry(addr): 128 | country = u'中国' 129 | area = addr 130 | else: 131 | country = addr 132 | area = '' 133 | proxy ={'ip':ip,'port':port,'type':type,'protocol':protocol,'country':country,'area':area,'speed':100} 134 | logger.info("Fetch proxy %s" % str(proxy)) 135 | proxylist.append(proxy) 136 | return proxylist 137 | 138 | 139 | def CnproxyPraser(self,response,parser): 140 | proxylist = self.RegularPraser(response,parser) 141 | chardict ={'v':'3','m':'4','a':'2','l':'9','q':'0','b':'5','i':'7','w':'6','r':'8','c':'1'} 142 | 143 | for proxy in proxylist: 144 | port = proxy['port'] 145 | new_port = '' 146 | for i in range(len(port)): 147 | if port[i]!='+': 148 | new_port += chardict[port[i]] 149 | new_port = int(new_port) 150 | proxy['port'] =new_port 151 | return proxylist 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /spider/ProxySpider.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from gevent.pool import Pool 3 | import requests 4 | import time 5 | from config import THREADNUM, parserList, MINNUM, UPDATE_TIME 6 | from db.SQLiteHelper import SqliteHelper 7 | from spider.HtmlDownLoader import Html_Downloader 8 | from spider.HtmlPraser import Html_Parser 9 | from validator.Validator import Validator 10 | import logging 11 | logger = logging.getLogger('spider') 12 | 13 | __author__ = 'Xaxdus' 14 | from gevent import monkey 15 | monkey.patch_all() 16 | ''' 17 | 这个类的作用是描述爬虫的逻辑 18 | ''' 19 | 20 | class ProxySpider(object): 21 | 22 | def __init__(self): 23 | self.crawl_pool = Pool(THREADNUM) 24 | # self.sqlHelper = sqlHelper 25 | 26 | def run(self): 27 | while True: 28 | logger.info("Start to run spider") 29 | sqlHelper = SqliteHelper() 30 | logger.info('Start to run validator') 31 | validator = Validator(sqlHelper) 32 | count = validator.run_db() 33 | logger.info('Finished to run validator, count=%s'%count) 34 | if count[0]< MINNUM: 35 | proxys = self.crawl_pool.map(self.crawl,parserList) 36 | #这个时候proxys的格式是[[{},{},{}],[{},{},{}]] 37 | # print proxys 38 | #这个时候应该去重: 39 | 40 | proxys_tmp = [] 41 | for proxy in proxys: 42 | proxys_tmp.extend(proxy) 43 | 44 | proxys = proxys_tmp 45 | logger.info('first_proxys: %s'%len(proxys)) 46 | #这个时候proxys的格式是[{},{},{},{},{},{}] 47 | proxys_tmp=None 48 | #这个时候开始去重: 49 | proxys = [dict(t) for t in set([tuple(proxy.items()) for proxy in proxys])] 50 | logger.info('end_proxy: %s'%len(proxys)) 51 | logger.info('spider proxys: %s'%type(proxys)) 52 | proxys = validator.run_list(proxys)#这个是检测后的ip地址 53 | 54 | sqlHelper.batch_insert(sqlHelper.tableName,proxys) 55 | 56 | logger.info('success ip: %s'%sqlHelper.selectCount()) 57 | sqlHelper.close() 58 | logger.info('Finished to run spider') 59 | time.sleep(UPDATE_TIME) 60 | 61 | 62 | def crawl(self,parser): 63 | proxys = [] 64 | html_parser = Html_Parser() 65 | for url in parser['urls']: 66 | response = Html_Downloader.download(url) 67 | if response!=None: 68 | proxylist= html_parser.parse(response,parser) 69 | if proxylist != None: 70 | proxys.extend(proxylist) 71 | return proxys 72 | 73 | 74 | if __name__=="__main__": 75 | spider = ProxySpider() 76 | spider.run() -------------------------------------------------------------------------------- /spider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Xaxdus' 2 | -------------------------------------------------------------------------------- /start.bat: -------------------------------------------------------------------------------- 1 | python IPProxys.py -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Xaxdus' 2 | -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import requests 3 | import json 4 | r = requests.get('http://127.0.0.1:8000/?types=0&count=5&country=中国') 5 | ip_ports = json.loads(r.text) 6 | print ip_ports 7 | ip = ip_ports[0]['ip'] 8 | port = ip_ports[0]['port'] 9 | proxies={ 10 | 'http':'http://%s:%s'%(ip,port), 11 | 'https':'http://%s:%s'%(ip,port) 12 | } 13 | r = requests.get('http://ip.chinaz.com/',proxies=proxies) 14 | r.encoding='utf-8' 15 | print r.text 16 | -------------------------------------------------------------------------------- /test/testhttpserver.py: -------------------------------------------------------------------------------- 1 | 2 | #coding:utf-8 3 | import BaseHTTPServer 4 | import json 5 | import urlparse 6 | class WebRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler): 7 | def do_GET(self): 8 | """ 9 | """ 10 | print self.path 11 | parsed_path = urlparse.urlparse(self.path) 12 | print parsed_path 13 | print parsed_path.query 14 | # message_parts = [ 15 | # 'CLIENT VALUES:', 16 | # 'client_address=%s (%s)' % (self.client_address, 17 | # self.address_string()), 18 | # 'command=%s' % self.command, 19 | # 'path=%s' % self.path, 20 | # 'real path=%s' % parsed_path.path, 21 | # 'query=%s' % parsed_path.query, 22 | # 'request_version=%s' % self.request_version, 23 | # '', 24 | # 'SERVER VALUES:', 25 | # 'server_version=%s' % self.server_version, 26 | # 'sys_version=%s' % self.sys_version, 27 | # 'protocol_version=%s' % self.protocol_version, 28 | # '', 29 | # 'HEADERS RECEIVED:', 30 | # ] 31 | # for name, value in sorted(self.headers.items()): 32 | # message_parts.append('%s=%s' % (name, value.rstrip())) 33 | # message_parts.append('') 34 | # message = '\r\n'.join(message_parts) 35 | data1 = [{'ip':'192.168.0.0','port':456}]*10 36 | d1 = json.dumps(data1,sort_keys=True,indent=4) 37 | message=('192.168.1.1',80) 38 | self.send_response(200) 39 | self.end_headers() 40 | self.wfile.write(d1) 41 | 42 | server = BaseHTTPServer.HTTPServer(('0.0.0.0',8000), WebRequestHandler) 43 | server.serve_forever() -------------------------------------------------------------------------------- /test/testlist.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from decimal import Decimal 3 | 4 | __author__ = 'Xaxdus' 5 | 6 | 7 | # list = ["www.baidu.com/%s" %m for m in ['index']+range(1,5)] 8 | # 9 | # list = [(1,10)]*10 10 | # 11 | # for m,n in list: 12 | # print m,n 13 | # 14 | # 15 | # list2 = ["www.baidu.com/%s/%s"%(i[0],i[1]) for i in list] 16 | # print list2 17 | 18 | # x=Decimal('0.998531571219').quantize(Decimal('0.00')) 19 | # a= 0.998531571219 20 | # value = round(a, 3) 21 | # print x,type(x),value 22 | # proxys=[] 23 | # proxy=[123,1234] 24 | # proxys.append(proxy) 25 | # 26 | # proxy=[123,1234] 27 | # proxys.append(proxy) 28 | # 29 | # print proxys 30 | l = [{'ip':'123.1.1.1','port':80},{'ip':'123.1.1.1','port':80},{'ip':'123.1.2.1','port':80},{'ip':'123.1.1.1','port':81}] 31 | 32 | # for d in l: 33 | # print [tuple(d.items())] 34 | print [tuple(d.items()) for d in l] 35 | 36 | print [dict(t) for t in set([tuple(d.items()) for d in l])] -------------------------------------------------------------------------------- /test/testlxml.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | from lxml import etree 3 | 4 | __author__ = 'Xaxdus' 5 | 6 | html=''' 7 | 8 | 9 | 10 | 11 | 北京http代理ip_66免费代理ip提取网 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 48 |
49 | 50 | 51 | 56 | 57 | 58 | 60 | 61 |
52 |

53 | 免流量 54 |

55 |
59 |
62 |
63 |
64 | 65 | 66 | 105 | 106 |
67 | 104 |
107 |
108 | 109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 | 135 |
136 |
137 | 162 | 163 | 164 | 165 | ''' 166 | 167 | root = etree.HTML(html) 168 | proxys = root.xpath(".//*[@id='footer']/div/table/tr[position()>1]") 169 | 170 | for proxy in proxys: 171 | print proxy.xpath('./td[1]')[0].text -------------------------------------------------------------------------------- /util/IPAddress.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | 6 | import socket 7 | import struct 8 | 9 | import logging 10 | logger = logging.getLogger('util') 11 | 12 | class IPAddresss: 13 | def __init__(self, ipdbFile): 14 | self.ipdb = open(ipdbFile, "rb") 15 | str = self.ipdb.read(8) 16 | (self.firstIndex, self.lastIndex) = struct.unpack('II', str) 17 | self.indexCount = (self.lastIndex - self.firstIndex)/7+1 18 | # print self.getVersion(), u" 纪录总数: %d 条 "%(self.indexCount) 19 | 20 | def getVersion(self): 21 | s = self.getIpAddr(0xffffff00L) 22 | return s 23 | 24 | def getAreaAddr(self, offset=0): 25 | if offset: 26 | self.ipdb.seek(offset) 27 | str = self.ipdb.read(1) 28 | (byte,) = struct.unpack('B', str) 29 | if byte == 0x01 or byte == 0x02: 30 | p = self.getLong3() 31 | if p: 32 | return self.getString(p) 33 | else: 34 | return "" 35 | else: 36 | self.ipdb.seek(-1, 1) 37 | return self.getString(offset) 38 | 39 | def getAddr(self, offset, ip=0): 40 | self.ipdb.seek(offset + 4) 41 | countryAddr = "" 42 | areaAddr = "" 43 | str = self.ipdb.read(1) 44 | (byte,) = struct.unpack('B', str) 45 | if byte == 0x01: 46 | countryOffset = self.getLong3() 47 | self.ipdb.seek(countryOffset) 48 | str = self.ipdb.read(1) 49 | (b,) = struct.unpack('B', str) 50 | if b == 0x02: 51 | countryAddr = self.getString(self.getLong3()) 52 | self.ipdb.seek(countryOffset + 4) 53 | else: 54 | countryAddr = self.getString(countryOffset) 55 | areaAddr = self.getAreaAddr() 56 | elif byte == 0x02: 57 | countryAddr = self.getString(self.getLong3()) 58 | areaAddr = self.getAreaAddr(offset + 8) 59 | else: 60 | countryAddr = self.getString(offset + 4) 61 | areaAddr = self.getAreaAddr() 62 | return countryAddr + " " + areaAddr 63 | 64 | def dump(self, first , last): 65 | if last > self.indexCount : 66 | last = self.indexCount 67 | for index in range(first, last): 68 | offset = self.firstIndex + index * 7 69 | self.ipdb.seek(offset) 70 | buf = self.ipdb.read(7) 71 | (ip, of1, of2) = struct.unpack("IHB", buf) 72 | address = self.getAddr(of1 + (of2 << 16)) 73 | # 把GBK转为utf-8 74 | address = unicode(address, 'gbk').encode("utf-8") 75 | logger.info("%d %s %s" % (index, self.ip2str(ip), address)) 76 | 77 | def setIpRange(self, index): 78 | offset = self.firstIndex + index * 7 79 | self.ipdb.seek(offset) 80 | buf = self.ipdb.read(7) 81 | (self.curStartIp, of1, of2) = struct.unpack("IHB", buf) 82 | self.curEndIpOffset = of1 + (of2 << 16) 83 | self.ipdb.seek(self.curEndIpOffset) 84 | buf = self.ipdb.read(4) 85 | (self.curEndIp,) = struct.unpack("I", buf) 86 | 87 | def getIpAddr(self, ip): 88 | L = 0 89 | R = self.indexCount - 1 90 | while L < R-1: 91 | M = (L + R) / 2 92 | self.setIpRange(M) 93 | if ip == self.curStartIp: 94 | L = M 95 | break 96 | if ip > self.curStartIp: 97 | L = M 98 | else: 99 | R = M 100 | self.setIpRange(L) 101 | # version information, 255.255.255.X, urgy but useful 102 | if ip & 0xffffff00L == 0xffffff00L: 103 | self.setIpRange(R) 104 | if self.curStartIp <= ip <= self.curEndIp: 105 | address = self.getAddr(self.curEndIpOffset) 106 | # 把GBK转为utf-8 107 | address = unicode(address, 'gbk') 108 | else: 109 | address = u"未找到该IP的地址" 110 | return address 111 | 112 | def getIpRange(self, ip): 113 | self.getIpAddr(ip) 114 | range = self.ip2str(self.curStartIp) + ' - ' \ 115 | + self.ip2str(self.curEndIp) 116 | return range 117 | 118 | def getString(self, offset = 0): 119 | if offset : 120 | self.ipdb.seek(offset) 121 | str = "" 122 | ch = self.ipdb.read(1) 123 | (byte,) = struct.unpack('B', ch) 124 | while byte != 0: 125 | str += ch 126 | ch = self.ipdb.read(1) 127 | (byte,) = struct.unpack('B', ch) 128 | return str 129 | 130 | def ip2str(self, ip): 131 | return str(ip >> 24)+'.'+str((ip >> 16) & 0xffL)+'.'+str((ip >> 8) & 0xffL)+'.'+str(ip & 0xffL) 132 | 133 | def str2ip(self, s): 134 | (ip,) = struct.unpack('I', socket.inet_aton(s)) 135 | return ((ip >> 24) & 0xffL) | ((ip & 0xffL) << 24) | ((ip >> 8) & 0xff00L) | ((ip & 0xff00L) << 8) 136 | 137 | def getLong3(self, offset=0): 138 | if offset: 139 | self.ipdb.seek(offset) 140 | str = self.ipdb.read(3) 141 | (a, b) = struct.unpack('HB', str) 142 | return (b << 16) + a 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | __author__ = 'Xaxdus' 4 | 5 | -------------------------------------------------------------------------------- /util/logger.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import logging 3 | 4 | __author__ = 'Xaxdus' 5 | 6 | 7 | logger = logging.getLogger() 8 | def logger_proxy(proxy): 9 | logger.setLevel(logging.INFO) 10 | logger.info(proxy) 11 | -------------------------------------------------------------------------------- /validator/Validator.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import datetime 3 | 4 | from lxml import etree 5 | from gevent.pool import Pool 6 | import requests 7 | import time 8 | from config import TEST_URL 9 | import config 10 | from db.SQLiteHelper import SqliteHelper 11 | import logging 12 | logger = logging.getLogger("validator") 13 | 14 | from gevent import monkey 15 | monkey.patch_all() 16 | 17 | 18 | __author__ = 'Xaxdus' 19 | 20 | class Validator(object): 21 | 22 | def __init__(self,sqlHelper): 23 | 24 | self.detect_pool = Pool(config.THREADNUM) 25 | self.sqlHelper =sqlHelper 26 | self.selfip = self.getMyIP() 27 | self.detect_pool = Pool(config.THREADNUM) 28 | 29 | def run_db(self): 30 | ''' 31 | 从数据库中检测 32 | :return: 33 | ''' 34 | try: 35 | #首先将超时的全部删除 36 | self.deleteOld() 37 | #接着检测剩余的ip,是否可用 38 | results = self.sqlHelper.selectAll() 39 | self.detect_pool.map(self.detect_db,results) 40 | #将数据库进行压缩 41 | self.sqlHelper.compress() 42 | 43 | return self.sqlHelper.selectCount()#返回最终的数量 44 | except Exception,e: 45 | logger.warning(str(e)) 46 | return 0 47 | 48 | 49 | 50 | def run_list(self,results): 51 | ''' 52 | 这个是先不进入数据库,直接从集合中删除 53 | :param results: 54 | :return: 55 | ''' 56 | # proxys=[] 57 | # for result in results: 58 | proxys = self.detect_pool.map(self.detect_list,results) 59 | #这个时候proxys的格式是[{},{},{},{},{}] 60 | return proxys 61 | 62 | def deleteOld(self): 63 | ''' 64 | 删除旧的数据 65 | :return: 66 | ''' 67 | condition = "updatetime<'%s'"%((datetime.datetime.now() - datetime.timedelta(minutes=config.MAXTIME)).strftime('%Y-%m-%d %H:%M:%S')) 68 | self.sqlHelper.delete(SqliteHelper.tableName,condition) 69 | 70 | 71 | def detect_db(self,result): 72 | ''' 73 | 74 | :param result: 从数据库中检测 75 | :return: 76 | ''' 77 | ip = result[0] 78 | port = str(result[1]) 79 | proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)} 80 | 81 | start = time.time() 82 | try: 83 | r = requests.get(url=TEST_URL,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies) 84 | 85 | if not r.ok or r.text.find(ip)==-1: 86 | condition = "ip='"+ip+"' AND "+'port='+port 87 | logger.info('failed %s:%s'%(ip,port)) 88 | self.sqlHelper.delete(SqliteHelper.tableName,condition) 89 | else: 90 | logger.info(r.text) 91 | speed = round(time.time()-start, 2) 92 | self.sqlHelper.update(SqliteHelper.tableName,'SET speed=? WHERE ip=? AND port=?',(speed,ip,port)) 93 | logger.info('success %s:%s, speed=%s'%(ip,port,speed)) 94 | except Exception,e: 95 | condition = "ip='"+ip+"' AND "+'port='+port 96 | logger.info('failed %s:%s'%(ip,port)) 97 | self.sqlHelper.delete(SqliteHelper.tableName,condition) 98 | 99 | 100 | 101 | def detect_list(self,proxy): 102 | ''' 103 | :param proxy: ip字典 104 | :return: 105 | ''' 106 | # for proxy in proxys: 107 | 108 | ip = proxy['ip'] 109 | port = proxy['port'] 110 | proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)} 111 | proxyType = self.checkProxyType(proxies) 112 | if proxyType==3: 113 | logger.info('failed %s:%s'%(ip,port)) 114 | 115 | proxy = None 116 | return proxy 117 | else: 118 | proxy['type']=proxyType 119 | start = time.time() 120 | try: 121 | r = requests.get(url=TEST_URL,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies) 122 | 123 | if not r.ok or r.text.find(ip)==-1: 124 | logger.info('failed %s:%s'%(ip,port)) 125 | proxy = None 126 | else: 127 | speed = round(time.time()-start,2) 128 | logger.info('success %s:%s, speed=%s'%(ip,port,speed)) 129 | proxy['speed']=speed 130 | # return proxy 131 | except Exception,e: 132 | logger.info('failed %s:%s'%(ip,port)) 133 | proxy = None 134 | return proxy 135 | # return proxys 136 | 137 | def checkProxyType(self,proxies): 138 | ''' 139 | 用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型 140 | :param proxies: 代理(0 高匿,1 匿名,2 透明 3 无效代理 141 | :return: 142 | ''' 143 | 144 | try: 145 | 146 | r = requests.get(url=config.TEST_PROXY,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies) 147 | if r.ok: 148 | root = etree.HTML(r.text) 149 | ip = root.xpath('.//center[2]/table/tr[3]/td[2]')[0].text 150 | http_x_forwared_for = root.xpath('.//center[2]/table/tr[8]/td[2]')[0].text 151 | http_via = root.xpath('.//center[2]/table/tr[9]/td[2]')[0].text 152 | # print ip,http_x_forwared_for,http_via,type(http_via),type(http_x_forwared_for) 153 | if ip==self.selfip: 154 | return 3 155 | if http_x_forwared_for is None and http_via is None: 156 | return 0 157 | if http_via != None and http_x_forwared_for.find(self.selfip)== -1: 158 | return 1 159 | 160 | if http_via != None and http_x_forwared_for.find(self.selfip)!= -1: 161 | return 2 162 | return 3 163 | 164 | 165 | 166 | except Exception,e: 167 | logger.warning(str(e)) 168 | return 3 169 | 170 | 171 | 172 | def getMyIP(self): 173 | try: 174 | r = requests.get(url=config.TEST_PROXY,headers=config.HEADER,timeout=config.TIMEOUT) 175 | # print r.text 176 | root = etree.HTML(r.text) 177 | ip = root.xpath('.//center[2]/table/tr[3]/td[2]')[0].text 178 | 179 | logger.info('ip %s' %ip) 180 | return ip 181 | except Exception,e: 182 | logger.info(str(e)) 183 | return None 184 | 185 | if __name__=='__main__': 186 | v = Validator(None) 187 | v.getMyIP() 188 | v.selfip 189 | # results=[{'ip':'192.168.1.1','port':80}]*10 190 | # results = v.run(results) 191 | # print results 192 | pass 193 | -------------------------------------------------------------------------------- /validator/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Xaxdus' 2 | --------------------------------------------------------------------------------