├── .gitignore
├── IPProxys.py
├── README.md
├── api
├── __init__.py
└── apiServer.py
├── config.py
├── data
├── proxy.db
└── qqwry.dat
├── db
├── SQLiteHelper.py
├── SqlHelper.py
└── __init__.py
├── logging.conf
├── qiye2.jpg
├── requirements.txt
├── spider
├── HtmlDownLoader.py
├── HtmlPraser.py
├── ProxySpider.py
└── __init__.py
├── start.bat
├── test
├── __init__.py
├── test.py
├── testhttpserver.py
├── testlist.py
└── testlxml.py
├── util
├── IPAddress.py
├── __init__.py
└── logger.py
└── validator
├── Validator.py
└── __init__.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 | .idea/
--------------------------------------------------------------------------------
/IPProxys.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | import BaseHTTPServer
3 | import threading
4 | import logging
5 | import logging.config
6 |
7 | from api.apiServer import WebRequestHandler
8 | from config import API_PORT
9 | from db.SQLiteHelper import SqliteHelper
10 | from spider.ProxySpider import ProxySpider
11 | import sys
12 | reload(sys)
13 | sys.setdefaultencoding('utf8')
14 | logging.config.fileConfig('logging.conf')
15 |
16 | class IPProxys(object):
17 |
18 | def startApiServer(self):
19 | '''
20 | 启动api服务器
21 | :return:
22 | '''
23 | logging.info('Start server @ %s:%s' %('0.0.0.0',API_PORT))
24 | server = BaseHTTPServer.HTTPServer(('0.0.0.0',API_PORT), WebRequestHandler)
25 | server.serve_forever()
26 |
27 | def startSpider(self):
28 | logging.info('Start Spider')
29 | spider = ProxySpider()
30 | spider.run()
31 |
32 | if __name__=="__main__":
33 |
34 | proxys = IPProxys()
35 |
36 | apiServer = threading.Thread(target=proxys.startApiServer)
37 | spider = threading.Thread(target=proxys.startSpider)
38 | apiServer.start()
39 | spider.start()
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # IPProxys
2 | IPProxys代理池项目,提供代理ip。使用python2.7.x开发
3 |
4 | 详细使用方式,请看我的博客:
5 | http://www.cnblogs.com/qiyeboy/p/5693128.html
6 |
7 | 我的微信公众号:
8 |
9 | 
10 |
11 | 希望大家提供更多的代理网站,现在爬取的好用的代理ip还是太少。
12 |
13 |
14 | 同时感谢[super1-chen](https://github.com/super1-chen)对项目的贡献。
15 |
16 | ##项目依赖
17 | ####ubuntu,debian下
18 |
19 | 安装sqlite数据库(一般系统内置):
20 | apt-get install sqlite3
21 |
22 | 安装requests库:
23 | pip install requests
24 |
25 | 安装chardet库:
26 | pip install chardet
27 |
28 | 安装lxml:
29 | apt-get install python-lxml
30 |
31 | 安装gevent库:
32 | pip install gevent
33 | ######(有时候使用的gevent版本过低会出现自动退出情况,请使用pip install gevent --upgrade更新)
34 |
35 | ####windows下
36 | 下载[sqlite](http://www.sqlite.org/download.html),路径添加到环境变量
37 |
38 | 安装requests库:
39 | pip install requests
40 |
41 | 安装chardet库:
42 | pip install chardet
43 |
44 | 安装lxml:
45 | pip install lxml或者下载[lxml windows版](https://pypi.python.org/pypi/lxml/)
46 |
47 | 安装gevent库:
48 | pip install gevent
49 | ######(有时候使用的gevent版本过低会出现自动退出情况,请使用pip install gevent --upgrade更新)
50 | ## 如何使用
51 |
52 | 将项目目录clone到当前文件夹
53 |
54 | $ git clone
55 |
56 | 切换工程目录
57 |
58 | ```
59 | $ cd IPProxys
60 | ```
61 |
62 | 运行脚本
63 |
64 | ```
65 | python IPProxys.py
66 | ```
67 |
68 | ## API 使用方法
69 |
70 | #### 模式
71 | ```
72 | GET /
73 | ```
74 |
75 | ####参数
76 |
77 |
78 | | Name | Type | Description |
79 | | ----| ---- | ---- |
80 | | types | int | 0: 高匿代理, 1 透明 |
81 | | protocol | int | 0: http, 1 https |
82 | | count | int | 数量 |
83 | | country | str | 国家 |
84 | | area | str | 地区 |
85 |
86 |
87 |
88 | #### 例子
89 | #####IPProxys默认端口为8000
90 | #####如果是在本机上测试:
91 | 1.获取5个ip地址在中国的高匿代理:http://127.0.0.1:8000/?types=0&count=5&country=中国
92 |
93 | 2.响应为JSON格式,按照响应速度由高到低,返回数据:
94 |
95 | [{"ip": "220.160.22.115", "port": 80}, {"ip": "183.129.151.130", "port": 80}, {"ip": "59.52.243.88", "port": 80}, {"ip": "112.228.35.24", "port": 8888}, {"ip": "106.75.176.4", "port": 80}]
96 |
97 | ```
98 | import requests
99 | import json
100 | r = requests.get('http://127.0.0.1:8000/?types=0&count=5&country=中国')
101 | ip_ports = json.loads(r.text)
102 | print ip_ports
103 | ip = ip_ports[0]['ip']
104 | port = ip_ports[0]['port']
105 | proxies={
106 | 'http':'http://%s:%s'%(ip,port),
107 | 'https':'http://%s:%s'%(ip,port)
108 | }
109 | r = requests.get('http://ip.chinaz.com/',proxies=proxies)
110 | r.encoding='utf-8'
111 | print r.text
112 | ```
113 | ## TODO
114 | 1.添加对Python3.x的支持
115 |
116 | 2.可自主选择添加squid反向代理服务器,简化爬虫配置
117 |
118 | 3.重构HTTP API接口
119 |
120 | 4.增加更多代理网站和数据库适配
121 | ## 更新进度
122 | -----------------------------2016-11-24----------------------------
123 |
124 | 1.增加chardet识别网页编码
125 |
126 | 2.突破66ip.cn反爬限制
127 |
128 | -----------------------------2016-10-27----------------------------
129 |
130 | 1.增加对代理的检测,测试是否能真正访问到网址,实现代理
131 |
132 | 2.添加通过正则表达式和加载插件解析网页的方式
133 |
134 | 3.又增加一个新的代理网站
135 |
136 |
137 | -----------------------------2016-7-20----------------------------
138 |
139 | 1.修复bug ,将数据库进行压缩
140 |
141 |
--------------------------------------------------------------------------------
/api/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 |
--------------------------------------------------------------------------------
/api/apiServer.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | '''
3 | 定义几个关键字,count type,protocol,country,area,
4 | '''
5 | import urllib
6 | from config import API_PORT
7 | from db.SQLiteHelper import SqliteHelper
8 |
9 | __author__ = 'Xaxdus'
10 |
11 | import BaseHTTPServer
12 | import json
13 | import urlparse
14 | import logging
15 | logger = logging.getLogger('api')
16 |
17 | # keylist=['count', 'types','protocol','country','area']
18 | class WebRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
19 |
20 | def do_GET(self):
21 | """
22 | """
23 | dict={}
24 |
25 | parsed_path = urlparse.urlparse(self.path)
26 | try:
27 | query = urllib.unquote(parsed_path.query)
28 | logger.info("query %s" %query)
29 | if query.find('&')!=-1:
30 | params = query.split('&')
31 | for param in params:
32 | dict[param.split('=')[0]]=param.split('=')[1]
33 | else:
34 | dict[query.split('=')[0]]=query.split('=')[1]
35 |
36 | sqlHelper = SqliteHelper()
37 | # 处理删除代理的请求
38 | if dict.has_key('delete'):
39 | condition="ip='" + dict['ip'] + "' AND port=" + dict['port']
40 | sqlHelper.delete(SqliteHelper.tableName, condition)
41 | self.send_response(200)
42 | self.end_headers()
43 | self.wfile.write("Success delete proxy: " + dict['ip'] + ":" + dict['port'])
44 | else:
45 | str_count=''
46 | conditions=[]
47 | for key in dict:
48 | if key =='count':
49 | str_count = 'LIMIT 0,%s'% dict[key]
50 | if key =='country' or key =='area':
51 | conditions .append(key+" LIKE '"+dict[key]+"%'")
52 | elif key =='types' or key =='protocol' or key =='country' or key =='area':
53 | conditions .append(key+"="+dict[key])
54 | if len(conditions)>1:
55 | conditions = ' AND '.join(conditions)
56 | else:
57 | conditions =conditions[0]
58 | result = sqlHelper.select(sqlHelper.tableName,conditions,str_count)
59 | # print type(result)
60 | # for r in result:
61 | # print r
62 | data = [{'ip':item[0], 'port': item[1]} for item in result]
63 | data = json.dumps(data)
64 | self.send_response(200)
65 | self.end_headers()
66 | self.wfile.write(data)
67 | except Exception,e:
68 | logger.warning(str(e))
69 | self.send_response(404)
70 |
71 | if __name__=='__main__':
72 | server = BaseHTTPServer.HTTPServer(('0.0.0.0',API_PORT), WebRequestHandler)
73 | server.serve_forever()
74 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | '''
3 | 定义规则 urls:url列表
4 | type:解析方式,取值 regular(正则表达式),xpath(xpath解析),module(自定义第三方模块解析)
5 | patten:可以是正则表达式,可以是xpath语句不过要和上面的相对应
6 | '''
7 | import random
8 |
9 | '''
10 | ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https),country(国家),area(省市),updatetime(更新时间)
11 | speed(连接速度)
12 | '''
13 | parserList = [
14 | {
15 | 'urls': ['http://m.66ip.cn/%s.html'% n for n in ['index']+range(2,12)],
16 | 'type':'xpath',
17 | 'pattern': ".//*[@class='profit-c']/table/tr[position()>1]",
18 | 'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[4]','protocol':''}
19 | },
20 | {
21 | 'urls': ['http://m.66ip.cn/areaindex_%s/%s.html'%(m,n) for m in range(1,35) for n in range(1,10)],
22 | 'type':'xpath',
23 | 'pattern': ".//*[@id='footer']/div/table/tr[position()>1]",
24 | 'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[4]','protocol':''}
25 | },
26 | {
27 | 'urls': ['http://www.kuaidaili.com/proxylist/%s/'% n for n in range(1,11)],
28 | 'type': 'xpath',
29 | 'pattern': ".//*[@id='index_free_list']/table/tbody/tr[position()>0]",
30 | 'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[3]','protocol':'./td[4]'}
31 | },
32 | {
33 | 'urls': ['http://www.kuaidaili.com/free/%s/%s/'% (m,n) for m in ['inha', 'intr', 'outha', 'outtr'] for n in range(1,11)],
34 | 'type':'xpath',
35 | 'pattern': ".//*[@id='list']/table/tbody/tr[position()>0]",
36 | 'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[3]','protocol':'./td[4]'}
37 | },
38 | {
39 | 'urls': ['http://www.cz88.net/proxy/%s'% m for m in ['index.shtml']+['http_%s.shtml' % n for n in range(2, 11)]],
40 | 'type':'xpath',
41 | 'pattern':".//*[@id='boxright']/div/ul/li[position()>1]",
42 | 'postion':{'ip':'./div[1]','port':'./div[2]','type':'./div[3]','protocol':''}
43 |
44 | },
45 | {
46 | 'urls': ['http://www.ip181.com/daili/%s.html'% n for n in range(1, 11)],
47 | 'type':'xpath',
48 | 'pattern': ".//div[@class='row']/div[3]/table/tbody/tr[position()>1]",
49 | 'postion':{'ip':'./td[1]','port':'./td[2]','type':'./td[3]','protocol':'./td[4]'}
50 |
51 | },
52 | {
53 | 'urls': ['http://www.xicidaili.com/%s/%s'%(m,n) for m in ['nn', 'nt', 'wn', 'wt'] for n in range(1, 8) ],
54 | 'type':'xpath',
55 | 'pattern': ".//*[@id='ip_list']/tr[position()>1]",
56 | 'postion':{'ip':'./td[2]','port':'./td[3]','type':'./td[5]','protocol':'./td[6]'}
57 | },
58 | {
59 | 'urls':['http://www.cnproxy.com/proxy%s.html'% i for i in range(1,11)],
60 | 'type':'module',
61 | 'moduleName':'CnproxyPraser',
62 | 'pattern':r'
(\d+\.\d+\.\d+\.\d+) | (HTTP|SOCKS4)\s*',
63 | 'postion':{'ip':0,'port':1,'type':-1,'protocol':2}
64 | }
65 | ]
66 | '''
67 | 数据库的配置
68 | '''
69 | DB_CONFIG={
70 | 'dbType':'sqlite',#sqlite,mysql,mongodb
71 | 'dbPath':'./data/proxy.db',#这个仅仅对sqlite有效
72 | 'dbUser':'',#用户名
73 | 'dbPass':'',#密码
74 | 'dbName':''#数据库名称
75 |
76 | }
77 |
78 | CHINA_AREA=[u'河北',u'山东',u'辽宁',u'黑龙江',u'吉林'
79 | ,u'甘肃',u'青海',u'河南',u'江苏',u'湖北',u'湖南',
80 | u'江西',u'浙江',u'广东',u'云南',u'福建',
81 | u'台湾',u'海南',u'山西',u'四川',u'陕西',
82 | u'贵州',u'安徽',u'重庆',u'北京',u'上海',u'天津',u'广西',u'内蒙',u'西藏',u'新疆',u'宁夏',u'香港',u'澳门']
83 | QQWRY_PATH="./data/qqwry.dat"
84 |
85 | THREADNUM = 20
86 | API_PORT=8000
87 | '''
88 | 爬虫爬取和检测ip的设置条件
89 | 不需要检测ip是否已经存在,因为会定时清理
90 | '''
91 | UPDATE_TIME=20*60#每半个小时检测一次是否有代理ip失效
92 | MINNUM = 50 #当有效的ip值小于一个时 需要启动爬虫进行爬取
93 | MAXTIME = 3*24*60 #当爬取存储开始一直使用的最大时间,如果超过这个时间,都删除
94 |
95 | TIMEOUT = 5#socket延时
96 |
97 |
98 |
99 | '''
100 | 反爬虫的设置
101 | '''
102 | '''
103 | 重试次数
104 | '''
105 | RETRY_TIME=3
106 |
107 |
108 | '''
109 | USER_AGENTS 随机头信息
110 | '''
111 | USER_AGENTS = [
112 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
113 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
114 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
115 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
116 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
117 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
118 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
119 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
120 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
121 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
122 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
123 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
124 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
125 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
126 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
127 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
128 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
129 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
130 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
131 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
132 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
133 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
134 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
135 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
136 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
137 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
138 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
139 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
140 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
141 | "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
142 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
143 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
144 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
145 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
146 | ]
147 |
148 | HEADER = {
149 | 'User-Agent': random.choice(USER_AGENTS),
150 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
151 | 'Accept-Language': 'en-US,en;q=0.5',
152 | 'Connection': 'keep-alive',
153 | 'Accept-Encoding': 'gzip, deflate',
154 | }
155 |
156 | TEST_URL='http://ip.chinaz.com/getip.aspx'
157 | # #添加的检测关键字,修复测试的代理是否能真正的访问到目的网址
158 | # TEST_KEY = '站长工具'
159 | TEST_PROXY='http://www.stilllistener.com/checkpoint1/test11/'
--------------------------------------------------------------------------------
/data/proxy.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TimoGroom/IPProxys/8de2efb23db1d96c12b82053519795b5ef7231e8/data/proxy.db
--------------------------------------------------------------------------------
/data/qqwry.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TimoGroom/IPProxys/8de2efb23db1d96c12b82053519795b5ef7231e8/data/qqwry.dat
--------------------------------------------------------------------------------
/db/SQLiteHelper.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | from config import DB_CONFIG
3 | from db.SqlHelper import SqlHelper
4 |
5 | __author__ = 'Xaxdus'
6 | import sqlite3
7 | class SqliteHelper(SqlHelper):
8 |
9 | tableName='proxys'
10 | def __init__(self):
11 | '''
12 | 建立数据库的链接
13 | :return:
14 | '''
15 | self.database = sqlite3.connect(DB_CONFIG['dbPath'],check_same_thread=False)
16 | self.cursor = self.database.cursor()
17 | #创建表结构
18 | self.createTable()
19 | def compress(self):
20 | '''
21 | 数据库进行压缩
22 | :return:
23 | '''
24 | self.database.execute('VACUUM')
25 |
26 | def createTable(self):
27 | self.cursor.execute("create TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY ,ip VARCHAR(16) NOT NULL,"
28 | "port INTEGER NOT NULL ,types INTEGER NOT NULL ,protocol INTEGER NOT NULL DEFAULT 0,"
29 | "country VARCHAR (20) NOT NULL,area VARCHAR (20) NOT NULL,updatetime TimeStamp NOT NULL DEFAULT (datetime('now','localtime')) ,speed DECIMAL(3,2) NOT NULL DEFAULT 100)"% self.tableName)
30 |
31 | self.database.commit()
32 |
33 | def select(self,tableName,condition,count):
34 | '''
35 |
36 | :param tableName: 表名
37 | :param condition: 条件包含占位符
38 | :param value: 占位符所对应的值(主要是为了防注入)
39 | :return:
40 | '''
41 | command = 'SELECT DISTINCT ip,port FROM %s WHERE %s ORDER BY speed ASC %s '%(tableName,condition,count)
42 |
43 | self.cursor.execute(command)
44 | result = self.cursor.fetchall()
45 | return result
46 |
47 | def selectAll(self):
48 | self.cursor.execute('SELECT DISTINCT ip,port FROM %s ORDER BY speed ASC '%self.tableName)
49 | result = self.cursor.fetchall()
50 | return result
51 |
52 | def selectCount(self):
53 | self.cursor.execute('SELECT COUNT( DISTINCT ip) FROM %s'%self.tableName)
54 | count = self.cursor.fetchone()
55 | return count
56 |
57 | def selectOne(self,tableName,condition,value):
58 | '''
59 |
60 | :param tableName: 表名
61 | :param condition: 条件包含占位符
62 | :param value: 占位符所对应的值(主要是为了防注入)
63 | :return:
64 | '''
65 | self.cursor.execute('SELECT DISTINCT ip,port FROM %s WHERE %s ORDER BY speed ASC'%(tableName,condition),value)
66 | result = self.cursor.fetchone()
67 | return result
68 |
69 | def update(self,tableName,condition,value):
70 | self.cursor.execute('UPDATE %s %s'%(tableName,condition),value)
71 | self.database.commit()
72 |
73 | def delete(self,tableName,condition):
74 | '''
75 |
76 | :param tableName: 表名
77 | :param condition: 条件
78 | :return:
79 | '''
80 | deleCommand = 'DELETE FROM %s WHERE %s'%(tableName,condition)
81 | # print deleCommand
82 | self.cursor.execute(deleCommand)
83 | self.commit()
84 |
85 | def commit(self):
86 | self.database.commit()
87 |
88 |
89 | def insert(self,tableName,value):
90 |
91 | proxy = [value['ip'],value['port'],value['type'],value['protocol'],value['country'],value['area'],value['speed']]
92 | # print proxy
93 | self.cursor.execute("INSERT INTO %s (ip,port,types,protocol,country,area,speed)VALUES (?,?,?,?,?,?,?)"% tableName
94 | ,proxy)
95 |
96 |
97 | def batch_insert(self,tableName,values):
98 |
99 | for value in values:
100 | if value!=None:
101 | self.insert(self.tableName,value)
102 | self.database.commit()
103 |
104 |
105 | def close(self):
106 | self.cursor.close()
107 | self.database.close()
108 |
109 |
110 |
111 | if __name__=="__main__":
112 | s = SqliteHelper()
113 | print s.selectCount()[0]
114 | # print s.selectAll()
--------------------------------------------------------------------------------
/db/SqlHelper.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | __author__ = 'Xaxdus'
4 | '''
5 | sql操作的基类
6 | 包括ip,端口,types类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
7 | speed(连接速度)
8 | '''
9 | class SqlHelper(object):
10 |
11 |
12 |
13 | def __init__(self):
14 | pass
15 |
16 | def insert(self, tableName,value):
17 | pass
18 |
19 | def batch_insert(self,values):
20 | pass
21 |
22 | def delete(self, tableName, condition):
23 | pass
24 |
25 | def batch_delete(self, tableName,values):
26 | pass
27 |
28 | def update(self, tableName,condition,value):
29 | pass
30 | def select(self, tableName,condition,count):
31 | pass
32 | def selectOne(self,tableName,condition,value):
33 | pass
34 | def close(self):
35 | pass
36 |
--------------------------------------------------------------------------------
/db/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 |
--------------------------------------------------------------------------------
/logging.conf:
--------------------------------------------------------------------------------
1 | [loggers]
2 | keys=root,api,data,db,spider,validator,download
3 |
4 | [logger_root]
5 | level=INFO
6 | handlers=screen
7 |
8 | [logger_api]
9 | level=INFO
10 | handlers=screen
11 | qualname=api
12 | propagate=0
13 |
14 | [logger_util]
15 | level=INFO
16 | handlers=screen
17 | qualname=util
18 | propagate=0
19 |
20 | [logger_download]
21 | level=INFO
22 | handlers=screen
23 | qualname=download
24 | propagate=0
25 |
26 | [logger_data]
27 | level=DEBUG
28 | handlers=screen
29 | qualname=data
30 | propagate=0
31 |
32 | [logger_db]
33 | level=DEBUG
34 | handlers=screen
35 | qualname=db
36 | propagate=0
37 |
38 | [logger_spider]
39 | level=INFO
40 | handlers=screen
41 | qualname=spider
42 | propagate=0
43 |
44 | [logger_validator]
45 | level=INFO
46 | handlers=screen
47 | qualname=validator
48 | propagate=0
49 |
50 | [handlers]
51 | keys=screen
52 |
53 | [handler_screen]
54 | class=logging.StreamHandler
55 | formatter=pretty
56 | level=DEBUG
57 | args=(sys.stderr, )
58 |
59 | [formatters]
60 | keys=pretty
61 |
62 | [formatter_pretty]
63 | format= %(module)s %(asctime)s %(levelname)s %(lineno)d %(message)s
64 | datefmt= %Y-%m-%d %H:%M:%S
65 | class=logging.Formatter
--------------------------------------------------------------------------------
/qiye2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TimoGroom/IPProxys/8de2efb23db1d96c12b82053519795b5ef7231e8/qiye2.jpg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 |
3 |
--------------------------------------------------------------------------------
/spider/HtmlDownLoader.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | import random
4 | import config
5 | import json
6 | __author__ = 'Xaxdus'
7 |
8 | import requests
9 | import logging
10 | import chardet
11 | logger = logging.getLogger('download')
12 | class Html_Downloader(object):
13 |
14 | @classmethod
15 | def download(self,url):
16 | count = 0#重试次数
17 | r=''
18 | logger.info("downloading url: %s",url)
19 | try:
20 | r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT)
21 | r.encoding =chardet.detect(r.content)['encoding']
22 | while count< config.RETRY_TIME:
23 | if (not r.ok) or len(r.content)<500 :
24 | response = requests.get("http://127.0.0.1:%s/?types=0&count=10"%config.API_PORT)
25 | if response.ok:
26 | content = response.text
27 | choose = random.choice(json.loads(content))
28 | proxies={"https": "http://%s:%s"%(choose[0],choose[1])}
29 | try:
30 | r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
31 | r.encoding =chardet.detect(r.content)['encoding']
32 | count += 1
33 | except Exception,e:
34 | count += 1
35 | else:
36 | return None
37 |
38 | else:
39 | return r.text
40 |
41 | return None
42 |
43 |
44 | except Exception,e:
45 | while count< config.RETRY_TIME:
46 | if r==''or (not r.ok) or len(r.content)<500 :
47 | try:
48 | response = requests.get("http://127.0.0.1:%s/?types=0&count=10"%config.API_PORT)
49 | if response.ok:
50 | content = response.text
51 | choose = random.choice(json.loads(content))
52 | proxies={"https": "http://%s:%s"%(choose[0],choose[1])}
53 | try:
54 | r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
55 | r.encoding =chardet.detect(r.content)['encoding']
56 | count += 1
57 | except Exception,e:
58 | count += 1
59 | else:
60 | return None
61 | except Exception,e:
62 | return None
63 |
64 | else:
65 | return r.text
66 |
67 | return None
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
--------------------------------------------------------------------------------
/spider/HtmlPraser.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | import datetime
3 | from config import QQWRY_PATH, CHINA_AREA
4 |
5 | from util.IPAddress import IPAddresss
6 | import re
7 | import logging
8 | logger = logging.getLogger('spider')
9 |
10 | __author__ = 'Xaxdus'
11 | from lxml import etree
12 | class Html_Parser(object):
13 |
14 | def __init__(self):
15 | self.ips = IPAddresss(QQWRY_PATH)
16 | def parse(self,response,parser):
17 | '''
18 |
19 | :param response: 响应
20 | :param type: 解析方式
21 | :return:
22 | '''
23 | if parser['type']=='xpath':
24 | return self.XpathPraser(response,parser)
25 | elif parser['type']=='regular':
26 | return self.RegularPraser(response,parser)
27 | elif parser['type']=='module':
28 | return getattr(self,parser['moduleName'],None)(response,parser)
29 | else:
30 | return None
31 |
32 | def AuthCountry(self,addr):
33 | '''
34 | 用来判断地址是哪个国家的
35 | :param addr:
36 | :return:
37 | '''
38 | for area in CHINA_AREA:
39 | if addr.find(area)!=-1:
40 | return True
41 | return False
42 |
43 |
44 |
45 | def XpathPraser(self,response,parser):
46 | '''
47 | 针对xpath方式进行解析
48 | :param response:
49 | :param parser:
50 | :return:
51 | '''
52 | # print response
53 | proxylist=[]
54 | root = etree.HTML(response)
55 | proxys = root.xpath(parser['pattern'])
56 | # print proxys
57 | for proxy in proxys:
58 | # print parser['postion']['ip']
59 | try:
60 | ip = proxy.xpath(parser['postion']['ip'])[0].text
61 | port = proxy.xpath(parser['postion']['port'])[0].text
62 | type = proxy.xpath(parser['postion']['type'])[0].text
63 | # print ip,port,type
64 | if type.find(u'高匿')!=-1:
65 | type = 0
66 | else:
67 | type = 1
68 | protocol=''
69 | if len(parser['postion']['protocol']) > 0:
70 | protocol = proxy.xpath(parser['postion']['protocol'])[0].text
71 | if protocol.lower().find('https')!=-1:
72 | protocol = 1
73 | else:
74 | protocol = 0
75 | else:
76 | protocol = 0
77 | addr = self.ips.getIpAddr(self.ips.str2ip(ip))
78 | country = ''
79 | area = ''
80 | if addr.find(u'省')!=-1 or self.AuthCountry(addr):
81 | country = u'中国'
82 | area = addr
83 | else:
84 | country = addr
85 | area = ''
86 | except Exception,e:
87 | logger.warning(str(e))
88 | continue
89 | # updatetime = datetime.datetime.now()
90 | # ip,端口,类型(0高匿名,1透明),protocol(0 http,1 https http),country(国家),area(省市),updatetime(更新时间)
91 |
92 | # proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'updatetime':updatetime,'speed':100}
93 | proxy ={'ip':ip,'port':int(port),'type':int(type),'protocol':int(protocol),'country':country,'area':area,'speed':100}
94 | logger.info("Fetch proxy %s" %str(proxy))
95 | proxylist.append(proxy)
96 |
97 | return proxylist
98 |
99 | def RegularPraser(self,response,parser):
100 | '''
101 | 针对正则表达式进行解析
102 | :param response:
103 | :param parser:
104 | :return:
105 | '''
106 | proxylist=[]
107 | pattern = re.compile(parser['pattern'])
108 | matchs = pattern.findall(response)
109 | if matchs !=None:
110 | for match in matchs:
111 | logging.info(str(match))
112 | ip = match[parser['postion']['ip']]
113 | port = match[parser['postion']['port']]
114 | #网站的类型一直不靠谱所以还是默认,之后会检测
115 | type =0
116 | if parser['postion']['protocol'] > 0:
117 | protocol = match[parser['postion']['protocol']]
118 | if protocol.lower().find('https')!=-1:
119 | protocol = 1
120 | else:
121 | protocol = 0
122 | else:
123 | protocol = 0
124 | addr = self.ips.getIpAddr(self.ips.str2ip(ip))
125 | country = ''
126 | area = ''
127 | if addr.find(u'省')!=-1 or self.AuthCountry(addr):
128 | country = u'中国'
129 | area = addr
130 | else:
131 | country = addr
132 | area = ''
133 | proxy ={'ip':ip,'port':port,'type':type,'protocol':protocol,'country':country,'area':area,'speed':100}
134 | logger.info("Fetch proxy %s" % str(proxy))
135 | proxylist.append(proxy)
136 | return proxylist
137 |
138 |
139 | def CnproxyPraser(self,response,parser):
140 | proxylist = self.RegularPraser(response,parser)
141 | chardict ={'v':'3','m':'4','a':'2','l':'9','q':'0','b':'5','i':'7','w':'6','r':'8','c':'1'}
142 |
143 | for proxy in proxylist:
144 | port = proxy['port']
145 | new_port = ''
146 | for i in range(len(port)):
147 | if port[i]!='+':
148 | new_port += chardict[port[i]]
149 | new_port = int(new_port)
150 | proxy['port'] =new_port
151 | return proxylist
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
--------------------------------------------------------------------------------
/spider/ProxySpider.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | from gevent.pool import Pool
3 | import requests
4 | import time
5 | from config import THREADNUM, parserList, MINNUM, UPDATE_TIME
6 | from db.SQLiteHelper import SqliteHelper
7 | from spider.HtmlDownLoader import Html_Downloader
8 | from spider.HtmlPraser import Html_Parser
9 | from validator.Validator import Validator
10 | import logging
11 | logger = logging.getLogger('spider')
12 |
13 | __author__ = 'Xaxdus'
14 | from gevent import monkey
15 | monkey.patch_all()
16 | '''
17 | 这个类的作用是描述爬虫的逻辑
18 | '''
19 |
20 | class ProxySpider(object):
21 |
22 | def __init__(self):
23 | self.crawl_pool = Pool(THREADNUM)
24 | # self.sqlHelper = sqlHelper
25 |
26 | def run(self):
27 | while True:
28 | logger.info("Start to run spider")
29 | sqlHelper = SqliteHelper()
30 | logger.info('Start to run validator')
31 | validator = Validator(sqlHelper)
32 | count = validator.run_db()
33 | logger.info('Finished to run validator, count=%s'%count)
34 | if count[0]< MINNUM:
35 | proxys = self.crawl_pool.map(self.crawl,parserList)
36 | #这个时候proxys的格式是[[{},{},{}],[{},{},{}]]
37 | # print proxys
38 | #这个时候应该去重:
39 |
40 | proxys_tmp = []
41 | for proxy in proxys:
42 | proxys_tmp.extend(proxy)
43 |
44 | proxys = proxys_tmp
45 | logger.info('first_proxys: %s'%len(proxys))
46 | #这个时候proxys的格式是[{},{},{},{},{},{}]
47 | proxys_tmp=None
48 | #这个时候开始去重:
49 | proxys = [dict(t) for t in set([tuple(proxy.items()) for proxy in proxys])]
50 | logger.info('end_proxy: %s'%len(proxys))
51 | logger.info('spider proxys: %s'%type(proxys))
52 | proxys = validator.run_list(proxys)#这个是检测后的ip地址
53 |
54 | sqlHelper.batch_insert(sqlHelper.tableName,proxys)
55 |
56 | logger.info('success ip: %s'%sqlHelper.selectCount())
57 | sqlHelper.close()
58 | logger.info('Finished to run spider')
59 | time.sleep(UPDATE_TIME)
60 |
61 |
62 | def crawl(self,parser):
63 | proxys = []
64 | html_parser = Html_Parser()
65 | for url in parser['urls']:
66 | response = Html_Downloader.download(url)
67 | if response!=None:
68 | proxylist= html_parser.parse(response,parser)
69 | if proxylist != None:
70 | proxys.extend(proxylist)
71 | return proxys
72 |
73 |
74 | if __name__=="__main__":
75 | spider = ProxySpider()
76 | spider.run()
--------------------------------------------------------------------------------
/spider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 |
--------------------------------------------------------------------------------
/start.bat:
--------------------------------------------------------------------------------
1 | python IPProxys.py
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 |
--------------------------------------------------------------------------------
/test/test.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | import requests
3 | import json
4 | r = requests.get('http://127.0.0.1:8000/?types=0&count=5&country=中国')
5 | ip_ports = json.loads(r.text)
6 | print ip_ports
7 | ip = ip_ports[0]['ip']
8 | port = ip_ports[0]['port']
9 | proxies={
10 | 'http':'http://%s:%s'%(ip,port),
11 | 'https':'http://%s:%s'%(ip,port)
12 | }
13 | r = requests.get('http://ip.chinaz.com/',proxies=proxies)
14 | r.encoding='utf-8'
15 | print r.text
16 |
--------------------------------------------------------------------------------
/test/testhttpserver.py:
--------------------------------------------------------------------------------
1 |
2 | #coding:utf-8
3 | import BaseHTTPServer
4 | import json
5 | import urlparse
6 | class WebRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
7 | def do_GET(self):
8 | """
9 | """
10 | print self.path
11 | parsed_path = urlparse.urlparse(self.path)
12 | print parsed_path
13 | print parsed_path.query
14 | # message_parts = [
15 | # 'CLIENT VALUES:',
16 | # 'client_address=%s (%s)' % (self.client_address,
17 | # self.address_string()),
18 | # 'command=%s' % self.command,
19 | # 'path=%s' % self.path,
20 | # 'real path=%s' % parsed_path.path,
21 | # 'query=%s' % parsed_path.query,
22 | # 'request_version=%s' % self.request_version,
23 | # '',
24 | # 'SERVER VALUES:',
25 | # 'server_version=%s' % self.server_version,
26 | # 'sys_version=%s' % self.sys_version,
27 | # 'protocol_version=%s' % self.protocol_version,
28 | # '',
29 | # 'HEADERS RECEIVED:',
30 | # ]
31 | # for name, value in sorted(self.headers.items()):
32 | # message_parts.append('%s=%s' % (name, value.rstrip()))
33 | # message_parts.append('')
34 | # message = '\r\n'.join(message_parts)
35 | data1 = [{'ip':'192.168.0.0','port':456}]*10
36 | d1 = json.dumps(data1,sort_keys=True,indent=4)
37 | message=('192.168.1.1',80)
38 | self.send_response(200)
39 | self.end_headers()
40 | self.wfile.write(d1)
41 |
42 | server = BaseHTTPServer.HTTPServer(('0.0.0.0',8000), WebRequestHandler)
43 | server.serve_forever()
--------------------------------------------------------------------------------
/test/testlist.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | from decimal import Decimal
3 |
4 | __author__ = 'Xaxdus'
5 |
6 |
7 | # list = ["www.baidu.com/%s" %m for m in ['index']+range(1,5)]
8 | #
9 | # list = [(1,10)]*10
10 | #
11 | # for m,n in list:
12 | # print m,n
13 | #
14 | #
15 | # list2 = ["www.baidu.com/%s/%s"%(i[0],i[1]) for i in list]
16 | # print list2
17 |
18 | # x=Decimal('0.998531571219').quantize(Decimal('0.00'))
19 | # a= 0.998531571219
20 | # value = round(a, 3)
21 | # print x,type(x),value
22 | # proxys=[]
23 | # proxy=[123,1234]
24 | # proxys.append(proxy)
25 | #
26 | # proxy=[123,1234]
27 | # proxys.append(proxy)
28 | #
29 | # print proxys
30 | l = [{'ip':'123.1.1.1','port':80},{'ip':'123.1.1.1','port':80},{'ip':'123.1.2.1','port':80},{'ip':'123.1.1.1','port':81}]
31 |
32 | # for d in l:
33 | # print [tuple(d.items())]
34 | print [tuple(d.items()) for d in l]
35 |
36 | print [dict(t) for t in set([tuple(d.items()) for d in l])]
--------------------------------------------------------------------------------
/test/testlxml.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | from lxml import etree
3 |
4 | __author__ = 'Xaxdus'
5 |
6 | html='''
7 |
8 |
9 |
10 |
11 | 北京http代理ip_66免费代理ip提取网
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 | |
56 |
57 |
58 |
59 | |
60 |
61 |
62 |
63 |
108 |
109 |
110 |
111 |
112 |
113 |
116 |
135 |
136 |
137 |
162 |
163 |
164 |
165 | '''
166 |
167 | root = etree.HTML(html)
168 | proxys = root.xpath(".//*[@id='footer']/div/table/tr[position()>1]")
169 |
170 | for proxy in proxys:
171 | print proxy.xpath('./td[1]')[0].text
--------------------------------------------------------------------------------
/util/IPAddress.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 |
6 | import socket
7 | import struct
8 |
9 | import logging
10 | logger = logging.getLogger('util')
11 |
12 | class IPAddresss:
13 | def __init__(self, ipdbFile):
14 | self.ipdb = open(ipdbFile, "rb")
15 | str = self.ipdb.read(8)
16 | (self.firstIndex, self.lastIndex) = struct.unpack('II', str)
17 | self.indexCount = (self.lastIndex - self.firstIndex)/7+1
18 | # print self.getVersion(), u" 纪录总数: %d 条 "%(self.indexCount)
19 |
20 | def getVersion(self):
21 | s = self.getIpAddr(0xffffff00L)
22 | return s
23 |
24 | def getAreaAddr(self, offset=0):
25 | if offset:
26 | self.ipdb.seek(offset)
27 | str = self.ipdb.read(1)
28 | (byte,) = struct.unpack('B', str)
29 | if byte == 0x01 or byte == 0x02:
30 | p = self.getLong3()
31 | if p:
32 | return self.getString(p)
33 | else:
34 | return ""
35 | else:
36 | self.ipdb.seek(-1, 1)
37 | return self.getString(offset)
38 |
39 | def getAddr(self, offset, ip=0):
40 | self.ipdb.seek(offset + 4)
41 | countryAddr = ""
42 | areaAddr = ""
43 | str = self.ipdb.read(1)
44 | (byte,) = struct.unpack('B', str)
45 | if byte == 0x01:
46 | countryOffset = self.getLong3()
47 | self.ipdb.seek(countryOffset)
48 | str = self.ipdb.read(1)
49 | (b,) = struct.unpack('B', str)
50 | if b == 0x02:
51 | countryAddr = self.getString(self.getLong3())
52 | self.ipdb.seek(countryOffset + 4)
53 | else:
54 | countryAddr = self.getString(countryOffset)
55 | areaAddr = self.getAreaAddr()
56 | elif byte == 0x02:
57 | countryAddr = self.getString(self.getLong3())
58 | areaAddr = self.getAreaAddr(offset + 8)
59 | else:
60 | countryAddr = self.getString(offset + 4)
61 | areaAddr = self.getAreaAddr()
62 | return countryAddr + " " + areaAddr
63 |
64 | def dump(self, first , last):
65 | if last > self.indexCount :
66 | last = self.indexCount
67 | for index in range(first, last):
68 | offset = self.firstIndex + index * 7
69 | self.ipdb.seek(offset)
70 | buf = self.ipdb.read(7)
71 | (ip, of1, of2) = struct.unpack("IHB", buf)
72 | address = self.getAddr(of1 + (of2 << 16))
73 | # 把GBK转为utf-8
74 | address = unicode(address, 'gbk').encode("utf-8")
75 | logger.info("%d %s %s" % (index, self.ip2str(ip), address))
76 |
77 | def setIpRange(self, index):
78 | offset = self.firstIndex + index * 7
79 | self.ipdb.seek(offset)
80 | buf = self.ipdb.read(7)
81 | (self.curStartIp, of1, of2) = struct.unpack("IHB", buf)
82 | self.curEndIpOffset = of1 + (of2 << 16)
83 | self.ipdb.seek(self.curEndIpOffset)
84 | buf = self.ipdb.read(4)
85 | (self.curEndIp,) = struct.unpack("I", buf)
86 |
87 | def getIpAddr(self, ip):
88 | L = 0
89 | R = self.indexCount - 1
90 | while L < R-1:
91 | M = (L + R) / 2
92 | self.setIpRange(M)
93 | if ip == self.curStartIp:
94 | L = M
95 | break
96 | if ip > self.curStartIp:
97 | L = M
98 | else:
99 | R = M
100 | self.setIpRange(L)
101 | # version information, 255.255.255.X, urgy but useful
102 | if ip & 0xffffff00L == 0xffffff00L:
103 | self.setIpRange(R)
104 | if self.curStartIp <= ip <= self.curEndIp:
105 | address = self.getAddr(self.curEndIpOffset)
106 | # 把GBK转为utf-8
107 | address = unicode(address, 'gbk')
108 | else:
109 | address = u"未找到该IP的地址"
110 | return address
111 |
112 | def getIpRange(self, ip):
113 | self.getIpAddr(ip)
114 | range = self.ip2str(self.curStartIp) + ' - ' \
115 | + self.ip2str(self.curEndIp)
116 | return range
117 |
118 | def getString(self, offset = 0):
119 | if offset :
120 | self.ipdb.seek(offset)
121 | str = ""
122 | ch = self.ipdb.read(1)
123 | (byte,) = struct.unpack('B', ch)
124 | while byte != 0:
125 | str += ch
126 | ch = self.ipdb.read(1)
127 | (byte,) = struct.unpack('B', ch)
128 | return str
129 |
130 | def ip2str(self, ip):
131 | return str(ip >> 24)+'.'+str((ip >> 16) & 0xffL)+'.'+str((ip >> 8) & 0xffL)+'.'+str(ip & 0xffL)
132 |
133 | def str2ip(self, s):
134 | (ip,) = struct.unpack('I', socket.inet_aton(s))
135 | return ((ip >> 24) & 0xffL) | ((ip & 0xffL) << 24) | ((ip >> 8) & 0xff00L) | ((ip & 0xff00L) << 8)
136 |
137 | def getLong3(self, offset=0):
138 | if offset:
139 | self.ipdb.seek(offset)
140 | str = self.ipdb.read(3)
141 | (a, b) = struct.unpack('HB', str)
142 | return (b << 16) + a
143 |
144 |
145 |
146 |
--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | __author__ = 'Xaxdus'
4 |
5 |
--------------------------------------------------------------------------------
/util/logger.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | import logging
3 |
4 | __author__ = 'Xaxdus'
5 |
6 |
7 | logger = logging.getLogger()
8 | def logger_proxy(proxy):
9 | logger.setLevel(logging.INFO)
10 | logger.info(proxy)
11 |
--------------------------------------------------------------------------------
/validator/Validator.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | import datetime
3 |
4 | from lxml import etree
5 | from gevent.pool import Pool
6 | import requests
7 | import time
8 | from config import TEST_URL
9 | import config
10 | from db.SQLiteHelper import SqliteHelper
11 | import logging
12 | logger = logging.getLogger("validator")
13 |
14 | from gevent import monkey
15 | monkey.patch_all()
16 |
17 |
18 | __author__ = 'Xaxdus'
19 |
20 | class Validator(object):
21 |
22 | def __init__(self,sqlHelper):
23 |
24 | self.detect_pool = Pool(config.THREADNUM)
25 | self.sqlHelper =sqlHelper
26 | self.selfip = self.getMyIP()
27 | self.detect_pool = Pool(config.THREADNUM)
28 |
29 | def run_db(self):
30 | '''
31 | 从数据库中检测
32 | :return:
33 | '''
34 | try:
35 | #首先将超时的全部删除
36 | self.deleteOld()
37 | #接着检测剩余的ip,是否可用
38 | results = self.sqlHelper.selectAll()
39 | self.detect_pool.map(self.detect_db,results)
40 | #将数据库进行压缩
41 | self.sqlHelper.compress()
42 |
43 | return self.sqlHelper.selectCount()#返回最终的数量
44 | except Exception,e:
45 | logger.warning(str(e))
46 | return 0
47 |
48 |
49 |
50 | def run_list(self,results):
51 | '''
52 | 这个是先不进入数据库,直接从集合中删除
53 | :param results:
54 | :return:
55 | '''
56 | # proxys=[]
57 | # for result in results:
58 | proxys = self.detect_pool.map(self.detect_list,results)
59 | #这个时候proxys的格式是[{},{},{},{},{}]
60 | return proxys
61 |
62 | def deleteOld(self):
63 | '''
64 | 删除旧的数据
65 | :return:
66 | '''
67 | condition = "updatetime<'%s'"%((datetime.datetime.now() - datetime.timedelta(minutes=config.MAXTIME)).strftime('%Y-%m-%d %H:%M:%S'))
68 | self.sqlHelper.delete(SqliteHelper.tableName,condition)
69 |
70 |
71 | def detect_db(self,result):
72 | '''
73 |
74 | :param result: 从数据库中检测
75 | :return:
76 | '''
77 | ip = result[0]
78 | port = str(result[1])
79 | proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)}
80 |
81 | start = time.time()
82 | try:
83 | r = requests.get(url=TEST_URL,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
84 |
85 | if not r.ok or r.text.find(ip)==-1:
86 | condition = "ip='"+ip+"' AND "+'port='+port
87 | logger.info('failed %s:%s'%(ip,port))
88 | self.sqlHelper.delete(SqliteHelper.tableName,condition)
89 | else:
90 | logger.info(r.text)
91 | speed = round(time.time()-start, 2)
92 | self.sqlHelper.update(SqliteHelper.tableName,'SET speed=? WHERE ip=? AND port=?',(speed,ip,port))
93 | logger.info('success %s:%s, speed=%s'%(ip,port,speed))
94 | except Exception,e:
95 | condition = "ip='"+ip+"' AND "+'port='+port
96 | logger.info('failed %s:%s'%(ip,port))
97 | self.sqlHelper.delete(SqliteHelper.tableName,condition)
98 |
99 |
100 |
101 | def detect_list(self,proxy):
102 | '''
103 | :param proxy: ip字典
104 | :return:
105 | '''
106 | # for proxy in proxys:
107 |
108 | ip = proxy['ip']
109 | port = proxy['port']
110 | proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)}
111 | proxyType = self.checkProxyType(proxies)
112 | if proxyType==3:
113 | logger.info('failed %s:%s'%(ip,port))
114 |
115 | proxy = None
116 | return proxy
117 | else:
118 | proxy['type']=proxyType
119 | start = time.time()
120 | try:
121 | r = requests.get(url=TEST_URL,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
122 |
123 | if not r.ok or r.text.find(ip)==-1:
124 | logger.info('failed %s:%s'%(ip,port))
125 | proxy = None
126 | else:
127 | speed = round(time.time()-start,2)
128 | logger.info('success %s:%s, speed=%s'%(ip,port,speed))
129 | proxy['speed']=speed
130 | # return proxy
131 | except Exception,e:
132 | logger.info('failed %s:%s'%(ip,port))
133 | proxy = None
134 | return proxy
135 | # return proxys
136 |
137 | def checkProxyType(self,proxies):
138 | '''
139 | 用来检测代理的类型,突然发现,免费网站写的信息不靠谱,还是要自己检测代理的类型
140 | :param proxies: 代理(0 高匿,1 匿名,2 透明 3 无效代理
141 | :return:
142 | '''
143 |
144 | try:
145 |
146 | r = requests.get(url=config.TEST_PROXY,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
147 | if r.ok:
148 | root = etree.HTML(r.text)
149 | ip = root.xpath('.//center[2]/table/tr[3]/td[2]')[0].text
150 | http_x_forwared_for = root.xpath('.//center[2]/table/tr[8]/td[2]')[0].text
151 | http_via = root.xpath('.//center[2]/table/tr[9]/td[2]')[0].text
152 | # print ip,http_x_forwared_for,http_via,type(http_via),type(http_x_forwared_for)
153 | if ip==self.selfip:
154 | return 3
155 | if http_x_forwared_for is None and http_via is None:
156 | return 0
157 | if http_via != None and http_x_forwared_for.find(self.selfip)== -1:
158 | return 1
159 |
160 | if http_via != None and http_x_forwared_for.find(self.selfip)!= -1:
161 | return 2
162 | return 3
163 |
164 |
165 |
166 | except Exception,e:
167 | logger.warning(str(e))
168 | return 3
169 |
170 |
171 |
172 | def getMyIP(self):
173 | try:
174 | r = requests.get(url=config.TEST_PROXY,headers=config.HEADER,timeout=config.TIMEOUT)
175 | # print r.text
176 | root = etree.HTML(r.text)
177 | ip = root.xpath('.//center[2]/table/tr[3]/td[2]')[0].text
178 |
179 | logger.info('ip %s' %ip)
180 | return ip
181 | except Exception,e:
182 | logger.info(str(e))
183 | return None
184 |
185 | if __name__=='__main__':
186 | v = Validator(None)
187 | v.getMyIP()
188 | v.selfip
189 | # results=[{'ip':'192.168.1.1','port':80}]*10
190 | # results = v.run(results)
191 | # print results
192 | pass
193 |
--------------------------------------------------------------------------------
/validator/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'Xaxdus'
2 |
--------------------------------------------------------------------------------
|