├── .gitignore
├── README.md
├── proxy.db
├── proxy.cfg
└── main.py
/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea
2 | *.pyc
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # getProxy
2 | 获取国内/国外的代理IP, 并且定期更新,剔除失效的
3 |
--------------------------------------------------------------------------------
/proxy.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rockyzsu/getProxy/master/proxy.db
--------------------------------------------------------------------------------
/proxy.cfg:
--------------------------------------------------------------------------------
1 | {'https': 'https://221.4.133.67:53281'}
2 | {'https': 'https://125.67.245.3:9728'}
3 | {'https': 'https://219.130.39.55:53281'}
4 | {'https': 'https://116.23.138.33:9999'}
5 | {'https': 'https://114.235.30.178:808'}
6 | {'https': 'https://119.41.193.65:53281'}
7 | {'https': 'https://180.137.232.81:53281'}
8 | {'https': 'https://219.130.39.55:53281'}
9 | {'https': 'https://123.121.85.51:9000'}
10 | {'https': 'https://119.41.193.65:53281'}
11 | {'https': 'https://61.160.208.222:8080'}
12 | {'https': 'https://125.67.245.3:9728'}
13 | {'https': 'https://180.137.232.81:53281'}
14 | {'https': 'https://223.151.83.221:53281'}
15 | {'https': 'https://180.137.232.81:53281'}
16 | {'https': 'https://180.137.232.81:53281'}
17 | {'https': 'https://112.85.73.86:9131'}
18 | {'https': 'https://223.151.83.221:53281'}
19 | {'https': 'https://180.137.232.81:53281'}
20 | {'https': 'https://125.67.245.3:9728'}
21 | {'https': 'https://61.181.10.174:9999'}
22 | {'https': 'https://175.4.37.140:808'}
23 | {'https': 'https://183.66.64.120:3128'}
24 | {'https': 'https://113.128.90.171:48888'}
25 | {'https': 'https://61.160.208.222:8080'}
26 | {'https': 'https://113.128.91.84:48888'}
27 | {'https': 'https://120.76.55.49:8088'}
28 | {'https': 'https://119.39.68.200:808'}
29 | {'https': 'https://113.128.91.122:48888'}
30 | {'https': 'https://113.128.90.81:48888'}
31 | {'https': 'https://42.202.130.246:3128'}
32 | {'https': 'https://222.196.33.254:3128'}
33 | {'https': 'https://139.224.24.26:8888'}
34 | {'https': 'https://124.237.83.14:53281'}
35 | {'https': 'https://14.221.165.65:808'}
36 | {'https': 'https://113.128.91.158:48888'}
37 |
38 |
39 | {'http': 'http://121.101.129.33:3128'}
40 | {'http': 'http://36.66.76.181:3128'}
41 | {'http': 'http://202.79.52.8:53281'}
42 | {'http': 'http://181.40.115.186:3128'}
43 | {'http': 'http://200.192.214.138:8080'}
44 | {'http': 'http://181.40.115.186:3128'}
45 | {'http': 'http://94.114.149.236:3128'}
46 | {'http': 'http://85.159.2.171:8080'}
47 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding=utf-8 -*-
2 | __author__ = 'Rocky'
3 | import re
4 | import requests
5 | from lxml import etree
6 | import urllib2, time, datetime
7 | from lxml import etree
8 | import sqlite3,time
9 |
10 | class getProxy():
11 |
12 | def __init__(self):
13 | self.user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
14 | self.header = {"User-Agent": self.user_agent}
15 | self.dbname="proxy.db"
16 | self.now = time.strftime("%Y-%m-%d")
17 |
18 | def getContent(self, num):
19 | nn_url = "http://www.xicidaili.com/nn/" + str(num)
20 | #国内高匿
21 | req = urllib2.Request(nn_url, headers=self.header)
22 | resp = urllib2.urlopen(req, timeout=10)
23 | content = resp.read()
24 | et = etree.HTML(content)
25 | result_even = et.xpath('//tr[@class=""]')
26 | result_odd = et.xpath('//tr[@class="odd"]')
27 | #因为网页源码中class 分开了奇偶两个class,所以使用lxml最方便的方式就是分开获取。
28 | #刚开始我使用一个方式获取,因而出现很多不对称的情况,估计是网站会经常修改源码,怕被其他爬虫的抓到
29 | #使用上面的方法可以不管网页怎么改,都可以抓到ip 和port
30 | for i in result_even:
31 | t1 = i.xpath("./td/text()")[:2]
32 | #print "IP:%s\tPort:%s" % (t1[0], t1[1])
33 | if self.isAlive(t1[0], t1[1]):
34 | proxies = {'https': 'https://' + t1[0] + ':' + t1[1]}
35 | self.check_Proxy_IP(proxies)
36 | #pass
37 | #self.insert_db(self.now,t1[0],t1[1])
38 | for i in result_odd:
39 | t2 = i.xpath("./td/text()")[:2]
40 | #print "IP:%s\tPort:%s" % (t2[0], t2[1])
41 | if self.isAlive(t2[0], t2[1]):
42 | #pass
43 | #self.insert_db(self.now,t2[0],t2[1])
44 | proxies = {'https': 'https://' + t2[0] + ':' + t2[1]}
45 | self.check_Proxy_IP(proxies)
46 |
47 | def insert_db(self,date,ip,port):
48 | dbname=self.dbname
49 | try:
50 | conn=sqlite3.connect(dbname)
51 | except:
52 | print "Error to open database%" %self.dbname
53 | create_tb='''
54 | CREATE TABLE IF NOT EXISTS PROXY
55 | (DATE TEXT,
56 | IP TEXT,
57 | PORT TEXT
58 | );
59 | '''
60 | conn.execute(create_tb)
61 | insert_db_cmd='''
62 | INSERT INTO PROXY (DATE,IP,PORT) VALUES ('%s','%s','%s');
63 | ''' %(date,ip,port)
64 | conn.execute(insert_db_cmd)
65 | conn.commit()
66 | conn.close()
67 |
68 | def loop(self,page=5):
69 | for i in range(1,page):
70 | self.getContent(i)
71 |
72 | #查看爬到的代理IP是否还能用
73 | def isAlive(self,ip,port):
74 | proxy={'http':ip+':'+port}
75 | #print proxy
76 |
77 | #使用这个方式是全局方法。不推荐
78 | '''
79 | proxy_support=urllib2.ProxyHandler(proxy)
80 | opener=urllib2.build_opener(proxy_support)
81 | urllib2.install_opener(opener)
82 | #使用代理访问腾讯官网,进行验证代理是否有效
83 | test_url="http://www.qq.com"
84 | req=urllib2.Request(test_url,headers=self.header)
85 | try:
86 | #timeout 设置为10,如果你不能忍受你的代理延时超过10,就修改timeout的数字
87 | resp=urllib2.urlopen(req,timeout=10)
88 |
89 | if resp.code==200:
90 | print proxy
91 | #print "work"
92 | return True
93 | else:
94 | #print "not work"
95 | return False
96 | except :
97 | #print "Not work"
98 | return False
99 | '''
100 | testUrl='members.3322.org/dyndns/getip'
101 | r=requests.get(url=testUrl,headers=self.header,proxies=proxy)
102 | code= r.status_code
103 | print r.text
104 | print code
105 | if code==200:
106 | print "Proxy ", proxy,'works'
107 | return True
108 | else:
109 | return False
110 |
111 |
112 | #查看数据库里面的数据时候还有效,没有的话将其纪录删除
113 | def check_db_pool(self):
114 | conn=sqlite3.connect(self.dbname)
115 | query_cmd='''
116 | select IP,PORT from PROXY;
117 | '''
118 | cursor=conn.execute(query_cmd)
119 | for row in cursor:
120 | if not self.isAlive(row[0],row[1]):
121 | #代理失效, 要从数据库从删除
122 | delete_cmd='''
123 | delete from PROXY where IP='%s'
124 | ''' %row[0]
125 | print "delete IP %s in db" %row[0]
126 | conn.execute(delete_cmd)
127 | conn.commit()
128 |
129 | conn.close()
130 |
131 | def getHTTPS(self):
132 | for i in range(1,5):
133 | url='http://www.xicidaili.com/wn/%s' %i
134 | s=requests.get(url,headers=self.header)
135 | print s
136 | content = s.text
137 | et = etree.HTML(content)
138 | result_even = et.xpath('//tr[@class=""]')
139 | result_odd = et.xpath('//tr[@class="odd"]')
140 | # 因为网页源码中class 分开了奇偶两个class,所以使用lxml最方便的方式就是分开获取。
141 | # 刚开始我使用一个方式获取,因而出现很多不对称的情况,估计是网站会经常修改源码,怕被其他爬虫的抓到
142 | # 使用上面的方法可以不管网页怎么改,都可以抓到ip 和port
143 | for i in result_even:
144 | t1 = i.xpath("./td/text()")[:2]
145 | # print "IP:%s\tPort:%s" % (t1[0], t1[1])
146 | if self.isAlive(t1[0], t1[1]):
147 | # pass
148 | #self.insert_db(self.now, t1[0], t1[1])
149 | proxies={'https':'https://'+t1[0]+':'+t1[1]}
150 | self.check_Proxy_IP(proxies)
151 | for i in result_odd:
152 | t2 = i.xpath("./td/text()")[:2]
153 | # print "IP:%s\tPort:%s" % (t2[0], t2[1])
154 | if self.isAlive(t2[0], t2[1]):
155 | # pass
156 | #self.insert_db(self.now, t2[0], t2[1])
157 | proxies={'https':'https://'+t2[0]+':'+t2[1]}
158 | self.check_Proxy_IP(proxies)
159 |
160 | print "*"*10
161 |
162 |
163 | def getFrom_89vip(self):
164 | url='http://www.89ip.cn/tiqv.php?sxb=&tqsl=30&ports=&ktip=&xl=on&submit=%CC%E1++%C8%A1'
165 | s=requests.get(url,headers=self.header)
166 | #print s.status_code
167 | #print s.text
168 | res=re.findall('
(.*?)
',s.text)
169 |
170 | for i in res:
171 |
172 | #print i
173 | proxies={'http':'http://'+str(i)}
174 | #print proxies
175 | self.check_Proxy_IP(proxies)
176 |
177 | def apiDemo(self):
178 | url='http://api.xicidaili.com'
179 | s=requests.get(url,headers=self.header)
180 | print s.status_code
181 | #print s.text
182 |
183 | def validation(self):
184 | fp = open('proxy.cfg', 'r')
185 | lines=fp.readlines()
186 | print lines
187 | new_lines=[]
188 | for i in lines:
189 | x=eval(i.strip())
190 | print x
191 | s = requests.get(url='https://guyuan.anjuke.com/community/p1/', headers=self.header, proxies=x, timeout=10)
192 | print s.status_code
193 | if s.status_code == 200:
194 | new_lines.append(i)
195 |
196 | fp.close()
197 | new_lines=list(set(new_lines))
198 | with open('proxy.cfg','w') as fp:
199 | for i in new_lines:
200 | fp.write(i)
201 |
202 |
203 |
204 | def check_Proxy_IP(self,proxies):
205 | #proxies={'http': '180.105.126.75:8118'}
206 | #proxies={'https': 'https://112.246.37.48:8118',}
207 | fp=open('proxy.cfg','a')
208 | #print proxies
209 | try:
210 |
211 | s=requests.get(url='http://members.3322.org/dyndns/getip',headers=self.header,proxies=proxies,timeout=10)
212 | #s=requests.get(url='http://members.3322.org/dyndns/getip',headers=self.header,timeout=10)
213 | print s.text
214 | print s.status_code
215 | if s.status_code==200:
216 | #print str(proxies)
217 | print proxies
218 | print 'work'
219 | fp.write(str(proxies))
220 | fp.write('\n')
221 | fp.close()
222 |
223 | except Exception,e:
224 | print e
225 |
226 |
227 | if __name__ == "__main__":
228 | now = datetime.datetime.now()
229 | print "Start at %s" % now
230 |
231 | obj=getProxy()
232 | #obj.check_Proxy_IP('')
233 | obj.getFrom_89vip()
234 | #obj.getHTTPS()
235 | #obj.validation()
236 | #obj.apiDemo()
237 | #obj.loop()
238 | #obj.getHTTPS()
239 | #obj.loop(5)
240 | #obj.check_db_pool()
241 | #obj.check_Proxy_IP()
242 |
--------------------------------------------------------------------------------