├── .gitignore ├── README.md ├── proxy.db ├── proxy.cfg └── main.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | *.pyc 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # getProxy 2 | 获取国内/国外的代理IP, 并且定期更新,剔除失效的 3 | -------------------------------------------------------------------------------- /proxy.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rockyzsu/getProxy/master/proxy.db -------------------------------------------------------------------------------- /proxy.cfg: -------------------------------------------------------------------------------- 1 | {'https': 'https://221.4.133.67:53281'} 2 | {'https': 'https://125.67.245.3:9728'} 3 | {'https': 'https://219.130.39.55:53281'} 4 | {'https': 'https://116.23.138.33:9999'} 5 | {'https': 'https://114.235.30.178:808'} 6 | {'https': 'https://119.41.193.65:53281'} 7 | {'https': 'https://180.137.232.81:53281'} 8 | {'https': 'https://219.130.39.55:53281'} 9 | {'https': 'https://123.121.85.51:9000'} 10 | {'https': 'https://119.41.193.65:53281'} 11 | {'https': 'https://61.160.208.222:8080'} 12 | {'https': 'https://125.67.245.3:9728'} 13 | {'https': 'https://180.137.232.81:53281'} 14 | {'https': 'https://223.151.83.221:53281'} 15 | {'https': 'https://180.137.232.81:53281'} 16 | {'https': 'https://180.137.232.81:53281'} 17 | {'https': 'https://112.85.73.86:9131'} 18 | {'https': 'https://223.151.83.221:53281'} 19 | {'https': 'https://180.137.232.81:53281'} 20 | {'https': 'https://125.67.245.3:9728'} 21 | {'https': 'https://61.181.10.174:9999'} 22 | {'https': 'https://175.4.37.140:808'} 23 | {'https': 'https://183.66.64.120:3128'} 24 | {'https': 'https://113.128.90.171:48888'} 25 | {'https': 'https://61.160.208.222:8080'} 26 | {'https': 'https://113.128.91.84:48888'} 27 | {'https': 'https://120.76.55.49:8088'} 28 | {'https': 'https://119.39.68.200:808'} 29 | {'https': 'https://113.128.91.122:48888'} 30 | {'https': 'https://113.128.90.81:48888'} 31 | {'https': 'https://42.202.130.246:3128'} 32 | {'https': 'https://222.196.33.254:3128'} 33 | {'https': 'https://139.224.24.26:8888'} 34 | {'https': 'https://124.237.83.14:53281'} 35 | {'https': 'https://14.221.165.65:808'} 36 | {'https': 'https://113.128.91.158:48888'} 37 | 38 | 39 | {'http': 'http://121.101.129.33:3128'} 40 | {'http': 'http://36.66.76.181:3128'} 41 | {'http': 'http://202.79.52.8:53281'} 42 | {'http': 'http://181.40.115.186:3128'} 43 | {'http': 'http://200.192.214.138:8080'} 44 | {'http': 'http://181.40.115.186:3128'} 45 | {'http': 'http://94.114.149.236:3128'} 46 | {'http': 'http://85.159.2.171:8080'} 47 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding=utf-8 -*- 2 | __author__ = 'Rocky' 3 | import re 4 | import requests 5 | from lxml import etree 6 | import urllib2, time, datetime 7 | from lxml import etree 8 | import sqlite3,time 9 | 10 | class getProxy(): 11 | 12 | def __init__(self): 13 | self.user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" 14 | self.header = {"User-Agent": self.user_agent} 15 | self.dbname="proxy.db" 16 | self.now = time.strftime("%Y-%m-%d") 17 | 18 | def getContent(self, num): 19 | nn_url = "http://www.xicidaili.com/nn/" + str(num) 20 | #国内高匿 21 | req = urllib2.Request(nn_url, headers=self.header) 22 | resp = urllib2.urlopen(req, timeout=10) 23 | content = resp.read() 24 | et = etree.HTML(content) 25 | result_even = et.xpath('//tr[@class=""]') 26 | result_odd = et.xpath('//tr[@class="odd"]') 27 | #因为网页源码中class 分开了奇偶两个class,所以使用lxml最方便的方式就是分开获取。 28 | #刚开始我使用一个方式获取,因而出现很多不对称的情况,估计是网站会经常修改源码,怕被其他爬虫的抓到 29 | #使用上面的方法可以不管网页怎么改,都可以抓到ip 和port 30 | for i in result_even: 31 | t1 = i.xpath("./td/text()")[:2] 32 | #print "IP:%s\tPort:%s" % (t1[0], t1[1]) 33 | if self.isAlive(t1[0], t1[1]): 34 | proxies = {'https': 'https://' + t1[0] + ':' + t1[1]} 35 | self.check_Proxy_IP(proxies) 36 | #pass 37 | #self.insert_db(self.now,t1[0],t1[1]) 38 | for i in result_odd: 39 | t2 = i.xpath("./td/text()")[:2] 40 | #print "IP:%s\tPort:%s" % (t2[0], t2[1]) 41 | if self.isAlive(t2[0], t2[1]): 42 | #pass 43 | #self.insert_db(self.now,t2[0],t2[1]) 44 | proxies = {'https': 'https://' + t2[0] + ':' + t2[1]} 45 | self.check_Proxy_IP(proxies) 46 | 47 | def insert_db(self,date,ip,port): 48 | dbname=self.dbname 49 | try: 50 | conn=sqlite3.connect(dbname) 51 | except: 52 | print "Error to open database%" %self.dbname 53 | create_tb=''' 54 | CREATE TABLE IF NOT EXISTS PROXY 55 | (DATE TEXT, 56 | IP TEXT, 57 | PORT TEXT 58 | ); 59 | ''' 60 | conn.execute(create_tb) 61 | insert_db_cmd=''' 62 | INSERT INTO PROXY (DATE,IP,PORT) VALUES ('%s','%s','%s'); 63 | ''' %(date,ip,port) 64 | conn.execute(insert_db_cmd) 65 | conn.commit() 66 | conn.close() 67 | 68 | def loop(self,page=5): 69 | for i in range(1,page): 70 | self.getContent(i) 71 | 72 | #查看爬到的代理IP是否还能用 73 | def isAlive(self,ip,port): 74 | proxy={'http':ip+':'+port} 75 | #print proxy 76 | 77 | #使用这个方式是全局方法。不推荐 78 | ''' 79 | proxy_support=urllib2.ProxyHandler(proxy) 80 | opener=urllib2.build_opener(proxy_support) 81 | urllib2.install_opener(opener) 82 | #使用代理访问腾讯官网,进行验证代理是否有效 83 | test_url="http://www.qq.com" 84 | req=urllib2.Request(test_url,headers=self.header) 85 | try: 86 | #timeout 设置为10,如果你不能忍受你的代理延时超过10,就修改timeout的数字 87 | resp=urllib2.urlopen(req,timeout=10) 88 | 89 | if resp.code==200: 90 | print proxy 91 | #print "work" 92 | return True 93 | else: 94 | #print "not work" 95 | return False 96 | except : 97 | #print "Not work" 98 | return False 99 | ''' 100 | testUrl='members.3322.org/dyndns/getip' 101 | r=requests.get(url=testUrl,headers=self.header,proxies=proxy) 102 | code= r.status_code 103 | print r.text 104 | print code 105 | if code==200: 106 | print "Proxy ", proxy,'works' 107 | return True 108 | else: 109 | return False 110 | 111 | 112 | #查看数据库里面的数据时候还有效,没有的话将其纪录删除 113 | def check_db_pool(self): 114 | conn=sqlite3.connect(self.dbname) 115 | query_cmd=''' 116 | select IP,PORT from PROXY; 117 | ''' 118 | cursor=conn.execute(query_cmd) 119 | for row in cursor: 120 | if not self.isAlive(row[0],row[1]): 121 | #代理失效, 要从数据库从删除 122 | delete_cmd=''' 123 | delete from PROXY where IP='%s' 124 | ''' %row[0] 125 | print "delete IP %s in db" %row[0] 126 | conn.execute(delete_cmd) 127 | conn.commit() 128 | 129 | conn.close() 130 | 131 | def getHTTPS(self): 132 | for i in range(1,5): 133 | url='http://www.xicidaili.com/wn/%s' %i 134 | s=requests.get(url,headers=self.header) 135 | print s 136 | content = s.text 137 | et = etree.HTML(content) 138 | result_even = et.xpath('//tr[@class=""]') 139 | result_odd = et.xpath('//tr[@class="odd"]') 140 | # 因为网页源码中class 分开了奇偶两个class,所以使用lxml最方便的方式就是分开获取。 141 | # 刚开始我使用一个方式获取,因而出现很多不对称的情况,估计是网站会经常修改源码,怕被其他爬虫的抓到 142 | # 使用上面的方法可以不管网页怎么改,都可以抓到ip 和port 143 | for i in result_even: 144 | t1 = i.xpath("./td/text()")[:2] 145 | # print "IP:%s\tPort:%s" % (t1[0], t1[1]) 146 | if self.isAlive(t1[0], t1[1]): 147 | # pass 148 | #self.insert_db(self.now, t1[0], t1[1]) 149 | proxies={'https':'https://'+t1[0]+':'+t1[1]} 150 | self.check_Proxy_IP(proxies) 151 | for i in result_odd: 152 | t2 = i.xpath("./td/text()")[:2] 153 | # print "IP:%s\tPort:%s" % (t2[0], t2[1]) 154 | if self.isAlive(t2[0], t2[1]): 155 | # pass 156 | #self.insert_db(self.now, t2[0], t2[1]) 157 | proxies={'https':'https://'+t2[0]+':'+t2[1]} 158 | self.check_Proxy_IP(proxies) 159 | 160 | print "*"*10 161 | 162 | 163 | def getFrom_89vip(self): 164 | url='http://www.89ip.cn/tiqv.php?sxb=&tqsl=30&ports=&ktip=&xl=on&submit=%CC%E1++%C8%A1' 165 | s=requests.get(url,headers=self.header) 166 | #print s.status_code 167 | #print s.text 168 | res=re.findall('
(.*?)
',s.text) 169 | 170 | for i in res: 171 | 172 | #print i 173 | proxies={'http':'http://'+str(i)} 174 | #print proxies 175 | self.check_Proxy_IP(proxies) 176 | 177 | def apiDemo(self): 178 | url='http://api.xicidaili.com' 179 | s=requests.get(url,headers=self.header) 180 | print s.status_code 181 | #print s.text 182 | 183 | def validation(self): 184 | fp = open('proxy.cfg', 'r') 185 | lines=fp.readlines() 186 | print lines 187 | new_lines=[] 188 | for i in lines: 189 | x=eval(i.strip()) 190 | print x 191 | s = requests.get(url='https://guyuan.anjuke.com/community/p1/', headers=self.header, proxies=x, timeout=10) 192 | print s.status_code 193 | if s.status_code == 200: 194 | new_lines.append(i) 195 | 196 | fp.close() 197 | new_lines=list(set(new_lines)) 198 | with open('proxy.cfg','w') as fp: 199 | for i in new_lines: 200 | fp.write(i) 201 | 202 | 203 | 204 | def check_Proxy_IP(self,proxies): 205 | #proxies={'http': '180.105.126.75:8118'} 206 | #proxies={'https': 'https://112.246.37.48:8118',} 207 | fp=open('proxy.cfg','a') 208 | #print proxies 209 | try: 210 | 211 | s=requests.get(url='http://members.3322.org/dyndns/getip',headers=self.header,proxies=proxies,timeout=10) 212 | #s=requests.get(url='http://members.3322.org/dyndns/getip',headers=self.header,timeout=10) 213 | print s.text 214 | print s.status_code 215 | if s.status_code==200: 216 | #print str(proxies) 217 | print proxies 218 | print 'work' 219 | fp.write(str(proxies)) 220 | fp.write('\n') 221 | fp.close() 222 | 223 | except Exception,e: 224 | print e 225 | 226 | 227 | if __name__ == "__main__": 228 | now = datetime.datetime.now() 229 | print "Start at %s" % now 230 | 231 | obj=getProxy() 232 | #obj.check_Proxy_IP('') 233 | obj.getFrom_89vip() 234 | #obj.getHTTPS() 235 | #obj.validation() 236 | #obj.apiDemo() 237 | #obj.loop() 238 | #obj.getHTTPS() 239 | #obj.loop(5) 240 | #obj.check_db_pool() 241 | #obj.check_Proxy_IP() 242 | --------------------------------------------------------------------------------