├── 逻辑图.png ├── Author.py ├── Util.py ├── demo.py ├── README.md ├── ProxiesDataBase.py ├── GetIP.py └── Config.py /逻辑图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZKeeer/IPProxy/HEAD/逻辑图.png -------------------------------------------------------------------------------- /Author.py: -------------------------------------------------------------------------------- 1 | Nick = "ZKeeer" 2 | Email = "zangker@foxmail.com" 3 | Blog = "http://zkeeer.space" 4 | Github = "https://github.com/ZKeeer" 5 | Zhihu = "https://www.zhihu.com/people/ZKeeer/activities" -------------------------------------------------------------------------------- /Util.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import ProxiesDataBase 4 | import GetIP 5 | import re 6 | 7 | def Refresh(): 8 | GetIP.RefreshDB() 9 | GetIP.GetIP() 10 | 11 | def Get(): 12 | proxies_dict = {} 13 | result = ProxiesDataBase.GetItems() 14 | if result: 15 | tmp = random.choice(result) 16 | proxies_dict['http'] = 'http://{}'.format(tmp) 17 | proxies_dict['https'] = 'https://{}'.format(tmp) 18 | return proxies_dict 19 | 20 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import GetIP 3 | import Config 4 | import ProxiesDataBase 5 | import Util 6 | 7 | 8 | def main(): 9 | # 初始化数据库和数据表 10 | ProxiesDataBase.InitDB() 11 | # 刷新数据库,添加新数据 12 | Util.Refresh() 13 | # 获取一个代理使用 14 | proxies = Util.Get() 15 | print(proxies) 16 | 17 | # 查询数据库多少条数据 18 | conn = sqlite3.connect(Config.DBName) 19 | cu = conn.cursor() 20 | print(cu.execute("""SELECT * FROM {};""".format(Config.TabelName)).fetchall().__len__()) 21 | cu.close() 22 | conn.close() 23 | 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IPProxy 2 | 爬虫所需要的IP代理,抓取八个网站的代理IP检测/清洗/入库/更新,添加调用接口
3 |

4 | 目前只在win10 64位机,python3.5 / ubuntu server 16.04.1 LTS 64位 ,python 3.5下测试通过
5 | 不同配置的机器, 请在Config.py中修改最大线程数。详情可以看下面Config.py部分 6 | 7 |

8 |

如何使用

9 | 查看demo.py

10 | Util.Refresh():数据库和新的数据需要主动调用此函数更新

11 | Util.Get():调用可获取一条可用的代理,Util.Get()返回的代理:
12 | {'http': 'http://115.159.152.130:81', 'https': 'https://115.159.152.130:81'}
13 | requests可以直接使用:requests.get(url,proxies=Util.Get(),headers={}) 14 |

15 |

Config.py 部分:

16 | 设置最大线程数量限制,MaxThreads。如果说,我的电脑配置很低,那么设置16,32慢慢跑;如果对你的电脑贼自信,我电脑牛X啊,i7 志强,又是什么N多G内存,网络带宽贼6,那么你可以设置1024。
17 | 如果你还有代理网站可以添加,请添加在Url_Regular字典中。
18 | 代理IP网址和对应的正则式,正则式一定要IP和Port分开获取,例如[(192.168.1.1, 80), (192.168.1.1, 90),]
19 | 只抓取首页,想要抓取首页以后页面的可以将链接和正则式贴上来,例如,将某网站的1、2、……页的链接和对应的正则式分别添加到Url_Regular字典中。
20 | 添加正则式之前请先在 站长工具-正则表达式在线测试 测试通过后添加
21 |

22 |

数据来源:

23 |
http://www.kuaidaili.com/free/
24 | http://www.66ip.cn/
25 | http://www.xicidaili.com/nn/
26 | http://www.ip3366.net/free/
27 | http://www.proxy360.cn/Region/China
28 | http://www.mimiip.com/
29 | http://www.data5u.com/free/index.shtml
30 | http://www.ip181.com/
31 | http://www.kxdaili.com/
32 | 欢迎添加你知道的代理网站,大家资源共享 33 |

34 | 逻辑结构:
35 | 36 |

37 | 欢迎issue和pull,代码渣渣,大神轻喷 38 | -------------------------------------------------------------------------------- /ProxiesDataBase.py: -------------------------------------------------------------------------------- 1 | # table IPPORT 2 | # ip_port TEXT NOT NULL 3 | import sqlite3 4 | import traceback 5 | 6 | import Config 7 | 8 | 9 | def InitDB(): 10 | db_conn = sqlite3.connect(Config.DBName) 11 | try: 12 | db_conn.execute( 13 | """CREATE TABLE IF NOT EXISTS {} (IP_PORT TEXT NOT NULL);""".format(Config.TabelName)) 14 | db_conn.commit() 15 | return True 16 | except BaseException as e: 17 | db_conn.rollback() 18 | return False 19 | finally: 20 | db_conn.close() 21 | 22 | 23 | def AddItem(ip_port): 24 | db_conn = sqlite3.connect(Config.DBName) 25 | db_cursor = db_conn.cursor() 26 | 27 | try: 28 | db_conn.execute("""INSERT INTO {} VALUES ('{}');""".format(Config.TabelName, ip_port)) 29 | db_conn.commit() 30 | except BaseException as e: 31 | db_conn.rollback() 32 | traceback.print_exc() 33 | db_conn.close() 34 | 35 | 36 | def AddItems(ip_list): 37 | if len(ip_list) < 1: 38 | return 39 | 40 | sql_str = """INSERT INTO IPPORT VALUES """ 41 | 42 | for item in ip_list: 43 | sql_str += ("('{}'),".format(item)) 44 | index = len(sql_str) 45 | sql_str = sql_str[0:index - 1] 46 | sql_str += ";" 47 | db_conn = sqlite3.connect(Config.DBName) 48 | try: 49 | db_conn.execute(sql_str) 50 | db_conn.commit() 51 | except BaseException as e: 52 | db_conn.rollback() 53 | traceback.print_exc() 54 | db_conn.close() 55 | 56 | 57 | def DelItem(item): 58 | db_conn = sqlite3.connect(Config.DBName) 59 | 60 | try: 61 | db_conn.execute("""DELETE FROM {} WHERE IP_PORT = '{}';""".format(Config.TabelName, item)) 62 | db_conn.commit() 63 | except BaseException as e: 64 | db_conn.rollback() 65 | traceback.print_exc() 66 | finally: 67 | db_conn.close() 68 | 69 | 70 | def ClearItems(): 71 | db_conn = sqlite3.connect(Config.DBName) 72 | try: 73 | db_conn.execute("""DELETE FROM {};""".format(Config.TabelName)) 74 | db_conn.commit() 75 | except BaseException as e: 76 | db_conn.rollback() 77 | traceback.print_exc() 78 | finally: 79 | db_conn.close() 80 | 81 | 82 | def GetItems(): 83 | ip_list = [] 84 | db_conn = sqlite3.connect(Config.DBName) 85 | db_cur = db_conn.cursor() 86 | try: 87 | for item in db_cur.execute("""SELECT * FROM {};""".format(Config.TabelName)).fetchall(): 88 | ip_list.append(item[0]) 89 | except BaseException as e: 90 | traceback.print_exc() 91 | finally: 92 | db_conn.close() 93 | return ip_list 94 | -------------------------------------------------------------------------------- /GetIP.py: -------------------------------------------------------------------------------- 1 | from random import choice 2 | from re import findall 3 | from threading import Thread 4 | 5 | from requests import get 6 | 7 | import Config 8 | import ProxiesDataBase 9 | 10 | d = {} 11 | ip_list = [] 12 | 13 | 14 | def GetPageContent(tar_url): 15 | url_content = "" 16 | try: 17 | url_content = get(tar_url, 18 | headers={ 19 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 20 | 'Accept-Encoding': 'gzip, deflate, compress', 21 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ru;q=0.4', 22 | 'Cache-Control': 'no-cache', 23 | 'Connection': 'keep-alive', 24 | 'Upgrade-Insecure-Requests': "1", 25 | 'User-Agent': choice(Config.UserAgents) 26 | }).text 27 | except BaseException as e: 28 | pass 29 | finally: 30 | return url_content 31 | 32 | 33 | def GetIP(): 34 | global d 35 | global ip_list 36 | thread_list = [] 37 | ips = [] 38 | 39 | for tar_url in Config.Url_Regular.keys(): 40 | url_content = GetPageContent(tar_url) 41 | regular = Config.Url_Regular.get(tar_url, "") 42 | tmp_ip_list = findall(regular, url_content) 43 | for item in tmp_ip_list: 44 | ip_list.append("{}:{}".format(item[0], item[1])) 45 | 46 | for index in range(0, Config.MaxThreads): 47 | thread_list.append(Thread(target=VerifyIp)) 48 | for item in thread_list: 49 | item.start() 50 | for item in thread_list: 51 | item.join() 52 | 53 | for item in d.keys(): 54 | ips.append(item) 55 | d.clear() 56 | ProxiesDataBase.AddItems(ips) 57 | 58 | 59 | def RefreshDB(): 60 | global d 61 | global ip_list 62 | ip_list = ProxiesDataBase.GetItems() 63 | thread_list = [] 64 | ips = [] 65 | 66 | if len(ip_list) < 1: 67 | return 68 | 69 | 70 | for index in range(0, Config.MaxThreads): 71 | thread_list.append(Thread(target=VerifyIp)) 72 | for item in thread_list: 73 | item.start() 74 | for item in thread_list: 75 | item.join() 76 | 77 | ProxiesDataBase.ClearItems() 78 | 79 | for item in d.keys(): 80 | ips.append(item) 81 | d.clear() 82 | ProxiesDataBase.AddItems(ips) 83 | 84 | 85 | def VerifyIp(): 86 | global d 87 | while ip_list: 88 | tmp_ip_port = ip_list.pop(0) 89 | print("verify ip: {}".format(tmp_ip_port)) 90 | proxies = {"http": "http://{}".format(tmp_ip_port), "https": "https://{}".format(tmp_ip_port)} 91 | try: 92 | url_content = get(Config.TestUrl, 93 | proxies=proxies, 94 | timeout=Config.TestTimeOut, 95 | headers={ 96 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 97 | 'Accept-Encoding': 'gzip, deflate, compress', 98 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ru;q=0.4', 99 | 'Cache-Control': 'max-age=0', 100 | 'Connection': 'keep-alive', 101 | 'User-Agent': choice(Config.UserAgents) 102 | }) 103 | 104 | if int(url_content.status_code) == int(200): 105 | d.update({"{}".format(tmp_ip_port): 0}) 106 | except BaseException as e: 107 | continue 108 | -------------------------------------------------------------------------------- /Config.py: -------------------------------------------------------------------------------- 1 | DBName = "PROXIES.db" # 数据库名称 2 | TabelName = "IPPORT" # 表 3 | Column1 = "IP_PORT" # 列1 4 | 5 | TestTimeOut = 20 # 检测IP可用性设置的超时, 6 | # 对IP质量要求不高,就把值设的高一点儿。这样可用IP就会增多 7 | MaxThreads = 64 # 最大线程数,依据电脑性能修改,性能好的电脑可以设置高一点 8 | # 最好设置为2的n次方,别问我为什么,我也不知道,这是玄学 9 | 10 | TestUrl = "https://www.baidu.com/" # 用以检测的网站 11 | 12 | # 头部代理S 13 | UserAgents = [ 14 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36", 15 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36", 16 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36", 17 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 18 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 19 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 20 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 21 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 22 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 23 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 24 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 25 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 26 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 27 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 28 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 29 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 30 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 31 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 32 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 33 | "Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1", 34 | "Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3", 35 | "Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12", 36 | "Opera/9.27 (Windows NT 5.2; U; zh-cn)", 37 | "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13", 38 | "Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 ", 39 | "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 ", 40 | "Mozilla/5.0 (Linux; U; Android 3.2; ja-jp; F-01D Build/F0001) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13 ", 41 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7", 42 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5 ", 43 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9 ", 44 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 45 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", 46 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36", 47 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36", 48 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36" 49 | ] 50 | 51 | # 代理IP网址和对应的正则式,正则式一定要IP和Port分开获取,例如[(192.168.1.1,80),(192.168.1.1,90),] 52 | # 可自行添加 53 | # 只抓取首页,想要抓取后面的可以将链接和正则式贴上来 54 | Url_Regular = { 55 | "http://www.kuaidaili.com/free/": "IP\">([\d\.]+)\s*(\d+)", 56 | "http://www.66ip.cn/nmtq.php?getnum=512&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip": "([\d\.]+):(\d+)", 57 | "http://www.xicidaili.com/nn/": "([\d\.]+)\s*(\d+)", 58 | "http://www.ip3366.net/free/": "([\d\.]+)\s*(\d+)", 59 | "http://www.proxy360.cn/Region/China": ">\s*([\d\.]+)\s*\s*.*width:50px;\">\s*(\d+)\s*", 60 | "http://www.mimiip.com/": "\s+([\d\.]+)\s+(\d+)", 61 | "http://www.data5u.com/free/index.shtml": "
  • ([\d\.]+)
  • \s+
  • (\d+)
  • ", 62 | "http://www.ip181.com/": "\s+([\d\.]+)\s+([\d]+)", 63 | "http://www.kxdaili.com/": "\s+([\d\.]+)\s+([\d]+)", 64 | } 65 | --------------------------------------------------------------------------------