├── 逻辑图.png
├── Author.py
├── Util.py
├── demo.py
├── README.md
├── ProxiesDataBase.py
├── GetIP.py
└── Config.py
/逻辑图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZKeeer/IPProxy/HEAD/逻辑图.png
--------------------------------------------------------------------------------
/Author.py:
--------------------------------------------------------------------------------
1 | Nick = "ZKeeer"
2 | Email = "zangker@foxmail.com"
3 | Blog = "http://zkeeer.space"
4 | Github = "https://github.com/ZKeeer"
5 | Zhihu = "https://www.zhihu.com/people/ZKeeer/activities"
--------------------------------------------------------------------------------
/Util.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | import ProxiesDataBase
4 | import GetIP
5 | import re
6 |
7 | def Refresh():
8 | GetIP.RefreshDB()
9 | GetIP.GetIP()
10 |
11 | def Get():
12 | proxies_dict = {}
13 | result = ProxiesDataBase.GetItems()
14 | if result:
15 | tmp = random.choice(result)
16 | proxies_dict['http'] = 'http://{}'.format(tmp)
17 | proxies_dict['https'] = 'https://{}'.format(tmp)
18 | return proxies_dict
19 |
20 |
--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import GetIP
3 | import Config
4 | import ProxiesDataBase
5 | import Util
6 |
7 |
8 | def main():
9 | # 初始化数据库和数据表
10 | ProxiesDataBase.InitDB()
11 | # 刷新数据库,添加新数据
12 | Util.Refresh()
13 | # 获取一个代理使用
14 | proxies = Util.Get()
15 | print(proxies)
16 |
17 | # 查询数据库多少条数据
18 | conn = sqlite3.connect(Config.DBName)
19 | cu = conn.cursor()
20 | print(cu.execute("""SELECT * FROM {};""".format(Config.TabelName)).fetchall().__len__())
21 | cu.close()
22 | conn.close()
23 |
24 |
25 | if __name__ == '__main__':
26 | main()
27 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # IPProxy
2 | 爬虫所需要的IP代理,抓取八个网站的代理IP检测/清洗/入库/更新,添加调用接口
3 |
4 | 目前只在win10 64位机,python3.5 / ubuntu server 16.04.1 LTS 64位 ,python 3.5下测试通过
5 | 不同配置的机器, 请在Config.py中修改最大线程数。详情可以看下面Config.py部分
6 |
7 |
8 | 如何使用
9 | 查看demo.py
10 | Util.Refresh():数据库和新的数据需要主动调用此函数更新
11 | Util.Get():调用可获取一条可用的代理,Util.Get()返回的代理:
12 | {'http': 'http://115.159.152.130:81', 'https': 'https://115.159.152.130:81'}
13 | requests可以直接使用:requests.get(url,proxies=Util.Get(),headers={})
14 |
15 | Config.py 部分:
16 | 设置最大线程数量限制,MaxThreads。如果说,我的电脑配置很低,那么设置16,32慢慢跑;如果对你的电脑贼自信,我电脑牛X啊,i7 志强,又是什么N多G内存,网络带宽贼6,那么你可以设置1024。
17 | 如果你还有代理网站可以添加,请添加在Url_Regular字典中。
18 | 代理IP网址和对应的正则式,正则式一定要IP和Port分开获取,例如[(192.168.1.1, 80), (192.168.1.1, 90),]
19 | 只抓取首页,想要抓取首页以后页面的可以将链接和正则式贴上来,例如,将某网站的1、2、……页的链接和对应的正则式分别添加到Url_Regular字典中。
20 | 添加正则式之前请先在 站长工具-正则表达式在线测试 测试通过后添加
21 |
22 | 数据来源:
23 | http://www.kuaidaili.com/free/
24 | http://www.66ip.cn/
25 | http://www.xicidaili.com/nn/
26 | http://www.ip3366.net/free/
27 | http://www.proxy360.cn/Region/China
28 | http://www.mimiip.com/
29 | http://www.data5u.com/free/index.shtml
30 | http://www.ip181.com/
31 | http://www.kxdaili.com/
32 | 欢迎添加你知道的代理网站,大家资源共享
33 |
34 | 逻辑结构:
35 |
36 |
37 | 欢迎issue和pull,代码渣渣,大神轻喷
38 |
--------------------------------------------------------------------------------
/ProxiesDataBase.py:
--------------------------------------------------------------------------------
1 | # table IPPORT
2 | # ip_port TEXT NOT NULL
3 | import sqlite3
4 | import traceback
5 |
6 | import Config
7 |
8 |
9 | def InitDB():
10 | db_conn = sqlite3.connect(Config.DBName)
11 | try:
12 | db_conn.execute(
13 | """CREATE TABLE IF NOT EXISTS {} (IP_PORT TEXT NOT NULL);""".format(Config.TabelName))
14 | db_conn.commit()
15 | return True
16 | except BaseException as e:
17 | db_conn.rollback()
18 | return False
19 | finally:
20 | db_conn.close()
21 |
22 |
23 | def AddItem(ip_port):
24 | db_conn = sqlite3.connect(Config.DBName)
25 | db_cursor = db_conn.cursor()
26 |
27 | try:
28 | db_conn.execute("""INSERT INTO {} VALUES ('{}');""".format(Config.TabelName, ip_port))
29 | db_conn.commit()
30 | except BaseException as e:
31 | db_conn.rollback()
32 | traceback.print_exc()
33 | db_conn.close()
34 |
35 |
36 | def AddItems(ip_list):
37 | if len(ip_list) < 1:
38 | return
39 |
40 | sql_str = """INSERT INTO IPPORT VALUES """
41 |
42 | for item in ip_list:
43 | sql_str += ("('{}'),".format(item))
44 | index = len(sql_str)
45 | sql_str = sql_str[0:index - 1]
46 | sql_str += ";"
47 | db_conn = sqlite3.connect(Config.DBName)
48 | try:
49 | db_conn.execute(sql_str)
50 | db_conn.commit()
51 | except BaseException as e:
52 | db_conn.rollback()
53 | traceback.print_exc()
54 | db_conn.close()
55 |
56 |
57 | def DelItem(item):
58 | db_conn = sqlite3.connect(Config.DBName)
59 |
60 | try:
61 | db_conn.execute("""DELETE FROM {} WHERE IP_PORT = '{}';""".format(Config.TabelName, item))
62 | db_conn.commit()
63 | except BaseException as e:
64 | db_conn.rollback()
65 | traceback.print_exc()
66 | finally:
67 | db_conn.close()
68 |
69 |
70 | def ClearItems():
71 | db_conn = sqlite3.connect(Config.DBName)
72 | try:
73 | db_conn.execute("""DELETE FROM {};""".format(Config.TabelName))
74 | db_conn.commit()
75 | except BaseException as e:
76 | db_conn.rollback()
77 | traceback.print_exc()
78 | finally:
79 | db_conn.close()
80 |
81 |
82 | def GetItems():
83 | ip_list = []
84 | db_conn = sqlite3.connect(Config.DBName)
85 | db_cur = db_conn.cursor()
86 | try:
87 | for item in db_cur.execute("""SELECT * FROM {};""".format(Config.TabelName)).fetchall():
88 | ip_list.append(item[0])
89 | except BaseException as e:
90 | traceback.print_exc()
91 | finally:
92 | db_conn.close()
93 | return ip_list
94 |
--------------------------------------------------------------------------------
/GetIP.py:
--------------------------------------------------------------------------------
1 | from random import choice
2 | from re import findall
3 | from threading import Thread
4 |
5 | from requests import get
6 |
7 | import Config
8 | import ProxiesDataBase
9 |
10 | d = {}
11 | ip_list = []
12 |
13 |
14 | def GetPageContent(tar_url):
15 | url_content = ""
16 | try:
17 | url_content = get(tar_url,
18 | headers={
19 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
20 | 'Accept-Encoding': 'gzip, deflate, compress',
21 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ru;q=0.4',
22 | 'Cache-Control': 'no-cache',
23 | 'Connection': 'keep-alive',
24 | 'Upgrade-Insecure-Requests': "1",
25 | 'User-Agent': choice(Config.UserAgents)
26 | }).text
27 | except BaseException as e:
28 | pass
29 | finally:
30 | return url_content
31 |
32 |
33 | def GetIP():
34 | global d
35 | global ip_list
36 | thread_list = []
37 | ips = []
38 |
39 | for tar_url in Config.Url_Regular.keys():
40 | url_content = GetPageContent(tar_url)
41 | regular = Config.Url_Regular.get(tar_url, "")
42 | tmp_ip_list = findall(regular, url_content)
43 | for item in tmp_ip_list:
44 | ip_list.append("{}:{}".format(item[0], item[1]))
45 |
46 | for index in range(0, Config.MaxThreads):
47 | thread_list.append(Thread(target=VerifyIp))
48 | for item in thread_list:
49 | item.start()
50 | for item in thread_list:
51 | item.join()
52 |
53 | for item in d.keys():
54 | ips.append(item)
55 | d.clear()
56 | ProxiesDataBase.AddItems(ips)
57 |
58 |
59 | def RefreshDB():
60 | global d
61 | global ip_list
62 | ip_list = ProxiesDataBase.GetItems()
63 | thread_list = []
64 | ips = []
65 |
66 | if len(ip_list) < 1:
67 | return
68 |
69 |
70 | for index in range(0, Config.MaxThreads):
71 | thread_list.append(Thread(target=VerifyIp))
72 | for item in thread_list:
73 | item.start()
74 | for item in thread_list:
75 | item.join()
76 |
77 | ProxiesDataBase.ClearItems()
78 |
79 | for item in d.keys():
80 | ips.append(item)
81 | d.clear()
82 | ProxiesDataBase.AddItems(ips)
83 |
84 |
85 | def VerifyIp():
86 | global d
87 | while ip_list:
88 | tmp_ip_port = ip_list.pop(0)
89 | print("verify ip: {}".format(tmp_ip_port))
90 | proxies = {"http": "http://{}".format(tmp_ip_port), "https": "https://{}".format(tmp_ip_port)}
91 | try:
92 | url_content = get(Config.TestUrl,
93 | proxies=proxies,
94 | timeout=Config.TestTimeOut,
95 | headers={
96 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
97 | 'Accept-Encoding': 'gzip, deflate, compress',
98 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ru;q=0.4',
99 | 'Cache-Control': 'max-age=0',
100 | 'Connection': 'keep-alive',
101 | 'User-Agent': choice(Config.UserAgents)
102 | })
103 |
104 | if int(url_content.status_code) == int(200):
105 | d.update({"{}".format(tmp_ip_port): 0})
106 | except BaseException as e:
107 | continue
108 |
--------------------------------------------------------------------------------
/Config.py:
--------------------------------------------------------------------------------
1 | DBName = "PROXIES.db" # 数据库名称
2 | TabelName = "IPPORT" # 表
3 | Column1 = "IP_PORT" # 列1
4 |
5 | TestTimeOut = 20 # 检测IP可用性设置的超时,
6 | # 对IP质量要求不高,就把值设的高一点儿。这样可用IP就会增多
7 | MaxThreads = 64 # 最大线程数,依据电脑性能修改,性能好的电脑可以设置高一点
8 | # 最好设置为2的n次方,别问我为什么,我也不知道,这是玄学
9 |
10 | TestUrl = "https://www.baidu.com/" # 用以检测的网站
11 |
12 | # 头部代理S
13 | UserAgents = [
14 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
15 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36",
16 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
17 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
18 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
19 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
20 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
21 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
22 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
23 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
24 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
25 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
26 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
27 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
28 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
29 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
30 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
31 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
32 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
33 | "Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1",
34 | "Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3",
35 | "Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12",
36 | "Opera/9.27 (Windows NT 5.2; U; zh-cn)",
37 | "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13",
38 | "Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 ",
39 | "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 ",
40 | "Mozilla/5.0 (Linux; U; Android 3.2; ja-jp; F-01D Build/F0001) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13 ",
41 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_1 like Mac OS X; ja-jp) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7",
42 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_2_1 like Mac OS X; da-dk) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5 ",
43 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9 ",
44 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
45 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
46 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36",
47 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
48 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"
49 | ]
50 |
51 | # 代理IP网址和对应的正则式,正则式一定要IP和Port分开获取,例如[(192.168.1.1,80),(192.168.1.1,90),]
52 | # 可自行添加
53 | # 只抓取首页,想要抓取后面的可以将链接和正则式贴上来
54 | Url_Regular = {
55 | "http://www.kuaidaili.com/free/": "IP\">([\d\.]+)\s*(\d+) | ",
56 | "http://www.66ip.cn/nmtq.php?getnum=512&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip": "([\d\.]+):(\d+)",
57 | "http://www.xicidaili.com/nn/": "([\d\.]+) | \s*(\d+) | ",
58 | "http://www.ip3366.net/free/": "([\d\.]+) | \s*(\d+) | ",
59 | "http://www.proxy360.cn/Region/China": ">\s*([\d\.]+)\s*\s*.*width:50px;\">\s*(\d+)\s*",
60 | "http://www.mimiip.com/": "\s+| ([\d\.]+) | \s+(\d+) | ",
61 | "http://www.data5u.com/free/index.shtml": "([\d\.]+)\s+(\d+)",
62 | "http://www.ip181.com/": "\s+([\d\.]+) | \s+([\d]+) | ",
63 | "http://www.kxdaili.com/": "\s+([\d\.]+) | \s+([\d]+) | ",
64 | }
65 |
--------------------------------------------------------------------------------