├── Modules
    ├── db.py
    ├── infopool.py
    ├── interface.py
    ├── proxy.py
    ├── scraper_manager.py
    └── web.py
└── README.md


/Modules/db.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:UTF-8 -*-
 2 | #数据库模块
 3 | #Author: 苍冥 e0t3rx
 4 | 
 5 | import sqlite3
 6 | 
 7 | 
 8 | class Database:
 9 | 	def __init__(self, DB_NAME="ProxyPoolDB.db"):
10 | 		try:
11 | 			#Create DB Cursor
12 | 			self.cursor = sqlite3.connect(DB_NAME, isolation_level=None).cursor()
13 | 			self.cursor.execute("CREATE TABLE IF NOT EXISTS TB_ProxyPool(ip TEXT, port INTEGER, protocol TEXT)")
14 | 		except sqlite3.OperationalError as e:
15 | 			#数据库繁忙，同时写入会发生错误
16 | 			#print("Error: Database Busy")
17 | 			pass
18 | 	
19 | 	def add(self, ip, port, protocol):
20 | 		try:
21 | 			self.cursor.execute("INSERT INTO TB_ProxyPool(ip, port, protocol) SELECT ?,?,? WHERE NOT EXISTS (SELECT * FROM TB_ProxyPool WHERE TB_ProxyPool.ip=? AND TB_ProxyPool.port=? AND TB_ProxyPool.protocol=?)", [ip,port,protocol,ip,port,protocol])
22 | 		except sqlite3.OperationalError as e:
23 | 			#数据库繁忙，同时写入会发生错误
24 | 			#print("Error: Database Busy")
25 | 			pass
26 | 
27 | 	def modify():
28 | 		#后期更新再开发此功能
29 | 		pass
30 | 
31 | 	def delete(self, ip, port, protocol):
32 | 		try:
33 | 			self.cursor.execute("DELETE FROM TB_ProxyPool WHERE ip=? AND port=? AND protocol=?",(ip, port, protocol))
34 | 		except sqlite3.OperationalError as e:
35 | 			#数据库繁忙，同时写入会发生错误
36 | 			#print("Error: Database Busy")
37 | 			pass
38 | 
39 | 
40 | 	#Sub-functions
41 | 	def fetch_all(self):
42 | 		try:
43 | 			#Returns a list of tuple objects, each stands for a proxy record
44 | 			return self.cursor.execute("SELECT * FROM TB_ProxyPool").fetchall()
45 | 		except sqlite3.OperationalError as e:
46 | 			#数据库繁忙，同时写入会发生错误
47 | 			#print("Error: Database Busy")
48 | 			pass
49 | 
50 | 
51 | '''
52 | db = Database()
53 | a = db.fetch_all()
54 | for i in a:
55 | 	print(i[0])
56 | 	print(i[1])
57 | 	print(i[2])
58 | 	'''


--------------------------------------------------------------------------------
/Modules/infopool.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:UTF-8 -*-
 2 | #信息池模块
 3 | #Author: 苍冥 e0t3rx
 4 | 
 5 | import scraper_manager
 6 | import proxy
 7 | import db
 8 | from time import sleep
 9 | import threading
10 | from os import system as cmd
11 | import msvcrt
12 | 
13 | #字符串常量
14 | LOGO = 6*"_"+"                   "+6*"_"+"           _ \n| ___ \                  | ___ \         | |\n| |_/ _ __ _____  ___   _| |_/ ___   ___ | |\n\
15 | |  __| '__/ _ \ \/ | | | |  __/ _ \ / _ \| |\n| |  | | | (_) >  <| |_| | | | (_) | (_) | |\n\_|  |_|  \___/_/\_\\__,  \_|  \___/ \___/|_|\n\
16 |                      __/ |                  \n                    |___/                   \n               _____ _   _____              \n\
17 |               |  _  | | |____ |             \n "+6*"_"+"    ___| |/' | |_    / /_ ____  __   \n|"+6*"_"+"|  / _ |  /| | __|   \ | '__\ \/ /   \n\
18 |          |  __\ |_/ | |_.___/ | |   >  <    \n          \___|\___/ \__\____/|_|  /_/\_\ \n"
19 | 
20 | MENU = "\n功能菜单："+"\n\t"+"[W]启动本地WEB服务器"+"\n\t"+"[T]修改验证线程数量"+"\n\t"+"[I]查看使用说明"+"\n\t"+"[M]功能菜单"
21 | INSTRUCTION = "高匿代理池说明："
22 | 
23 | #选项变量
24 | OnOffSwitcher = ["ON", "OFF"]
25 | modeChoice = "MENU"
26 | modeMapping = {"M":"MENU","T":"THREAD","I":"INSTRUCTION","W":"WEB"}
27 | modeWebServer = "OFF"
28 | 
29 | 
30 | def Input():
31 | 	#如果忘记加Global关键字，就不会修改modeChoice的值
32 | 	global modeChoice, modeWebServer
33 | 	while True:
34 | 		#忽略特殊按键引起的异常
35 | 		try:
36 | 			#读取用户输入的按键，并对功能菜单的选项进行映射
37 | 			getch = str(msvcrt.getch(),"utf-8")
38 | 			if getch.upper() in modeMapping:
39 | 				modeChoice = modeMapping[getch.upper()]
40 | 			if modeChoice == "WEB" and getch.upper() == "S":
41 | 				if modeWebServer == "ON":
42 | 					modeWebServer = "OFF"
43 | 				else:
44 | 					modeWebServer = "ON"
45 | 		except:
46 | 			pass
47 | 
48 | 
49 | def Output():
50 | 	#实时获取各模块信息并提供命令行交互
51 | 	while True:
52 | 		TotalProxies = db.Database().fetch_all()
53 | 		NumProxies = len(TotalProxies) if TotalProxies is not None else "Read Error Database Locked"
54 | 		NumProxyCheckThreads = threading.activeCount()-proxy.InitialThreadNum
55 | 		cmd("cls")
56 | 		def showInfo():
57 | 			print("[!] 目前数据库中有%s个代理"%NumProxies)
58 | 			print("[!] 当前代理验证线程数量为: %s\n" %(NumProxyCheckThreads))
59 | 		#下面输出界面交互部分
60 | 		if modeChoice == "MENU":
61 | 			print(LOGO)
62 | 			showInfo()
63 | 			print(MENU)
64 | 		elif modeChoice == "INSTRUCTION":
65 | 			showInfo()
66 | 			print(INSTRUCTION)
67 | 			print(MENU)
68 | 		elif modeChoice == "WEB":
69 | 			showInfo()
70 | 			print("[S]Web服务器状态："+modeWebServer)
71 | 			print("\n[!] Web功能目前还在开发中，敬请期待："+"\n\t"+"http://github.com/eastrd/HighAnonProxyPool")
72 | 			print(MENU)
73 | 		elif modeChoice == "THREAD":
74 | 			showInfo()
75 | 			print("[!] 线程设置功能目前还在开发中，敬请期待："+"\n\t"+"http://github.com/eastrd/HighAnonProxyPool")
76 | 			print(MENU)
77 | 
78 | def Initialise():
79 | 	#启动爬虫与代理验证模块
80 | 	print("[!] 启动爬虫模块...")
81 | 	threading.Thread(target=scraper_manager.start, name='Scraper Manager').start()
82 | 	print("[!] 启动验证模块...")
83 | 	threading.Thread(target=proxy.start, name='Proxy Manager').start()
84 | 	#启动命令行界面交互模块
85 | 	threading.Thread(target=Input).start()
86 | 	threading.Thread(target=Output).start()
87 | 	while True:
88 | 		sleep(300)
89 | 
90 | Initialise()


--------------------------------------------------------------------------------
/Modules/interface.py:
--------------------------------------------------------------------------------
1 | #-*- coding:UTF-8 -*-
2 | #对外接口模块
3 | #Author: 苍冥 e0t3rx
4 | 
5 | 


--------------------------------------------------------------------------------
/Modules/proxy.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:UTF-8 -*-
 2 | #代理模块
 3 | #Author: 苍冥 e0t3rx
 4 | 
 5 | import requests
 6 | import db
 7 | from time import sleep
 8 | import threading
 9 | 
10 | #ProxyCheckerThread用来多线程对每一个代理进行检测
11 | class ProxyCheckerThread(threading.Thread):
12 | 	def __init__(self, DirtyProxy):
13 | 		#DirtyProxy is a list containing ip(str), port(int), protocol(str)
14 | 		threading.Thread.__init__(self)
15 | 		self.DirtyProxy = DirtyProxy
16 | 	def run(self):
17 | 		#print("Starting Thread for checking %s:%s" %(self.DirtyProxy[0], self.DirtyProxy[1]))
18 | 		Proxy().check_ConnAnon(self.DirtyProxy)
19 | 		
20 | 
21 | class Proxy:
22 | 	def __init__(self):
23 | 		self.REQ_TIMEOUT = 8
24 | 
25 | 	def check_ConnAnon(self, DirtyProxy):
26 | 		ip, port, protocol = DirtyProxy[0], DirtyProxy[1], DirtyProxy[2].lower()
27 | 		#如果代理is down，则访问的时候是会默认使用源IP的
28 | 		proxies = { protocol: ip+":"+str(port) }
29 | 
30 | 		#暂时先从icanhazip获取IP，后期会进行更改
31 | 		try:
32 | 			MaskedIP = str(requests.get("http://icanhazip.com", timeout=self.REQ_TIMEOUT, proxies=proxies).content, "utf-8").replace("\n","")
33 | 		except Exception as e:
34 | 			#代理超时了、解码错误（例如：“错误：您所请求的网址（URL）无法获取” - 代理对访问的url存在限制）、代理挂掉了、连接被重置，全部删除
35 | 			db.Database().delete(ip,port,protocol)
36 | 			return
37 | 
38 | 		if MaskedIP != ip:
39 | 			#如果返回的IP和代理ip不一样，则调用数据库接口删除此条代理记录
40 | 			db.Database().delete(ip,port,protocol)
41 | 		else:
42 | 			#print("[!] "+ip+":"+str(port)+" is Good!")
43 | 			pass
44 | 
45 | 	def fetch_info(self):
46 | 		#后期更新再开发此功能
47 | 		pass
48 | 
49 | 	def ProxyWash(self):
50 | 		#调用数据库模块的接口，获取全部代理，并启动多线程验证其有效性
51 | 		DirtyProxyList = db.Database().fetch_all()
52 | 		for ProxyRecord_tuples in DirtyProxyList:
53 | 			#如果同时存在多于100个线程，则等待10秒再开新线程
54 | 			while threading.activeCount() > 100:
55 | 				sleep(1)
56 | 			ProxyCheckerThread(ProxyRecord_tuples).start()
57 | 		#print("[!] 当前代理循环验证任务线程发布完毕\n\n")
58 | 
59 | #初始线程数量 = 本线程 + 爬虫管理模块 + 命令界面交互模块(输入+输出) + 爬虫数  =  5 + 爬虫数
60 | InitialThreadNum = 7
61 | 
62 | def start():
63 | 	while True:
64 | 		#仅当所有子线程都运行完毕的时候再开始新一轮的验证
65 | 
66 | 		if threading.activeCount() <= InitialThreadNum:
67 | 			print("[!] 开始新一轮代理验证...")
68 | 			Proxy().ProxyWash()
69 | 		#else:
70 | 			#print("[!] 代理验证程序启动失败，等待线程数量为%s"%(threading.activeCount()-5))
71 | 		sleep(2)


--------------------------------------------------------------------------------
/Modules/scraper_manager.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:UTF-8 -*-
 2 | #爬虫管理模块
 3 | #Author: 苍冥 e0t3rx
 4 | 
 5 | import re
 6 | import requests
 7 | import base64
 8 | import time
 9 | from bs4 import BeautifulSoup as bs
10 | import db
11 | from time import sleep
12 | import threading
13 | 
14 | '''
15 | #ScraperManagerThread用来启动所有爬虫线程，以便同时爬取所有代理
16 | class ScraperManagerThread(threading.Thread):
17 | 	def __init__(self, IntervalDelay):
18 | 		#IntervalDelay是每轮爬取页面后的冷却时间（秒）
19 | 		threading.Thread.__init__(self)
20 | 		self.IntervalDelay = IntervalDelay
21 | 	def run(self):
22 | 		A()
23 | 		B()
24 | 
25 | '''
26 | 
27 | IntervalDelay = 20
28 | 
29 | def proxy_list_org():
30 | 	#http://proxy-list.org
31 | 	print("[!] Starting proxy-list.org thread...")
32 | 	BASE_URL = "https://proxy-list.org/english/index.php?p=" 
33 | 	Re_Pattern_IP = re.compile("(.*):")
34 | 	Re_Pattern_PORT = re.compile(":(.*)")
35 | 	while True:
36 | 		#print("[!] Scraping proxy-list.org...")
37 | 		for startingURL_Param in range(1,11):
38 | 			while True:
39 | 				try:
40 | 					#If there's an error duing the request, it will try to reconnect until succeed
41 | 					while True:
42 | 						try:
43 | 							HTML_ProxyPage = requests.get(BASE_URL+str(startingURL_Param)).content
44 | 							break
45 | 						except Exception as e:
46 | 							print("An Error occurred: "+str(e))
47 | 					soup = bs(HTML_ProxyPage,"html.parser")
48 | 					for Raw_ProxyInfo in soup.find_all("ul",{"class":None}):
49 | 						ip_port = str(base64.b64decode(Raw_ProxyInfo.find("li",{"class":"proxy"}).text.replace("Proxy('","").replace("')","")), "utf-8")
50 | 						IP = re.findall(Re_Pattern_IP, ip_port)[0]
51 | 						PORT = re.findall(Re_Pattern_PORT, ip_port)[0]
52 | 						PROTOCOL = Raw_ProxyInfo.find("li",{"class":"https"}).text
53 | 						if PROTOCOL != "-":
54 | 							db.Database().add(IP,PORT,PROTOCOL.lower())
55 | 					break
56 | 				except Exception as e:
57 | 					print("An error occurred with proxy_list_org: "+str(e))
58 | 		sleep(IntervalDelay)
59 | 
60 | 
61 | def incloak_com():
62 | 	#http://inclock.com
63 | 	print("[!] Starting incloak.com thread...")
64 | 	RE_Pattern_IPaddr = re.compile("[0-9\.].*")
65 | 	while True:
66 | 		try:
67 | 			#print("[!] Scraping incloak.com...")
68 | 			soup = bs(requests.get("https://incloak.com/proxy-list/?anon=234#list").content,"html.parser")
69 | 			for RAW_ProxyInfo in soup.find_all("tr"):
70 | 				#Length is checked so not to include the skeleton frame <td>
71 | 				if len(RAW_ProxyInfo.find_all("td")) == 7:
72 | 					IP = str(RAW_ProxyInfo.find("td",{"class":"tdl"})).replace("<td class=\"tdl\">","").replace("</td>","")
73 | 					PORT = str(RAW_ProxyInfo.find("td",None)).replace("<td>","").replace("</td>","")
74 | 					PROTOCOL = RAW_ProxyInfo.find_all("td")[4].text
75 | 					if "SOCK" not in PROTOCOL:
76 | 						if "HTTPS" in PROTOCOL:
77 | 							db.Database().add(IP,PORT,"https")
78 | 						elif "HTTP" in PROTOCOL.replace("HTTPS",""):
79 | 							db.Database().add(IP,PORT,"http")
80 | 		except Exception as e:
81 | 			print("An error occurred with incloak_com: "+str(e))
82 | 		sleep(IntervalDelay)
83 | 
84 | def start():
85 | 	threading.Thread(target=proxy_list_org, name='proxy-list.org').start()
86 | 	threading.Thread(target=incloak_com, name='incloak.com').start()
87 | 	while True:
88 | 		#print("Current Amount of Threads: "+str(threading.activeCount()))
89 | 		sleep(100)
90 | 


--------------------------------------------------------------------------------
/Modules/web.py:
--------------------------------------------------------------------------------
1 | #-*- coding:UTF-8 -*-
2 | #服务器模块
3 | #Author: 苍冥 e0t3rx
4 | 
5 | import infopool
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 高匿代理池 ProxyPool
 2 | 
 3 | 
 4 | **待做：**
 5 | - 实现Web页面交互、以及REST API交互（GET参数）
 6 | - CUI交互界面上实现代理验证线程数设置
 7 | - 填写工具说明
 8 | - 将爬虫做成可拓展的config模式，并开发更多代理爬虫
 9 | - 多步网络连接测试，仅一次连接失败就删除的实现较不妥
10 | - 检测X-Forwarded-For的HTTP头
11 | - 数据库在同步操作的时候会上锁，影响效率，应想办法优化
12 | 
13 | **12月23日**
14 | - 将Web服务器开/关选项添加到了CUI交互界面上
15 | 
16 | 
17 | **12月18日**
18 | - 实现命令行交互核心框架
19 | - 在命令行界面添加了LOGO
20 | - 修复了线程共享变量的Bug
21 | 
22 | 
23 | **12月17日**
24 | - 将爬虫模块整合为多线程，一发制动全部
25 | - 修复了requests代理全部失败的Bug（大小写）
26 | - 增加并优化了验证代理时的各类异常捕捉机制
27 | - 实现爬虫与验证程序的一体化方式(直接通过信息池模块启动)。
28 | - 新增信息池模块，负责启动代理&爬虫模块，以及实时获取各模块状态并提供命令行交互
29 | - 优化了等待时间以提高验证效率
30 | 
31 | 
32 | **12月16日**
33 | - 将数据库记录定义为ip、端口、协议，必须满足全部三项才可指定（比如删除）一条记录
34 | - 实现了代理验证的多线程特性
35 | - 优化了代理模块的部分逻辑关系
36 | - 设计了爬虫管理模块的运行模式：单文件多函数，每个函数为一个爬虫，运行时进行多线程爬取
37 | 
38 | -
39 | （中间的这段时间我去折腾机器学习了。。。）
40 | -
41 | 
42 | **12月3日**
43 | - 重新设计了整个高匿代理池项目的组成模块。
44 | - 完成数据库模块的Add功能，改进Sqlite命令的去重
45 | - 完成数据库模块的Delete及Fetch_all功能
46 | - 完成代理库模块的check功能，暂时先使用icanhazip进行匿名检测，去除了多余的异常处理。
47 | - 待学习：模块之间沟通的具体代码实现，是直接在模块内调用其他模块的代码还是在主程序中为各模块进行交互？
48 | 
49 | 


--------------------------------------------------------------------------------