├── Modules ├── db.py ├── infopool.py ├── interface.py ├── proxy.py ├── scraper_manager.py └── web.py └── README.md /Modules/db.py: -------------------------------------------------------------------------------- 1 | #-*- coding:UTF-8 -*- 2 | #数据库模块 3 | #Author: 苍冥 e0t3rx 4 | 5 | import sqlite3 6 | 7 | 8 | class Database: 9 | def __init__(self, DB_NAME="ProxyPoolDB.db"): 10 | try: 11 | #Create DB Cursor 12 | self.cursor = sqlite3.connect(DB_NAME, isolation_level=None).cursor() 13 | self.cursor.execute("CREATE TABLE IF NOT EXISTS TB_ProxyPool(ip TEXT, port INTEGER, protocol TEXT)") 14 | except sqlite3.OperationalError as e: 15 | #数据库繁忙,同时写入会发生错误 16 | #print("Error: Database Busy") 17 | pass 18 | 19 | def add(self, ip, port, protocol): 20 | try: 21 | self.cursor.execute("INSERT INTO TB_ProxyPool(ip, port, protocol) SELECT ?,?,? WHERE NOT EXISTS (SELECT * FROM TB_ProxyPool WHERE TB_ProxyPool.ip=? AND TB_ProxyPool.port=? AND TB_ProxyPool.protocol=?)", [ip,port,protocol,ip,port,protocol]) 22 | except sqlite3.OperationalError as e: 23 | #数据库繁忙,同时写入会发生错误 24 | #print("Error: Database Busy") 25 | pass 26 | 27 | def modify(): 28 | #后期更新再开发此功能 29 | pass 30 | 31 | def delete(self, ip, port, protocol): 32 | try: 33 | self.cursor.execute("DELETE FROM TB_ProxyPool WHERE ip=? AND port=? AND protocol=?",(ip, port, protocol)) 34 | except sqlite3.OperationalError as e: 35 | #数据库繁忙,同时写入会发生错误 36 | #print("Error: Database Busy") 37 | pass 38 | 39 | 40 | #Sub-functions 41 | def fetch_all(self): 42 | try: 43 | #Returns a list of tuple objects, each stands for a proxy record 44 | return self.cursor.execute("SELECT * FROM TB_ProxyPool").fetchall() 45 | except sqlite3.OperationalError as e: 46 | #数据库繁忙,同时写入会发生错误 47 | #print("Error: Database Busy") 48 | pass 49 | 50 | 51 | ''' 52 | db = Database() 53 | a = db.fetch_all() 54 | for i in a: 55 | print(i[0]) 56 | print(i[1]) 57 | print(i[2]) 58 | ''' -------------------------------------------------------------------------------- /Modules/infopool.py: -------------------------------------------------------------------------------- 1 | #-*- coding:UTF-8 -*- 2 | #信息池模块 3 | #Author: 苍冥 e0t3rx 4 | 5 | import scraper_manager 6 | import proxy 7 | import db 8 | from time import sleep 9 | import threading 10 | from os import system as cmd 11 | import msvcrt 12 | 13 | #字符串常量 14 | LOGO = 6*"_"+" "+6*"_"+" _ \n| ___ \ | ___ \ | |\n| |_/ _ __ _____ ___ _| |_/ ___ ___ | |\n\ 15 | | __| '__/ _ \ \/ | | | | __/ _ \ / _ \| |\n| | | | | (_) > <| |_| | | | (_) | (_) | |\n\_| |_| \___/_/\_\\__, \_| \___/ \___/|_|\n\ 16 | __/ | \n |___/ \n _____ _ _____ \n\ 17 | | _ | | |____ | \n "+6*"_"+" ___| |/' | |_ / /_ ____ __ \n|"+6*"_"+"| / _ | /| | __| \ | '__\ \/ / \n\ 18 | | __\ |_/ | |_.___/ | | > < \n \___|\___/ \__\____/|_| /_/\_\ \n" 19 | 20 | MENU = "\n功能菜单:"+"\n\t"+"[W]启动本地WEB服务器"+"\n\t"+"[T]修改验证线程数量"+"\n\t"+"[I]查看使用说明"+"\n\t"+"[M]功能菜单" 21 | INSTRUCTION = "高匿代理池说明:" 22 | 23 | #选项变量 24 | OnOffSwitcher = ["ON", "OFF"] 25 | modeChoice = "MENU" 26 | modeMapping = {"M":"MENU","T":"THREAD","I":"INSTRUCTION","W":"WEB"} 27 | modeWebServer = "OFF" 28 | 29 | 30 | def Input(): 31 | #如果忘记加Global关键字,就不会修改modeChoice的值 32 | global modeChoice, modeWebServer 33 | while True: 34 | #忽略特殊按键引起的异常 35 | try: 36 | #读取用户输入的按键,并对功能菜单的选项进行映射 37 | getch = str(msvcrt.getch(),"utf-8") 38 | if getch.upper() in modeMapping: 39 | modeChoice = modeMapping[getch.upper()] 40 | if modeChoice == "WEB" and getch.upper() == "S": 41 | if modeWebServer == "ON": 42 | modeWebServer = "OFF" 43 | else: 44 | modeWebServer = "ON" 45 | except: 46 | pass 47 | 48 | 49 | def Output(): 50 | #实时获取各模块信息并提供命令行交互 51 | while True: 52 | TotalProxies = db.Database().fetch_all() 53 | NumProxies = len(TotalProxies) if TotalProxies is not None else "Read Error Database Locked" 54 | NumProxyCheckThreads = threading.activeCount()-proxy.InitialThreadNum 55 | cmd("cls") 56 | def showInfo(): 57 | print("[!] 目前数据库中有%s个代理"%NumProxies) 58 | print("[!] 当前代理验证线程数量为: %s\n" %(NumProxyCheckThreads)) 59 | #下面输出界面交互部分 60 | if modeChoice == "MENU": 61 | print(LOGO) 62 | showInfo() 63 | print(MENU) 64 | elif modeChoice == "INSTRUCTION": 65 | showInfo() 66 | print(INSTRUCTION) 67 | print(MENU) 68 | elif modeChoice == "WEB": 69 | showInfo() 70 | print("[S]Web服务器状态:"+modeWebServer) 71 | print("\n[!] Web功能目前还在开发中,敬请期待:"+"\n\t"+"http://github.com/eastrd/HighAnonProxyPool") 72 | print(MENU) 73 | elif modeChoice == "THREAD": 74 | showInfo() 75 | print("[!] 线程设置功能目前还在开发中,敬请期待:"+"\n\t"+"http://github.com/eastrd/HighAnonProxyPool") 76 | print(MENU) 77 | 78 | def Initialise(): 79 | #启动爬虫与代理验证模块 80 | print("[!] 启动爬虫模块...") 81 | threading.Thread(target=scraper_manager.start, name='Scraper Manager').start() 82 | print("[!] 启动验证模块...") 83 | threading.Thread(target=proxy.start, name='Proxy Manager').start() 84 | #启动命令行界面交互模块 85 | threading.Thread(target=Input).start() 86 | threading.Thread(target=Output).start() 87 | while True: 88 | sleep(300) 89 | 90 | Initialise() -------------------------------------------------------------------------------- /Modules/interface.py: -------------------------------------------------------------------------------- 1 | #-*- coding:UTF-8 -*- 2 | #对外接口模块 3 | #Author: 苍冥 e0t3rx 4 | 5 | -------------------------------------------------------------------------------- /Modules/proxy.py: -------------------------------------------------------------------------------- 1 | #-*- coding:UTF-8 -*- 2 | #代理模块 3 | #Author: 苍冥 e0t3rx 4 | 5 | import requests 6 | import db 7 | from time import sleep 8 | import threading 9 | 10 | #ProxyCheckerThread用来多线程对每一个代理进行检测 11 | class ProxyCheckerThread(threading.Thread): 12 | def __init__(self, DirtyProxy): 13 | #DirtyProxy is a list containing ip(str), port(int), protocol(str) 14 | threading.Thread.__init__(self) 15 | self.DirtyProxy = DirtyProxy 16 | def run(self): 17 | #print("Starting Thread for checking %s:%s" %(self.DirtyProxy[0], self.DirtyProxy[1])) 18 | Proxy().check_ConnAnon(self.DirtyProxy) 19 | 20 | 21 | class Proxy: 22 | def __init__(self): 23 | self.REQ_TIMEOUT = 8 24 | 25 | def check_ConnAnon(self, DirtyProxy): 26 | ip, port, protocol = DirtyProxy[0], DirtyProxy[1], DirtyProxy[2].lower() 27 | #如果代理is down,则访问的时候是会默认使用源IP的 28 | proxies = { protocol: ip+":"+str(port) } 29 | 30 | #暂时先从icanhazip获取IP,后期会进行更改 31 | try: 32 | MaskedIP = str(requests.get("http://icanhazip.com", timeout=self.REQ_TIMEOUT, proxies=proxies).content, "utf-8").replace("\n","") 33 | except Exception as e: 34 | #代理超时了、解码错误(例如:“错误:您所请求的网址(URL)无法获取” - 代理对访问的url存在限制)、代理挂掉了、连接被重置,全部删除 35 | db.Database().delete(ip,port,protocol) 36 | return 37 | 38 | if MaskedIP != ip: 39 | #如果返回的IP和代理ip不一样,则调用数据库接口删除此条代理记录 40 | db.Database().delete(ip,port,protocol) 41 | else: 42 | #print("[!] "+ip+":"+str(port)+" is Good!") 43 | pass 44 | 45 | def fetch_info(self): 46 | #后期更新再开发此功能 47 | pass 48 | 49 | def ProxyWash(self): 50 | #调用数据库模块的接口,获取全部代理,并启动多线程验证其有效性 51 | DirtyProxyList = db.Database().fetch_all() 52 | for ProxyRecord_tuples in DirtyProxyList: 53 | #如果同时存在多于100个线程,则等待10秒再开新线程 54 | while threading.activeCount() > 100: 55 | sleep(1) 56 | ProxyCheckerThread(ProxyRecord_tuples).start() 57 | #print("[!] 当前代理循环验证任务线程发布完毕\n\n") 58 | 59 | #初始线程数量 = 本线程 + 爬虫管理模块 + 命令界面交互模块(输入+输出) + 爬虫数 = 5 + 爬虫数 60 | InitialThreadNum = 7 61 | 62 | def start(): 63 | while True: 64 | #仅当所有子线程都运行完毕的时候再开始新一轮的验证 65 | 66 | if threading.activeCount() <= InitialThreadNum: 67 | print("[!] 开始新一轮代理验证...") 68 | Proxy().ProxyWash() 69 | #else: 70 | #print("[!] 代理验证程序启动失败,等待线程数量为%s"%(threading.activeCount()-5)) 71 | sleep(2) -------------------------------------------------------------------------------- /Modules/scraper_manager.py: -------------------------------------------------------------------------------- 1 | #-*- coding:UTF-8 -*- 2 | #爬虫管理模块 3 | #Author: 苍冥 e0t3rx 4 | 5 | import re 6 | import requests 7 | import base64 8 | import time 9 | from bs4 import BeautifulSoup as bs 10 | import db 11 | from time import sleep 12 | import threading 13 | 14 | ''' 15 | #ScraperManagerThread用来启动所有爬虫线程,以便同时爬取所有代理 16 | class ScraperManagerThread(threading.Thread): 17 | def __init__(self, IntervalDelay): 18 | #IntervalDelay是每轮爬取页面后的冷却时间(秒) 19 | threading.Thread.__init__(self) 20 | self.IntervalDelay = IntervalDelay 21 | def run(self): 22 | A() 23 | B() 24 | 25 | ''' 26 | 27 | IntervalDelay = 20 28 | 29 | def proxy_list_org(): 30 | #http://proxy-list.org 31 | print("[!] Starting proxy-list.org thread...") 32 | BASE_URL = "https://proxy-list.org/english/index.php?p=" 33 | Re_Pattern_IP = re.compile("(.*):") 34 | Re_Pattern_PORT = re.compile(":(.*)") 35 | while True: 36 | #print("[!] Scraping proxy-list.org...") 37 | for startingURL_Param in range(1,11): 38 | while True: 39 | try: 40 | #If there's an error duing the request, it will try to reconnect until succeed 41 | while True: 42 | try: 43 | HTML_ProxyPage = requests.get(BASE_URL+str(startingURL_Param)).content 44 | break 45 | except Exception as e: 46 | print("An Error occurred: "+str(e)) 47 | soup = bs(HTML_ProxyPage,"html.parser") 48 | for Raw_ProxyInfo in soup.find_all("ul",{"class":None}): 49 | ip_port = str(base64.b64decode(Raw_ProxyInfo.find("li",{"class":"proxy"}).text.replace("Proxy('","").replace("')","")), "utf-8") 50 | IP = re.findall(Re_Pattern_IP, ip_port)[0] 51 | PORT = re.findall(Re_Pattern_PORT, ip_port)[0] 52 | PROTOCOL = Raw_ProxyInfo.find("li",{"class":"https"}).text 53 | if PROTOCOL != "-": 54 | db.Database().add(IP,PORT,PROTOCOL.lower()) 55 | break 56 | except Exception as e: 57 | print("An error occurred with proxy_list_org: "+str(e)) 58 | sleep(IntervalDelay) 59 | 60 | 61 | def incloak_com(): 62 | #http://inclock.com 63 | print("[!] Starting incloak.com thread...") 64 | RE_Pattern_IPaddr = re.compile("[0-9\.].*") 65 | while True: 66 | try: 67 | #print("[!] Scraping incloak.com...") 68 | soup = bs(requests.get("https://incloak.com/proxy-list/?anon=234#list").content,"html.parser") 69 | for RAW_ProxyInfo in soup.find_all("tr"): 70 | #Length is checked so not to include the skeleton frame