├── .gitignore ├── Cores ├── ProxyManage │ ├── ProxyManage.py │ └── __init__.py ├── ProxySpider │ ├── ProxySpider.py │ ├── __init__.py │ └── spiders │ │ ├── KDLHASpider.py │ │ ├── KPSpider.py │ │ ├── _ExampleSpider.py │ │ └── __init__.py ├── WebSpider │ ├── WebClicker.py │ ├── WebSpider.py │ └── __init__.py └── __init__.py ├── CreateTables.py ├── LICENSE ├── Pansidong.py ├── README.md ├── ThirdParty └── phantomjs │ ├── phantomjs │ ├── phantomjs-mac │ └── phantomjs.exe ├── config.ini.sample ├── mytest.py ├── requirements.txt └── utils ├── ArgParser ├── ArgParse.py ├── Messages.py ├── ParseCommandArgs.py └── __init__.py ├── AutoLoad.py ├── DBConnection ├── DBConnection.py └── __init__.py ├── Data ├── Enum.py ├── LoggerHelp.py ├── SaveData.py ├── Tables.py └── __init__.py ├── SpiderBase.py ├── ThreadPool.py ├── ThreadPool2.py └── __init__.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | .idea/ 92 | proxy-ip-list.csv 93 | config.ini 94 | 95 | -------------------------------------------------------------------------------- /Cores/ProxyManage/ProxyManage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import sys 5 | import datetime 6 | import time 7 | import Queue 8 | import platform 9 | 10 | import requests 11 | import prettytable 12 | from sqlalchemy.exc import SQLAlchemyError 13 | 14 | from utils.Data.LoggerHelp import logger 15 | from utils.DBConnection import DBConnection 16 | from utils.Data.Tables import Proxy 17 | from utils.ThreadPool2 import ThreadPool 18 | 19 | __author__ = "lightless" 20 | __email__ = "root@lightless.me" 21 | 22 | 23 | class ProxyManage(object): 24 | 25 | def __init__(self, **kwargs): 26 | """ 27 | 初始化代理地址检查 28 | :param kwargs: 29 | all: True: 检测数据库中全部的IP, False:检测指定的IP列表,以逗号分隔 30 | ips: 待检测的IP,"1.1.1.1:80, 2.2.2.2:3128" 31 | """ 32 | super(ProxyManage, self).__init__() 33 | 34 | # request headers 35 | self.headers = { 36 | 'User-Agent': 'curl/7.49.1', 37 | } 38 | 39 | # 初始化数据库连接 40 | db = DBConnection.DBConnection() 41 | db.connect() 42 | self.session = db.session 43 | 44 | # 获取参数 45 | self.kwargs = kwargs 46 | 47 | # 检测全部IP时的线程池 48 | self.thread_pool = None 49 | 50 | self.result_queue = Queue.Queue() 51 | 52 | def __del__(self): 53 | del self.session 54 | 55 | def check(self): 56 | """ 57 | 根据参数检测ip或数据库中的ip列表存活性 58 | :rtype: None 59 | """ 60 | if self.kwargs.get("all", None) is not None: 61 | # 检查数据库中的全部ip 62 | self._check_ip_all() 63 | elif self.kwargs.get("ips", None) is not None: 64 | # 检查提供的IP 65 | self._check_ip_list(self.kwargs.get("ips")) 66 | 67 | def _check(self, ip, port, save_to_queue=False): 68 | """ 69 | 检测给定的代理IP和端口是否存活 70 | :param ip: 代理IP 71 | :param port: 代理端口 72 | :param save_to_queue: 如果设置为True,则存储到结果队列中,否则不存储,默认为False 73 | :return: success, delay 如果目标代理存活,则success为True且delay为延迟,否则为False,delay为0 74 | """ 75 | # 检查参数合法性 76 | if ip == "" or port == "": 77 | logger.error("Invalid ip or port found. Skipping...") 78 | return False, -1.0 79 | 80 | # 3次重试机会 81 | retry = 3 82 | time_summary = 0.0 83 | success = False 84 | while retry: 85 | logger.debug("Times: {0}. Trying {1}:{2} connection...".format(3-retry+1, ip, port)) 86 | proxies = { 87 | 'http': ip + ":" + port 88 | } 89 | 90 | try: 91 | time_start = time.time() 92 | requests.get("http://ip.cn/", headers=self.headers, proxies=proxies, timeout=10) 93 | time_summary = time.time() - time_start 94 | success = True 95 | break 96 | except requests.RequestException: 97 | logger.warning("{0}:{1} proxy time out.".format(ip, port)) 98 | continue 99 | finally: 100 | retry -= 1 101 | if save_to_queue: 102 | self.result_queue.put((ip, port, success, time_summary)) 103 | return success, time_summary 104 | 105 | def _check_ip_list(self, raw_ips): 106 | try: 107 | if raw_ips is not None and len(raw_ips): 108 | ips = raw_ips.split(",") 109 | for ip in ips: 110 | ip_stu = ip.strip().split(":") 111 | s, t = self._check(ip_stu[0], ip_stu[1]) 112 | logger.info("IP {0} Connect {1}, time: {2:.2f}s".format(ip, "success", t)) if s \ 113 | else logger.error("IP {0} Connect failed.".format(ip)) 114 | self._update_db(ip_stu[0], ip_stu[1], t, s) 115 | else: 116 | logger.fatal("No IP provide.") 117 | sys.exit(1) 118 | except KeyError: 119 | logger.fatal("No IP provide.") 120 | sys.exit(1) 121 | 122 | def _check_ip_all(self): 123 | rows = self.session.query(Proxy).all() 124 | self.thread_pool = ThreadPool(thread_count=10 if not len(rows)/20 else len(rows)/20) 125 | for row in rows: 126 | self.thread_pool.add_func(self._check, ip=row.ip, port=row.port, save_to_queue=True) 127 | self.thread_pool.close() 128 | self.thread_pool.join() 129 | while True: 130 | if self.thread_pool.exit is True and self.result_queue.empty(): 131 | break 132 | else: 133 | try: 134 | res = self.result_queue.get_nowait() 135 | ip = res[0] 136 | port = res[1] 137 | delay = res[3] 138 | alive = res[2] 139 | logger.info("IP {0} Connect {1}, time: {2:.2f}s".format(ip, "success", delay)) if alive \ 140 | else logger.error("IP {0} Connect failed.".format(ip)) 141 | self._update_db(ip, port, delay, alive) 142 | except Queue.Empty: 143 | time.sleep(2) 144 | 145 | def _update_db(self, ip, port, delay, alive): 146 | proxy_item = self.session.query(Proxy).filter(Proxy.ip == ip, Proxy.port == port).all() 147 | if len(proxy_item): 148 | # 数据库中已经有这个IP了,更新即可 149 | proxy_item = proxy_item[0] 150 | proxy_item.updated_time = datetime.datetime.now() 151 | proxy_item.times = delay 152 | proxy_item.is_alive = 1 if alive else 0 153 | try: 154 | self.session.add(proxy_item) 155 | self.session.commit() 156 | except SQLAlchemyError, e: 157 | logger.error("Error while update proxy information to database.") 158 | logger.error(e.message) 159 | sys.exit(1) 160 | elif not len(proxy_item): 161 | # 数据库中没有IP,添加进去 162 | new_proxy = Proxy( 163 | ip=ip, port=port, proxy_type=None, location=None, protocol=None, times=delay, is_alive=1, 164 | created_time=datetime.datetime.now(), updated_time=datetime.datetime.now() 165 | ) 166 | try: 167 | self.session.add(new_proxy) 168 | self.session.commit() 169 | except SQLAlchemyError, e: 170 | logger.error("Error while update proxy information to database.") 171 | logger.error(e.message) 172 | sys.exit(1) 173 | 174 | def get_alive_proxy(self, amount=0, delay=0): 175 | """ 176 | 从数据库中获取获取存活的代理 177 | :param amount: 取出的数量 178 | :param delay: 取出延时小于delay的代理 179 | """ 180 | all_ips = self.session.query(Proxy) 181 | all_ips = all_ips.filter(Proxy.is_alive == "1") 182 | if int(delay): 183 | all_ips = all_ips.filter(Proxy.times < delay) 184 | all_ips = all_ips.order_by(Proxy.times) 185 | if int(amount): 186 | all_ips = all_ips.limit(amount) 187 | 188 | result = all_ips.all() 189 | # TODO:在Windows上要设置GBK编码,mac未测试。 190 | # Linux 上需要设置为UTF-8编码 191 | encoding = "UTF-8" if "linux" in platform.system().lower() else "GBK" 192 | x = prettytable.PrettyTable(encoding=encoding, field_names=["Proxy IP", "Location", "Proxy Type", "Delay (s)"], 193 | float_format=".2") 194 | for res in result: 195 | x.add_row([res.ip + ":" + res.port, res.location, res.proxy_type, float(res.times)]) 196 | x.align = "l" 197 | print x 198 | print "[*] Total: {}".format(str(len(result))) 199 | 200 | def clean_dead_proxy(self): 201 | try: 202 | logger.info("Start clean dead proxy in db.") 203 | dead_proxy = self.session.query(Proxy).filter(Proxy.is_alive == "0").all() 204 | logger.info("Found {} dead proxy in db.".format(len(dead_proxy))) 205 | for dp in dead_proxy: 206 | self.session.delete(dp) 207 | self.session.commit() 208 | logger.info("Clean done. {} dead proxies cleaned.".format(len(dead_proxy))) 209 | except SQLAlchemyError: 210 | logger.fatal("Error occurred when clean dead proxy from db.") 211 | sys.exit(1) 212 | -------------------------------------------------------------------------------- /Cores/ProxyManage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightless233/Pansidong/dd8691d356a50d522d68dbaa519b77b890bd0de3/Cores/ProxyManage/__init__.py -------------------------------------------------------------------------------- /Cores/ProxySpider/ProxySpider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | import sys 4 | 5 | from utils.AutoLoad import AutoLoad 6 | from utils.ThreadPool import ThreadPool 7 | from utils.Data.LoggerHelp import logger 8 | from utils.Data.SaveData import SaveData 9 | 10 | __author__ = "lightless" 11 | __email__ = "root@lightless.me" 12 | 13 | 14 | __all__ = ['ProxySpider'] 15 | 16 | 17 | class ProxySpider(object): 18 | def __init__(self, output_file=True, output_db=True, output_filename="proxy-ip-list.csv"): 19 | # 初始化AutoLoad模块 20 | self.al = AutoLoad() 21 | # 初始化 22 | self.tp = None 23 | self.sd = None 24 | self.write_file_tp = None 25 | self.spider_threads = None 26 | self.save_data_threads = None 27 | # 获取参数 28 | self.output_file = output_file 29 | self.output_db = output_db 30 | self.output_filename = output_filename 31 | 32 | def load(self, *spiders): 33 | self.al.load(*spiders) 34 | 35 | def set_threads(self, spider_threads=0, save_data_threads=0): 36 | if spider_threads > 0: 37 | self.spider_threads = spider_threads 38 | if save_data_threads > 0: 39 | self.save_data_threads = save_data_threads 40 | 41 | def start(self): 42 | if not len(self.al.spiders): 43 | logger.error("No Spiders loaded. exit.") 44 | sys.exit(1) 45 | else: 46 | message = "Loaded spiders: " 47 | for s in self.al.spiders: 48 | message += str(s.__class__).split(".")[-1].split("'")[0] + ", " 49 | logger.info(message.strip(", ")) 50 | # 创建线程池 51 | if self.spider_threads: 52 | self.tp = ThreadPool(self.spider_threads) 53 | else: 54 | self.tp = ThreadPool() 55 | for sp in self.al.spiders: 56 | # 将spider中的run方法添加到线程池中 57 | self.tp.add_function(sp.run) 58 | # 开始线程池 59 | self.tp.run(join=False) 60 | 61 | # 输出结果 62 | self.sd = SaveData(self.al.results, self.tp, use_file=self.output_file, use_database=self.output_db, 63 | filename=self.output_filename) 64 | if self.save_data_threads: 65 | self.write_file_tp = ThreadPool(self.save_data_threads) 66 | else: 67 | self.write_file_tp = ThreadPool() 68 | self.write_file_tp = ThreadPool() 69 | self.write_file_tp.add_function(self.sd.write) 70 | self.write_file_tp.run() 71 | 72 | 73 | if __name__ == "__main__": 74 | ps = ProxySpider() 75 | ps.load() 76 | ps.start() 77 | -------------------------------------------------------------------------------- /Cores/ProxySpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightless233/Pansidong/dd8691d356a50d522d68dbaa519b77b890bd0de3/Cores/ProxySpider/__init__.py -------------------------------------------------------------------------------- /Cores/ProxySpider/spiders/KDLHASpider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | from multiprocessing.dummy import Pool 5 | 6 | from bs4 import BeautifulSoup 7 | from selenium import webdriver 8 | 9 | from utils.SpiderBase import SpiderBase 10 | from utils.Data.LoggerHelp import logger 11 | 12 | __author__ = "lightless" 13 | __email__ = "root@lightless.me" 14 | 15 | 16 | class KDLHASpider(SpiderBase): 17 | def __init__(self): 18 | SpiderBase.__init__(self) 19 | self.url = "http://www.kuaidaili.com/free/inha/" 20 | self.tag = "快代理-每日更新" 21 | self.type = "HTTP" 22 | 23 | def run(self): 24 | # http://www.kuaidaili.com/proxylist/1/ 25 | pool = Pool() 26 | tt = pool.map(self.my_run, [page for page in xrange(1, 11)]) 27 | t_result = [] 28 | for x in tt: 29 | t_result += x 30 | 31 | # 填充结果集 32 | result = [] 33 | info = dict() 34 | info['url'] = self.url 35 | info['type'] = self.type 36 | info['tag'] = self.tag 37 | result.append(info) 38 | result.append(t_result) 39 | self.result_queue.put(result) 40 | 41 | def my_run(self, page): 42 | raw_url = "http://www.kuaidaili.com/proxylist/{page}/" 43 | url = raw_url.replace("{page}", str(page)) 44 | logger.debug(url) 45 | driver = webdriver.PhantomJS(executable_path=self.phantomjs_path) 46 | driver.get(url) 47 | raw_html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML") 48 | 49 | soup = BeautifulSoup(raw_html, "html5lib") 50 | t_result = list() 51 | for tr in soup.find_all("tr")[1:]: 52 | each_item = {} 53 | td = tr.find_all("td") 54 | 55 | # 填充数据 56 | each_item['ip'] = td[0].get_text() 57 | each_item['port'] = td[1].get_text() 58 | each_item['type'] = td[2].get_text() 59 | each_item['protocol'] = td[3].get_text().replace(", ", "-") 60 | each_item['location'] = td[5].get_text() 61 | each_item['time'] = filter(lambda ch: ch in '0123456789.', td[6].get_text().encode("utf8")) 62 | t_result.append(each_item) 63 | return t_result 64 | 65 | 66 | def get_spider_class(): 67 | return KDLHASpider 68 | 69 | -------------------------------------------------------------------------------- /Cores/ProxySpider/spiders/KPSpider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | from bs4 import BeautifulSoup 5 | from selenium import webdriver 6 | 7 | from utils.SpiderBase import SpiderBase 8 | from utils.Data.LoggerHelp import logger 9 | 10 | __author__ = "lightless" 11 | __email__ = "root@lightless.me" 12 | 13 | 14 | class KPSpider(SpiderBase): 15 | def __init__(self): 16 | SpiderBase.__init__(self) 17 | self.url = "http://www.site-digger.com/html/articles/20110516/proxieslist.html" 18 | self.tag = "鲲鹏-全球-每日更新" 19 | self.type = "HTTP" 20 | 21 | def run(self): 22 | 23 | url = self.url 24 | logger.debug(url) 25 | driver = webdriver.PhantomJS(executable_path=self.phantomjs_path) 26 | driver.get(url) 27 | raw_html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML") 28 | 29 | soup = BeautifulSoup(raw_html, "html5lib") 30 | table = soup.find("tbody") 31 | t_result = [] 32 | for tr in table.find_all("tr"): 33 | each_item = dict() 34 | td = tr.find_all("td") 35 | each_item['ip'] = td[0].get_text().split(";")[1].split(":")[0] 36 | each_item['port'] = td[0].get_text().split(";")[1].split(":")[1] 37 | each_item['type'] = td[1].get_text() 38 | each_item['location'] = td[2].get_text().strip() 39 | th = tr.find_all("th") 40 | each_item['time'] = th[0].get_text() 41 | t_result.append(each_item) 42 | result = [] 43 | info = dict() 44 | info['url'] = self.url 45 | info['type'] = self.type 46 | info['tag'] = self.tag 47 | result.append(info) 48 | result.append(t_result) 49 | self.result_queue.put(result) 50 | 51 | 52 | def get_spider_class(): 53 | return KPSpider 54 | -------------------------------------------------------------------------------- /Cores/ProxySpider/spiders/_ExampleSpider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | __author__ = "lightless" 5 | __email__ = "root@lightless.me" 6 | 7 | """ 8 | 爬虫插件类 9 | 10 | 命名规则: 11 | 文件名与类名相同。因为本文件仅作为示例,所以文件名以下划线开头,在编写自己的插件时请注意,以下划线开头的插件文件是不会被加载进框架的。 12 | 13 | 实现函数: 14 | __init__ 15 | 构造函数 16 | run 17 | 爬取函数,结果集为列表,第一项为字典,存储基本信息,第二项为列表,其中每一项均为字典, 18 | 字典中应当包括:ip,port,type,location,time 19 | ip: 代理地址的IP 20 | port: 代理地址的端口 21 | type: 代理类型,一般为 "透明,匿名,高匿" 其中之一,根据自己爬取的结果填充,若所爬页面未提供该值,填充为None即可。 22 | protocol: 代理支持的协议,HTTP/HTTPS,根据爬取的结果进行填充,若未提供该值,填充为None。 23 | location: 该代理IP的位置,根据自己爬取的结果进行填充,若所爬页面未提供该值,填充为None即可。 24 | time: 该代理IP的响应时间,单位为秒。 25 | 结果集格式如下: 26 | [ 27 | {"url": self.url, "type": self.type, "tag": self.tag}, 28 | [ 29 | {"ip": "33.44.55.66", "port": "80", "type": "高匿", "protocol": "HTTP", "location": "中国 江苏省 苏州市 电信", "time": "0.3"}, 30 | {"ip": "11.22.33.44", "port": "3128", "type": "透明", "protocol": "HTTPS", "location": "中国 河南省 洛阳市 电信", "time": "2.7"}, 31 | {"ip": "22.33.44.55", "port": "8888", "type": "匿名", "protocol": "HTTP/HTTPS", "location": "Taiwan", "time": "5.6"}, 32 | ... 33 | ] 34 | ] 35 | set_result_queue 36 | 设置结果队列,复制example中的函数即可,一般不需要修改。 37 | 38 | 类外实现函数:get_spider_class 39 | 返回爬虫类,按照example中的写法即可。 40 | """ 41 | 42 | 43 | class ExampleSpider: 44 | def __init__(self): 45 | # 待爬取的URL 46 | self.url = "Your url here." 47 | # 代理类型,包括HTTP,shadowsocks,VPN 48 | self.type = "HTTP" 49 | # 一些你自己的备注,建议填写以作区分 50 | self.tag = "鲲鹏-全球-高匿代理" 51 | # Result Queue 52 | self.result_queue = None 53 | 54 | def set_result_queue(self, result_queue): 55 | self.result_queue = result_queue 56 | 57 | def run(self): 58 | # TODO: Add your process here... 59 | # TODO: delete these lines below, just an example... 60 | t = [] 61 | s = {"ip": "11.22.33.44", "port": "8080", "type": u"透明", "protocol": "HTTP", "location": u"Taiwan", "time": "2.6"} 62 | t.append(s) 63 | s = {"ip": "22.33.44.55", "port": "3128", "type": u"高匿", "protocol": "HTTPS/HTTP", "location": u"江苏省南京市 联通", "time": "5"} 64 | t.append(s) 65 | tt = [{ 66 | "url": self.url, 67 | "type": self.type, 68 | }, t] 69 | self.result_queue.put(tt) 70 | 71 | 72 | def get_spider_class(): 73 | return ExampleSpider 74 | 75 | -------------------------------------------------------------------------------- /Cores/ProxySpider/spiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightless233/Pansidong/dd8691d356a50d522d68dbaa519b77b890bd0de3/Cores/ProxySpider/spiders/__init__.py -------------------------------------------------------------------------------- /Cores/WebSpider/WebClicker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | # file: WebSpider 4 | # time: 2016/08/02 23:18 5 | 6 | import Queue 7 | import threading 8 | import time 9 | 10 | from utils.Data.LoggerHelp import logger 11 | 12 | __author__ = "lightless" 13 | __email__ = "root@lightless.me" 14 | 15 | __all__ = ["WebClicker"] 16 | 17 | 18 | class WebClicker(object): 19 | def __init__(self): 20 | super(WebClicker, self).__init__() 21 | 22 | # 工作队列 23 | self.work_queue = Queue.Queue() 24 | # 退出标志 25 | self._exit = False 26 | 27 | def add_url(self, url): 28 | self.work_queue.put(url) 29 | 30 | def click_engine(self): 31 | # 开个新线程进行循环,不要阻塞 32 | click_thread = threading.Thread(target=self._loop, name="ClickLoopThread") 33 | click_thread.start() 34 | 35 | def terminate(self): 36 | self._exit = True 37 | logger.info("Send exit to WebClicker..Wait for quit.") 38 | 39 | def _loop(self): 40 | while True: 41 | # 如果检测到退出标志,则退出 42 | if self._exit: 43 | break 44 | 45 | target_url = None 46 | try: 47 | target_url = self.work_queue.get(block=True, timeout=1) 48 | except Queue.Empty: 49 | time.sleep(5) 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /Cores/WebSpider/WebSpider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | # file: WebSpider 4 | # time: 2016/7/17 10:59 5 | import json 6 | import random 7 | import urlparse 8 | import Queue 9 | import time 10 | import threading 11 | from multiprocessing import cpu_count 12 | 13 | import tldextract 14 | from selenium import webdriver 15 | from selenium.webdriver import DesiredCapabilities 16 | from bs4 import BeautifulSoup 17 | 18 | from utils.ThreadPool2 import ThreadPool 19 | from utils.SpiderBase import SpiderBase 20 | from utils.Data.LoggerHelp import logger 21 | 22 | __author__ = "lightless" 23 | __email__ = "root@lightless.me" 24 | 25 | 26 | __all__ = ['WebSpider'] 27 | 28 | 29 | class WebSpider(SpiderBase): 30 | def __init__(self, target, deep=1, limit_domain=list(), thread_count=cpu_count()*2, 31 | phantomjs_count=cpu_count(), filter_similar=False): 32 | 33 | # 设置phantomjs路径 34 | SpiderBase.__init__(self) 35 | SpiderBase.set_phantomjs_path(self) 36 | 37 | # 设置参数 38 | self.target = target 39 | self.deep = deep 40 | if limit_domain: 41 | self.limit_domain = limit_domain 42 | else: 43 | self.limit_domain = ".".join(tldextract.extract(self.target)) 44 | self.thread_count = thread_count 45 | self.phantomjs_count = phantomjs_count 46 | self.filter_similar = filter_similar 47 | 48 | # 去重用的set 49 | self.url_set = set() 50 | self.url_param_set = set() 51 | # 存储爬虫结果的list 52 | self.links = list() 53 | # 待爬取的队列 54 | self.task_queue = Queue.Queue() 55 | self.spider_pool = None 56 | 57 | # 将初始目标置于待爬取的队列中 58 | self.task_queue.put((self.target, 0)) 59 | 60 | # 统计信息 61 | self.raw_links_num = 0 62 | self.filter_links_num = 0 63 | self.links_num = 0 64 | 65 | # 初始化 webdriver 66 | # dcap 好像无效 67 | self.dcap = dict(DesiredCapabilities.PHANTOMJS) 68 | self.dcap["phantomjs.page.settings.resourceTimeout"] = 10 69 | self.dcap["phantomjs.page.settings.loadImages"] = False 70 | 71 | self.service_args = [ 72 | "--webdriver-loglevel=DEBUG", 73 | "--webdriver-logfile=phantomjs.log" 74 | "--load-images=no", 75 | "--disk-cache=true" 76 | ] 77 | 78 | # webdriver进程池 79 | logger.info("initial web spider phantomjs process pool...") 80 | self.driver_pool = list() 81 | self.driver_pool_lock = list() 82 | for i in range(self.phantomjs_count): 83 | self.driver_pool.append( 84 | webdriver.PhantomJS(executable_path=self.phantomjs_path, desired_capabilities=self.dcap, 85 | service_args=self.service_args 86 | ) 87 | ) 88 | self.driver_pool_lock.append( 89 | threading.Lock() 90 | ) 91 | logger.info("%.2f%% finished." % ((float(i + 1) * 100) / float(self.phantomjs_count))) 92 | logger.info("initial finished.") 93 | 94 | def __del__(self): 95 | for driver in self.driver_pool: 96 | driver.quit() 97 | del driver 98 | del self.driver_pool 99 | 100 | def do_spider(self): 101 | t = threading.Thread(target=self.start, name="WebSpider.start") 102 | t.start() 103 | 104 | def start(self): 105 | logger.debug("start of web spider.") 106 | 107 | # 开始线程池,并且开启了线程分发器 108 | self.spider_pool = ThreadPool(self.thread_count) 109 | # 开始爬取第一个页面 110 | self.spider_pool.add_func(self._start, target=self.task_queue.get_nowait()) 111 | while True: 112 | 113 | if (not self.spider_pool.working_thread_number) and self.task_queue.empty(): 114 | time.sleep(2) 115 | if (not self.spider_pool.working_thread_number) and self.task_queue.empty(): 116 | self.spider_pool.terminated() 117 | logger.debug("WebSpider loop end.") 118 | break 119 | 120 | if self.task_queue.empty(): 121 | time.sleep(1) 122 | continue 123 | 124 | target = self.task_queue.get_nowait() 125 | self.spider_pool.add_func(self._start, target=(target[0], target[1])) 126 | time.sleep(0.1) 127 | 128 | logger.debug("end of web spider") 129 | 130 | def _start(self, target): 131 | logger.debug("start spider " + target[0]) 132 | deep = target[1] 133 | target = target[0] 134 | 135 | # 随机取一个phantomjs进程 136 | phantomjs_tag = random.randint(0, self.phantomjs_count-1) 137 | 138 | self.driver_pool_lock[phantomjs_tag].acquire() 139 | retry_times = 2 140 | while retry_times: 141 | try: 142 | self.driver_pool[phantomjs_tag].get(target) 143 | break 144 | except: 145 | # driver.close() 146 | logger.error("retry %d" % retry_times) 147 | retry_times -= 1 148 | if not retry_times: 149 | logger.warn("Time out when get %s HTML" % target) 150 | self.driver_pool_lock[phantomjs_tag].release() 151 | return 152 | else: 153 | continue 154 | 155 | # 获取网页HTML 156 | raw_html = self.driver_pool[phantomjs_tag].execute_script( 157 | "return document.getElementsByTagName('html')[0].innerHTML" 158 | ) 159 | # 获取网页加载过程中发生的HTTP请求 160 | http_log = json.loads(self.driver_pool[phantomjs_tag].get_log("har")[0]["message"])["log"]["entries"] 161 | # 获取当前的页面URL 162 | base_url = self.driver_pool[phantomjs_tag].current_url 163 | # 释放锁 164 | self.driver_pool_lock[phantomjs_tag].release() 165 | 166 | soup = BeautifulSoup(raw_html, "html5lib") 167 | logger.debug("Get %s HTML done. Deep: %s" % (target, deep)) 168 | 169 | # 处理文件中获取的href标签 170 | for a in soup.find_all("a", href=True): 171 | url = a['href'].strip() 172 | # 去掉非URL的部分 173 | if url.startswith('javascript:') or url.startswith('#') or not url: 174 | continue 175 | elif not url.startswith('https://') or not url.startswith('http://'): 176 | # 将相对路径转换为绝对路径 177 | url = urlparse.urljoin(base_url, url) 178 | self.check_same_url(url, deep, self.filter_similar) 179 | 180 | # 处理打开页面时产生的请求 181 | for log in http_log: 182 | url = log['request']['url'] 183 | logger.info(url) 184 | self.check_same_url(url, deep, self.filter_similar) 185 | 186 | logger.debug("".join(["Raw links: ", str(self.raw_links_num)])) 187 | logger.debug("".join(["Filter links: ", str(self.filter_links_num)])) 188 | 189 | # TODO: 去重相关的函数抽出去到utils中 190 | @staticmethod 191 | def format_url(url): 192 | """ 193 | 简单去重、去相似的URL 194 | :param url: 待处理的URL 195 | :return: URL的特征元组 196 | """ 197 | # 规范化URL,在末尾增加 / 198 | if urlparse.urlparse(url)[2] == "": 199 | url += '/' 200 | 201 | url_structure = urlparse.urlparse(url) 202 | netloc = url_structure.netloc 203 | path = url_structure.path 204 | query = url_structure.query 205 | suffix = url_structure.path.split('.')[-1] 206 | 207 | result = ( 208 | netloc, 209 | tuple([len(i) for i in path.split('/')]), 210 | tuple(sorted([i.split('=')[0] for i in query.split('&')])), 211 | ) 212 | return result, suffix 213 | 214 | @staticmethod 215 | def format_url_param(url): 216 | url_st = urlparse.urlparse(url) 217 | queries = url_st.query 218 | if not queries: 219 | return 220 | new_queries = "" 221 | for eq in queries.split("&"): 222 | key = eq.split("=")[0] 223 | value = eq.split("=")[1] 224 | if value.isdigit(): 225 | value = "" 226 | new_queries += key + "=" + value + "&" 227 | new_queries = new_queries.strip("&") 228 | url = urlparse.urlunparse(( 229 | url_st.scheme, 230 | url_st.netloc, 231 | url_st.path, 232 | url_st.params, 233 | new_queries, 234 | url_st.fragment, 235 | )) 236 | return url 237 | 238 | def check_same_url(self, url, deep, filter_similar): 239 | 240 | # 判断URL的后缀是否为图片等 241 | url_st = urlparse.urlparse(url) 242 | suffix = url_st.path.split(".")[-1] 243 | if suffix.lower() in ["jpg", "png", "gif", "jpeg", "bmp", "css", "ttf"]: 244 | return 245 | 246 | self.raw_links_num += 1 247 | 248 | # 先判断域名在不在目标域中 249 | if self.check_domain_limit(url): 250 | # 在目标域中,判断参数格式 251 | # 如果已经在set中,说明之前爬到过类似参数的页面,直接return 252 | # 如果不在set中,说明之前未出现过,继续向下执行处理,并将其添加到set中 253 | formatted_url = self.format_url_param(url) 254 | # logger.warning(formatted_url) 255 | if formatted_url is not None: 256 | if formatted_url not in self.url_param_set: 257 | self.url_param_set.add(formatted_url) 258 | else: 259 | return 260 | 261 | # 格式化url 262 | r, suffix = self.format_url(url) 263 | if suffix: 264 | # 有后缀,正常页面,根据是否判断相似性的设置继续判断 265 | if filter_similar and (r not in self.url_set): 266 | self.filter_links_num += 1 267 | self.url_set.add(r) 268 | self.links.append(url) 269 | logger.info(url) 270 | if deep + 1 <= self.deep: 271 | self.task_queue.put((url, deep + 1)) 272 | elif not filter_similar and (url not in self.links): 273 | self.filter_links_num += 1 274 | self.links.append(url) 275 | logger.info(url) 276 | if deep + 1 <= self.deep: 277 | self.task_queue.put((url, deep + 1)) 278 | else: 279 | # 没有后缀,是个目录,去重,不去相似 280 | if url not in self.links: 281 | self.filter_links_num += 1 282 | self.links.append(url) 283 | logger.info(url) 284 | if deep + 1 <= self.deep: 285 | self.task_queue.put((url, deep + 1)) 286 | 287 | def check_domain_limit(self, url): 288 | for domain in self.limit_domain: 289 | ext = tldextract.extract(domain) 290 | # *的时候匹配所有二级域名,或者只匹配特定的域名 291 | if ((ext[0] == "*" or ext[0] == "") and tldextract.extract(url)[1] == ext[1]) or \ 292 | (".".join(tldextract.extract(url)) == domain): 293 | return True 294 | 295 | return False 296 | -------------------------------------------------------------------------------- /Cores/WebSpider/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | # file: __init__.py 4 | # time: 2016/7/17 10:58 5 | 6 | __author__ = "lightless" 7 | __email__ = "root@lightless.me" -------------------------------------------------------------------------------- /Cores/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | # file: __init__.py 4 | # time: 2016/7/17 10:55 5 | 6 | __author__ = "lightless" 7 | __email__ = "root@lightless.me" -------------------------------------------------------------------------------- /CreateTables.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | import ConfigParser 4 | 5 | from sqlalchemy import create_engine 6 | from sqlalchemy.orm import sessionmaker 7 | 8 | from utils.Data.LoggerHelp import logger 9 | from utils.Data.Tables import Proxy 10 | 11 | __author__ = "lightless" 12 | __email__ = "root@lightless.me" 13 | 14 | 15 | if __name__ == "__main__": 16 | cf = ConfigParser.ConfigParser() 17 | cf.read("config.ini") 18 | db_name = cf.get("Pansidong", "database") 19 | username = cf.get(db_name, "username") 20 | password = cf.get(db_name, "password") 21 | host = cf.get(db_name, "host") 22 | database = cf.get(db_name, "database") 23 | 24 | engine = create_engine("mysql://" + username + ":" + password + "@" + host + "/" + database) 25 | db_session = sessionmaker(bind=engine) 26 | try: 27 | Proxy.metadata.create_all(engine) 28 | logger.debug("Tables create success.") 29 | except Exception, e: 30 | logger.error(e.message) 31 | 32 | 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | {one line to give the program's name and a brief idea of what it does.} 635 | Copyright (C) {year} {name of author} 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | {project} Copyright (C) {year} {fullname} 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /Pansidong.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | # file: Pansidong.py 4 | # time: 2016/8/5 23:48 5 | 6 | from utils.ArgParser import ParseCommandArgs 7 | 8 | __author__ = "lightless" 9 | __email__ = "root@lightless.me" 10 | 11 | 12 | """ 13 | 项目入口文件 14 | """ 15 | 16 | 17 | def main(): 18 | parse = ParseCommandArgs.ParseCommandArgs() 19 | parse.start_parse() 20 | 21 | 22 | if __name__ == '__main__': 23 | main() 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pansidong 2 | 自动化WEB漏洞扫描器 3 | 4 | ## 别 star 了,烂尾了 5 | ## 别 star 了,烂尾了 6 | ## 别 star 了,烂尾了 7 | 8 | 9 | ## TODO 10 | 11 | ### 线程池相关 12 | * 使用gevent改进进程池 -> 编写了新的线程池 13 | * 使用新的线程池替换旧的线程池 14 | 15 | ### 代理池管理部分 16 | * 检查代理存活时改用多线程 17 | 18 | ### Web爬虫相关 19 | * Web爬虫深度控制 20 | * Web爬虫关键字控制 21 | * Web爬虫多线程控制 22 | * Web爬虫的目标域限制 23 | * 去重算法优化 24 | * 增加爬取form的action部分,src属性,以javascrip:开头的href,有onclick属性,form的action,自动提交。 25 | * 获取打开页面时加载的ajax请求 26 | * 增加Web代理功能,让浏览器代理到爬虫上,手工点击增加链接数量 27 | * 从代理池中获取可用的代理并自动利用代理爬取 28 | * 增加cookie支持 29 | * 增加UA支持 30 | 31 | ### 杂项 32 | * 增加log的彩色输出 33 | 34 | ## 更新日志 35 | * 2016-7-24 36 | * Pansidong v0.1.1 版本完成。 37 | * 增加 phantomjs的进程池,保证稳定性。 38 | * 修复 线程锁的释放位置,保证线程间数据完整性。 39 | * 增加 记录HTTP子请求的功能,增加了爬取到的线程数量。 40 | * 2016-7-23 41 | * Pansidong v0.1.0 版本完成。 42 | * 增加 爬虫深度控制。 43 | * 增加 爬虫多线程控制。 44 | * 增加 新的线程池,比以前更有效率。 45 | * 优化 爬虫爬取时消耗的资源更少,时间更短。 46 | * 2016-7-20 47 | * 增加 爬虫爬取范围的控制 48 | * 2016-7-17 49 | * 增加 第一版Web爬虫。可以初步过滤URL相似以及重复。 50 | * 增加 对Mac系统的支持。 51 | * 2016-7-16 52 | * 修改proxy表字段,增加是否存活的字段。 53 | * 2016-7-10 54 | * ProxySpider v1.0.4版本完成。 55 | * ProxySpider封装完成,已经作为模块导入到盘丝洞中:https://github.com/LiGhT1EsS/Pansidong 56 | -------------------------------------------------------------------------------- /ThirdParty/phantomjs/phantomjs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightless233/Pansidong/dd8691d356a50d522d68dbaa519b77b890bd0de3/ThirdParty/phantomjs/phantomjs -------------------------------------------------------------------------------- /ThirdParty/phantomjs/phantomjs-mac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightless233/Pansidong/dd8691d356a50d522d68dbaa519b77b890bd0de3/ThirdParty/phantomjs/phantomjs-mac -------------------------------------------------------------------------------- /ThirdParty/phantomjs/phantomjs.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightless233/Pansidong/dd8691d356a50d522d68dbaa519b77b890bd0de3/ThirdParty/phantomjs/phantomjs.exe -------------------------------------------------------------------------------- /config.ini.sample: -------------------------------------------------------------------------------- 1 | # ProxySpider config file. 2 | # This is only an example file. 3 | 4 | # ProxySpider configuare 5 | [Pansidong] 6 | # If debug should be enable. The value of this option is "On" or "Off" 7 | # On - enable debug log 8 | # Off - disable debug log. 9 | debug = On 10 | 11 | # which database should be use. 12 | # Now only mysql was supported. 13 | database = MySQL 14 | 15 | [MySQL] 16 | username = root 17 | password = root 18 | database = proxyspider 19 | host = localhost 20 | -------------------------------------------------------------------------------- /mytest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | # file: mytest.py 4 | # time: 2016/7/13 21:34 5 | import json 6 | import threading 7 | import time 8 | 9 | import tldextract 10 | from selenium.webdriver import DesiredCapabilities 11 | 12 | from Cores.ProxySpider.ProxySpider import ProxySpider 13 | from Cores.WebSpider.WebSpider import WebSpider 14 | 15 | 16 | 17 | # ps = ProxySpider() 18 | # ps.load() 19 | # ps.start() 20 | # 21 | # pm = ProxyManage() 22 | # pm.check() 23 | 24 | # proxies = { 25 | # "http": "113.124.68.42:8998", 26 | # } 27 | # 28 | # headers = { 29 | # 'User-Agent': 'curl/7.49.1', 30 | # } 31 | # 32 | # start_time = time.time() 33 | # r = requests.get("http://ip.cn", headers=headers, proxies=proxies) 34 | # elapsed_time = time.time() - start_time 35 | # print elapsed_time 36 | # print r.content 37 | 38 | # for i in xrange(10): 39 | # print i 40 | # try: 41 | # x = i / 0 42 | # print "try here" 43 | # except Exception, e: 44 | # print e.message 45 | # continue 46 | # finally: 47 | # print "finally here" 48 | # print "try here 2" 49 | from utils.data.LoggerHelp import logger 50 | 51 | web_spider = WebSpider( 52 | target="http://www.yundaex.com/", 53 | limit_domain=['*.yundaex.com'], 54 | deep=5, 55 | thread_count=50 56 | ) 57 | web_spider.do_spider() 58 | # web_spider.start() 59 | # while True: 60 | # time.sleep(1) 61 | # print web_spider.links 62 | # time.sleep(1) 63 | while True: 64 | time.sleep(5) 65 | logger.debug("Alive thread: %d" % web_spider.spider_pool.working_thread_number) 66 | logger.debug("Left tasks number: %d" % web_spider.task_queue.qsize()) 67 | logger.debug("links num before filter: %d" % web_spider.raw_links_num) 68 | logger.debug("links num after filter: %d" % web_spider.filter_links_num) 69 | 70 | if web_spider.spider_pool.working_thread_number == 0: 71 | break 72 | # 73 | # print web_spider.links 74 | with open("urls.txt", "w") as ff: 75 | for url in web_spider.links: 76 | ff.write(url.decode('utf8') + "\n") 77 | 78 | # urls = ['www.lightless.me', 'www.baidu.com'] 79 | # jobs = [gevent.spawn(socket.gethostbyname, url) for url in urls] 80 | # # gevent.joinall(jobs) 81 | # res = [job.value for job in jobs] 82 | # print res 83 | # 84 | # def func(): 85 | # return '123' 86 | # 87 | # tp = ThreadPool() 88 | # tp.add_task_run(socket.gethostbyname, 'www.lightless.me') 89 | # tp.add_task_run(socket.gethostbyname, 'www.baidu.com') 90 | # # tp.add_task_run(func) 91 | # 92 | # print tp.value.get().value 93 | # print tp.value.get().value 94 | # # print tp.value.get().value 95 | 96 | 97 | # from selenium import webdriver 98 | # 99 | # service_args = [ 100 | # '--load-images=no', 101 | # ] 102 | # dcap = dict(DesiredCapabilities.PHANTOMJS) 103 | # dcap["phantomjs.page.settings.resourceTimeout"] = 10 104 | # dcap["phantomjs.page.settings.loadImages"] = False 105 | # driver = webdriver.PhantomJS(executable_path='ThirdParty/phantomjs/phantomjs.exe', service_args=service_args) 106 | # driver.get("http://www.china-pub.com") 107 | # x = driver.get_log('har') 108 | 109 | # d = json.loads(x[0]['message']) 110 | # print d 111 | # print type(d) 112 | # print d.keys() 113 | # print "=====" 114 | # print d['log'] 115 | # print d['log'].keys() 116 | # print "=== entries ===" 117 | # print d['log']['entries'] 118 | # print d['log']['entries'][0].keys() 119 | # print d['log']['entries'][0]['request'].keys() 120 | # print d['log']['entries'][0]['request']['url'] 121 | # print d['log']['entries'][0]['request']['cookies'] 122 | # print d['log']['entries'][0]['request']['queryString'] 123 | # print d['log']['entries'][0]['request']['method'] 124 | # print d['log']['entries'][0]['request']['headers'] 125 | 126 | # for i in d['log']['entries']: 127 | # print i['request']['method'], i['request']['url'], i['request']['queryString'] 128 | # 129 | # for i in d['log']['pages']: 130 | # print i['id'] 131 | # reqMonitoring = json.loads(driver.get_log("har")[0]["message"])["log"]["entries"] 132 | # print reqMonitoring 133 | # for i in reqMonitoring: 134 | # print i['request']['method'], i['request']['url'], i['request']['queryString'] 135 | # 136 | # driver.quit() 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightless233/Pansidong/dd8691d356a50d522d68dbaa519b77b890bd0de3/requirements.txt -------------------------------------------------------------------------------- /utils/ArgParser/ArgParse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | # file: ArgParse.py 4 | # time: 2016/8/6 10:45 5 | 6 | import argparse 7 | 8 | from utils.ArgParser.Messages import Banner 9 | 10 | __author__ = "lightless" 11 | __email__ = "root@lightless.me" 12 | __all__ = ["pansidong_parse"] 13 | 14 | # 初始化参数 15 | pansidong_parse = argparse.ArgumentParser(description=Banner, formatter_class=argparse.RawTextHelpFormatter) 16 | 17 | # 设置命令组 18 | misc_group = pansidong_parse.add_argument_group("Misc") 19 | proxy_group = pansidong_parse.add_argument_group("Proxy") 20 | attack_group = pansidong_parse.add_argument_group("Attack") 21 | spider_group = pansidong_parse.add_argument_group("Spider") 22 | 23 | # 添加Misc组的命令 24 | misc_group.add_argument("--version", help="Show program version.", action="store_true") 25 | 26 | # 添加Proxy组的命令 27 | proxy_group.add_argument("--update-proxy-db", help="Update proxy IP Address.", action="store_true") 28 | proxy_group.add_argument("--check-proxy", metavar="IP:PORT", type=str, help="Check proxy availability.") 29 | proxy_group.add_argument("--check-proxy-all", help="Check ALL proxy availability. !!VERY SLOW!!", action="store_true") 30 | proxy_group.add_argument("--get-alive-proxy", help="Get all alive proxy from db. e.g. --get-alive-proxy 100, 2", 31 | type=str, metavar="[count[, delay]]") 32 | proxy_group.add_argument("--clean-db", help="Clean the dead proxy from db.", action="store_true") 33 | 34 | # 添加Attack组的命令 35 | 36 | 37 | # 添加spider组的命令 38 | 39 | -------------------------------------------------------------------------------- /utils/ArgParser/Messages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | # file: Messages.py 4 | # time: 2016/8/6 11:20 5 | 6 | __author__ = "lightless" 7 | __email__ = "root@lightless.me" 8 | 9 | Version = "0.1b Beta" 10 | 11 | Banner = u""" 12 | .______ ___ .__ __. _______. __ _______ ______ .__ __. _______ 13 | | _ \ / \ | \ | | / || | | \ / __ \ | \ | | / _____| 14 | | |_) | / ^ \ | \| | | (----`| | | .--. || | | | | \| | | | __ 15 | | ___/ / /_\ \ | . ` | \ \ | | | | | || | | | | . ` | | | |_ | 16 | | | / _____ \ | |\ | .----) | | | | '--' || `--' | | |\ | | |__| | 17 | | _| /__/ \__\ |__| \__| |_______/ |__| |_______/ \______/ |__| \__| \______| 18 | 19 | [*] 盘丝洞 - 自动化Web漏洞挖掘系统 20 | [*] Powered By lightless 21 | [*] Version: {0} 22 | """.format(Version) 23 | 24 | -------------------------------------------------------------------------------- /utils/ArgParser/ParseCommandArgs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | # file: ParseCommandArgs.py 4 | # time: 2016/8/6 11:59 5 | 6 | import sys 7 | 8 | from Cores.ProxySpider import ProxySpider 9 | from Cores.ProxyManage import ProxyManage 10 | from utils.ArgParser import ArgParse 11 | from utils.ArgParser.Messages import Version 12 | from utils.Data.LoggerHelp import logger 13 | 14 | __author__ = "lightless" 15 | __email__ = "root@lightless.me" 16 | 17 | __all__ = ["ParseCommandArgs"] 18 | 19 | 20 | # TODO: 按照不同的分类,拆分这个类 21 | class ParseCommandArgs(object): 22 | def __init__(self): 23 | super(ParseCommandArgs, self).__init__() 24 | self.command_args = ArgParse.pansidong_parse.parse_args() 25 | 26 | def start_parse(self): 27 | # --version 28 | if self.command_args.version: 29 | print Version 30 | sys.exit(0) 31 | 32 | # --update-proxy-db 33 | if self.command_args.update_proxy_db: 34 | logger.debug("Update Proxy DB selected.") 35 | ps = ProxySpider.ProxySpider() 36 | ps.load() 37 | ps.start() 38 | sys.exit(0) 39 | 40 | # --check-proxy 41 | if self.command_args.check_proxy: 42 | logger.debug("Check proxy selected.") 43 | ips = self.command_args.check_proxy 44 | logger.debug(ips) 45 | pm = ProxyManage.ProxyManage(ips=ips) 46 | pm.check() 47 | sys.exit(0) 48 | 49 | # --check-proxy-all 50 | if self.command_args.check_proxy_all: 51 | logger.debug("Check all proxy selected.") 52 | pm = ProxyManage.ProxyManage(all=True) 53 | pm.check() 54 | sys.exit(0) 55 | 56 | # --get-alive-proxy 57 | if self.command_args.get_alive_proxy: 58 | logger.debug("Get alive proxy selected.") 59 | logger.debug(self.command_args.get_alive_proxy) 60 | pm = ProxyManage.ProxyManage() 61 | params = self.command_args.get_alive_proxy 62 | if "," in params: 63 | amount = params.split(",")[0].strip() 64 | delay = params.split(",")[1].strip() 65 | pm.get_alive_proxy(amount, delay) 66 | else: 67 | pm.get_alive_proxy(params.strip()) 68 | 69 | # --clean-db 70 | if self.command_args.clean_db: 71 | logger.debug("Clean db selected.") 72 | pm = ProxyManage.ProxyManage() 73 | pm.clean_dead_proxy() 74 | 75 | -------------------------------------------------------------------------------- /utils/ArgParser/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | # file: __init__.py 4 | # time: 2016/8/6 10:45 5 | 6 | __author__ = "lightless" 7 | __email__ = "root@lightless.me" -------------------------------------------------------------------------------- /utils/AutoLoad.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | import os 4 | import Queue 5 | import platform 6 | 7 | __author__ = "lightless" 8 | __email__ = "root@lightless.me" 9 | 10 | 11 | class AutoLoad: 12 | 13 | def __init__(self): 14 | self.spiders = [] 15 | self.results = Queue.Queue() 16 | 17 | @staticmethod 18 | def __check_filename(filename): 19 | if not filename.endswith(".py") or filename.startswith("_"): 20 | return False 21 | else: 22 | return True 23 | 24 | def load_spider(self, filename): 25 | spider_name = os.path.splitext(filename)[0] 26 | spider = __import__("Cores.ProxySpider.spiders." + spider_name, fromlist=[spider_name]) 27 | spider_class = spider.get_spider_class() 28 | o = spider_class() 29 | o.set_result_queue(self.results) 30 | o.set_phantomjs_path() 31 | self.spiders.append(o) 32 | 33 | def load(self, *cls): 34 | if not cls: 35 | if "Darwin" in platform.system(): 36 | spider_path = "ProxySpider/spiders" 37 | else: 38 | spider_path = "../ProxySpider/spiders" 39 | 40 | for filename in os.listdir(spider_path): 41 | if self.__check_filename(filename): 42 | self.load_spider(filename) 43 | else: 44 | for class_name in cls: 45 | filename = class_name + ".py" 46 | if self.__check_filename(filename) and os.path.exists("spiders" + os.sep + class_name + ".py"): 47 | self.load_spider(filename) 48 | 49 | 50 | -------------------------------------------------------------------------------- /utils/DBConnection/DBConnection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | # file: DBConnection.py 4 | # time: 2016/8/6 15:16 5 | 6 | import sys 7 | import ConfigParser 8 | 9 | from sqlalchemy import create_engine 10 | from sqlalchemy.orm import sessionmaker 11 | 12 | from utils.Data import Enum 13 | from utils.Data.LoggerHelp import logger 14 | 15 | __author__ = "lightless" 16 | __email__ = "root@lightless.me" 17 | 18 | __all__ = ["DBConnection", "DB_TYPE"] 19 | 20 | 21 | DB_TYPE = Enum.Enum([ 22 | "MYSQL", 23 | ]) 24 | 25 | 26 | class DBConnection(object): 27 | 28 | def __init__(self): 29 | super(DBConnection, self).__init__() 30 | self._type, self._username, self._password, self._host, self._database = self._get_db_config() 31 | self._engine = None 32 | self._db_session = None 33 | self.session = None 34 | 35 | def __del__(self): 36 | del self._engine 37 | del self._db_session 38 | del self.session 39 | 40 | def connect(self): 41 | if self._type.upper() == DB_TYPE.MYSQL: 42 | self._engine = create_engine( 43 | "mysql://" + self._username + ":" + self._password + "@" + 44 | self._host + "/" + self._database + "?charset=utf8" 45 | ) 46 | self._db_session = sessionmaker(bind=self._engine) 47 | self.session = self._db_session() 48 | else: 49 | logger.fatal("Unsupported database type.") 50 | sys.exit(1) 51 | 52 | @staticmethod 53 | def _get_db_config(): 54 | cf = ConfigParser.ConfigParser() 55 | cf.read("config.ini") 56 | db_type = cf.get("Pansidong", "database") 57 | username = cf.get(db_type, "username") 58 | password = cf.get(db_type, "password") 59 | host = cf.get(db_type, "host") 60 | database = cf.get(db_type, "database") 61 | return db_type, username, password, host, database 62 | -------------------------------------------------------------------------------- /utils/DBConnection/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | # file: __init__.py 4 | # time: 2016/8/6 15:16 5 | 6 | __author__ = "lightless" 7 | __email__ = "root@lightless.me" -------------------------------------------------------------------------------- /utils/Data/Enum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | # file: Enum.py 4 | # time: 2016/8/6 15:19 5 | 6 | __author__ = "lightless" 7 | __email__ = "root@lightless.me" 8 | 9 | 10 | class Enum(set): 11 | def __getattr__(self, name): 12 | if name in self: 13 | return name 14 | raise AttributeError 15 | -------------------------------------------------------------------------------- /utils/Data/LoggerHelp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import colorlog 5 | import logging 6 | 7 | __author__ = "lightless" 8 | __email__ = "root@lightless.me" 9 | 10 | __all__ = ['logger'] 11 | 12 | 13 | handler = colorlog.StreamHandler() 14 | handler.setFormatter( 15 | colorlog.ColoredFormatter( 16 | fmt='%(log_color)s[%(levelname)s] [%(threadName)s] [%(asctime)s] [%(filename)s:%(lineno)d] %(message)s', 17 | datefmt="%H:%M:%S", 18 | log_colors={ 19 | 'DEBUG': 'cyan', 20 | 'INFO': 'green', 21 | 'WARNING': 'yellow', 22 | 'ERROR': 'red', 23 | 'CRITICAL': 'red,bg_white', 24 | }, 25 | ) 26 | ) 27 | 28 | logger = logging.getLogger(__name__) 29 | logger.addHandler(handler) 30 | logger.setLevel("DEBUG") 31 | -------------------------------------------------------------------------------- /utils/Data/SaveData.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import codecs 5 | import os 6 | import time 7 | import ConfigParser 8 | import sys 9 | import datetime 10 | 11 | from sqlalchemy import create_engine 12 | from sqlalchemy.orm import sessionmaker 13 | 14 | from utils.Data.LoggerHelp import logger 15 | from utils.Data.Tables import Proxy 16 | 17 | __author__ = "lightless" 18 | __email__ = "root@lightless.me" 19 | 20 | 21 | class SaveData(object): 22 | def __init__(self, results_queue, thread_pool, use_file=True, use_database=True, filename="proxy-ip-list.csv"): 23 | self.use_file = use_file 24 | self.use_database = use_database 25 | self.filename = filename 26 | self.results_queue = results_queue 27 | self.thread_pool = thread_pool 28 | 29 | if use_database: 30 | try: 31 | cf = ConfigParser.ConfigParser() 32 | cf.read("config.ini") 33 | db_name = cf.get("Pansidong", "database") 34 | username = cf.get(db_name, "username") 35 | password = cf.get(db_name, "password") 36 | host = cf.get(db_name, "host") 37 | database = cf.get(db_name, "database") 38 | except AttributeError, e: 39 | logger.fatal(e.message) 40 | sys.exit(1) 41 | self.engine = create_engine("mysql://" + username + ":" + password + "@" + 42 | host + "/" + database + "?charset=utf8") 43 | self.db_session = sessionmaker(bind=self.engine) 44 | self.session = self.db_session() 45 | if use_file: 46 | self.ff = open(self.filename, "w") 47 | self.ff.write(codecs.BOM_UTF8) 48 | 49 | def __del__(self): 50 | if self.use_file: 51 | self.ff.close() 52 | if self.use_database: 53 | self.session.close() 54 | 55 | def write(self): 56 | # wait for other threads start. 57 | time.sleep(5) 58 | 59 | while not self.thread_pool.finished: 60 | if not self.results_queue.empty(): 61 | res = self.results_queue.get(block=True) 62 | if self.use_file: 63 | self.__write_file(res) 64 | if self.use_database: 65 | self.__write_database(res) 66 | 67 | def __write_database(self, res): 68 | res = res[1] 69 | for r in res: 70 | # 先检测数据库中是否存在该IP 71 | # 如果IP和端口均相同 72 | # 则认为是重复的数据,不添加到数据库中 73 | 74 | proxy = self.session.query(Proxy).filter_by(ip=r.get("ip"), port=r.get("port")).first() 75 | if proxy: 76 | proxy.updated_time = datetime.datetime.now() 77 | try: 78 | self.session.add(proxy) 79 | self.session.commit() 80 | except Exception, e: 81 | logger.debug("Update database error. " + e.message) 82 | continue 83 | 84 | new_proxy = Proxy(ip=r.get("ip", "None"), port=r.get("port", "None"), proxy_type=r.get("type", "None"), 85 | location=r.get("location", "None"), protocol=r.get("protocol", "None"), 86 | times=r.get("time", "None"), is_alive=0, created_time=datetime.datetime.now(), 87 | updated_time=datetime.datetime.now()) 88 | try: 89 | self.session.add(new_proxy) 90 | self.session.commit() 91 | except Exception, e: 92 | logger.debug("Save database error. " + e.message) 93 | 94 | def __write_file(self, res): 95 | self.ff.writelines(res[0].get('url') + "\n") 96 | self.ff.writelines("ip,port,type,protocol,location,time(s)\n") 97 | logger.info("[*] url: " + res[0].get('url')) 98 | res = res[1] 99 | for r in res: 100 | line = r.get('ip', 'None') + "," + r.get('port', 'None') + "," + \ 101 | r.get('type', 'None') + "," + r.get('protocol', 'None') + "," + \ 102 | r.get('location', 'None') + "," + r.get('time', 'None') 103 | logger.info("[*] " + line) 104 | self.ff.writelines((line + "\n").encode("utf8")) 105 | 106 | -------------------------------------------------------------------------------- /utils/Data/Tables.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | 4 | from sqlalchemy import Column 5 | from sqlalchemy import String, Integer, DateTime, Float 6 | from sqlalchemy.ext.declarative import declarative_base 7 | 8 | 9 | __author__ = "lightless" 10 | __email__ = "root@lightless.me" 11 | 12 | 13 | Base = declarative_base() 14 | 15 | 16 | class Proxy(Base): 17 | 18 | __tablename__ = "proxy" 19 | 20 | id = Column(Integer, primary_key=True, autoincrement=True) 21 | ip = Column(String(16), index=True, default=None, nullable=True) 22 | port = Column(String(5), default=None, nullable=True) 23 | proxy_type = Column(String(32), default=None, nullable=True) 24 | location = Column(String(128), default=None, nullable=True) 25 | protocol = Column(String(64), default=None, nullable=True) 26 | times = Column(Float, default=None, nullable=True) 27 | is_alive = Column(Integer, default=0, nullable=True) # 0-dead, 1-alive 28 | created_time = Column(DateTime, default=None, nullable=True) 29 | updated_time = Column(DateTime, default=None, nullable=True) 30 | 31 | -------------------------------------------------------------------------------- /utils/Data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightless233/Pansidong/dd8691d356a50d522d68dbaa519b77b890bd0de3/utils/Data/__init__.py -------------------------------------------------------------------------------- /utils/SpiderBase.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | import os 4 | import sys 5 | import platform 6 | 7 | from utils.Data.LoggerHelp import logger 8 | 9 | 10 | __author__ = "lightless" 11 | __email__ = "root@lightless.me" 12 | 13 | 14 | class SpiderBase(object): 15 | def __init__(self): 16 | self.result_queue = None 17 | self.phantomjs_path = None 18 | 19 | def set_result_queue(self, result_queue): 20 | self.result_queue = result_queue 21 | 22 | def set_phantomjs_path(self): 23 | phantomjs_path = os.getcwd() + os.sep + "ThirdParty" + os.sep + "phantomjs" + os.sep 24 | 25 | if "Windows" in platform.system(): 26 | self.phantomjs_path = phantomjs_path + "phantomjs.exe" 27 | elif "Linux" in platform.system() and "x86_64" in platform.machine(): 28 | self.phantomjs_path = phantomjs_path + "phantomjs" 29 | elif "Darwin" in platform.system(): 30 | self.phantomjs_path = phantomjs_path + "phantomjs-mac" 31 | else: 32 | logger.error("Unsupported operating system.") 33 | logger.error("Only Windows and Linux x86_64 was supported.") 34 | sys.exit(1) 35 | 36 | -------------------------------------------------------------------------------- /utils/ThreadPool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import Queue 5 | import threading 6 | from multiprocessing import cpu_count 7 | import time 8 | 9 | from utils.Data.LoggerHelp import logger 10 | 11 | __all__ = ["ThreadPool"] 12 | 13 | __author__ = "lightless" 14 | __email__ = "root@lightless.me" 15 | 16 | 17 | class ThreadPool(object): 18 | def __init__(self, thread_count=cpu_count()*2): 19 | self.__thread_count = thread_count 20 | self.__function_list = Queue.Queue() 21 | self.__thread_list = [] 22 | self.__alive_thread_counts = 0 23 | self.__working_thread_list = [] 24 | self.__dead_threads = [] 25 | self.finished = False 26 | 27 | def add_function_list(self, function_list=[]): 28 | for fn in function_list: 29 | self.add_function(fn[0], **fn[1]) 30 | 31 | def add_function(self, func, **kwargs): 32 | if callable(func): 33 | self.__function_list.put((func, kwargs)) 34 | 35 | def add_thread_list(self, fn, **kwargs): 36 | # 获取thread name 37 | try: 38 | thread_name = str(fn.im_class).split(".")[-1].split("'")[0] 39 | except AttributeError: 40 | thread_name = fn.__name__ 41 | 42 | t = threading.Thread(target=fn, name=thread_name, kwargs=kwargs) 43 | self.__thread_list.append(t) 44 | 45 | def run(self, join=True): 46 | # 从队列中获取工作函数 47 | while not self.__function_list.empty(): 48 | fn = self.__function_list.get_nowait() 49 | try: 50 | thread_name = str(fn[0].im_class).split(".")[-1].split("'")[0] 51 | except AttributeError: 52 | thread_name = fn[0].__name__ 53 | t = threading.Thread(target=fn[0], name=thread_name, kwargs=fn[1]) 54 | self.__thread_list.append(t) 55 | 56 | tt = threading.Thread(target=self._run, args=(join,), name="really_run") 57 | # tt.setDaemon(True) 58 | tt.start() 59 | # tt.join() 60 | 61 | def is_all_thread_dead(self): 62 | flags = True 63 | for t in self.__thread_list: 64 | if t.is_alive(): 65 | flags = False 66 | elif t not in self.__dead_threads: 67 | logger.debug("[*] " + t.getName() + " finished.") 68 | self.__dead_threads.append(t) 69 | return flags 70 | 71 | def __get_current_alive_thread_count(self): 72 | alive_cnt = 0 73 | for t in self.__working_thread_list: 74 | if t.is_alive(): 75 | alive_cnt += 1 76 | self.__alive_thread_counts = alive_cnt 77 | return alive_cnt 78 | 79 | def _run(self, join=True): 80 | for t in self.__thread_list: 81 | # 等待线程 82 | while True: 83 | if self.__get_current_alive_thread_count() < self.__thread_count: 84 | break 85 | else: 86 | time.sleep(0.5) 87 | # 获取到了空闲的位置,从工作列表中删除已经停止的线程 88 | for tt in self.__working_thread_list: 89 | if not tt.is_alive(): 90 | logger.debug("[*] " + tt.getName() + " deleted from working list.") 91 | self.__working_thread_list.remove(tt) 92 | # 等待到了空闲的位置,将该任务添加到工作列表中 93 | self.__working_thread_list.append(t) 94 | # 开始线程 95 | logger.debug("[*] " + t.getName() + " start.") 96 | t.start() 97 | if join: 98 | for tt in self.__working_thread_list: 99 | tt.join() 100 | while True: 101 | if self.is_all_thread_dead(): 102 | self.finished = True 103 | break 104 | else: 105 | time.sleep(0.5) 106 | 107 | @staticmethod 108 | def get_all_threads(): 109 | return threading.enumerate() 110 | -------------------------------------------------------------------------------- /utils/ThreadPool2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # coding: utf-8 3 | # file: ThreadPool4 4 | # time: 2016/7/23 20:12 5 | 6 | import Queue 7 | from multiprocessing import cpu_count 8 | import threading 9 | import time 10 | import os 11 | 12 | from utils.Data.LoggerHelp import logger 13 | 14 | __author__ = "lightless" 15 | __email__ = "root@lightless.me" 16 | 17 | 18 | class ThreadPool(object): 19 | def __init__(self, thread_count=cpu_count()): 20 | 21 | # 初始化相关变量 22 | super(ThreadPool, self).__init__() 23 | self.thread_count = thread_count if thread_count else 1 24 | 25 | # 任务队列和正在运行的线程列表 26 | self.working_list = list() 27 | self.func_list = Queue.Queue() 28 | 29 | # 结果队列 30 | self.result_queue = Queue.Queue() 31 | 32 | # 线程数量统计 33 | self.dead_thread_number = 0 34 | self.all_thread_number = 0 35 | 36 | # join和close标志 37 | self.joined = False 38 | self.closed = False 39 | 40 | # 退出标志 41 | self.exit = False 42 | self.already_exit = False 43 | 44 | # 开启主线程 45 | loop_thread = threading.Thread(target=self.__loop, name="ThreadPool.loop") 46 | # 如果开启了这个,主线程退出的时候loop就会终止 47 | loop_thread.daemon = False 48 | loop_thread.start() 49 | 50 | self.working_thread_number = 0 51 | 52 | # 线程池debug标志 53 | self.DEBUG = False 54 | 55 | def add_func(self, func, *args, **kwargs): 56 | self.func_list.put((func, args, kwargs)) 57 | self.all_thread_number += 1 58 | 59 | def current_working_num(self): 60 | working = 0 61 | for thread in self.working_list: 62 | if thread.isAlive(): 63 | # 线程还活着 64 | working += 1 65 | else: 66 | # 线程已经结束了 67 | logger.debug("Thread %s end." % thread.name) 68 | self.working_list.remove(thread) 69 | self.dead_thread_number += 1 70 | self.working_thread_number = working 71 | return working 72 | 73 | def __loop(self): 74 | while True: 75 | 76 | if self.exit: 77 | logger.debug("ThreadPool loop end.") 78 | break 79 | 80 | if self.joined and self.all_thread_number == self.dead_thread_number: 81 | self.terminated() 82 | 83 | if self.current_working_num() >= self.thread_count: 84 | # 没有空闲位置了 85 | time.sleep(1) 86 | logger.debug("No more place.") if self.DEBUG else None 87 | continue 88 | 89 | if self.func_list.empty(): 90 | # 没有任务了 91 | time.sleep(1) 92 | logger.debug("No more task.") if self.DEBUG else None 93 | continue 94 | 95 | # 获取任务并运行 96 | task = self.func_list.get_nowait() 97 | try: 98 | thread_name = str(task[0].im_class).split(".")[-1].split("'")[0] 99 | except AttributeError: 100 | thread_name = task[0].__name__ 101 | thread = threading.Thread(target=task[0], args=task[1], kwargs=task[2], name=thread_name) 102 | thread.start() 103 | logger.debug(thread_name + " start.") if self.DEBUG else None 104 | self.working_list.append(thread) 105 | 106 | def terminated(self): 107 | self.exit = True 108 | 109 | def close(self): 110 | self.closed = True 111 | 112 | def join(self): 113 | if self.closed: 114 | self.joined = True 115 | else: 116 | logger.fatal("Join must after closed thread pool.") 117 | os._exit(1) 118 | 119 | 120 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightless233/Pansidong/dd8691d356a50d522d68dbaa519b77b890bd0de3/utils/__init__.py --------------------------------------------------------------------------------