├── chromedriver.exe ├── rule.json ├── RequestsHeader.py ├── config.ini ├── README.md ├── log.py ├── read_DB.py ├── config.py ├── WebpageShot.py ├── Notification.py └── webmonitor.py /chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0x24bin/WebMonitor/HEAD/chromedriver.exe -------------------------------------------------------------------------------- /rule.json: -------------------------------------------------------------------------------- 1 | { 2 | "single": { 3 | "https://www.baidu.com/": {}, 4 | "https://www.qq.com/": {}, 5 | "https://www.ifeng.com/": {} 6 | }, 7 | "TimeInterval": 3 8 | 9 | } -------------------------------------------------------------------------------- /RequestsHeader.py: -------------------------------------------------------------------------------- 1 | user_agents = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36" 2 | 3 | req_headers = { 4 | 'User-Agent': user_agents 5 | } -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | [default] 2 | TimeInterval : 3600 3 | FailTimeInterval : 60 4 | retriesnum : 1 5 | recorddir : D:\webmonitor 6 | dbfile: D:\webmonitor\webmonitor.db 7 | 8 | 9 | [mail] 10 | host : smtp.qq.com 11 | port : 465 12 | from : 123@qq.com 13 | password: password 14 | to : aqdwd@@xxxx.com,test@awqw.com 15 | 16 | 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WebMonitor 2 | 网站监控 3 | 4 | 5 | 主要作用: 6 | 定时对rule.json文件内对网站进行轮询访问 7 | 对需要访问对网站主页保存html 并将访问页面截图保存 8 | 将网站内容与历史记录进行比对,将比对结果邮件发送发送邮箱 9 | 10 | 各个脚本主要功能 11 | 12 | config.ini 配置文件 13 | config.py 读取配置文件 14 | log.py 日志记录 15 | Notification.py 邮件通知 16 | read_DB.py 写sqlite db程序 17 | RequestsHeader.py 设置请求header 18 | rule.json 需要监控对网站主页列表 19 | webmonitor.py 主入口程序 20 | WebpageShot.py 网页截图 21 | -------------------------------------------------------------------------------- /log.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | log 5 | ~~~ 6 | 7 | Implements color logger 8 | 9 | :author: Feei 10 | :homepage: https://github.com/wufeifei/cobra 11 | :license: MIT, see LICENSE for more details. 12 | :copyright: Copyright (c) 2018 Feei. All rights reserved 13 | """ 14 | import os 15 | import colorlog 16 | import logging 17 | from logging import handlers 18 | 19 | log_path = 'logs' 20 | if os.path.isdir(log_path) is not True: 21 | os.mkdir(log_path, 0o755) 22 | logfile = os.path.join(log_path, 'webmonitor.log') 23 | 24 | handler = colorlog.StreamHandler() 25 | formatter = colorlog.ColoredFormatter( 26 | '%(log_color)s%(asctime)s [%(name)s] [%(levelname)s] %(message)s%(reset)s', 27 | datefmt=None, 28 | reset=True, 29 | log_colors={ 30 | 'DEBUG': 'cyan', 31 | 'INFO': 'green', 32 | 'WARNING': 'yellow', 33 | 'ERROR': 'red', 34 | 'CRITICAL': 'red,bg_white', 35 | }, 36 | secondary_log_colors={}, 37 | style='%' 38 | ) 39 | handler.setFormatter(formatter) 40 | 41 | file_handler = handlers.RotatingFileHandler(logfile, maxBytes=(1048576 * 5), backupCount=7) 42 | file_handler.setFormatter(formatter) 43 | 44 | logger = colorlog.getLogger('WebpageMonitor') 45 | logger.addHandler(handler) 46 | logger.addHandler(file_handler) 47 | logger.setLevel(logging.INFO) 48 | -------------------------------------------------------------------------------- /read_DB.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import sqlite3 4 | 5 | 6 | # 创建表 7 | def create_table(db_name, table_name): 8 | conn = sqlite3.connect('%s' % db_name) 9 | cursor = conn.cursor() 10 | cursor.execute("create table %s (id INTEGER PRIMARY KEY AUTOINCREMENT,url text not null,pagemd5 text not null,sourcefile text not null,picname text not null,date timestamp not null default (datetime('now','localtime')))"%table_name,) 11 | cursor.close() 12 | conn.close() 13 | 14 | 15 | # 读取数据库 16 | def queryUrlMd5(db_name, table_name, url, size=100): 17 | conn = sqlite3.connect('%s' % db_name) 18 | cursor = conn.cursor() 19 | cursor.execute("select pagemd5 from %s where url = '%s' order by date desc limit 0,1" % (table_name,url)) 20 | result_all = cursor.fetchmany(size) 21 | cursor.close() 22 | conn.close() 23 | if result_all: 24 | return result_all[0][0] 25 | return None 26 | 27 | #return result_all[0] 28 | 29 | 30 | 31 | 32 | # 写入数据库 33 | # 由于写入数据库比较耗时,直接将更新的所有传递给write_db 34 | def write_db(db_name, table_name, result_list): 35 | conn = sqlite3.connect('%s' % db_name) 36 | cursor = conn.cursor() 37 | new_list = [] 38 | for result in result_list: 39 | url,pagemd5, sourfile, imgname = result 40 | if pagemd5 != 'null': 41 | sql = "insert into %s (url, pagemd5, sourcefile, picname ) values ('%s', '%s', '%s', '%s')" % (table_name, url,pagemd5, sourfile, imgname) 42 | #print(sql) 43 | #cursor.execute(sql) 44 | #conn.commit() 45 | try: 46 | cursor.execute(sql) 47 | new_list.append(result) 48 | conn.commit() 49 | print(u"写入 "+sql+u" 成功!") 50 | except: 51 | print(result+u" 已存在!") 52 | cursor.close() 53 | conn.close() 54 | return True 55 | #return new_list 56 | 57 | 58 | if __name__ == '__main__': 59 | #create_table('test', 'test') 60 | aa = queryUrlMd5('aaa.db','result','http://www.suningestate.com/index.aspxa') 61 | if aa: 62 | print (aa) 63 | #print(querymd5('aaa.db','result','http://www.suningestate.com/index.asp')) 64 | 65 | 66 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | config 5 | ~~~~~~ 6 | 7 | Implements configuration 8 | 9 | 10 | """ 11 | 12 | import sys 13 | import traceback 14 | import configparser 15 | import os 16 | import json 17 | from log import logger 18 | 19 | 20 | #conf_name = sys.argv[1] if len(sys.argv) == 2 else 'config.ini' 21 | #上层目录 22 | #project_directory = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) 23 | #当前目录 24 | project_directory = os.path.abspath(os.path.join(os.path.dirname(__file__))) 25 | config_path = os.path.join(project_directory, 'config.ini') 26 | rules_path = os.path.join(project_directory, 'rule.json') 27 | #print(config_path) 28 | 29 | 30 | 31 | 32 | def get(level1=None, level2=None): 33 | """ 34 | Get config value 35 | :param level1: 36 | :param level2: 37 | :return: string 38 | """ 39 | if level1 is None and level2 is None: 40 | return 41 | config = configparser.ConfigParser() 42 | 43 | config.read(config_path) 44 | value = None 45 | try: 46 | value = config.get(level1, level2) 47 | #print (value) 48 | except Exception as e: 49 | print(level1, level2) 50 | traceback.print_exc() 51 | print("config.ini file configure failed.\nError: {0}".format(e)) 52 | return value 53 | 54 | # Rules Structure Design 55 | # 56 | # 'rule keywords': { 57 | # 'mode': '' // RuleMode: normal-match(default)/only-match/full-match/mail 58 | # 'extension': '' // search extension: (default)/txt/md/java/python/etc... 59 | # } 60 | # 61 | try: 62 | with open(rules_path) as f: 63 | rules_dict = json.load(f) 64 | except Exception as e: 65 | logger.critical('please config rules.json!') 66 | logger.critical(traceback.format_exc()) 67 | 68 | #print(rules_dict.items()) 69 | 70 | class Rule(object): 71 | def __init__(self, types=None,url=None,mode=None): 72 | self.types = types 73 | self.url = url 74 | #self.filename = url.replace("/","-") 75 | self.mode = mode 76 | 77 | # 读取配置文件 78 | def get_rules(rule_type='singlepage'): 79 | rules_objects = [] 80 | for types, rule_list in rules_dict.items(): 81 | if types in rule_type: 82 | types =types.upper() 83 | for url,rule_attr in rule_list.items(): 84 | if 'mode' in rule_attr: 85 | mode = rule_attr['mode'] 86 | else: 87 | mode = None 88 | r = Rule(types,url,mode) 89 | #print(url,types,mode) 90 | rules_objects.append(r) 91 | return rules_objects 92 | 93 | 94 | 95 | 96 | if __name__ == '__main__': 97 | #print(get('mail', 'host')) 98 | rules = get_rules() 99 | if len(rules) == 0: 100 | print('aaa') 101 | for idx,rule_object in enumerate(rules): 102 | print(idx,rule_object.url) 103 | 104 | -------------------------------------------------------------------------------- /WebpageShot.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import time 3 | import os.path 4 | import multiprocessing as mp 5 | from selenium.webdriver.chrome.options import Options 6 | 7 | 8 | def readtxt(): 9 | '''读取txt文件,返回一个列表,每个元素都是一个元组;文件的格式是图片保存的名称加英文逗号加网页地址''' 10 | with open('urls.txt', 'r') as f: 11 | lines = f.readlines() 12 | urls = [] 13 | for line in lines: 14 | try: 15 | thelist = line.strip().split(",") 16 | if len(thelist) == 2 and thelist[0] and thelist[1]: 17 | urls.append((thelist[0], thelist[1])) 18 | except: 19 | pass 20 | return urls 21 | 22 | 23 | def get_dir(): 24 | '''判断文件夹是否存在,如果不存在就创建一个''' 25 | filename = "./pics" 26 | if not os.path.isdir(filename): 27 | os.makedirs(filename) 28 | return filename 29 | 30 | 31 | def webshot(args): 32 | filename,link = args 33 | # driver = webdriver.PhantomJS() 34 | options = webdriver.ChromeOptions() 35 | #options = Options() 36 | chromed='./chromedriver.exe' 37 | options.add_argument('--headless') 38 | options.add_argument('--disable-gpu') 39 | #driver = webdriver.Chrome(chromed,options=options) 40 | driver = webdriver.Chrome(chromed,chrome_options=options) 41 | driver.maximize_window() 42 | # 返回网页的高度的js代码 43 | js_height = "return document.body.clientHeight" 44 | #picname = str(tup[0]) 45 | #print(tup[1]) 46 | 47 | #link = tup[1] 48 | #print(link) 49 | driver.get(link) 50 | #print(link) 51 | try: 52 | driver.get(link) 53 | k = 1 54 | height = driver.execute_script(js_height) 55 | while True: 56 | if k * 500 < height: 57 | js_move = "window.scrollTo(0,{})".format(k * 500) 58 | print(js_move) 59 | driver.execute_script(js_move) 60 | time.sleep(1) 61 | height = driver.execute_script(js_height) 62 | k += 1 63 | else: 64 | break 65 | scroll_width = driver.execute_script('return document.body.parentNode.scrollWidth') 66 | scroll_height = driver.execute_script('return document.body.parentNode.scrollHeight') 67 | driver.set_window_size(scroll_width, scroll_height) 68 | #driver.get_screenshot_as_file( 69 | # "./pics/" + picname + ".png") 70 | driver.get_screenshot_as_file(filename) 71 | #with open(filename+ ".txt", 'w') as f: 72 | # f.write(driver.page_source) 73 | print("Process {} get one pic !!!".format(os.getpid())) 74 | time.sleep(0.1) 75 | except Exception as e: 76 | print(filename + ".png", e) 77 | 78 | 79 | if __name__ == '__main__': 80 | t = time.time() 81 | get_dir() 82 | urls = readtxt() 83 | #print(urls) 84 | #webshot(urls[0]) 85 | pool = mp.Pool() 86 | pool.map_async(func=webshot, iterable=urls) 87 | pool.close() 88 | pool.join() 89 | print("操作结束,耗时:{:.2f}秒".format(float(time.time() - t))) 90 | -------------------------------------------------------------------------------- /Notification.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from email.header import Header 4 | from email.mime.text import MIMEText 5 | from email.mime.multipart import MIMEMultipart 6 | from email.mime.application import MIMEApplication 7 | import smtplib 8 | from config import get 9 | from log import logger 10 | import traceback 11 | import os 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | from_addr = get('mail','from') 20 | password = get('mail','password') 21 | to_addr = get('mail','to') 22 | smtp_server = get('mail','host').strip() 23 | 24 | smtp_port = get('mail','port').strip() 25 | 26 | class Notification(object): 27 | def __init__(self, subject, to=None): 28 | """ 29 | Initialize notification class 30 | :param subject: 31 | :param to: 32 | """ 33 | self.subject = subject 34 | self.mail = get('mail','from') 35 | if to is None: 36 | self.to = get('mail', 'to') 37 | else: 38 | self.to = to 39 | 40 | 41 | # 字节bytes转化kb\m\g 返回大小单位为M 42 | def formatSize(self,bytes): 43 | try: 44 | bytes = float(bytes) 45 | kb = bytes / 1024 46 | except: 47 | print("传入的字节格式不对") 48 | return "Error" 49 | 50 | M = kb / 1024 51 | return M 52 | 53 | # 获取文件大小 54 | def getFileSize(self,path): 55 | try: 56 | if os.path.isfile(path): 57 | size = os.path.getsize(path) 58 | else: 59 | size = 0 60 | return self.formatSize(size) 61 | except Exception as err: 62 | print(err) 63 | 64 | 65 | def sendmail(self, html=None,attchfile=None): 66 | """ 67 | Send notification use by mail 68 | :param html: 69 | :return: 70 | """ 71 | msg = MIMEMultipart() 72 | msg['Subject'] = self.subject 73 | msg['From'] = '{0} <{1}>'.format(self.mail, get('mail', 'from')) 74 | # 支持多用户接收邮件 75 | msg['To'] = self.to 76 | 77 | 78 | text = MIMEText(html, 'html', 'utf-8') 79 | msg.attach(text) 80 | host = get('mail', 'host').strip() 81 | port = get('mail', 'port').strip() 82 | #print(attchfile) 83 | if attchfile: 84 | #只允许小于40Mb的附件 85 | if self.getFileSize(attchfile) < 40: 86 | #print('ok') 87 | zipApart = MIMEApplication(open(attchfile, 'rb').read()) 88 | zipApart.add_header('Content-Disposition', 'attachment', filename=attchfile) 89 | msg.attach(zipApart) 90 | 91 | try: 92 | if port == '465': 93 | port = int(port) 94 | s = smtplib.SMTP_SSL(host, port) 95 | else: 96 | s = smtplib.SMTP(host, port) 97 | s.ehlo() 98 | s.starttls() 99 | s.ehlo() 100 | s.set_debuglevel(1) 101 | s.login(self.mail, get('mail', 'password')) 102 | s.sendmail(self.mail, self.to.split(','), msg.as_string()) 103 | s.quit() 104 | return True 105 | except SMTPException: 106 | logger.critical('Send mail failed') 107 | traceback.print_exc() 108 | return False 109 | 110 | 111 | 112 | if __name__ == '__main__': 113 | aa =Notification('aaa') 114 | aa.sendmail(html='aaaaa',attchfile='a.py') 115 | -------------------------------------------------------------------------------- /webmonitor.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import requests 4 | from RequestsHeader import req_headers 5 | from read_DB import * 6 | from Notification import Notification 7 | import random 8 | from log import logger 9 | import hashlib 10 | from config import * 11 | import multiprocessing as mp 12 | from WebpageShot import webshot 13 | import time 14 | import os 15 | import zipfile 16 | 17 | 18 | 19 | class WebpageMonitor(object): 20 | def __init__(self): 21 | self.rules = get_rules() 22 | self.recorddir = get('default', 'recorddir').strip() 23 | self.fail_time_interval_num = int(get('default', 'FailTimeInterval').strip()) 24 | self.timeinterval= int(get('default', 'TimeInterval')) 25 | self.retriesnum = int(get('default', 'retriesnum').strip()) 26 | self.dbfile= get('default', 'dbfile') 27 | self.table_name = 'result' 28 | # 入库 29 | try: 30 | create_table('%s' % self.dbfile, 'result') 31 | except: 32 | pass 33 | 34 | 35 | def md5_ncrypt(self,text): 36 | m = hashlib.md5() 37 | m.update(text.encode(encoding='utf-8')) 38 | str_md5 = m.hexdigest() 39 | return str_md5 40 | 41 | def md5sum(self,filename, blocksize=65536): 42 | hash = hashlib.md5() 43 | with open(filename, "rb") as f: 44 | # 必须是rb形式打开的,否则的两次出来的结果不一致 45 | for block in iter(lambda: f.read(blocksize), b""): 46 | hash.update(block) 47 | return hash.hexdigest() 48 | 49 | def getNowtime(self): 50 | #return time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) 51 | return time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) 52 | 53 | def createdir(self): 54 | dirname = self.recorddir + os.path.sep + self.getNowtime() 55 | if not os.path.isdir(dirname): 56 | os.makedirs(dirname) 57 | return dirname 58 | def url2name(self,url): 59 | tmpurl=url 60 | if url.startswith('http'): 61 | tmpurl=url.replace('http://', '') 62 | tmpurl = tmpurl.replace('https://', '') 63 | if tmpurl.endswith('/'): 64 | tmpurl=tmpurl[:-1] 65 | tmpurl = tmpurl.replace('/', '-') 66 | return tmpurl 67 | 68 | 69 | def start(self,rule_types): 70 | rules = get_rules(rule_types) 71 | if len(rules) == 0: 72 | logger.critical('get rules failed, rule types not found!') 73 | exit(0) 74 | self.rulenum = len(rules) 75 | logger.info('rules length: {rl}'.format(rl=len(rules))) 76 | #tmptime=self.getNowtime() 77 | #dirname = self.recorddir + tmptime 78 | self.dirname = self.createdir() 79 | #dirname = self.recorddir 80 | #pool = mp.Pool() 81 | result_list = [] 82 | webshotargs = [] 83 | for idx, rule_object in enumerate(rules): 84 | #print(idx, rule_object.url) 85 | logger.info('>>>>>>>>>>>>> {n} > {r} >>>>>>'.format(n=rule_object.types, r=rule_object.url)) 86 | urlname = self.url2name(rule_object.url)+'-'+self.getNowtime() 87 | content = self.openWebPage(rule_object.url) 88 | sourcefile = None 89 | 90 | 91 | #html = '

网页监控报告: {rule_regex} Count: {count} Datetime: {datetime}

'.format( 92 | # rule_regex=self.rule_object.keyword, datetime=time.strftime("%Y-%m-%d %H:%M:%S"), 93 | # count=len(self.content)) 94 | 95 | tmp = [] 96 | tmp.append(rule_object.url) 97 | if content : 98 | 99 | sourcefile = self.dirname + os.path.sep + urlname + '.txt' 100 | 101 | #sourcemd5= self.md5_ncrypt(content) 102 | 103 | 104 | with open(sourcefile, 'w',encoding='utf-8') as f: 105 | f.write(content) 106 | sourcemd5 = self.md5sum(sourcefile) 107 | #oldmd5 = queryUrlMd5(self.dbfile, self.table_name, rule_object.url) 108 | 109 | picname = self.dirname + os.path.sep + urlname + '.png' 110 | filename = self.dirname + os.path.sep + urlname 111 | #webshot(filename, rule_object.url) 112 | webshotargs.append((picname, rule_object.url)) 113 | 114 | 115 | tmp = (rule_object.url, sourcemd5, sourcefile, picname) 116 | result_list.append(tmp) 117 | else: 118 | tmp = (rule_object.url,'null','null','null') 119 | result_list.append(tmp) 120 | webshotmp(webshotargs) 121 | self.checkdiff(result_list) 122 | 123 | 124 | #pool.apply_async(search, args=(idx, rule_object), callback=store_result) 125 | #pool.close() 126 | #pool.join() 127 | 128 | def genratepagelist(self,pagelist): 129 | html='' 130 | for record in pagelist: 131 | html += '
  • URL地址: {url} MD5: {md5} 源码路径: {source} 截图路径: {img}
  • '.format( 132 | url=record[0], md5=record[1], source=record[2], img=record[3]) 133 | return html 134 | 135 | 136 | def checkdiff(self, result_list): 137 | count = len(result_list) 138 | #normal new change error 139 | result_dict={} 140 | pagenew = [] 141 | pagenormal = [] 142 | pageerror = [] 143 | pagechanged = [] 144 | for record in result_list: 145 | #print (record[0]) 146 | url, pagemd5, sourfile, imgname = record 147 | 148 | if pagemd5 == 'null': 149 | #页面打开失败 150 | pageerror.append(record) 151 | else: 152 | oldmd5 = queryUrlMd5(self.dbfile, self.table_name,url) 153 | 154 | if oldmd5 : 155 | if oldmd5 == pagemd5 : 156 | pagenormal.append(record) 157 | else: 158 | pagechanged.append(record) 159 | else: 160 | pagenew.append(record) 161 | write_db('%s' % self.dbfile, self.table_name, result_list) 162 | #print ('normal page', pagenormal) 163 | #print ('page error', pageerror) 164 | #print ('page new', pagenew) 165 | #print ('page changed', pagechanged) 166 | reporttime = time.strftime("%Y-%m-%d %H:%M:%S") 167 | subject = "网站防篡改监控报告--{ss}".format(ss=reporttime) 168 | 169 | html = '

    监控网站数量: {count} 监控时间: {datetime}


    '.format(count=self.rulenum, 170 | datetime=reporttime) 171 | #html +='
    ' 172 | #所有页面正常 173 | print (self.rulenum,'-----------',len(pagenormal)) 174 | if self.rulenum == len(pagenormal): 175 | html += '

    所有被监控网页正常运行无异常!

    ' 176 | html += self.genratepagelist(pagenormal) 177 | Notification(subject).sendmail(html=html) 178 | else: 179 | if len(pageerror): 180 | #html += '

    ' 181 | html += '

    访问异常网站{count}个,分别为如下网站

    '.format(count=len(pageerror)) 182 | html += self.genratepagelist(pageerror) 183 | if len(pagechanged): 184 | html += '

    页面变化网站{count}个,请人工确认,分别为如下网站

    '.format(count=len(pagechanged)) 185 | html += self.genratepagelist(pagechanged) 186 | if len(pagenew): 187 | html += '

    新增监控网站{count}个,分别为如下网站

    '.format(count=len(pagenew)) 188 | html += self.genratepagelist(pagenew) 189 | if len(pagenormal): 190 | html += '

    运行正常网站{count}个,分别为如下网站

    '.format(count=len(pagenormal)) 191 | html += self.genratepagelist(pagenormal) 192 | zipfilename = '网站防篡改监控运行日志-' + reporttime.replace(':','').replace('-','') + '.zip' 193 | if self.zipDir(self.dirname,zipfilename): 194 | #print("qqqq") 195 | Notification(subject).sendmail(html=html,attchfile=zipfilename) 196 | os.remove(zipfilename) 197 | 198 | def zipDir(self,dirpath, outFullName): 199 | """ 200 | 压缩指定文件夹 201 | :param dirpath: 目标文件夹路径 202 | :param outFullName: 压缩文件保存路径+xxxx.zip 203 | :return: 无 204 | """ 205 | try: 206 | #if 'nt' == os.name: 207 | # zip = zipfile.ZipFile(outFullName, "w") 208 | #else: 209 | 210 | # zip = zipfile.ZipFile(outFullName, "w", zipfile.ZIP_DEFLATED) 211 | zip = zipfile.ZipFile(outFullName, "w") 212 | for path, dirnames, filenames in os.walk(dirpath): 213 | # 去掉目标跟路径,只对目标文件夹下边的文件及文件夹进行压缩 214 | #print(fpath) 215 | fpath = path.replace(dirpath, '') 216 | 217 | #print(fpath) 218 | for filename in filenames: 219 | zip.write(os.path.join(path, filename), os.path.join(fpath, filename)) 220 | zip.close() 221 | return True 222 | except: 223 | logger.error("genrate zip file error {dir} {name}".format(dir=dirpath,name=outFullName)) 224 | return False 225 | 226 | 227 | 228 | 229 | def main(self): 230 | # t = time.time() 231 | #pool = multiprocessing.Pool(multiprocessing.cpu_count()) 232 | # self.start('single') 233 | # logger.info("操作结束,耗时:{:.2f}秒".format(float(time.time() - t))) 234 | while 1: 235 | t = time.time() 236 | self.start('single') 237 | logger.info("操作结束,耗时:{:.2f}秒".format(float(time.time() - t))) 238 | time.sleep(self.timeinterval) 239 | # 240 | 241 | 242 | 243 | 244 | 245 | 246 | #访问URL 247 | def openWebPage(self,url): 248 | tag = 0 249 | while tag < int(self.retriesnum): 250 | try: 251 | if tag != 0 : 252 | time.sleep(self.fail_time_interval_num + random.randint(1, 5)) 253 | page = requests.get(url, headers=req_headers, allow_redirects=True) 254 | return page.text 255 | except: 256 | print(u"网络访问失败! ") 257 | logger.error('open {url} fail ,fail num is {tag} '.format(url=url, tag=tag+1)) 258 | tag +=1 259 | 260 | return None 261 | 262 | 263 | def webshotmp(args): 264 | t = time.time() 265 | pool = mp.Pool() 266 | logger.info('webshot {filename} {url}'.format(filename=args[0],url=args[1])) 267 | #pool.map_async(func=webshot, iterable=args) 268 | for aa in args: 269 | logger.info('webshot {filename} {url}'.format(filename=aa[0],url=aa[1])) 270 | pool.apply_async(webshot,args=(aa,)) 271 | #pool.map(webshot, args) 272 | pool.close() 273 | pool.join() 274 | print("操作结束,耗时:{:.2f}秒".format(float(time.time() - t))) 275 | 276 | if __name__ == '__main__': 277 | newVisit = WebpageMonitor() 278 | newVisit.main() 279 | #newVisit.save_result() 280 | --------------------------------------------------------------------------------