├── CrawlInfo.py ├── FilterResult.py ├── MainNotify.py ├── ReadJSON.py ├── Readme.md ├── RequestsHeader.py ├── config.json ├── read_DB.py └── sendMail.py /CrawlInfo.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import re 4 | from lxml.html import soupparser, tostring, HtmlElement 5 | 6 | 7 | def re_find(text, exp): 8 | re_result_list = re.findall(r'%s' % exp, text, re.S) 9 | return re_result_list 10 | 11 | 12 | def xpath_find(html, exp): 13 | root = soupparser.fromstring(html) 14 | result_ele = root.xpath(exp) 15 | return elements_to_unicodes(result_ele) 16 | 17 | 18 | def elements_to_unicodes(eles): 19 | xpath_result = [] 20 | for r in eles: 21 | xpath_result.append(tostring(r, encoding='utf-8').decode('utf-8') if isinstance(r, HtmlElement) else r) 22 | return xpath_result 23 | 24 | 25 | def css_find(text, exp): 26 | pass 27 | -------------------------------------------------------------------------------- /FilterResult.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import re 3 | 4 | 5 | def filter_result(result_list, exp, num): 6 | result = [] 7 | for rl in result_list: 8 | re_result = re.search(exp, rl) 9 | try: 10 | result.append(re_result.group(num)) 11 | except: 12 | pass 13 | return result 14 | -------------------------------------------------------------------------------- /MainNotify.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import requests 4 | import time 5 | from RequestsHeader import req_headers 6 | from CrawlInfo import * 7 | from read_DB import * 8 | from SendMail import * 9 | from FilterResult import * 10 | import random 11 | 12 | METHOD_DICT = { 13 | 're': re_find, 14 | 'xpath': xpath_find, 15 | 'css': css_find 16 | } 17 | 18 | 19 | class WebVisit(object): 20 | def __init__(self): 21 | self.start_page_list = [] # 监控网址列表 22 | self.time_interval_num = 0 # 时间间隔 23 | self.fail_time_interval_num = 0 # 访问失败再次访问时间间隔 24 | self.attention_method = '' # 监控内容匹配方式(正则,xpath,css) 25 | self.attention_exp = '' # 监控内容匹配表达式 26 | self.return_method = '' # 返回内容匹配表达式 27 | self.return_exp = '' # 返回 28 | self.notify_email = '' # 提醒邮箱 29 | self.notify_message = '' # 提醒内容 30 | self.filter_exp = '' # 结果过滤正则表达式 31 | self.filter_num = 0 # 结果过滤group 32 | 33 | # 读取配置 34 | def get_config(self): 35 | json_dict = read_config() 36 | if json_dict: 37 | try: 38 | self.start_page_list = json_dict['StartPage'] 39 | self.time_interval_num = json_dict['TimeInterval'] 40 | self.fail_time_interval_num = json_dict['FailTimeInterval'] 41 | self.attention_method = json_dict['Attention']['method'] 42 | self.attention_exp = json_dict['Attention']['expression'] 43 | self.return_method = json_dict['Return']['method'] 44 | self.return_exp = json_dict['Return']['expression'] 45 | self.notify_email = json_dict['Notify']['email'] 46 | self.notify_message = json_dict['Notify']['message'] 47 | self.filter_exp = json_dict['Filter']['expression'] 48 | self.filter_num = json_dict['Filter']['num'] 49 | except: 50 | print u"配置不完整! 请修改后重试! " 51 | exit() 52 | else: 53 | print u"配置文件不存在或内容有误! 请检查后重试! " 54 | exit() 55 | 56 | # 访问url 57 | def visit_web(self, url): 58 | 59 | try: 60 | page = requests.get(url, headers=req_headers, allow_redirects=True) 61 | return page.text 62 | except: 63 | print u"网络访问失败! " 64 | time.sleep(self.fail_time_interval_num + random.randint(10, 30)) 65 | return self.visit_web(url) 66 | 67 | # 根据配置获取要访问的url及要抓取的内容 68 | def visit_config(self): 69 | result_list_all = [] 70 | for url in self.start_page_list: 71 | result_list = [] 72 | page_text = self.visit_web(url) 73 | if page_text: 74 | print u"%s访问成功! " % url 75 | if self.attention_method in METHOD_DICT: 76 | result_list = METHOD_DICT[self.attention_method](page_text, self.attention_exp) 77 | else: 78 | print u"Attention.method字段配置错误! 请检查后重试! " 79 | exit() 80 | print u"抓取成功! " 81 | result_list_all.extend(result_list) 82 | else: 83 | print u"get page %s error! " % url 84 | time.sleep(self.fail_time_interval_num + random.randint(10, 30)) 85 | yield result_list 86 | # return result_list_all 87 | 88 | # 保存抓取结果 89 | def save_result(self): 90 | try: 91 | create_table('%s' % conf_name, 'result') 92 | except: 93 | pass 94 | while 1: 95 | result_list = self.visit_config() 96 | for rl in result_list: 97 | rl = filter_result(rl, self.filter_exp, self.filter_num) 98 | new_list = write_db('%s' % conf_name, 'result', rl) 99 | if new_list: 100 | new_json = json.dumps(new_list, encoding="UTF-8", ensure_ascii=False) 101 | SendMailTo("有更新啦! ", new_json) 102 | # time.sleep(self.time_interval_num + random.randint(1, 100)) 103 | 104 | 105 | if __name__ == '__main__': 106 | newVisit = WebVisit() 107 | newVisit.get_config() 108 | newVisit.save_result() 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /ReadJSON.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import os 4 | import json 5 | import sys 6 | 7 | conf_name = sys.argv[1] if len(sys.argv) == 2 else 'config.json' 8 | print sys.argv,len(sys.argv) 9 | 10 | 11 | # 读取配置文件 12 | def read_config(): 13 | file_path = './%s' % conf_name 14 | if os.path.isfile(file_path): 15 | with open(file_path, 'r') as json_file: 16 | try: 17 | json_dict = json.load(json_file) 18 | return json_dict 19 | except: 20 | return None 21 | else: 22 | return None 23 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # Readme 2 | 3 | ## config.json 4 | 5 | ```json 6 | { 7 | "StartPage": [ 8 | "url1", 9 | "url2" 10 | ], 11 | "TimeInterval": 3, 12 | "Attention": { 13 | "method": "", 14 | "expression": "" 15 | }, 16 | "Return": { 17 | "method": "", 18 | "expression": "" 19 | }, 20 | "Notify": { 21 | "email": "", 22 | "message": "" 23 | } 24 | } 25 | 26 | ``` 27 | 28 | + "StartPage": 需要监控的页面url列表。 29 | + "TimeInterval": 间隔时间。单位为分钟。如果只需要单次运行则填0。 30 | + "Attention": 需要监控的页面片段。"method"为匹配片段的方式。支持的方式有:re(正则),xpath,css。"expression"为匹配表达式。 31 | + "Return": 监控到变化后需要返回的页面片段。 32 | + "Notify": 接收提醒的邮箱和提醒文字。 -------------------------------------------------------------------------------- /RequestsHeader.py: -------------------------------------------------------------------------------- 1 | user_agents = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36" 2 | 3 | req_headers = { 4 | 'User-Agent': user_agents 5 | } -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "StartPage": [ 3 | "", 4 | "" 5 | ], 6 | "TimeInterval": 100, 7 | "FailTimeInterval": 30, 8 | "Attention": { 9 | "method": "", 10 | "expression": "" 11 | }, 12 | "Return": { 13 | "method": "", 14 | "expression": "" 15 | }, 16 | "Notify": { 17 | "email": "", 18 | "message": "" 19 | }, 20 | "Mail": { 21 | "from_addr": "", 22 | "password": "", 23 | "to_addr": "", 24 | "smtp_server": "" 25 | }, 26 | "Filter": { 27 | "expression": "", 28 | "num": 1 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /read_DB.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import sqlite3 4 | 5 | 6 | # 创建表 7 | def create_table(db_name, table_name): 8 | conn = sqlite3.connect('%s.db' % db_name) 9 | cursor = conn.cursor() 10 | cursor.execute("create table %s (id INTEGER PRIMARY KEY AUTOINCREMENT,result text UNIQUE,date timestamp not null default (datetime('now','localtime')))"%table_name,) 11 | cursor.close() 12 | conn.close() 13 | 14 | 15 | # 读取数据库 16 | def read_db(db_name, table_name, size=100): 17 | conn = sqlite3.connect('%s.db' % db_name) 18 | cursor = conn.cursor() 19 | cursor.execute('select * from ?', table_name) 20 | result_all = cursor.fetchmany(size) 21 | cursor.close() 22 | conn.close() 23 | return result_all 24 | 25 | 26 | # 写入数据库 27 | # 由于写入数据库比较耗时,直接将更新的所有传递给write_db 28 | def write_db(db_name, table_name, result_list): 29 | conn = sqlite3.connect('%s.db' % db_name) 30 | cursor = conn.cursor() 31 | new_list = [] 32 | for result in result_list: 33 | sql = "insert into %s (result) values ('%s')" % (table_name, result) 34 | #cursor.execute(sql) 35 | #conn.commit() 36 | try: 37 | cursor.execute(sql) 38 | new_list.append(result) 39 | conn.commit() 40 | print(u"写入 "+sql+u" 成功!") 41 | except: 42 | print(result+u" 已存在!") 43 | cursor.close() 44 | conn.close() 45 | return new_list 46 | 47 | 48 | if __name__ == '__main__': 49 | create_table('test', 'test') 50 | 51 | 52 | -------------------------------------------------------------------------------- /sendMail.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from email.header import Header 4 | from email.mime.text import MIMEText 5 | from email.utils import parseaddr, formataddr 6 | import smtplib 7 | from ReadJSON import * 8 | 9 | 10 | def _format_addr(s): 11 | name, addr = parseaddr(s) 12 | return formataddr((Header(name, 'utf-8').encode(),addr.encode('utf-8') if isinstance(addr, unicode) else addr)) 13 | 14 | 15 | mail_dict = read_config() 16 | if mail_dict: 17 | from_addr = mail_dict['Mail']['from_addr'] 18 | password = mail_dict['Mail']['password'] 19 | to_addr= mail_dict['Mail']['to_addr'] 20 | smtp_server = mail_dict['Mail']['smtp_server'] 21 | else: 22 | print u"配置文件不存在或内容有误! 请检查后重试! " 23 | 24 | 25 | def SendMailTo(subject, result): 26 | msg = MIMEText('%s' % result, 'plain', 'utf-8') 27 | msg['From'] = _format_addr(u'监控爬虫 <%s>' % from_addr) 28 | msg['To'] = _format_addr(u'管理员 <%s>' % to_addr) 29 | msg['Subject'] = Header(subject, 'utf-8').encode() 30 | 31 | server = smtplib.SMTP(smtp_server, 25) 32 | server.set_debuglevel(1) 33 | server.login(from_addr, password) 34 | server.sendmail(from_addr, [to_addr], msg.as_string()) 35 | server.quit() 36 | 37 | if __name__ == '__main__': 38 | SendMailTo('test', 'test') 39 | --------------------------------------------------------------------------------