├── .gitignore ├── README.md ├── crawl.py ├── requirement.txt └── urlfilter.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Python: 2 | *.pyc 3 | *.so 4 | *.egg 5 | *.egg-info 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # deadurl_detector 2 | ##要求: 3 | 设计一个系统,自动完成对于手机搜狐(http://m.sohu.com/ )系统可靠性的检测。具体要求: 4 | 5 | 1. 定时递归检测所有m.sohu.com域名的页面以及这些页面上的链接的可达性,即有没有出现不可访问情况。 6 | 7 | 2. m.sohu.com域名页面很多,从各个方面考虑性能优化。 8 | 9 | 3. 对于错误的链接记录到日志中,日志包括:连接,时间,错误状态等。 10 | 11 | 4. 考虑多线程的方式实现 12 | 13 | ##解决方案: 14 | #### 获取链接 15 | requests请求网页 16 | 17 | re正则提取页面url 18 | 19 | 20 | #### url过滤 21 | url去重 22 | 23 | url是否含有特点域名 24 | 25 | url是否相似 26 | 27 | #### 定时运行 28 | 由crontab实现 29 | 30 | 31 | 32 | ##目前测试结果: 33 | 在特定域名(如:m.sohu.com)过滤情况下 34 | 35 | 如果判断`url是否相似`,一共能检测87个非相似链接 36 | 37 | 如果进行`url去重`,一共能检测4070个链接 38 | 39 | -------------------------------------------------------------------------------- /crawl.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import re 4 | import socket 5 | import Queue 6 | import threading 7 | import sys 8 | import datetime 9 | import requests 10 | import random 11 | import time 12 | from urlfilter import url_is_similar, url_is_repeat, url_contain_custom_focus 13 | socket.setdefaulttimeout(3) 14 | 15 | 16 | html = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' 17 | gzip = 'gzip, deflate, sdch' 18 | chinese = 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4' 19 | user_agent = ['Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201', 20 | 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16', 21 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A', 22 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko', 23 | 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' 24 | ] 25 | 26 | # 已经爬过的链接 27 | crawledurl = [] 28 | # 不需要爬的链接 29 | filterurl = ["javascript:void(0);", "#", "javascript:;", "javascript:window.scrollTo(0,0);"] 30 | crawl_queue = Queue.Queue() 31 | 32 | # 正则提取href 33 | url_pat = re.compile(r'(?<=href=\").*?(?=\")', re.M) 34 | 35 | def get_all_links(page, baseurl, focuskey = ()): 36 | """获取整个页面的链接 向队列中加入新url""" 37 | for url in re.finditer(url_pat, page): 38 | url = url.group() 39 | if url in filterurl or url_is_repeat(url):#or url_is_similar(url): 40 | continue 41 | if not url_contain_custom_focus(url, focuskey): # 是否含特定域名 42 | continue 43 | if url.startswith("http"): 44 | crawl_queue.put(url) 45 | else: 46 | crawl_queue.put(baseurl[:-1]+url) 47 | 48 | def write_log(error, url): 49 | """把错误信息写入日志""" 50 | ferror = open("log.txt", "a") 51 | ferror.write(error+" ") 52 | ferror.write(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+" ") 53 | ferror.write(url+'\n') 54 | ferror.close() 55 | 56 | def random_agent(user_agent): 57 | """随机选择浏览器""" 58 | return random.choice(user_agent) 59 | 60 | def check_link(url, focuskey): 61 | """分析链接是否异常 有异常记录到log 没异常获取页面链接""" 62 | crawledurl.append(url) 63 | try: 64 | response = requests.get(url, headers = {'Accept':html,'Accept-Encoding':gzip, 65 | 'Accept-Language':chinese,'Cache-Control':'no-cache','Connection':'keep-alive', 66 | 'User-Agent':random_agent(user_agent)},timeout = 3) 67 | except requests.exceptions.ConnectionError: 68 | write_log('ConnectionError', url) 69 | except requests.exceptions.HTTPError: 70 | write_log('HTTPError', url) 71 | except (requests.exceptions.Timeout, socket.timeout): 72 | write_log("Timeout", url) 73 | except requests.exceptions.TooManyRedirects: 74 | write_log('TooManyRedirects', url) 75 | except requests.exceptions.InvalidURL: 76 | write_log('InvalidURL', url) 77 | except: 78 | write_log('UnknownError', url) 79 | else: 80 | page = response.text 81 | get_all_links(page, url, focuskey) # url is base_url 82 | 83 | class CrawlUrl(threading.Thread): 84 | def __init__(self, crawl_queue, focuskey): 85 | threading.Thread.__init__(self) 86 | self.crawl_queue = crawl_queue 87 | self.focuskey = focuskey 88 | 89 | def run(self): 90 | while True: 91 | url = self.crawl_queue.get() 92 | if url not in crawledurl: 93 | check_link(url, self.focuskey) 94 | self.crawl_queue.task_done() 95 | print "left {}, has {}".format(crawl_queue.qsize(), len(crawledurl)) 96 | 97 | def main(): 98 | for i in range(10): 99 | crawlthread = CrawlUrl(crawl_queue, focuskey = ('m.sohu.com')) 100 | crawlthread.setDaemon(True) 101 | crawlthread.start() 102 | 103 | url = "http://m.sohu.com/" 104 | crawl_queue.put(url) 105 | 106 | crawl_queue.join() 107 | 108 | if __name__ == "__main__": 109 | main() 110 | 111 | -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | requests==2.6.0 2 | -------------------------------------------------------------------------------- /urlfilter.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import time 4 | import os 5 | import urlparse 6 | import hashlib 7 | import sys 8 | 9 | SIMILAR_SET = set() 10 | REPEAT_SET = set() 11 | 12 | def urlformat(url): 13 | ''' 14 | 策略是构建一个三元组 15 | 第一项为url的netloc 16 | 第二项为path中每项的拆分长度 17 | 第三项为query的每个参数名称(参数按照字母顺序排序,避免由于顺序不同而导致的重复问题) 18 | ''' 19 | if urlparse.urlparse(url)[2] == '': 20 | url = url+'/' 21 | 22 | url_structure = urlparse.urlparse(url) 23 | netloc = url_structure[1] 24 | path = url_structure[2] 25 | query = url_structure[4] 26 | 27 | temp = (netloc,tuple([len(i) for i in path.split('/')]),tuple(sorted([i.split('=')[0] for i in query.split('&')]))) 28 | # print temp 29 | return temp 30 | 31 | 32 | def url_is_similar(url): 33 | ''' 34 | URL相似性控制 35 | 36 | True url未重复 37 | False url重复 38 | ''' 39 | t = urlformat(url) 40 | if t not in SIMILAR_SET: 41 | SIMILAR_SET.add(t) 42 | return False 43 | return True 44 | 45 | def url_is_repeat(url): 46 | ''' 47 | URL重复控制 48 | True url未重复 49 | False url重复 50 | ''' 51 | if url not in REPEAT_SET: 52 | REPEAT_SET.add(url) 53 | return False 54 | return True 55 | 56 | def url_contain_custom_focus(url,focuskey): 57 | ''' 58 | URL自定义关键字控制 聚焦 59 | True 符合聚焦策略 60 | False 61 | ''' 62 | if len(focuskey) == 0: 63 | return True 64 | for i in focuskey: 65 | if i in url: 66 | return True 67 | return False 68 | --------------------------------------------------------------------------------