├── README.md └── testCrawling.py /README.md: -------------------------------------------------------------------------------- 1 | # 一个通用爬虫思路 2 | 3 | 4 | 5 | 6 | 7 | 记住三步:下载数据、解析数据、保存数据 8 | 9 | ![](http://i.imgur.com/MdN4msE.png) 10 | 11 | 原文链接:[https://zhuanlan.zhihu.com/p/28621516](https://zhuanlan.zhihu.com/p/28621516) 12 | -------------------------------------------------------------------------------- /testCrawling.py: -------------------------------------------------------------------------------- 1 | #!/user/bin/env python 2 | # -*- coding: utf-8 -*- 3 | __author__="YYYY" 4 | 5 | import re 6 | import urllib.request 7 | import urllib.parse 8 | 9 | import lxml.html 10 | 11 | 12 | #下载url 13 | """ 14 | url: 下载链接 15 | user_agent:浏览器的UA信息,又称用户代理 16 | proxy:代理ip 17 | num_retries: 重试次数 18 | timeout: 超时时间 19 | """ 20 | def dowmlpad(url, user_agent='wswp', proxy=None, num_retries=2, timeout=5): 21 | """ 22 | # 支持500错误重试 23 | # 设定用户代理 user_agent 24 | # 支持ip代理 25 | """ 26 | print('DownloadURL:',url) 27 | 28 | #配置用户代理 29 | headers = {'User-agent':user_agent} 30 | request = urllib.request.Request(url, headers=headers) 31 | #配置 32 | opener = urllib.request.build_opener() 33 | 34 | #判断是否代理 35 | if proxy: 36 | proxy_params = {urllib.parse.urlparse(url).scheme:proxy} 37 | opener.add_handler(urllib.request.ProxyHandler(proxy_params)) 38 | try: 39 | html = opener.open(request, timeout=timeout).read() 40 | except urllib.request.URLError as e: 41 | print('Download error:',e.reason) 42 | html = None 43 | if num_retries > 0: 44 | if hasattr(e,'code') and 500 <= e.code <600: 45 | html = dowmlpad(url, user_agent, num_retries-1) 46 | except Exception as e: 47 | print('error :',e) 48 | html = None 49 | 50 | return html 51 | 52 | #获得链接 53 | def get_links(html): 54 | if html: 55 | webpage_regex = re.compile(']+href=["\'](.*?)["\']', re.IGNORECASE) 56 | return webpage_regex.findall(html.decode('utf-8')) 57 | else: 58 | return "" 59 | 60 | #编写爬取规则,获得数据 61 | def scrape_callback(url,html): 62 | csslist = ['span[property = "v:itemreviewed"]', 'span.year', 'strong[property="v:average"]'] 63 | try: 64 | tree = lxml.html.fromstring(html) 65 | row = [tree.cssselect('{0}'.format(field))[0].text for field in csslist] 66 | 67 | print(url, row) 68 | except Exception as e: 69 | print("ScrapeCallback error:",e) 70 | 71 | """ 72 | seed_url:种子url 73 | link_regex: 提取链接的正则表达式 74 | max_depath:提取链接的深度,默认为2爬虫到达第二场页面后不再提取链接 ,对于种子页面取出的链接页面,就是第二层, 75 | scrape_callback:回掉函数 76 | """ 77 | def link_crawler(seed_url, link_regex, max_depath=2, scrape_callback=None): 78 | crawl_queue = [seed_url] #配置爬取队列,其实就是一个存储url的列表 79 | #seen = set(crawl_queue) 80 | seens = {seed_url:1} 81 | 82 | # 循环直到队列为空退出 83 | while crawl_queue: 84 | url = crawl_queue.pop() # 移除队列最后一个元素,并返回值 85 | html = dowmlpad(url) # 根据url 下载页面 86 | depth = seens[url] # 获得url深度 87 | print(depth) 88 | 89 | #获取页面中的链接 90 | for link in get_links(html): 91 | if depth != max_depath and re.search(link_regex,link): 92 | link = urllib.parse.urljoin(seed_url, link) #组装规范链接 93 | 94 | #添加链接到爬取队列中 95 | if link not in seens: 96 | seens[link] = depth+1 97 | crawl_queue.append(link) 98 | 99 | #如果处理回调函数存在,则进行回调处理 100 | if scrape_callback: 101 | scrape_callback(url, html) 102 | 103 | 104 | import csv 105 | class ScrapeCallback: 106 | def __init__(self): 107 | self.writer = csv.writer(open('countries.csv','w')) 108 | self.fields = ('name','year','score') 109 | self.writer.writerow(self.fields) 110 | 111 | def __call__(self, url,html): 112 | csslist = ['span[property = "v:itemreviewed"]', 'span.year','strong[property="v:average"]'] 113 | try: 114 | tree = lxml.html.fromstring(html) 115 | row = [tree.cssselect('{0}'.format(field))[0].text for field in csslist] 116 | self.writer.writerow(row) 117 | print(url, row) 118 | except Exception as e:\ 119 | print("ScrapeCallback error:",e) 120 | 121 | 122 | if __name__ == '__main__': 123 | #测试 124 | send_url = "https://movie.douban.com/" 125 | 126 | link_regex = '(/subject/[\d]+/)' #获取链接的规则 127 | 128 | #使用类的方式来写,下面两个一样结果 129 | link_crawler(send_url,link_regex,max_depath=2, scrape_callback=ScrapeCallback()) 130 | #link_crawler(send_url, link_regex, max_depath=2, scrape_callback=scrape_callback) 131 | --------------------------------------------------------------------------------