├── README.md
└── testCrawling.py


/README.md:
--------------------------------------------------------------------------------
 1 | # 一个通用爬虫思路
 2 | 
 3 | 
 4 | 
 5 | 
 6 | 
 7 | 记住三步：下载数据、解析数据、保存数据
 8 | 
 9 | ![](http://i.imgur.com/MdN4msE.png)
10 | 
11 | 原文链接：[https://zhuanlan.zhihu.com/p/28621516](https://zhuanlan.zhihu.com/p/28621516)
12 | 


--------------------------------------------------------------------------------
/testCrawling.py:
--------------------------------------------------------------------------------
  1 | #!/user/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | __author__="YYYY"   
  4 | 
  5 | import re
  6 | import urllib.request
  7 | import urllib.parse
  8 | 
  9 | import lxml.html
 10 | 
 11 | 
 12 | #下载url
 13 | """
 14 | url: 下载链接
 15 | user_agent：浏览器的UA信息，又称用户代理
 16 | proxy：代理ip
 17 | num_retries： 重试次数
 18 | timeout： 超时时间
 19 | """
 20 | def dowmlpad(url, user_agent='wswp', proxy=None, num_retries=2, timeout=5):
 21 |     """
 22 |     # 支持500错误重试
 23 |     # 设定用户代理 user_agent
 24 |     # 支持ip代理
 25 |     """
 26 |     print('DownloadURL:',url)
 27 | 
 28 |     #配置用户代理
 29 |     headers = {'User-agent':user_agent}
 30 |     request = urllib.request.Request(url, headers=headers)
 31 |     #配置
 32 |     opener = urllib.request.build_opener()
 33 | 
 34 |     #判断是否代理
 35 |     if proxy:
 36 |         proxy_params = {urllib.parse.urlparse(url).scheme:proxy}
 37 |         opener.add_handler(urllib.request.ProxyHandler(proxy_params))
 38 |     try:
 39 |         html = opener.open(request, timeout=timeout).read()
 40 |     except urllib.request.URLError as e:
 41 |         print('Download error:',e.reason)
 42 |         html = None
 43 |         if num_retries > 0:
 44 |             if hasattr(e,'code') and 500 <= e.code <600:
 45 |                 html = dowmlpad(url, user_agent, num_retries-1)
 46 |     except Exception as e:
 47 |         print('error :',e)
 48 |         html = None
 49 | 
 50 |     return html
 51 | 
 52 | #获得链接
 53 | def get_links(html):
 54 |     if html:
 55 |         webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
 56 |         return  webpage_regex.findall(html.decode('utf-8'))
 57 |     else:
 58 |         return ""
 59 | 
 60 | #编写爬取规则，获得数据
 61 | def scrape_callback(url,html):
 62 |     csslist = ['span[property = "v:itemreviewed"]', 'span.year', 'strong[property="v:average"]']
 63 |     try:
 64 |         tree = lxml.html.fromstring(html)
 65 |         row = [tree.cssselect('{0}'.format(field))[0].text for field in csslist]
 66 | 
 67 |         print(url, row)
 68 |     except Exception as e:
 69 |         print("ScrapeCallback error:",e)
 70 | 
 71 | """
 72 | seed_url:种子url
 73 | link_regex: 提取链接的正则表达式
 74 | max_depath：提取链接的深度，默认为2爬虫到达第二场页面后不再提取链接 ，对于种子页面取出的链接页面，就是第二层，
 75 | scrape_callback：回掉函数
 76 | """
 77 | def link_crawler(seed_url, link_regex, max_depath=2, scrape_callback=None):
 78 |     crawl_queue = [seed_url]  #配置爬取队列，其实就是一个存储url的列表
 79 |     #seen = set(crawl_queue)
 80 |     seens = {seed_url:1}
 81 | 
 82 |     # 循环直到队列为空退出
 83 |     while crawl_queue:
 84 |         url = crawl_queue.pop()  # 移除队列最后一个元素，并返回值
 85 |         html = dowmlpad(url)     # 根据url 下载页面
 86 |         depth = seens[url]       # 获得url深度
 87 |         print(depth)
 88 | 
 89 |         #获取页面中的链接
 90 |         for link in get_links(html):
 91 |             if depth != max_depath and re.search(link_regex,link):
 92 |                 link = urllib.parse.urljoin(seed_url, link)     #组装规范链接
 93 | 
 94 |                 #添加链接到爬取队列中
 95 |                 if link not in seens:
 96 |                     seens[link] = depth+1
 97 |                     crawl_queue.append(link)
 98 | 
 99 |         #如果处理回调函数存在，则进行回调处理
100 |         if scrape_callback:
101 |             scrape_callback(url, html)
102 | 
103 | 
104 | import csv
105 | class ScrapeCallback:
106 |     def __init__(self):
107 |         self.writer = csv.writer(open('countries.csv','w'))
108 |         self.fields = ('name','year','score')
109 |         self.writer.writerow(self.fields)
110 | 
111 |     def __call__(self, url,html):
112 |         csslist = ['span[property = "v:itemreviewed"]', 'span.year','strong[property="v:average"]']
113 |         try:
114 |             tree = lxml.html.fromstring(html)
115 |             row = [tree.cssselect('{0}'.format(field))[0].text for field in csslist]
116 |             self.writer.writerow(row)
117 |             print(url, row)
118 |         except Exception as e:\
119 |             print("ScrapeCallback error:",e)
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     #测试
124 |     send_url = "https://movie.douban.com/"
125 | 
126 |     link_regex = '(/subject/[\d]+/)' #获取链接的规则
127 | 
128 |     #使用类的方式来写，下面两个一样结果
129 |     link_crawler(send_url,link_regex,max_depath=2, scrape_callback=ScrapeCallback())
130 |     #link_crawler(send_url, link_regex, max_depath=2, scrape_callback=scrape_callback)
131 | 


--------------------------------------------------------------------------------