├── __init__.py ├── README.md ├── requirements.txt ├── html_downloader.py ├── url_manager.py ├── html_outputer.py ├── html_parser.py └── spider_main.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 慕课网 Python开发简单爬虫 实例代码 2 | 3 | 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.4.1 2 | wheel==0.24.0 3 | -------------------------------------------------------------------------------- /html_downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: UTF-8 -*- 3 | import urllib2 4 | 5 | 6 | class HtmlDownloader(object): 7 | 8 | def download(self, url): 9 | if url is None: 10 | return None 11 | response = urllib2.urlopen(url) 12 | if response.getcode() != 200: 13 | return None 14 | return response.read() -------------------------------------------------------------------------------- /url_manager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: UTF-8 -*- 3 | class UrlManager(object): 4 | def __init__(self): 5 | self.new_urls = set() 6 | self.old_urls = set() 7 | 8 | def add_new_url(self, url): 9 | if url is None: 10 | return 11 | if url not in self.new_urls and url not in self.old_urls: 12 | self.new_urls.add(url) 13 | 14 | def add_new_urls(self, urls): 15 | if urls is None or len(urls) == 0: 16 | return 17 | for url in urls: 18 | self.add_new_url(url) 19 | 20 | def has_new_url(self): 21 | return len(self.new_urls) != 0 22 | 23 | 24 | def get_new_url(self): 25 | new_url = self.new_urls.pop() 26 | self.old_urls.add(new_url) 27 | return new_url 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /html_outputer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: UTF-8 -*- 3 | class HtmlOutputer(object): 4 | 5 | def __init__(self): 6 | self.datas = [] 7 | 8 | 9 | def collect_data(self, data): 10 | if data is None: 11 | return 12 | self.datas.append(data) 13 | 14 | def output_html(self): 15 | fout = open('output.html', 'w') 16 | fout.write("") 17 | fout.write("") 18 | fout.write("") 19 | 20 | for data in self.datas: 21 | fout.write("") 22 | fout.write("" % data['url']) 23 | fout.write("" % data['title'].encode('utf-8')) 24 | fout.write("" % data['summary'].encode('utf-8')) 25 | fout.write("") 26 | fout.write("
%s%s%s
") 27 | fout.write("") 28 | fout.write("") 29 | fout.close() -------------------------------------------------------------------------------- /html_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: UTF-8 -*- 3 | import re 4 | import urlparse 5 | 6 | from bs4 import BeautifulSoup 7 | 8 | 9 | class HtmlParser(object): 10 | 11 | def parse(self, page_url, html_cont): 12 | if page_url is None or html_cont is None: 13 | return 14 | 15 | soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8') 16 | new_urls = self._get_new_urls(page_url, soup) 17 | new_data = self._get_new_data(page_url, soup) 18 | return new_urls, new_data 19 | 20 | def _get_new_urls(self, page_url, soup): 21 | new_urls = set() 22 | links = soup.find_all('a', href=re.compile(r"/view/\d+\.htm")) 23 | for link in links: 24 | new_url = link['href'] 25 | new_full_url = urlparse.urljoin(page_url, new_url) 26 | new_urls.add(new_full_url) 27 | return new_urls 28 | 29 | 30 | def _get_new_data(self, page_url, soup): 31 | res_data = {} 32 | 33 | # url 34 | 35 | res_data['url'] = page_url 36 | #

Python

37 | title_node = soup.find('dd', class_= "lemmaWgt-lemmaTitle-title").find("h1") 38 | res_data['title'] = title_node.get_text() 39 | 40 | summary_node = soup.find('div', class_="lemma-summary") 41 | res_data['summary'] = summary_node.get_text() 42 | return res_data 43 | -------------------------------------------------------------------------------- /spider_main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: UTF-8 -*- 3 | 4 | # 爬虫调度端 5 | 6 | ## URL管理器 7 | 8 | ### 添加新的URL到待爬取集合中 9 | ### 判断待添加URL是否在容器中 10 | ### 获取待爬取URL 11 | ### 判断是否还有待爬取URL 12 | ### 将URL从待爬取移动到已爬取 13 | 14 | ## 网页下载器 15 | ### urllib2 16 | ### requests 17 | 18 | ## 网页解析器 19 | 20 | ### 正则表达式 21 | ### html.parser 22 | ### BeautifulSoup 23 | ### lxml 24 | 25 | 26 | ## 分析目标 27 | ### URL格式 28 | ### 数据格式 29 | ### 网页编码 30 | 31 | 32 | from baike_spider import url_manager, html_downloader, html_outputer, html_parser 33 | 34 | 35 | class SpiderMain(object): 36 | 37 | def __init__(self): 38 | self.urls = url_manager.UrlManager() 39 | self.downloader = html_downloader.HtmlDownloader() 40 | self.parser = html_parser.HtmlParser() 41 | self.outputer = html_outputer.HtmlOutputer() 42 | 43 | 44 | 45 | def craw(self, root_url): 46 | count = 1 47 | self.urls.add_new_url(root_url) 48 | while self.urls.has_new_url(): 49 | try : 50 | new_url = self.urls.get_new_url() 51 | print 'craw %d : %s' % (count, new_url) 52 | html_cont = self.downloader.download(new_url) 53 | new_urls, new_data = self.parser.parse(new_url, html_cont) 54 | self.urls.add_new_urls(new_urls) 55 | self.outputer.collect_data(new_data) 56 | 57 | 58 | if count == 1000: 59 | break 60 | count = count + 1 61 | except: 62 | print 'craw failed' 63 | 64 | self.outputer.output_html() 65 | 66 | if __name__ == "__main__": 67 | root_url = "http://baike.baidu.com/view/21087.htm" 68 | obj_spider = SpiderMain() 69 | obj_spider.craw(root_url) 70 | --------------------------------------------------------------------------------