├── __init__.py
├── README.md
├── requirements.txt
├── html_downloader.py
├── url_manager.py
├── html_outputer.py
├── html_parser.py
└── spider_main.py
/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 慕课网 Python开发简单爬虫 实例代码
2 |
3 |
4 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.4.1
2 | wheel==0.24.0
3 |
--------------------------------------------------------------------------------
/html_downloader.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | # -*- coding: UTF-8 -*-
3 | import urllib2
4 |
5 |
6 | class HtmlDownloader(object):
7 |
8 | def download(self, url):
9 | if url is None:
10 | return None
11 | response = urllib2.urlopen(url)
12 | if response.getcode() != 200:
13 | return None
14 | return response.read()
--------------------------------------------------------------------------------
/url_manager.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | # -*- coding: UTF-8 -*-
3 | class UrlManager(object):
4 | def __init__(self):
5 | self.new_urls = set()
6 | self.old_urls = set()
7 |
8 | def add_new_url(self, url):
9 | if url is None:
10 | return
11 | if url not in self.new_urls and url not in self.old_urls:
12 | self.new_urls.add(url)
13 |
14 | def add_new_urls(self, urls):
15 | if urls is None or len(urls) == 0:
16 | return
17 | for url in urls:
18 | self.add_new_url(url)
19 |
20 | def has_new_url(self):
21 | return len(self.new_urls) != 0
22 |
23 |
24 | def get_new_url(self):
25 | new_url = self.new_urls.pop()
26 | self.old_urls.add(new_url)
27 | return new_url
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/html_outputer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | # -*- coding: UTF-8 -*-
3 | class HtmlOutputer(object):
4 |
5 | def __init__(self):
6 | self.datas = []
7 |
8 |
9 | def collect_data(self, data):
10 | if data is None:
11 | return
12 | self.datas.append(data)
13 |
14 | def output_html(self):
15 | fout = open('output.html', 'w')
16 | fout.write("")
17 | fout.write("
")
18 | fout.write("")
19 |
20 | for data in self.datas:
21 | fout.write("")
22 | fout.write("| %s | " % data['url'])
23 | fout.write("%s | " % data['title'].encode('utf-8'))
24 | fout.write("%s | " % data['summary'].encode('utf-8'))
25 | fout.write("
")
26 | fout.write("
")
27 | fout.write("")
28 | fout.write("")
29 | fout.close()
--------------------------------------------------------------------------------
/html_parser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | # -*- coding: UTF-8 -*-
3 | import re
4 | import urlparse
5 |
6 | from bs4 import BeautifulSoup
7 |
8 |
9 | class HtmlParser(object):
10 |
11 | def parse(self, page_url, html_cont):
12 | if page_url is None or html_cont is None:
13 | return
14 |
15 | soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
16 | new_urls = self._get_new_urls(page_url, soup)
17 | new_data = self._get_new_data(page_url, soup)
18 | return new_urls, new_data
19 |
20 | def _get_new_urls(self, page_url, soup):
21 | new_urls = set()
22 | links = soup.find_all('a', href=re.compile(r"/view/\d+\.htm"))
23 | for link in links:
24 | new_url = link['href']
25 | new_full_url = urlparse.urljoin(page_url, new_url)
26 | new_urls.add(new_full_url)
27 | return new_urls
28 |
29 |
30 | def _get_new_data(self, page_url, soup):
31 | res_data = {}
32 |
33 | # url
34 |
35 | res_data['url'] = page_url
36 | # Python
37 | title_node = soup.find('dd', class_= "lemmaWgt-lemmaTitle-title").find("h1")
38 | res_data['title'] = title_node.get_text()
39 |
40 | summary_node = soup.find('div', class_="lemma-summary")
41 | res_data['summary'] = summary_node.get_text()
42 | return res_data
43 |
--------------------------------------------------------------------------------
/spider_main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | # -*- coding: UTF-8 -*-
3 |
4 | # 爬虫调度端
5 |
6 | ## URL管理器
7 |
8 | ### 添加新的URL到待爬取集合中
9 | ### 判断待添加URL是否在容器中
10 | ### 获取待爬取URL
11 | ### 判断是否还有待爬取URL
12 | ### 将URL从待爬取移动到已爬取
13 |
14 | ## 网页下载器
15 | ### urllib2
16 | ### requests
17 |
18 | ## 网页解析器
19 |
20 | ### 正则表达式
21 | ### html.parser
22 | ### BeautifulSoup
23 | ### lxml
24 |
25 |
26 | ## 分析目标
27 | ### URL格式
28 | ### 数据格式
29 | ### 网页编码
30 |
31 |
32 | from baike_spider import url_manager, html_downloader, html_outputer, html_parser
33 |
34 |
35 | class SpiderMain(object):
36 |
37 | def __init__(self):
38 | self.urls = url_manager.UrlManager()
39 | self.downloader = html_downloader.HtmlDownloader()
40 | self.parser = html_parser.HtmlParser()
41 | self.outputer = html_outputer.HtmlOutputer()
42 |
43 |
44 |
45 | def craw(self, root_url):
46 | count = 1
47 | self.urls.add_new_url(root_url)
48 | while self.urls.has_new_url():
49 | try :
50 | new_url = self.urls.get_new_url()
51 | print 'craw %d : %s' % (count, new_url)
52 | html_cont = self.downloader.download(new_url)
53 | new_urls, new_data = self.parser.parse(new_url, html_cont)
54 | self.urls.add_new_urls(new_urls)
55 | self.outputer.collect_data(new_data)
56 |
57 |
58 | if count == 1000:
59 | break
60 | count = count + 1
61 | except:
62 | print 'craw failed'
63 |
64 | self.outputer.output_html()
65 |
66 | if __name__ == "__main__":
67 | root_url = "http://baike.baidu.com/view/21087.htm"
68 | obj_spider = SpiderMain()
69 | obj_spider.craw(root_url)
70 |
--------------------------------------------------------------------------------