├── __pycache__
├── html_parser.cpython-36.pyc
├── url_manager.cpython-36.pyc
├── html_outputer.cpython-36.pyc
└── html_downloader.cpython-36.pyc
├── README.md
├── html_downloader.py
├── html_outputer.py
├── url_manager.py
├── spider_main.py
└── html_parser.py
/__pycache__/html_parser.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Joezeo/pythonspider/HEAD/__pycache__/html_parser.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/url_manager.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Joezeo/pythonspider/HEAD/__pycache__/url_manager.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/html_outputer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Joezeo/pythonspider/HEAD/__pycache__/html_outputer.cpython-36.pyc
--------------------------------------------------------------------------------
/__pycache__/html_downloader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Joezeo/pythonspider/HEAD/__pycache__/html_downloader.cpython-36.pyc
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # python spider
2 | []()
3 | []()
4 | []()
5 |
6 | ---
7 |
8 | This is a simple crawler program for crawling the data of the baidu baike Python related pages
9 |
--------------------------------------------------------------------------------
/html_downloader.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | import urllib.request
3 |
4 |
5 | class HtmlDownloader:
6 | def download(self, url):
7 | """Downloader接口函数
8 | 从url下载取得页面信息,并返回所取得的信息
9 | """
10 | if url is None:
11 | return
12 | response = urllib.request.urlopen(url)
13 |
14 | if response.getcode() != 200:
15 | return
16 | return response.read()
17 |
--------------------------------------------------------------------------------
/html_outputer.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 |
3 |
4 | class HtmlOutputer:
5 | """Outputer模块
6 | 对从parser模块取得的数据,进行输出到html文件的处理
7 | """
8 | def __init__(self):
9 | """outputer初始化
10 | 建立一个存放数据的列表
11 | """
12 | self.datas = []
13 |
14 | def collect_data(self, data):
15 | """outputer模块接口函数
16 | 收集数据,并将其放入datas列表中
17 | """
18 | if data is None:
19 | return
20 | self.datas.append(data)
21 |
22 | def output(self):
23 | """outputer模块接口函数
24 | 输出函数,将datas列表中的数据输出到列表中
25 | """
26 | fout = open('F:\\workspace\\tmp\\output.html', 'w')
27 | fout.write('\n')
28 | fout.write('
\n')
29 | fout.write('\n')
30 |
31 | for data in self.datas:
32 | fout.write('')
33 | fout.write('| %s | ' % data['url'])
34 | fout.write('%s | ' % data['title'].encode('utf-8'))
35 | fout.write('%s | ' % data['summary'].encode('utf-8'))
36 | fout.write('
')
37 |
38 | fout.write('\n
')
39 | fout.write('\n')
40 | fout.write('\n')
41 | fout.close()
42 |
--------------------------------------------------------------------------------
/url_manager.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 |
3 |
4 | class UrlManager:
5 | """Manager模块初始化
6 | Manager模块主要功能是使用集合,对待爬取,和已爬取的url进行管理
7 | """
8 | def __init__(self):
9 | """Manager模块初始化
10 | 创建 待爬取url 和 已爬取url集合
11 | """
12 | self.new_urls = set() # 待爬取的url
13 | self.old_urls = set() # 已爬取的url
14 |
15 | def add_new_url(self, url):
16 | """Manager模块函数
17 | 如果新的url不在 待爬取url 和 已爬取url 中
18 | """
19 | if url is None:
20 | return
21 | if url not in self.new_urls and url not in self.old_urls:
22 | self.new_urls.add(url)
23 |
24 | def add_new_urls(self, urls):
25 | """Manager模块接口函数
26 | 添加一组urls,使用add_new_url方法进行url的添加
27 | """
28 | if urls is None or len(urls) == 0:
29 | return
30 | for url in urls:
31 | self.add_new_url(url)
32 |
33 | def has_new_url(self):
34 | """Manager模块接口函数
35 | 判断待爬取url是否为空
36 | """
37 | return len(self.new_urls) != 0
38 |
39 | def get_new_url(self):
40 | """Manager模块接口函数
41 | 从待爬取的url中取出一条url,使用Downloader模块进行下载
42 | """
43 | new_url = self.new_urls.pop()
44 | self.old_urls.add(new_url)
45 | return new_url
46 |
--------------------------------------------------------------------------------
/spider_main.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | import url_manager
3 | import html_downloader
4 | import html_parser
5 | import html_outputer
6 |
7 |
8 | class SpiderMain:
9 | def __init__(self):
10 | """初始化
11 | 建立四个模块的实例
12 | """
13 | self.urls = url_manager.UrlManager()
14 | self.downloader = html_downloader.HtmlDownloader()
15 | self.parser = html_parser.HtmlParser()
16 | self.outputer = html_outputer.HtmlOutputer()
17 |
18 | def craw(self, root_url):
19 | """爬虫程序的主函数
20 | 程序主逻辑函数
21 | """
22 | count = 1
23 | self.urls.add_new_url(root_url)
24 |
25 | while self.urls.has_new_url():
26 | try:
27 | new_url = self.urls.get_new_url()
28 | print("craw: {} : {}".format(count, new_url))
29 | html_cont = self.downloader.download(new_url)
30 | new_urls, new_data = self.parser.parse(new_url, html_cont)
31 | self.urls.add_new_urls(new_urls)
32 | self.outputer.collect_data(new_data)
33 |
34 | if count == 10:
35 | break
36 | count += 1
37 | except Exception:
38 | print('craw failed!')
39 | self.outputer.output()
40 |
41 |
42 | if __name__ == '__main__':
43 | # 根url,爬虫程序爬取网页的起点
44 | root_url = 'https://baike.baidu.com/item/Python/407313?fr=aladdin'
45 | obj_spider = SpiderMain()
46 | obj_spider.craw(root_url)
47 |
--------------------------------------------------------------------------------
/html_parser.py:
--------------------------------------------------------------------------------
1 | # coding=UTF-8
2 | from bs4 import BeautifulSoup
3 | import re
4 | from urllib.parse import urljoin
5 |
6 |
7 | class HtmlParser:
8 | """parser模块
9 | 对downloader模块下载下来的网页信息进行分析处理
10 | """
11 | def _get_new_urls(self, page_url, soup):
12 | """Parser模块函数
13 | 从当前url获得新的urls,并返回获得的urls
14 | """
15 | new_urls = set()
16 | # 计算机程序设计语言
18 | links = soup.find_all('a', href=re.compile(r"/item/[a-zA-Z0-9%]+"))
19 | for link in links:
20 | new_url = link['href']
21 | new_full_url = urljoin(page_url, new_url)
22 | new_urls.add(new_full_url)
23 | return new_urls
24 |
25 | def _get_new_data(self, page_url, soup):
26 | """Parser模块函数
27 | 从当前url获得页面数据,并返回取得的数据
28 | """
29 | res_data = {}
30 | # url:
31 | res_data['url'] = page_url
32 | #
33 | # Python
34 | title_node = soup.find('dd', class_="lemmaWgt-\
35 | lemmaTitle-title").find('h1')
36 |
37 | res_data['title'] = title_node.get_text()
38 |
39 | #
40 | summary_node = soup.find('div', class_="lemma-summary")
41 | res_data['summary'] = summary_node.get_text()
42 | return res_data
43 |
44 | def parse(self, page_url, html_cont):
45 | """Parser模块接口函数
46 | 调用模块的_get_new_urls和_get_new_data函数从当前page_url获得链接和数据
47 | """
48 | if page_url is None or html_cont is None:
49 | return
50 |
51 | soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='GB2312')
52 | new_urls = self._get_new_urls(page_url, soup)
53 | new_data = self._get_new_data(page_url, soup)
54 | return new_urls, new_data
55 |
--------------------------------------------------------------------------------