├── __pycache__ ├── html_parser.cpython-36.pyc ├── url_manager.cpython-36.pyc ├── html_outputer.cpython-36.pyc └── html_downloader.cpython-36.pyc ├── README.md ├── html_downloader.py ├── html_outputer.py ├── url_manager.py ├── spider_main.py └── html_parser.py /__pycache__/html_parser.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joezeo/pythonspider/HEAD/__pycache__/html_parser.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/url_manager.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joezeo/pythonspider/HEAD/__pycache__/url_manager.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/html_outputer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joezeo/pythonspider/HEAD/__pycache__/html_outputer.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/html_downloader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Joezeo/pythonspider/HEAD/__pycache__/html_downloader.cpython-36.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python spider 2 | [![Travis](https://img.shields.io/badge/Language-Python-blue.svg)]() 3 | [![Travis](https://img.shields.io/travis/rust-lang/rust.svg)]() 4 | [![Travis](https://img.shields.io/badge/Lib-Beautiful'Soup-orange.svg)]() 5 | 6 | --- 7 | 8 | This is a simple crawler program for crawling the data of the baidu baike Python related pages 9 | -------------------------------------------------------------------------------- /html_downloader.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | import urllib.request 3 | 4 | 5 | class HtmlDownloader: 6 | def download(self, url): 7 | """Downloader接口函数 8 | 从url下载取得页面信息,并返回所取得的信息 9 | """ 10 | if url is None: 11 | return 12 | response = urllib.request.urlopen(url) 13 | 14 | if response.getcode() != 200: 15 | return 16 | return response.read() 17 | -------------------------------------------------------------------------------- /html_outputer.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | 3 | 4 | class HtmlOutputer: 5 | """Outputer模块 6 | 对从parser模块取得的数据,进行输出到html文件的处理 7 | """ 8 | def __init__(self): 9 | """outputer初始化 10 | 建立一个存放数据的列表 11 | """ 12 | self.datas = [] 13 | 14 | def collect_data(self, data): 15 | """outputer模块接口函数 16 | 收集数据,并将其放入datas列表中 17 | """ 18 | if data is None: 19 | return 20 | self.datas.append(data) 21 | 22 | def output(self): 23 | """outputer模块接口函数 24 | 输出函数,将datas列表中的数据输出到列表中 25 | """ 26 | fout = open('F:\\workspace\\tmp\\output.html', 'w') 27 | fout.write('\n') 28 | fout.write('\n') 29 | fout.write('\n') 30 | 31 | for data in self.datas: 32 | fout.write('') 33 | fout.write('' % data['url']) 34 | fout.write('' % data['title'].encode('utf-8')) 35 | fout.write('' % data['summary'].encode('utf-8')) 36 | fout.write('') 37 | 38 | fout.write('\n
%s%s%s
') 39 | fout.write('\n') 40 | fout.write('\n') 41 | fout.close() 42 | -------------------------------------------------------------------------------- /url_manager.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | 3 | 4 | class UrlManager: 5 | """Manager模块初始化 6 | Manager模块主要功能是使用集合,对待爬取,和已爬取的url进行管理 7 | """ 8 | def __init__(self): 9 | """Manager模块初始化 10 | 创建 待爬取url 和 已爬取url集合 11 | """ 12 | self.new_urls = set() # 待爬取的url 13 | self.old_urls = set() # 已爬取的url 14 | 15 | def add_new_url(self, url): 16 | """Manager模块函数 17 | 如果新的url不在 待爬取url 和 已爬取url 中 18 | """ 19 | if url is None: 20 | return 21 | if url not in self.new_urls and url not in self.old_urls: 22 | self.new_urls.add(url) 23 | 24 | def add_new_urls(self, urls): 25 | """Manager模块接口函数 26 | 添加一组urls,使用add_new_url方法进行url的添加 27 | """ 28 | if urls is None or len(urls) == 0: 29 | return 30 | for url in urls: 31 | self.add_new_url(url) 32 | 33 | def has_new_url(self): 34 | """Manager模块接口函数 35 | 判断待爬取url是否为空 36 | """ 37 | return len(self.new_urls) != 0 38 | 39 | def get_new_url(self): 40 | """Manager模块接口函数 41 | 从待爬取的url中取出一条url,使用Downloader模块进行下载 42 | """ 43 | new_url = self.new_urls.pop() 44 | self.old_urls.add(new_url) 45 | return new_url 46 | -------------------------------------------------------------------------------- /spider_main.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | import url_manager 3 | import html_downloader 4 | import html_parser 5 | import html_outputer 6 | 7 | 8 | class SpiderMain: 9 | def __init__(self): 10 | """初始化 11 | 建立四个模块的实例 12 | """ 13 | self.urls = url_manager.UrlManager() 14 | self.downloader = html_downloader.HtmlDownloader() 15 | self.parser = html_parser.HtmlParser() 16 | self.outputer = html_outputer.HtmlOutputer() 17 | 18 | def craw(self, root_url): 19 | """爬虫程序的主函数 20 | 程序主逻辑函数 21 | """ 22 | count = 1 23 | self.urls.add_new_url(root_url) 24 | 25 | while self.urls.has_new_url(): 26 | try: 27 | new_url = self.urls.get_new_url() 28 | print("craw: {} : {}".format(count, new_url)) 29 | html_cont = self.downloader.download(new_url) 30 | new_urls, new_data = self.parser.parse(new_url, html_cont) 31 | self.urls.add_new_urls(new_urls) 32 | self.outputer.collect_data(new_data) 33 | 34 | if count == 10: 35 | break 36 | count += 1 37 | except Exception: 38 | print('craw failed!') 39 | self.outputer.output() 40 | 41 | 42 | if __name__ == '__main__': 43 | # 根url,爬虫程序爬取网页的起点 44 | root_url = 'https://baike.baidu.com/item/Python/407313?fr=aladdin' 45 | obj_spider = SpiderMain() 46 | obj_spider.craw(root_url) 47 | -------------------------------------------------------------------------------- /html_parser.py: -------------------------------------------------------------------------------- 1 | # coding=UTF-8 2 | from bs4 import BeautifulSoup 3 | import re 4 | from urllib.parse import urljoin 5 | 6 | 7 | class HtmlParser: 8 | """parser模块 9 | 对downloader模块下载下来的网页信息进行分析处理 10 | """ 11 | def _get_new_urls(self, page_url, soup): 12 | """Parser模块函数 13 | 从当前url获得新的urls,并返回获得的urls 14 | """ 15 | new_urls = set() 16 | # 计算机程序设计语言 18 | links = soup.find_all('a', href=re.compile(r"/item/[a-zA-Z0-9%]+")) 19 | for link in links: 20 | new_url = link['href'] 21 | new_full_url = urljoin(page_url, new_url) 22 | new_urls.add(new_full_url) 23 | return new_urls 24 | 25 | def _get_new_data(self, page_url, soup): 26 | """Parser模块函数 27 | 从当前url获得页面数据,并返回取得的数据 28 | """ 29 | res_data = {} 30 | # url: 31 | res_data['url'] = page_url 32 | #
33 | #

Python

34 | title_node = soup.find('dd', class_="lemmaWgt-\ 35 | lemmaTitle-title").find('h1') 36 | 37 | res_data['title'] = title_node.get_text() 38 | 39 | #
40 | summary_node = soup.find('div', class_="lemma-summary") 41 | res_data['summary'] = summary_node.get_text() 42 | return res_data 43 | 44 | def parse(self, page_url, html_cont): 45 | """Parser模块接口函数 46 | 调用模块的_get_new_urls和_get_new_data函数从当前page_url获得链接和数据 47 | """ 48 | if page_url is None or html_cont is None: 49 | return 50 | 51 | soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='GB2312') 52 | new_urls = self._get_new_urls(page_url, soup) 53 | new_data = self._get_new_data(page_url, soup) 54 | return new_urls, new_data 55 | --------------------------------------------------------------------------------