├── README.md └── 爬取百度文库文字.py /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Baidu Wenku Crawler 3 | Baidu Wenku Crawler is used to crawl the text content of documents from Baidu Wenku. 4 | The main functionality is to crawl the text of documents restricted by Baidu membership downloads. 5 | 6 | # Required Modules 7 | Beautifulsoup 8 | requests 9 | 10 | # Usage 11 | Replace the URL 12 | 13 | 14 | # 百度文库爬虫 15 | 百度文库爬虫,爬取文档的文字内容。 16 | 主力功能是爬取百度会员下载限制的文档文字,鉴于之前Pandownload事件,幸好我没有盈利,也没大面积推广,更没有提供服务器接口,仅仅是写了代码完全开源,各位随意取用。 17 | 18 | ## 所需模块 19 | Beautifulsoup 20 | requests 21 | 22 | ## 使用方法 23 | 替换 url 24 | -------------------------------------------------------------------------------- /爬取百度文库文字.py: -------------------------------------------------------------------------------- 1 | #项目地址:https://github.com/JackKing-defier/Baiduwenku 2 | #@author: JackKing_defier 3 | 4 | import requests 5 | from bs4 import BeautifulSoup 6 | import bs4 7 | 8 | def getHTMLText(url): 9 | kv = {'User-agent': 'Baiduspider'} 10 | try: 11 | r = requests.get(url, headers = kv, timeout = 30) 12 | r.raise_for_status() 13 | r.encoding = r.apparent_encoding 14 | return r.text 15 | except: 16 | return '' 17 | 18 | def findPList(html): 19 | plist = [] 20 | soup = BeautifulSoup(html, "html.parser") 21 | plist.append(soup.title.string) 22 | for div in soup.find_all('div', attrs={"class": "bd doc-reader"}): 23 | plist.extend(div.get_text().split('\n')) 24 | 25 | plist = [c.replace(' ', '') for c in plist] 26 | plist = [c.replace('\x0c', '') for c in plist] 27 | return plist 28 | 29 | def printPList(plist, path = 'baiduwenku.txt'): 30 | file = open(path, 'w') 31 | for str in plist: 32 | file.write(str) 33 | file.write('\n') 34 | file.close() 35 | 36 | def main(): 37 | 38 | url = 'https://wenku.baidu.com/view/515e88c36529647d2728529b.html' 39 | html = getHTMLText(url) 40 | plist = findPList(html) 41 | printPList(plist) 42 | 43 | main() 44 | --------------------------------------------------------------------------------