├── BaiduWenkuDownloads.exe ├── README.md └── main.py /BaiduWenkuDownloads.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/7hc/BaiduWenkuDownloads/ff20ec56ecb4b9b4b762f3d165b34a6a110cbd43/BaiduWenkuDownloads.exe -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BaiduWenkuDownloads 2 |
3 | 这是一个百度文库收费文档下载工具。 4 | 5 | ## 使用方法: 6 | 7 | 下载BaiduWenkuDownloads.exe,输入要下载的文档URL并回车。 8 | 9 | ## 工具说明: 10 | 11 | * 支持DOC,TXT,PPT,PDF下载。 12 | 13 | * DOC和TXT保存为TXT格式。 14 | 15 | * PDF和PPT保存为JPG格式。 -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | import sys 4 | import json 5 | import os 6 | import time 7 | 8 | #根据文件决定函数 9 | y = 0 10 | def doc(url): 11 | doc_id = re.findall('view/(.*).html', url)[0] 12 | html = requests.get(url).text 13 | lists=re.findall('(https.*?0.json.*?)\\\\x22}',html) 14 | lenth = (len(lists)//2) 15 | NewLists = lists[:lenth] 16 | for i in range(len(NewLists)) : 17 | NewLists[i] = NewLists[i].replace('\\','') 18 | txts=requests.get(NewLists[i]).text 19 | txtlists = re.findall('"c":"(.*?)".*?"y":(.*?),',txts) 20 | for i in range(0,len(txtlists)): 21 | global y 22 | print(txtlists[i][0].encode('utf-8').decode('unicode_escape','ignore')) 23 | if y != txtlists[i][1]: 24 | y = txtlists[i][1] 25 | n = '\n' 26 | else: 27 | n = '' 28 | filename = doc_id + '.txt' 29 | with open(filename,'a',encoding='utf-8') as f: 30 | f.write(n+txtlists[i][0].encode('utf-8').decode('unicode_escape','ignore').replace('\\','')) 31 | print("DOC文档保存在"+filename) 32 | 33 | def ppt(url): 34 | doc_id = re.findall('view/(.*).html',url)[0] 35 | url = "https://wenku.baidu.com/browse/getbcsurl?doc_id="+doc_id+"&pn=1&rn=99999&type=ppt" 36 | html = requests.get(url).text 37 | lists=re.findall('{"zoom":"(.*?)","page"',html) 38 | for i in range(0,len(lists)): 39 | lists[i] = lists[i].replace("\\",'') 40 | try: 41 | os.mkdir(doc_id) 42 | except: 43 | pass 44 | for i in range(0,len(lists)): 45 | img=requests.get(lists[i]).content 46 | with open(doc_id+'\img'+str(i)+'.jpg','wb') as m: 47 | m.write(img) 48 | print("PPT图片保存在" + doc_id +"文件夹") 49 | 50 | def txt(url): 51 | doc_id = re.findall('view/(.*).html', url)[0] 52 | url = "https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id="+doc_id 53 | html = requests.get(url).text 54 | md5 = re.findall('"md5sum":"(.*?)"',html)[0] 55 | pn = re.findall('"totalPageNum":"(.*?)"',html)[0] 56 | rsign = re.findall('"rsign":"(.*?)"',html)[0] 57 | NewUrl = 'https://wkretype.bdimg.com/retype/text/'+doc_id+'?rn='+pn+'&type=txt'+md5+'&rsign='+rsign 58 | txt = requests.get(NewUrl).text 59 | jsons = json.loads(txt) 60 | texts=re.findall("'c': '(.*?)',",str(jsons)) 61 | print(texts) 62 | filename=doc_id+'.txt' 63 | with open(filename,'a',encoding='utf-8') as f: 64 | for i in range(0,len(texts)): 65 | texts[i] = texts[i].replace('\\r','\r') 66 | texts[i] = texts[i].replace('\\n','\n') 67 | 68 | f.write(texts[i]) 69 | print("TXT文档保存在" + filename) 70 | 71 | def pdf(url): 72 | doc_id = re.findall('view/(.*).html',url)[0] 73 | url = "https://wenku.baidu.com/browse/getbcsurl?doc_id="+doc_id+"&pn=1&rn=99999&type=ppt" 74 | html = requests.get(url).text 75 | lists=re.findall('{"zoom":"(.*?)","page"',html) 76 | for i in range(0,len(lists)): 77 | lists[i] = lists[i].replace("\\",'') 78 | try: 79 | os.mkdir(doc_id) 80 | except: 81 | pass 82 | for i in range(0,len(lists)): 83 | img=requests.get(lists[i]).content 84 | with open(doc_id+'\img'+str(i)+'.jpg','wb') as m: 85 | m.write(img) 86 | print("PDF图片保存在" + doc_id + "文件夹") 87 | 88 | def get_type(url): 89 | data = requests.get(url) 90 | typee = re.findall("'docType': '(.*?)',",data.text) 91 | typee_str = typee[0] 92 | return typee_str 93 | 94 | 95 | url = input("URL:") 96 | print("文档类型:"+get_type(url)) 97 | eval(get_type(url))(url) 98 | print("3秒后退出") 99 | time.sleep(3) --------------------------------------------------------------------------------