├── downloadPPT.py ├── GUI ├── docs_download.py ├── docs_download.ui └── docs_downloader.py └── README.md /downloadPPT.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from fpdf import FPDF 3 | from PIL import Image 4 | from lxml import etree 5 | import re, time, random, os 6 | 7 | def getTiltleUrl(originUrl): 8 | # 获取资料的标题和通用的url链接 9 | html = etree.HTML(requests.get(originUrl).text) 10 | theHTML = etree.tostring(html).decode('utf-8') 11 | # print(theHTML) 12 | try: 13 | title = html.xpath('//span[@class="doc_title fs_c_76"]/text()')[0] 14 | except: 15 | title = html.xpath('//title/text()') 16 | fileId = re.findall('\-\d+\.',originUrl)[0][1:-1] 17 | 18 | sid = re.findall('flash_param_hzq:\"[\w\*\-]+\"', theHTML)[0][17:-1] 19 | url = 'https://docimg1.docin.com/docinpic.jsp?file=' + fileId + '&width=1000&sid=' + sid + '&pcimg=1&pageno=' 20 | return title, url 21 | 22 | def getPictures(theurl, path): 23 | # 获取图片 24 | pagenum = 1 25 | headers = { 26 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" 27 | } 28 | allNum = 0 29 | while pagenum>0: 30 | # time.sleep(3*random.random()) 31 | print('Downloading picture ' + str(pagenum)) 32 | url = theurl + str(pagenum) 33 | img_req = requests.get(url=url, headers=headers) 34 | if img_req.content==b'sid error or Invalid!': 35 | allNum = pagenum-1 36 | print('Downloading finished, the count of all pictures is ' + str(allNum)) 37 | pagenum = -1 38 | break; 39 | file_name = path + str(pagenum) + '.png' 40 | f = open(file_name, 'wb') 41 | f.write(img_req.content) 42 | f.close() 43 | # 将图片保存为标准png格式 44 | im = Image.open(file_name) 45 | im.save(file_name) 46 | pagenum += 1 47 | return allNum 48 | 49 | def combinePictures2Pdf(path, pdfName, allNum): 50 | # 合并图片为pdf 51 | print('Start combining the pictures...') 52 | pagenum = 1 53 | file_name = path + str(pagenum) + '.png' 54 | cover = Image.open(file_name) 55 | width, height = cover.size 56 | pdf = FPDF(unit = "pt", format = [width, height]) 57 | while allNum>=pagenum: 58 | try: 59 | print('combining picture ' + str(pagenum)) 60 | file_name = path + str(pagenum) + '.png' 61 | pdf.add_page() 62 | pdf.image(file_name, 0, 0) 63 | pagenum += 1 64 | except Exception as e: 65 | print(e) 66 | break; 67 | pdf.output(pdfName, "F") 68 | 69 | def removePictures(path, allNum): 70 | # 删除原图片 71 | pagenum = 1 72 | while allNum>=pagenum: 73 | try: 74 | print('deleting picture ' + str(pagenum)) 75 | file_name = path + str(pagenum) + '.png' 76 | os.remove(file_name) 77 | pagenum += 1 78 | except Exception as e: 79 | print(e) 80 | break; 81 | 82 | if __name__ == '__main__': 83 | # 文件存储的路径 84 | path = 'E:\\test\\Docin\\' 85 | # 需要的资料的网址 86 | # originUrl = 'https://www.docin.com/p-977106193.html?docfrom=rrela' 87 | originUrl = input('input the url: ') 88 | result = getTiltleUrl(originUrl) 89 | title = result[0].split('.')[0] 90 | url = result[1] 91 | print(title, url) 92 | allNum = getPictures(url, path) 93 | pdfName = path + title + '.pdf' 94 | combinePictures2Pdf(path, pdfName, allNum) 95 | removePictures(path, allNum) 96 | -------------------------------------------------------------------------------- /GUI/docs_download.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Form implementation generated from reading ui file 'docs_download.ui' 4 | # 5 | # Created by: PyQt5 UI code generator 5.15.1 6 | # 7 | # WARNING: Any manual changes made to this file will be lost when pyuic5 is 8 | # run again. Do not edit this file unless you know what you are doing. 9 | 10 | 11 | from PyQt5 import QtCore, QtGui, QtWidgets 12 | 13 | 14 | class Ui_DocsDownload(object): 15 | def setupUi(self, DocsDownload): 16 | DocsDownload.setObjectName("DocsDownload") 17 | DocsDownload.resize(801, 600) 18 | icon = QtGui.QIcon() 19 | icon.addPixmap(QtGui.QPixmap("C:/Users/Dubito/Desktop/1634396985874.jpeg"), QtGui.QIcon.Normal, QtGui.QIcon.Off) 20 | DocsDownload.setWindowIcon(icon) 21 | DocsDownload.setAccessibleName("") 22 | DocsDownload.setAccessibleDescription("") 23 | self.label = QtWidgets.QLabel(DocsDownload) 24 | self.label.setGeometry(QtCore.QRect(10, 50, 111, 30)) 25 | self.label.setObjectName("label") 26 | self.lineEdit = QtWidgets.QLineEdit(DocsDownload) 27 | self.lineEdit.setGeometry(QtCore.QRect(130, 50, 651, 30)) 28 | self.lineEdit.setStyleSheet("") 29 | self.lineEdit.setObjectName("lineEdit") 30 | self.label_2 = QtWidgets.QLabel(DocsDownload) 31 | self.label_2.setGeometry(QtCore.QRect(10, 90, 111, 30)) 32 | self.label_2.setObjectName("label_2") 33 | self.lineEdit_2 = QtWidgets.QLineEdit(DocsDownload) 34 | self.lineEdit_2.setGeometry(QtCore.QRect(130, 90, 481, 30)) 35 | self.lineEdit_2.setObjectName("lineEdit_2") 36 | self.toolButton_2 = QtWidgets.QToolButton(DocsDownload) 37 | self.toolButton_2.setGeometry(QtCore.QRect(630, 90, 151, 30)) 38 | self.toolButton_2.setObjectName("toolButton_2") 39 | self.toolButton_5 = QtWidgets.QToolButton(DocsDownload) 40 | self.toolButton_5.setGeometry(QtCore.QRect(400, 150, 130, 30)) 41 | self.toolButton_5.setObjectName("toolButton_5") 42 | self.toolButton_3 = QtWidgets.QToolButton(DocsDownload) 43 | self.toolButton_3.setGeometry(QtCore.QRect(250, 150, 130, 30)) 44 | self.toolButton_3.setStyleSheet("background-color: rgb(85, 170, 255);\n" 45 | "color: rgb(255, 255, 255);\n" 46 | "border: 0;") 47 | self.toolButton_3.setObjectName("toolButton_3") 48 | self.textBrowser = QtWidgets.QTextBrowser(DocsDownload) 49 | self.textBrowser.setGeometry(QtCore.QRect(0, 210, 801, 391)) 50 | self.textBrowser.setObjectName("textBrowser") 51 | self.label_3 = QtWidgets.QLabel(DocsDownload) 52 | self.label_3.setGeometry(QtCore.QRect(10, 10, 111, 30)) 53 | self.label_3.setObjectName("label_3") 54 | self.radioButton = QtWidgets.QRadioButton(DocsDownload) 55 | self.radioButton.setGeometry(QtCore.QRect(130, 10, 75, 30)) 56 | self.radioButton.setObjectName("radioButton") 57 | self.radioButton_2 = QtWidgets.QRadioButton(DocsDownload) 58 | self.radioButton_2.setGeometry(QtCore.QRect(230, 10, 101, 30)) 59 | self.radioButton_2.setObjectName("radioButton_2") 60 | 61 | self.retranslateUi(DocsDownload) 62 | QtCore.QMetaObject.connectSlotsByName(DocsDownload) 63 | 64 | def retranslateUi(self, DocsDownload): 65 | _translate = QtCore.QCoreApplication.translate 66 | DocsDownload.setWindowTitle(_translate("DocsDownload", "文献一键下载器 - By: Dubito")) 67 | self.label.setText(_translate("DocsDownload", "文献资料链接:")) 68 | self.label_2.setText(_translate("DocsDownload", "文件保存位置:")) 69 | self.toolButton_2.setText(_translate("DocsDownload", "选择文件夹")) 70 | self.toolButton_5.setText(_translate("DocsDownload", "查看源码")) 71 | self.toolButton_3.setText(_translate("DocsDownload", "开始下载")) 72 | self.label_3.setText(_translate("DocsDownload", "文献下载网站:")) 73 | self.radioButton.setText(_translate("DocsDownload", "豆丁网")) 74 | self.radioButton_2.setText(_translate("DocsDownload", "道客巴巴")) 75 | -------------------------------------------------------------------------------- /GUI/docs_download.ui: -------------------------------------------------------------------------------- 1 | 2 | 3 | DocsDownload 4 | 5 | 6 | 7 | 0 8 | 0 9 | 801 10 | 600 11 | 12 | 13 | 14 | 文献一键下载器 - By: Dubito 15 | 16 | 17 | 18 | C:/Users/Dubito/Desktop/1634396985874.jpegC:/Users/Dubito/Desktop/1634396985874.jpeg 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 10 30 | 50 31 | 111 32 | 30 33 | 34 | 35 | 36 | 文献资料链接: 37 | 38 | 39 | 40 | 41 | 42 | 130 43 | 50 44 | 651 45 | 30 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 10 56 | 90 57 | 111 58 | 30 59 | 60 | 61 | 62 | 文件保存位置: 63 | 64 | 65 | 66 | 67 | 68 | 130 69 | 90 70 | 481 71 | 30 72 | 73 | 74 | 75 | 76 | 77 | 78 | 630 79 | 90 80 | 151 81 | 30 82 | 83 | 84 | 85 | 选择文件夹 86 | 87 | 88 | 89 | 90 | 91 | 400 92 | 150 93 | 130 94 | 30 95 | 96 | 97 | 98 | 查看源码 99 | 100 | 101 | 102 | 103 | 104 | 250 105 | 150 106 | 130 107 | 30 108 | 109 | 110 | 111 | background-color: rgb(85, 170, 255); 112 | color: rgb(255, 255, 255); 113 | border: 0; 114 | 115 | 116 | 开始下载 117 | 118 | 119 | 120 | 121 | 122 | 0 123 | 210 124 | 801 125 | 391 126 | 127 | 128 | 129 | 130 | 131 | 132 | 10 133 | 10 134 | 111 135 | 30 136 | 137 | 138 | 139 | 文献下载网站: 140 | 141 | 142 | 143 | 144 | 145 | 130 146 | 10 147 | 75 148 | 30 149 | 150 | 151 | 152 | 豆丁网 153 | 154 | 155 | 156 | 157 | 158 | 230 159 | 10 160 | 101 161 | 30 162 | 163 | 164 | 165 | 道客巴巴 166 | 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /GUI/docs_downloader.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from PyQt5 import QtWidgets 3 | from PyQt5.QtWidgets import QFileDialog 4 | from docs_download import Ui_DocsDownload 5 | import webbrowser 6 | import requests 7 | from fpdf import FPDF 8 | from PIL import Image 9 | from lxml import etree 10 | import re, time, random, os 11 | 12 | class Docin_download: 13 | def __init__ (self, path, originUrl, QTextBrowser): 14 | self.path = path 15 | self.originUrl = originUrl 16 | self.QTextBrowser = QTextBrowser 17 | 18 | def add_text(self, added_text): 19 | self.QTextBrowser.append(added_text) 20 | self.QTextBrowser.moveCursor(self.QTextBrowser.textCursor().End) #文本框显示到底部 21 | QtWidgets.QApplication.processEvents() 22 | 23 | def getTiltleUrl(self, originUrl): 24 | # 获取资料的标题和通用的url链接 25 | html = etree.HTML(requests.get(originUrl).text) 26 | theHTML = etree.tostring(html).decode('utf-8') 27 | # print(theHTML) 28 | try: 29 | title = html.xpath('//span[@class="doc_title fs_c_76"]/text()')[0] 30 | except: 31 | title = html.xpath('//title/text()') 32 | fileId = re.findall('\-\d+\.',originUrl)[0][1:-1] 33 | 34 | sid = re.findall('flash_param_hzq:\"[\w\*\-]+\"', theHTML)[0][17:-1] 35 | url = 'https://docimg1.docin.com/docinpic.jsp?file=' + fileId + '&width=1000&sid=' + sid + '&pcimg=1&pageno=' 36 | return title, url 37 | 38 | def getPictures(self, theurl, path): 39 | # 获取图片 40 | pagenum = 1 41 | headers = { 42 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" 43 | } 44 | allNum = 0 45 | while pagenum>0: 46 | # time.sleep(3*random.random()) 47 | url = theurl + str(pagenum) 48 | img_req = requests.get(url=url, headers=headers) 49 | if img_req.content==b'sid error or Invalid!': 50 | self.add_text('正在下载第' + str(pagenum) + '页...') 51 | allNum = pagenum-1 52 | self.add_text('下载结束,文档共有' + str(allNum) + '页') 53 | pagenum = -1 54 | break; 55 | file_name = path + str(pagenum) + '.png' 56 | f = open(file_name, 'wb') 57 | f.write(img_req.content) 58 | f.close() 59 | # 将图片保存为标准png格式 60 | im = Image.open(file_name) 61 | im.save(file_name) 62 | pagenum += 1 63 | return allNum 64 | 65 | def combinePictures2Pdf(self, path, pdfName, allNum): 66 | # 合并图片为pdf 67 | self.add_text('开始合并页面为PDF:') 68 | pagenum = 1 69 | file_name = path + str(pagenum) + '.png' 70 | cover = Image.open(file_name) 71 | width, height = cover.size 72 | pdf = FPDF(unit = "pt", format = [width, height]) 73 | while allNum>=pagenum: 74 | try: 75 | self.add_text('正在合并页面' + str(pagenum)) 76 | file_name = path + str(pagenum) + '.png' 77 | pdf.add_page() 78 | pdf.image(file_name, 0, 0) 79 | pagenum += 1 80 | except Exception as e: 81 | self.add_text(e) 82 | break; 83 | pdf.output(pdfName, "F") 84 | 85 | def removePictures(self, path, allNum): 86 | # 删除原图片 87 | pagenum = 1 88 | while allNum>=pagenum: 89 | try: 90 | self.add_text('删除页面源文件' + str(pagenum)) 91 | file_name = path + str(pagenum) + '.png' 92 | os.remove(file_name) 93 | pagenum += 1 94 | except Exception as e: 95 | self.add_text(e) 96 | break; 97 | 98 | def docin_download(self): 99 | result = self.getTiltleUrl(self.originUrl) 100 | title = result[0].split('.')[0] 101 | url = result[1] 102 | # print(title, url) 103 | self.add_text("文档名:" + str(title)) 104 | allNum = self.getPictures(url, self.path) 105 | pdfName = title + '.pdf' 106 | self.combinePictures2Pdf(self.path, pdfName, allNum) 107 | self.removePictures(self.path, allNum) 108 | self.add_text("文献下载完成!请到设定的保存目录处查看🍉") 109 | 110 | class mywindow(QtWidgets.QWidget, Ui_DocsDownload): 111 | def __init__ (self): 112 | super(mywindow, self).__init__() 113 | self.setupUi(self) 114 | self.radioButton.setChecked(True) 115 | self.toolButton_2.clicked.connect(self.choose_folder) 116 | self.toolButton_3.clicked.connect(self.start_download) 117 | self.toolButton_5.clicked.connect(self.get_source_code) 118 | 119 | def add_text(self, added_text): 120 | self.textBrowser.append(added_text) 121 | self.textBrowser.moveCursor(self.textBrowser.textCursor().End) #文本框显示到底部 122 | QtWidgets.QApplication.processEvents() 123 | 124 | def choose_folder(self): 125 | #选取文件夹 126 | foldername = QFileDialog.getExistingDirectory(self, "选取文件夹", "C:/") 127 | print(foldername) 128 | self.lineEdit_2.setText(foldername) 129 | 130 | def get_source_code(self): 131 | url = "https://github.com/you8023/downloadDocin" 132 | webbrowser.open_new_tab(url) 133 | 134 | def start_download(self): 135 | save_folder = self.lineEdit_2.text() # 保存结果的路径 136 | originUrl = self.lineEdit.text() # url 137 | if not (save_folder and originUrl): 138 | self.add_text("请检查上面的参数是否填写完整!") 139 | return 140 | else: 141 | self.add_text("程序已开始运行,请稍等...") 142 | 143 | if self.radioButton.isChecked() == True: 144 | QTextBrowser = self.textBrowser 145 | save_folder = save_folder.rstrip('/') + '/' 146 | if re.findall('https://(www.)?docin.com', originUrl) == []: 147 | self.add_text("请输入正确的豆丁网文献网址!") 148 | else: 149 | downloader = Docin_download(save_folder, originUrl, QTextBrowser) 150 | downloader.docin_download() 151 | elif self.radioButton_2.isChecked() == True: 152 | self.add_text("道客巴巴文档源正在集成中,请等待软件更新😋") 153 | else: 154 | self.add_text("请选择下载资料的网站👻") 155 | 156 | 157 | if __name__=="__main__": 158 | 159 | app=QtWidgets.QApplication(sys.argv) 160 | ui = mywindow() 161 | ui.show() 162 | sys.exit(app.exec_()) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # downloadDocin 2 | 3 | 免费自动下载豆丁网的资料 4 | 5 | Download materials in Docin freely and automatically 6 | 7 | **本工具已封装为EXE可执行程序,发布了release,可直接下载使用,无需搭建任何环境(想要了解实现细节的可以继续往下看)** 8 | 9 | 如遇图片显示不全,请到我的[博客](https://www.jianshu.com/p/3817e06d6a92)观看,欢迎点赞留言! 10 | 11 | 最近在查找资料时,在[豆丁网](https://www.docin.com/)上找到不少有用的资料,比如说一些课程的PPT之类的,但是只能在线看,而且还有广告,但是下载下来要钱,而且要价不菲,于是动起了歪脑筋,寻思着搞一个爬虫直接下载下来就可以离线看了,也方便资料的存储管理。本教程已完结,请放心食用,效果如下(该效果图采用[screentogif](https://www.screentogif.com/)软件录制,特此鸣谢): 12 | 13 | ![豆丁网资料自动下载效果展示](https://upload-images.jianshu.io/upload_images/5714082-ebcd2d95e82f1869.gif?imageMogr2/auto-orient/strip) 14 | 15 | 本代码免费开源,不想了解原理的可以跳过到使用部分直接使用,希望能给我点个赞以支持开发,如果方便的话,github来一颗星星更好啦! 16 | 17 | 开源代码地址:[https://github.com/you8023/downloadDocin](https://github.com/you8023/downloadDocin),直接下载按照使用方法使用即可,如遇问题,欢迎在文章下方留言或在[github](https://github.com/you8023/downloadDocin/issues)上提issue。 18 | 19 | ## 开发环境 20 | * Windows 10 21 | * Sublime Text 3 22 | * Python 3.8 23 | * python库:lxml、fpdf、requests 24 | 25 | ## 环境搭建 26 | 1. 首先安装Python,直接到官网下载安装即可 27 | 2. 安装Python库 28 | 键盘同时按下`win`+`R`,在弹出的对话框中输入`cmd`按回车 29 | 在弹出的黑框中输入命令安装python库: 30 | ``` 31 | pip install lxml 32 | pip install fpdf 33 | pip install requests 34 | ``` 35 | 至此,环境搭建完毕 36 | 37 | ## 分析&设计 38 | 在编写代码前,需要对需求及网页进行分析,明确我们需要的东西的位置 39 | 40 | ### 需求分析 41 | 首先明确需求,我们需要将豆丁网上我们需要的资料爬取下来,通过对页面元素进行观察,发现上面的资料,无论何种格式,均是以图片的形式进行展示,因此,考虑将其保存为pdf以方便查看。提取的输入输出如下: 42 | 43 | * 输入:所需资料的网址 44 | * 输出:资料的pdf文件,其中包含: 45 | * 资料的每张图片 46 | * 资料标题 47 | 48 | ### 页面分析 49 | #### 页面标题 50 | 打开想要的资料的网页,这里以[这个网页](https://www.docin.com/p-456842624.html)为例,首先,鼠标右键检查,找到标题元素: 51 | 52 | ![标题元素](https://upload-images.jianshu.io/upload_images/5714082-979f491a36c290e4.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 53 | 54 | 可以发现,标题所属的为`span`标签,class为`doc_title fs_c_76`,因此,可以据此定位标题所在 55 | 56 | #### 页面资料内容 57 | 再以此方法找到资料所属的标签: 58 | 59 | ![网页元素](https://upload-images.jianshu.io/upload_images/5714082-7c2b9fa4dd351d65.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 60 | 61 | 发现网页的资料均是以图片形式显示,因此,目标确定,我们仅需要找到这个链接即可 62 | 63 | 复制链接,直接用浏览器打开,发现果然就是我们要找的这张图片 64 | 65 | 接着,分析链接`https://docimg1.docin.com/docinpic.jsp?file=456842624&width=942&sid=LE-fLs-BXk4g4mtVLS2D8apgA9Z5X3NNeoZbh0mHZkW*C1Zz1LvKe8xey1BsO1BG&pageno=1&pcimg=1`,发现其中有三个关键字段,分别是: 66 | 67 | * file 68 | * sid 69 | * pageno 70 | 71 | 其中,经过观察,发现`file`字段的数字和网页链接`https://www.docin.com/p-456842624.html`的数字一致; 72 | 73 | `pageno`字段则是第几页; 74 | 75 | 而`sid`经过打开另一个资料的网页测试,发现不同的资料具有不同的`sid`,观察其编码,没发现规律,最终,经过仔细分析,在`source`页面源码中找到了一个关键字段`flash_param_hzq`: 76 | 77 | ![flash_param_hzq字段的发现](https://upload-images.jianshu.io/upload_images/5714082-6173e3a5c0281b38.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 78 | 79 | 虽然和`sid`字段并不完全一致,但使用该字段作为`sid`也能得到图片,因此,提取该网页的`flash_param_hzq`字段即可; 80 | 81 | ## 代码实现 82 | 83 | 页面分析完毕,即可开始编写代码。 84 | 85 | ### 获取资料标题和内容的通用url 86 | 87 | 首先,使用`requests`获取网页内容,并使用`lxml`将其封装为一个HTML对象方便后续解析: 88 | 89 | ```python 90 | html = etree.HTML(requests.get(originUrl).text) 91 | ``` 92 | 93 | 然后使用`xpath`提取标题: 94 | 95 | ```python 96 | title = html.xpath('//span[@class="doc_title fs_c_76"]/text()')[0] 97 | ``` 98 | 99 | 其中,`//span`表示匹配任意`span`标签,使用`[@class=""]`匹配`class`属性,使用`/text()`提取标签内的内容,由于返回的内容为一个元祖,因此使用`[0]`取第一个元素 100 | 101 | 使用正则表达式匹配`file`字段: 102 | 103 | ```python 104 | fileId = re.findall('\-\d+\.',originUrl)[0][1:-1] 105 | ``` 106 | 107 | 其中,`\d`代表匹配数字,`+`表示匹配一次或多次,`[1:-1]`表示取结果的第二个字符到倒数第二个字符 108 | 109 | 将HTML对象转为字符串: 110 | 111 | ```python 112 | theHTML = etree.tostring(html).decode('utf-8') 113 | ``` 114 | 115 | 使用正则表达式匹配`flash_param_hzq`字段: 116 | 117 | ```python 118 | sid = re.findall('flash_param_hzq:\"[\w\*\-]+\"', theHTML)[0][17:-1] 119 | ``` 120 | 121 | 其中,`\w`表示匹配数字或字母 122 | 123 | 至此,该部分函数书写完毕,完整代码为: 124 | 125 | ```python 126 | def getTiltleUrl(originUrl): 127 | html = etree.HTML(requests.get(originUrl).text) 128 | theHTML = etree.tostring(html).decode('utf-8') 129 | # print(theHTML) 130 | try: 131 | title = html.xpath('//span[@class="doc_title fs_c_76"]/text()')[0] 132 | except: 133 | title = html.xpath('//title/text()') 134 | fileId = re.findall('\-\d+\.',originUrl)[0][1:-1] 135 | 136 | sid = re.findall('flash_param_hzq:\"[\w\*\-]+\"', theHTML)[0][17:-1] 137 | url = 'https://docimg1.docin.com/docinpic.jsp?file=' + fileId + '&width=1000&sid=' + sid + '&pcimg=1&pageno=' 138 | return title, url 139 | ``` 140 | 141 | ### 获取图片 142 | 143 | 通过上面的函数获取到通用的链接后,仅需要更改`pageno`字段即可获取所有图片,使用`requests`获取到图片后,直接将文件流写入到文件中即可。但在后续代码运行过程中发现图片格式报错,因此,使用`PIL`标准化图片。完整代码如下: 144 | 145 | ```python 146 | def getPictures(theurl, path): 147 | pagenum = 1 148 | headers = { 149 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" 150 | } 151 | allNum = 0 152 | while pagenum>0: 153 | # time.sleep(3*random.random()) 154 | print('Downloading picture ' + str(pagenum)) 155 | url = theurl + str(pagenum) 156 | img_req = requests.get(url=url, headers=headers) 157 | if img_req.content==b'sid error or Invalid!': 158 | allNum = pagenum-1 159 | print('Downloading finished, the count of all pictures is ' + str(allNum)) 160 | pagenum = -1 161 | break; 162 | file_name = path + str(pagenum) + '.png' 163 | f = open(file_name, 'wb') 164 | f.write(img_req.content) 165 | f.close() 166 | # 将图片保存为标准png格式 167 | im = Image.open(file_name) 168 | im.save(file_name) 169 | pagenum += 1 170 | return allNum 171 | ``` 172 | 173 | ### 合并图片为pdf 174 | 175 | 这里主要使用`fpdf`库合并图片为pdf,代码如下: 176 | 177 | ```python 178 | def combinePictures2Pdf(path, pdfName, allNum): 179 | print('Start combining the pictures...') 180 | pagenum = 1 181 | file_name = path + str(pagenum) + '.png' 182 | cover = Image.open(file_name) 183 | width, height = cover.size 184 | pdf = FPDF(unit = "pt", format = [width, height]) 185 | while allNum>=pagenum: 186 | try: 187 | print('combining picture ' + str(pagenum)) 188 | file_name = path + str(pagenum) + '.png' 189 | pdf.add_page() 190 | pdf.image(file_name, 0, 0) 191 | pagenum += 1 192 | except Exception as e: 193 | print(e) 194 | break; 195 | pdf.output(pdfName, "F") 196 | ``` 197 | 198 | 其中: 199 | 200 | * `pdf = FPDF(unit = "pt", format = [width, height])`规定了pdf的尺寸 201 | * `pdf.add_page()`将为pdf添加一张空白页面 202 | * `pdf.image(file_name, 0, 0)`则是将图片绘制到该空白页面上,后两个参数为绘制的起始xy坐标 203 | * `pdf.output(pdfName, "F")`则是生成pdf文件 204 | 205 | ### 删除原图片 206 | 207 | pdf生成完毕之后,之前保存到本地的图片就没有用武之地了,这时需要删去所有图片,删去某个文件的语句为: 208 | 209 | ```python 210 | os.remove(file_name) 211 | ``` 212 | 213 | 完整代码如下: 214 | 215 | ```python 216 | def removePictures(path, allNum): 217 | pagenum = 1 218 | while allNum>=pagenum: 219 | try: 220 | print('deleting picture ' + str(pagenum)) 221 | file_name = path + str(pagenum) + '.png' 222 | os.remove(file_name) 223 | pagenum += 1 224 | except Exception as e: 225 | print(e) 226 | break; 227 | ``` 228 | 229 | ### 主函数 230 | 231 | 最后,书写语句依次调用函数,自动化下载图片,合并为pdf,并删去原文件: 232 | 233 | ```python 234 | if __name__ == '__main__': 235 | path = 'E:\\test\\Docin\\' 236 | # originUrl = 'https://www.docin.com/p-977106193.html?docfrom=rrela' 237 | originUrl = input('input the url: ') 238 | result = getTiltleUrl(originUrl) 239 | title = result[0].split('.')[0] 240 | url = result[1] 241 | print(title, url) 242 | allNum = getPictures(url, path) 243 | pdfName = path + title + '.pdf' 244 | combinePictures2Pdf(path, pdfName, allNum) 245 | removePictures(path, allNum) 246 | ``` 247 | 248 | 其中: 249 | 250 | * `path`为保存文件的路径,注意,必须为转义`\`后的绝对路径 251 | * 原网址使用`input`函数以让用户自行输入 252 | 253 | ## 代码使用 254 | 255 | 源码在[GitHub](https://github.com/you8023/downloadDocin)上,直接下载即可使用。 256 | 257 | 1. 进入到源码所在的文件夹,使用编辑器(记事本亦可)打开源码文件,修改最下方`main`函数中的`path`路径为你想要保存文件的路径 258 | 2. 在文件所在的文件夹的地址栏输入`cmd`,按下回车,在出现的黑框中输入以下命令: 259 | ```python 260 | python downloadPPT.py 261 | ``` 262 | 3. 按下回车,当出现提示语句时输入网址,按下回车 263 | 4. 静等程序跑完即可,下载的资料在第一步输入的路径里面 264 | 265 | 如图所示: 266 | ![豆丁网资料自动下载效果展示](https://upload-images.jianshu.io/upload_images/5714082-ebcd2d95e82f1869.gif?imageMogr2/auto-orient/strip) 267 | --------------------------------------------------------------------------------