├── downloadPPT.py
├── GUI
    ├── docs_download.py
    ├── docs_download.ui
    └── docs_downloader.py
└── README.md


/downloadPPT.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from fpdf import FPDF
 3 | from PIL import Image
 4 | from lxml import etree
 5 | import re, time, random, os
 6 | 
 7 | def getTiltleUrl(originUrl):
 8 | 	# 获取资料的标题和通用的url链接
 9 | 	html = etree.HTML(requests.get(originUrl).text)
10 | 	theHTML = etree.tostring(html).decode('utf-8')
11 | 	# print(theHTML)
12 | 	try:
13 | 		title = html.xpath('//span[@class="doc_title fs_c_76"]/text()')[0]
14 | 	except:
15 | 		title = html.xpath('//title/text()')
16 | 	fileId = re.findall('\-\d+\.',originUrl)[0][1:-1]
17 | 
18 | 	sid = re.findall('flash_param_hzq:\"[\w\*\-]+\"', theHTML)[0][17:-1]
19 | 	url = 'https://docimg1.docin.com/docinpic.jsp?file=' + fileId + '&width=1000&sid=' + sid + '&pcimg=1&pageno='
20 | 	return title, url
21 | 
22 | def getPictures(theurl, path):
23 | 	# 获取图片
24 | 	pagenum = 1
25 | 	headers = {
26 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
27 | 	}
28 | 	allNum = 0
29 | 	while pagenum>0:
30 | 		# time.sleep(3*random.random())
31 | 		print('Downloading picture ' + str(pagenum))
32 | 		url = theurl + str(pagenum)
33 | 		img_req = requests.get(url=url, headers=headers)
34 | 		if img_req.content==b'sid error or Invalid!':
35 | 			allNum = pagenum-1
36 | 			print('Downloading finished, the count of all pictures is ' + str(allNum))
37 | 			pagenum = -1
38 | 			break;
39 | 		file_name = path + str(pagenum) + '.png'
40 | 		f = open(file_name, 'wb')
41 | 		f.write(img_req.content)
42 | 		f.close()
43 | 		# 将图片保存为标准png格式
44 | 		im = Image.open(file_name)
45 | 		im.save(file_name)
46 | 		pagenum += 1
47 | 	return allNum
48 | 
49 | def combinePictures2Pdf(path, pdfName, allNum):
50 | 	# 合并图片为pdf
51 | 	print('Start combining the pictures...')
52 | 	pagenum = 1
53 | 	file_name = path + str(pagenum) + '.png'
54 | 	cover = Image.open(file_name)
55 | 	width, height = cover.size
56 | 	pdf = FPDF(unit = "pt", format = [width, height])
57 | 	while allNum>=pagenum:
58 | 		try:
59 | 			print('combining picture ' + str(pagenum))
60 | 			file_name = path + str(pagenum) + '.png'
61 | 			pdf.add_page()
62 | 			pdf.image(file_name, 0, 0)
63 | 			pagenum += 1
64 | 		except Exception as e:
65 | 			print(e)
66 | 			break;
67 | 	pdf.output(pdfName, "F")
68 | 
69 | def removePictures(path, allNum):
70 | 	# 删除原图片
71 | 	pagenum = 1
72 | 	while allNum>=pagenum:
73 | 		try:
74 | 			print('deleting picture ' + str(pagenum))
75 | 			file_name = path + str(pagenum) + '.png'
76 | 			os.remove(file_name)
77 | 			pagenum += 1
78 | 		except Exception as e:
79 | 			print(e)
80 | 			break;
81 | 
82 | if __name__ == '__main__':
83 | 	# 文件存储的路径
84 | 	path = 'E:\\test\\Docin\\'
85 | 	# 需要的资料的网址
86 | 	# originUrl = 'https://www.docin.com/p-977106193.html?docfrom=rrela'
87 | 	originUrl = input('input the url: ')
88 | 	result = getTiltleUrl(originUrl)
89 | 	title = result[0].split('.')[0]
90 | 	url = result[1]
91 | 	print(title, url)
92 | 	allNum = getPictures(url, path)
93 | 	pdfName = path + title + '.pdf'
94 | 	combinePictures2Pdf(path, pdfName, allNum)
95 | 	removePictures(path, allNum)
96 | 


--------------------------------------------------------------------------------
/GUI/docs_download.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Form implementation generated from reading ui file 'docs_download.ui'
 4 | #
 5 | # Created by: PyQt5 UI code generator 5.15.1
 6 | #
 7 | # WARNING: Any manual changes made to this file will be lost when pyuic5 is
 8 | # run again.  Do not edit this file unless you know what you are doing.
 9 | 
10 | 
11 | from PyQt5 import QtCore, QtGui, QtWidgets
12 | 
13 | 
14 | class Ui_DocsDownload(object):
15 |     def setupUi(self, DocsDownload):
16 |         DocsDownload.setObjectName("DocsDownload")
17 |         DocsDownload.resize(801, 600)
18 |         icon = QtGui.QIcon()
19 |         icon.addPixmap(QtGui.QPixmap("C:/Users/Dubito/Desktop/1634396985874.jpeg"), QtGui.QIcon.Normal, QtGui.QIcon.Off)
20 |         DocsDownload.setWindowIcon(icon)
21 |         DocsDownload.setAccessibleName("")
22 |         DocsDownload.setAccessibleDescription("")
23 |         self.label = QtWidgets.QLabel(DocsDownload)
24 |         self.label.setGeometry(QtCore.QRect(10, 50, 111, 30))
25 |         self.label.setObjectName("label")
26 |         self.lineEdit = QtWidgets.QLineEdit(DocsDownload)
27 |         self.lineEdit.setGeometry(QtCore.QRect(130, 50, 651, 30))
28 |         self.lineEdit.setStyleSheet("")
29 |         self.lineEdit.setObjectName("lineEdit")
30 |         self.label_2 = QtWidgets.QLabel(DocsDownload)
31 |         self.label_2.setGeometry(QtCore.QRect(10, 90, 111, 30))
32 |         self.label_2.setObjectName("label_2")
33 |         self.lineEdit_2 = QtWidgets.QLineEdit(DocsDownload)
34 |         self.lineEdit_2.setGeometry(QtCore.QRect(130, 90, 481, 30))
35 |         self.lineEdit_2.setObjectName("lineEdit_2")
36 |         self.toolButton_2 = QtWidgets.QToolButton(DocsDownload)
37 |         self.toolButton_2.setGeometry(QtCore.QRect(630, 90, 151, 30))
38 |         self.toolButton_2.setObjectName("toolButton_2")
39 |         self.toolButton_5 = QtWidgets.QToolButton(DocsDownload)
40 |         self.toolButton_5.setGeometry(QtCore.QRect(400, 150, 130, 30))
41 |         self.toolButton_5.setObjectName("toolButton_5")
42 |         self.toolButton_3 = QtWidgets.QToolButton(DocsDownload)
43 |         self.toolButton_3.setGeometry(QtCore.QRect(250, 150, 130, 30))
44 |         self.toolButton_3.setStyleSheet("background-color: rgb(85, 170, 255);\n"
45 | "color: rgb(255, 255, 255);\n"
46 | "border: 0;")
47 |         self.toolButton_3.setObjectName("toolButton_3")
48 |         self.textBrowser = QtWidgets.QTextBrowser(DocsDownload)
49 |         self.textBrowser.setGeometry(QtCore.QRect(0, 210, 801, 391))
50 |         self.textBrowser.setObjectName("textBrowser")
51 |         self.label_3 = QtWidgets.QLabel(DocsDownload)
52 |         self.label_3.setGeometry(QtCore.QRect(10, 10, 111, 30))
53 |         self.label_3.setObjectName("label_3")
54 |         self.radioButton = QtWidgets.QRadioButton(DocsDownload)
55 |         self.radioButton.setGeometry(QtCore.QRect(130, 10, 75, 30))
56 |         self.radioButton.setObjectName("radioButton")
57 |         self.radioButton_2 = QtWidgets.QRadioButton(DocsDownload)
58 |         self.radioButton_2.setGeometry(QtCore.QRect(230, 10, 101, 30))
59 |         self.radioButton_2.setObjectName("radioButton_2")
60 | 
61 |         self.retranslateUi(DocsDownload)
62 |         QtCore.QMetaObject.connectSlotsByName(DocsDownload)
63 | 
64 |     def retranslateUi(self, DocsDownload):
65 |         _translate = QtCore.QCoreApplication.translate
66 |         DocsDownload.setWindowTitle(_translate("DocsDownload", "文献一键下载器 - By: Dubito"))
67 |         self.label.setText(_translate("DocsDownload", "文献资料链接："))
68 |         self.label_2.setText(_translate("DocsDownload", "文件保存位置："))
69 |         self.toolButton_2.setText(_translate("DocsDownload", "选择文件夹"))
70 |         self.toolButton_5.setText(_translate("DocsDownload", "查看源码"))
71 |         self.toolButton_3.setText(_translate("DocsDownload", "开始下载"))
72 |         self.label_3.setText(_translate("DocsDownload", "文献下载网站："))
73 |         self.radioButton.setText(_translate("DocsDownload", "豆丁网"))
74 |         self.radioButton_2.setText(_translate("DocsDownload", "道客巴巴"))
75 | 


--------------------------------------------------------------------------------
/GUI/docs_download.ui:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <ui version="4.0">
  3 |  <class>DocsDownload</class>
  4 |  <widget class="QDialog" name="DocsDownload">
  5 |   <property name="geometry">
  6 |    <rect>
  7 |     <x>0</x>
  8 |     <y>0</y>
  9 |     <width>801</width>
 10 |     <height>600</height>
 11 |    </rect>
 12 |   </property>
 13 |   <property name="windowTitle">
 14 |    <string>文献一键下载器 - By: Dubito</string>
 15 |   </property>
 16 |   <property name="windowIcon">
 17 |    <iconset>
 18 |     <normaloff>C:/Users/Dubito/Desktop/1634396985874.jpeg</normaloff>C:/Users/Dubito/Desktop/1634396985874.jpeg</iconset>
 19 |   </property>
 20 |   <property name="accessibleName">
 21 |    <string/>
 22 |   </property>
 23 |   <property name="accessibleDescription">
 24 |    <string/>
 25 |   </property>
 26 |   <widget class="QLabel" name="label">
 27 |    <property name="geometry">
 28 |     <rect>
 29 |      <x>10</x>
 30 |      <y>50</y>
 31 |      <width>111</width>
 32 |      <height>30</height>
 33 |     </rect>
 34 |    </property>
 35 |    <property name="text">
 36 |     <string>文献资料链接：</string>
 37 |    </property>
 38 |   </widget>
 39 |   <widget class="QLineEdit" name="lineEdit">
 40 |    <property name="geometry">
 41 |     <rect>
 42 |      <x>130</x>
 43 |      <y>50</y>
 44 |      <width>651</width>
 45 |      <height>30</height>
 46 |     </rect>
 47 |    </property>
 48 |    <property name="styleSheet">
 49 |     <string notr="true"/>
 50 |    </property>
 51 |   </widget>
 52 |   <widget class="QLabel" name="label_2">
 53 |    <property name="geometry">
 54 |     <rect>
 55 |      <x>10</x>
 56 |      <y>90</y>
 57 |      <width>111</width>
 58 |      <height>30</height>
 59 |     </rect>
 60 |    </property>
 61 |    <property name="text">
 62 |     <string>文件保存位置：</string>
 63 |    </property>
 64 |   </widget>
 65 |   <widget class="QLineEdit" name="lineEdit_2">
 66 |    <property name="geometry">
 67 |     <rect>
 68 |      <x>130</x>
 69 |      <y>90</y>
 70 |      <width>481</width>
 71 |      <height>30</height>
 72 |     </rect>
 73 |    </property>
 74 |   </widget>
 75 |   <widget class="QToolButton" name="toolButton_2">
 76 |    <property name="geometry">
 77 |     <rect>
 78 |      <x>630</x>
 79 |      <y>90</y>
 80 |      <width>151</width>
 81 |      <height>30</height>
 82 |     </rect>
 83 |    </property>
 84 |    <property name="text">
 85 |     <string>选择文件夹</string>
 86 |    </property>
 87 |   </widget>
 88 |   <widget class="QToolButton" name="toolButton_5">
 89 |    <property name="geometry">
 90 |     <rect>
 91 |      <x>400</x>
 92 |      <y>150</y>
 93 |      <width>130</width>
 94 |      <height>30</height>
 95 |     </rect>
 96 |    </property>
 97 |    <property name="text">
 98 |     <string>查看源码</string>
 99 |    </property>
100 |   </widget>
101 |   <widget class="QToolButton" name="toolButton_3">
102 |    <property name="geometry">
103 |     <rect>
104 |      <x>250</x>
105 |      <y>150</y>
106 |      <width>130</width>
107 |      <height>30</height>
108 |     </rect>
109 |    </property>
110 |    <property name="styleSheet">
111 |     <string notr="true">background-color: rgb(85, 170, 255);
112 | color: rgb(255, 255, 255);
113 | border: 0;</string>
114 |    </property>
115 |    <property name="text">
116 |     <string>开始下载</string>
117 |    </property>
118 |   </widget>
119 |   <widget class="QTextBrowser" name="textBrowser">
120 |    <property name="geometry">
121 |     <rect>
122 |      <x>0</x>
123 |      <y>210</y>
124 |      <width>801</width>
125 |      <height>391</height>
126 |     </rect>
127 |    </property>
128 |   </widget>
129 |   <widget class="QLabel" name="label_3">
130 |    <property name="geometry">
131 |     <rect>
132 |      <x>10</x>
133 |      <y>10</y>
134 |      <width>111</width>
135 |      <height>30</height>
136 |     </rect>
137 |    </property>
138 |    <property name="text">
139 |     <string>文献下载网站：</string>
140 |    </property>
141 |   </widget>
142 |   <widget class="QRadioButton" name="radioButton">
143 |    <property name="geometry">
144 |     <rect>
145 |      <x>130</x>
146 |      <y>10</y>
147 |      <width>75</width>
148 |      <height>30</height>
149 |     </rect>
150 |    </property>
151 |    <property name="text">
152 |     <string>豆丁网</string>
153 |    </property>
154 |   </widget>
155 |   <widget class="QRadioButton" name="radioButton_2">
156 |    <property name="geometry">
157 |     <rect>
158 |      <x>230</x>
159 |      <y>10</y>
160 |      <width>101</width>
161 |      <height>30</height>
162 |     </rect>
163 |    </property>
164 |    <property name="text">
165 |     <string>道客巴巴</string>
166 |    </property>
167 |   </widget>
168 |  </widget>
169 |  <resources/>
170 |  <connections/>
171 | </ui>
172 | 


--------------------------------------------------------------------------------
/GUI/docs_downloader.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from PyQt5 import QtWidgets
  3 | from PyQt5.QtWidgets import QFileDialog
  4 | from docs_download import Ui_DocsDownload
  5 | import webbrowser
  6 | import requests
  7 | from fpdf import FPDF
  8 | from PIL import Image
  9 | from lxml import etree
 10 | import re, time, random, os
 11 | 
 12 | class Docin_download:
 13 |     def  __init__ (self, path, originUrl, QTextBrowser):
 14 |         self.path = path
 15 |         self.originUrl = originUrl
 16 |         self.QTextBrowser = QTextBrowser
 17 | 
 18 |     def add_text(self, added_text):
 19 |         self.QTextBrowser.append(added_text)
 20 |         self.QTextBrowser.moveCursor(self.QTextBrowser.textCursor().End)  #文本框显示到底部
 21 |         QtWidgets.QApplication.processEvents()
 22 | 
 23 |     def getTiltleUrl(self, originUrl):
 24 |         # 获取资料的标题和通用的url链接
 25 |         html = etree.HTML(requests.get(originUrl).text)
 26 |         theHTML = etree.tostring(html).decode('utf-8')
 27 |         # print(theHTML)
 28 |         try:
 29 |             title = html.xpath('//span[@class="doc_title fs_c_76"]/text()')[0]
 30 |         except:
 31 |             title = html.xpath('//title/text()')
 32 |         fileId = re.findall('\-\d+\.',originUrl)[0][1:-1]
 33 | 
 34 |         sid = re.findall('flash_param_hzq:\"[\w\*\-]+\"', theHTML)[0][17:-1]
 35 |         url = 'https://docimg1.docin.com/docinpic.jsp?file=' + fileId + '&width=1000&sid=' + sid + '&pcimg=1&pageno='
 36 |         return title, url
 37 | 
 38 |     def getPictures(self, theurl, path):
 39 |         # 获取图片
 40 |         pagenum = 1
 41 |         headers = {
 42 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
 43 |         }
 44 |         allNum = 0
 45 |         while pagenum>0:
 46 |             # time.sleep(3*random.random())
 47 |             url = theurl + str(pagenum)
 48 |             img_req = requests.get(url=url, headers=headers)
 49 |             if img_req.content==b'sid error or Invalid!':
 50 |                 self.add_text('正在下载第' + str(pagenum) + '页...')
 51 |                 allNum = pagenum-1
 52 |                 self.add_text('下载结束，文档共有' + str(allNum) + '页')
 53 |                 pagenum = -1
 54 |                 break;
 55 |             file_name = path + str(pagenum) + '.png'
 56 |             f = open(file_name, 'wb')
 57 |             f.write(img_req.content)
 58 |             f.close()
 59 |             # 将图片保存为标准png格式
 60 |             im = Image.open(file_name)
 61 |             im.save(file_name)
 62 |             pagenum += 1
 63 |         return allNum
 64 | 
 65 |     def combinePictures2Pdf(self, path, pdfName, allNum):
 66 |         # 合并图片为pdf
 67 |         self.add_text('开始合并页面为PDF：')
 68 |         pagenum = 1
 69 |         file_name = path + str(pagenum) + '.png'
 70 |         cover = Image.open(file_name)
 71 |         width, height = cover.size
 72 |         pdf = FPDF(unit = "pt", format = [width, height])
 73 |         while allNum>=pagenum:
 74 |             try:
 75 |                 self.add_text('正在合并页面' + str(pagenum))
 76 |                 file_name = path + str(pagenum) + '.png'
 77 |                 pdf.add_page()
 78 |                 pdf.image(file_name, 0, 0)
 79 |                 pagenum += 1
 80 |             except Exception as e:
 81 |                 self.add_text(e)
 82 |                 break;
 83 |         pdf.output(pdfName, "F")
 84 | 
 85 |     def removePictures(self, path, allNum):
 86 |         # 删除原图片
 87 |         pagenum = 1
 88 |         while allNum>=pagenum:
 89 |             try:
 90 |                 self.add_text('删除页面源文件' + str(pagenum))
 91 |                 file_name = path + str(pagenum) + '.png'
 92 |                 os.remove(file_name)
 93 |                 pagenum += 1
 94 |             except Exception as e:
 95 |                 self.add_text(e)
 96 |                 break;
 97 | 
 98 |     def docin_download(self):
 99 |         result = self.getTiltleUrl(self.originUrl)
100 |         title = result[0].split('.')[0]
101 |         url = result[1]
102 |         # print(title, url)
103 |         self.add_text("文档名：" + str(title))
104 |         allNum = self.getPictures(url, self.path)
105 |         pdfName = title + '.pdf'
106 |         self.combinePictures2Pdf(self.path, pdfName, allNum)
107 |         self.removePictures(self.path, allNum)
108 |         self.add_text("文献下载完成！请到设定的保存目录处查看🍉")
109 | 
110 | class mywindow(QtWidgets.QWidget, Ui_DocsDownload):
111 |     def  __init__ (self):
112 |         super(mywindow, self).__init__()
113 |         self.setupUi(self) 
114 |         self.radioButton.setChecked(True)
115 |         self.toolButton_2.clicked.connect(self.choose_folder)
116 |         self.toolButton_3.clicked.connect(self.start_download)
117 |         self.toolButton_5.clicked.connect(self.get_source_code)
118 | 
119 |     def add_text(self, added_text):
120 |         self.textBrowser.append(added_text)
121 |         self.textBrowser.moveCursor(self.textBrowser.textCursor().End)  #文本框显示到底部
122 |         QtWidgets.QApplication.processEvents()
123 | 
124 |     def choose_folder(self):
125 |         #选取文件夹
126 |         foldername = QFileDialog.getExistingDirectory(self, "选取文件夹", "C:/")
127 |         print(foldername)
128 |         self.lineEdit_2.setText(foldername)
129 | 
130 |     def get_source_code(self):
131 |         url = "https://github.com/you8023/downloadDocin"
132 |         webbrowser.open_new_tab(url)
133 | 
134 |     def start_download(self):
135 |         save_folder = self.lineEdit_2.text() # 保存结果的路径
136 |         originUrl = self.lineEdit.text() # url
137 |         if not (save_folder and originUrl):
138 |             self.add_text("请检查上面的参数是否填写完整！")
139 |             return
140 |         else:
141 |             self.add_text("程序已开始运行，请稍等...")
142 | 
143 |         if self.radioButton.isChecked() == True:
144 |             QTextBrowser = self.textBrowser
145 |             save_folder = save_folder.rstrip('/') + '/'
146 |             if re.findall('https://(www.)?docin.com', originUrl) == []:
147 |                 self.add_text("请输入正确的豆丁网文献网址！")
148 |             else:
149 |                 downloader = Docin_download(save_folder, originUrl, QTextBrowser)
150 |                 downloader.docin_download()
151 |         elif self.radioButton_2.isChecked() == True:
152 |             self.add_text("道客巴巴文档源正在集成中，请等待软件更新😋")
153 |         else:
154 |             self.add_text("请选择下载资料的网站👻")
155 | 
156 | 
157 | if __name__=="__main__":
158 |     
159 |     app=QtWidgets.QApplication(sys.argv)
160 |     ui = mywindow()
161 |     ui.show()
162 |     sys.exit(app.exec_())


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # downloadDocin
  2 | 
  3 | 免费自动下载豆丁网的资料
  4 | 
  5 | Download materials in Docin freely and automatically
  6 | 
  7 | **本工具已封装为EXE可执行程序，发布了release，可直接下载使用，无需搭建任何环境（想要了解实现细节的可以继续往下看）**
  8 | 
  9 | 如遇图片显示不全，请到我的[博客](https://www.jianshu.com/p/3817e06d6a92)观看，欢迎点赞留言！
 10 | 
 11 | 最近在查找资料时，在[豆丁网](https://www.docin.com/)上找到不少有用的资料，比如说一些课程的PPT之类的，但是只能在线看，而且还有广告，但是下载下来要钱，而且要价不菲，于是动起了歪脑筋，寻思着搞一个爬虫直接下载下来就可以离线看了，也方便资料的存储管理。本教程已完结，请放心食用，效果如下（该效果图采用[screentogif](https://www.screentogif.com/)软件录制，特此鸣谢）：
 12 | 
 13 | ![豆丁网资料自动下载效果展示](https://upload-images.jianshu.io/upload_images/5714082-ebcd2d95e82f1869.gif?imageMogr2/auto-orient/strip)
 14 | 
 15 | 本代码免费开源，不想了解原理的可以跳过到使用部分直接使用，希望能给我点个赞以支持开发，如果方便的话，github来一颗星星更好啦！
 16 | 
 17 | 开源代码地址：[https://github.com/you8023/downloadDocin](https://github.com/you8023/downloadDocin)，直接下载按照使用方法使用即可，如遇问题，欢迎在文章下方留言或在[github](https://github.com/you8023/downloadDocin/issues)上提issue。
 18 | 
 19 | ## 开发环境
 20 | * Windows 10
 21 | * Sublime Text 3
 22 | * Python 3.8
 23 | * python库：lxml、fpdf、requests
 24 | 
 25 | ## 环境搭建
 26 | 1. 首先安装Python，直接到官网下载安装即可
 27 | 2. 安装Python库
 28 | 键盘同时按下`win`+`R`，在弹出的对话框中输入`cmd`按回车
 29 | 在弹出的黑框中输入命令安装python库：
 30 | ```
 31 | pip install lxml
 32 | pip install fpdf
 33 | pip install requests
 34 | ```
 35 | 至此，环境搭建完毕
 36 | 
 37 | ## 分析&设计
 38 | 在编写代码前，需要对需求及网页进行分析，明确我们需要的东西的位置
 39 | 
 40 | ### 需求分析
 41 | 首先明确需求，我们需要将豆丁网上我们需要的资料爬取下来，通过对页面元素进行观察，发现上面的资料，无论何种格式，均是以图片的形式进行展示，因此，考虑将其保存为pdf以方便查看。提取的输入输出如下：
 42 | 
 43 | * 输入：所需资料的网址
 44 | * 输出：资料的pdf文件，其中包含：
 45 |   * 资料的每张图片
 46 |   * 资料标题
 47 | 
 48 | ### 页面分析
 49 | #### 页面标题
 50 | 打开想要的资料的网页，这里以[这个网页](https://www.docin.com/p-456842624.html)为例，首先，鼠标右键检查，找到标题元素：
 51 | 
 52 | ![标题元素](https://upload-images.jianshu.io/upload_images/5714082-979f491a36c290e4.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
 53 | 
 54 | 可以发现，标题所属的为`span`标签，class为`doc_title fs_c_76`，因此，可以据此定位标题所在
 55 | 
 56 | #### 页面资料内容
 57 | 再以此方法找到资料所属的标签：
 58 | 
 59 | ![网页元素](https://upload-images.jianshu.io/upload_images/5714082-7c2b9fa4dd351d65.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
 60 | 
 61 | 发现网页的资料均是以图片形式显示，因此，目标确定，我们仅需要找到这个链接即可
 62 | 
 63 | 复制链接，直接用浏览器打开，发现果然就是我们要找的这张图片
 64 | 
 65 | 接着，分析链接`https://docimg1.docin.com/docinpic.jsp?file=456842624&width=942&sid=LE-fLs-BXk4g4mtVLS2D8apgA9Z5X3NNeoZbh0mHZkW*C1Zz1LvKe8xey1BsO1BG&pageno=1&pcimg=1`，发现其中有三个关键字段，分别是：
 66 | 
 67 | * file
 68 | * sid
 69 | * pageno
 70 | 
 71 | 其中，经过观察，发现`file`字段的数字和网页链接`https://www.docin.com/p-456842624.html`的数字一致；
 72 | 
 73 | `pageno`字段则是第几页；
 74 | 
 75 | 而`sid`经过打开另一个资料的网页测试，发现不同的资料具有不同的`sid`，观察其编码，没发现规律，最终，经过仔细分析，在`source`页面源码中找到了一个关键字段`flash_param_hzq`：
 76 | 
 77 | ![flash_param_hzq字段的发现](https://upload-images.jianshu.io/upload_images/5714082-6173e3a5c0281b38.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
 78 | 
 79 | 虽然和`sid`字段并不完全一致，但使用该字段作为`sid`也能得到图片，因此，提取该网页的`flash_param_hzq`字段即可；
 80 | 
 81 | ## 代码实现
 82 | 
 83 | 页面分析完毕，即可开始编写代码。
 84 | 
 85 | ### 获取资料标题和内容的通用url
 86 | 
 87 | 首先，使用`requests`获取网页内容，并使用`lxml`将其封装为一个HTML对象方便后续解析：
 88 | 
 89 | ```python
 90 | html = etree.HTML(requests.get(originUrl).text)
 91 | ```
 92 | 
 93 | 然后使用`xpath`提取标题：
 94 | 
 95 | ```python
 96 | title = html.xpath('//span[@class="doc_title fs_c_76"]/text()')[0]
 97 | ```
 98 | 
 99 | 其中，`//span`表示匹配任意`span`标签，使用`[@class=""]`匹配`class`属性，使用`/text()`提取标签内的内容，由于返回的内容为一个元祖，因此使用`[0]`取第一个元素
100 | 
101 | 使用正则表达式匹配`file`字段：
102 | 
103 | ```python
104 | fileId = re.findall('\-\d+\.',originUrl)[0][1:-1]
105 | ```
106 | 
107 | 其中，`\d`代表匹配数字，`+`表示匹配一次或多次，`[1:-1]`表示取结果的第二个字符到倒数第二个字符
108 | 
109 | 将HTML对象转为字符串：
110 | 
111 | ```python
112 | theHTML = etree.tostring(html).decode('utf-8')
113 | ```
114 | 
115 | 使用正则表达式匹配`flash_param_hzq`字段：
116 | 
117 | ```python
118 | sid = re.findall('flash_param_hzq:\"[\w\*\-]+\"', theHTML)[0][17:-1]
119 | ```
120 | 
121 | 其中，`\w`表示匹配数字或字母
122 | 
123 | 至此，该部分函数书写完毕，完整代码为：
124 | 
125 | ```python
126 | def getTiltleUrl(originUrl):
127 | 	html = etree.HTML(requests.get(originUrl).text)
128 | 	theHTML = etree.tostring(html).decode('utf-8')
129 | 	# print(theHTML)
130 | 	try:
131 | 		title = html.xpath('//span[@class="doc_title fs_c_76"]/text()')[0]
132 | 	except:
133 | 		title = html.xpath('//title/text()')
134 | 	fileId = re.findall('\-\d+\.',originUrl)[0][1:-1]
135 | 
136 | 	sid = re.findall('flash_param_hzq:\"[\w\*\-]+\"', theHTML)[0][17:-1]
137 | 	url = 'https://docimg1.docin.com/docinpic.jsp?file=' + fileId + '&width=1000&sid=' + sid + '&pcimg=1&pageno='
138 | 	return title, url
139 | ```
140 | 
141 | ### 获取图片
142 | 
143 | 通过上面的函数获取到通用的链接后，仅需要更改`pageno`字段即可获取所有图片，使用`requests`获取到图片后，直接将文件流写入到文件中即可。但在后续代码运行过程中发现图片格式报错，因此，使用`PIL`标准化图片。完整代码如下：
144 | 
145 | ```python
146 | def getPictures(theurl, path):
147 | 	pagenum = 1
148 | 	headers = {
149 |         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
150 | 	}
151 | 	allNum = 0
152 | 	while pagenum>0:
153 | 		# time.sleep(3*random.random())
154 | 		print('Downloading picture ' + str(pagenum))
155 | 		url = theurl + str(pagenum)
156 | 		img_req = requests.get(url=url, headers=headers)
157 | 		if img_req.content==b'sid error or Invalid!':
158 | 			allNum = pagenum-1
159 | 			print('Downloading finished, the count of all pictures is ' + str(allNum))
160 | 			pagenum = -1
161 | 			break;
162 | 		file_name = path + str(pagenum) + '.png'
163 | 		f = open(file_name, 'wb')
164 | 		f.write(img_req.content)
165 | 		f.close()
166 | 		# 将图片保存为标准png格式
167 | 		im = Image.open(file_name)
168 | 		im.save(file_name)
169 | 		pagenum += 1
170 | 	return allNum
171 | ```
172 | 
173 | ### 合并图片为pdf
174 | 
175 | 这里主要使用`fpdf`库合并图片为pdf，代码如下：
176 | 
177 | ```python
178 | def combinePictures2Pdf(path, pdfName, allNum):
179 | 	print('Start combining the pictures...')
180 | 	pagenum = 1
181 | 	file_name = path + str(pagenum) + '.png'
182 | 	cover = Image.open(file_name)
183 | 	width, height = cover.size
184 | 	pdf = FPDF(unit = "pt", format = [width, height])
185 | 	while allNum>=pagenum:
186 | 		try:
187 | 			print('combining picture ' + str(pagenum))
188 | 			file_name = path + str(pagenum) + '.png'
189 | 			pdf.add_page()
190 | 			pdf.image(file_name, 0, 0)
191 | 			pagenum += 1
192 | 		except Exception as e:
193 | 			print(e)
194 | 			break;
195 | 	pdf.output(pdfName, "F")
196 | ```
197 | 
198 | 其中：
199 | 
200 | * `pdf = FPDF(unit = "pt", format = [width, height])`规定了pdf的尺寸
201 | * `pdf.add_page()`将为pdf添加一张空白页面
202 | * `pdf.image(file_name, 0, 0)`则是将图片绘制到该空白页面上，后两个参数为绘制的起始xy坐标
203 | * `pdf.output(pdfName, "F")`则是生成pdf文件
204 | 
205 | ### 删除原图片
206 | 
207 | pdf生成完毕之后，之前保存到本地的图片就没有用武之地了，这时需要删去所有图片，删去某个文件的语句为：
208 | 
209 | ```python
210 | os.remove(file_name)
211 | ```
212 | 
213 | 完整代码如下：
214 | 
215 | ```python
216 | def removePictures(path, allNum):
217 | 	pagenum = 1
218 | 	while allNum>=pagenum:
219 | 		try:
220 | 			print('deleting picture ' + str(pagenum))
221 | 			file_name = path + str(pagenum) + '.png'
222 | 			os.remove(file_name)
223 | 			pagenum += 1
224 | 		except Exception as e:
225 | 			print(e)
226 | 			break;
227 | ```
228 | 
229 | ### 主函数
230 | 
231 | 最后，书写语句依次调用函数，自动化下载图片，合并为pdf，并删去原文件：
232 | 
233 | ```python
234 | if __name__ == '__main__':
235 | 	path = 'E:\\test\\Docin\\'
236 | 	# originUrl = 'https://www.docin.com/p-977106193.html?docfrom=rrela'
237 | 	originUrl = input('input the url: ')
238 | 	result = getTiltleUrl(originUrl)
239 | 	title = result[0].split('.')[0]
240 | 	url = result[1]
241 | 	print(title, url)
242 | 	allNum = getPictures(url, path)
243 | 	pdfName = path + title + '.pdf'
244 | 	combinePictures2Pdf(path, pdfName, allNum)
245 | 	removePictures(path, allNum)
246 | ```
247 | 
248 | 其中：
249 | 
250 | * `path`为保存文件的路径，注意，必须为转义`\`后的绝对路径
251 | * 原网址使用`input`函数以让用户自行输入
252 | 
253 | ## 代码使用
254 | 
255 | 源码在[GitHub](https://github.com/you8023/downloadDocin)上，直接下载即可使用。
256 | 
257 | 1. 进入到源码所在的文件夹，使用编辑器（记事本亦可）打开源码文件，修改最下方`main`函数中的`path`路径为你想要保存文件的路径
258 | 2. 在文件所在的文件夹的地址栏输入`cmd`，按下回车，在出现的黑框中输入以下命令：
259 | ```python
260 | python downloadPPT.py
261 | ```
262 | 3. 按下回车，当出现提示语句时输入网址，按下回车
263 | 4. 静等程序跑完即可，下载的资料在第一步输入的路径里面
264 | 
265 | 如图所示：
266 | ![豆丁网资料自动下载效果展示](https://upload-images.jianshu.io/upload_images/5714082-ebcd2d95e82f1869.gif?imageMogr2/auto-orient/strip)
267 | 


--------------------------------------------------------------------------------