├── GUI
    ├── review_generator.py
    ├── review_generator.ui
    └── review_generator_client.py
├── README.md
├── demo
    ├── An Ontology-Driven Approach to Automating the Process of Integrating Security Software Systems.pdf
    └── result.txt
└── pdfprocessor.py


/GUI/review_generator.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Form implementation generated from reading ui file 'review_generator.ui'
 4 | #
 5 | # Created by: PyQt5 UI code generator 5.15.1
 6 | #
 7 | # WARNING: Any manual changes made to this file will be lost when pyuic5 is
 8 | # run again.  Do not edit this file unless you know what you are doing.
 9 | 
10 | 
11 | from PyQt5 import QtCore, QtGui, QtWidgets
12 | 
13 | 
14 | class Ui_ReviewGenerator(object):
15 |     def setupUi(self, ReviewGenerator):
16 |         ReviewGenerator.setObjectName("ReviewGenerator")
17 |         ReviewGenerator.resize(800, 600)
18 |         icon = QtGui.QIcon()
19 |         icon.addPixmap(QtGui.QPixmap("C:/Users/Dubito/Desktop/1634396985874.jpeg"), QtGui.QIcon.Normal, QtGui.QIcon.Off)
20 |         ReviewGenerator.setWindowIcon(icon)
21 |         ReviewGenerator.setAccessibleName("")
22 |         ReviewGenerator.setAccessibleDescription("")
23 |         self.toolButton = QtWidgets.QToolButton(ReviewGenerator)
24 |         self.toolButton.setGeometry(QtCore.QRect(630, 20, 150, 30))
25 |         self.toolButton.setObjectName("toolButton")
26 |         self.label = QtWidgets.QLabel(ReviewGenerator)
27 |         self.label.setGeometry(QtCore.QRect(10, 20, 111, 30))
28 |         self.label.setObjectName("label")
29 |         self.lineEdit = QtWidgets.QLineEdit(ReviewGenerator)
30 |         self.lineEdit.setGeometry(QtCore.QRect(130, 20, 480, 30))
31 |         self.lineEdit.setStyleSheet("")
32 |         self.lineEdit.setObjectName("lineEdit")
33 |         self.label_2 = QtWidgets.QLabel(ReviewGenerator)
34 |         self.label_2.setGeometry(QtCore.QRect(10, 60, 111, 30))
35 |         self.label_2.setObjectName("label_2")
36 |         self.lineEdit_2 = QtWidgets.QLineEdit(ReviewGenerator)
37 |         self.lineEdit_2.setGeometry(QtCore.QRect(130, 60, 481, 30))
38 |         self.lineEdit_2.setObjectName("lineEdit_2")
39 |         self.toolButton_2 = QtWidgets.QToolButton(ReviewGenerator)
40 |         self.toolButton_2.setGeometry(QtCore.QRect(630, 60, 150, 30))
41 |         self.toolButton_2.setObjectName("toolButton_2")
42 |         self.label_3 = QtWidgets.QLabel(ReviewGenerator)
43 |         self.label_3.setGeometry(QtCore.QRect(10, 140, 111, 30))
44 |         self.label_3.setObjectName("label_3")
45 |         self.lineEdit_3 = QtWidgets.QLineEdit(ReviewGenerator)
46 |         self.lineEdit_3.setGeometry(QtCore.QRect(130, 140, 331, 30))
47 |         self.lineEdit_3.setObjectName("lineEdit_3")
48 |         self.label_4 = QtWidgets.QLabel(ReviewGenerator)
49 |         self.label_4.setGeometry(QtCore.QRect(10, 180, 111, 30))
50 |         self.label_4.setObjectName("label_4")
51 |         self.lineEdit_4 = QtWidgets.QLineEdit(ReviewGenerator)
52 |         self.lineEdit_4.setGeometry(QtCore.QRect(130, 180, 331, 30))
53 |         self.lineEdit_4.setObjectName("lineEdit_4")
54 |         self.lineEdit_5 = QtWidgets.QLineEdit(ReviewGenerator)
55 |         self.lineEdit_5.setGeometry(QtCore.QRect(130, 100, 331, 30))
56 |         self.lineEdit_5.setText("")
57 |         self.lineEdit_5.setObjectName("lineEdit_5")
58 |         self.label_5 = QtWidgets.QLabel(ReviewGenerator)
59 |         self.label_5.setGeometry(QtCore.QRect(10, 100, 111, 30))
60 |         self.label_5.setObjectName("label_5")
61 |         self.toolButton_5 = QtWidgets.QToolButton(ReviewGenerator)
62 |         self.toolButton_5.setGeometry(QtCore.QRect(570, 230, 130, 30))
63 |         self.toolButton_5.setObjectName("toolButton_5")
64 |         self.toolButton_3 = QtWidgets.QToolButton(ReviewGenerator)
65 |         self.toolButton_3.setGeometry(QtCore.QRect(100, 230, 130, 30))
66 |         self.toolButton_3.setStyleSheet("background-color: rgb(85, 170, 255);\n"
67 | "color: rgb(255, 255, 255);\n"
68 | "border: 0;")
69 |         self.toolButton_3.setObjectName("toolButton_3")
70 |         self.toolButton_4 = QtWidgets.QToolButton(ReviewGenerator)
71 |         self.toolButton_4.setGeometry(QtCore.QRect(250, 230, 300, 30))
72 |         self.toolButton_4.setObjectName("toolButton_4")
73 |         self.textBrowser = QtWidgets.QTextBrowser(ReviewGenerator)
74 |         self.textBrowser.setGeometry(QtCore.QRect(0, 280, 801, 321))
75 |         self.textBrowser.setObjectName("textBrowser")
76 | 
77 |         self.retranslateUi(ReviewGenerator)
78 |         QtCore.QMetaObject.connectSlotsByName(ReviewGenerator)
79 | 
80 |     def retranslateUi(self, ReviewGenerator):
81 |         _translate = QtCore.QCoreApplication.translate
82 |         ReviewGenerator.setWindowTitle(_translate("ReviewGenerator", "文献综述一键生成器 - By: Dubito"))
83 |         self.toolButton.setText(_translate("ReviewGenerator", "选择文件夹"))
84 |         self.label.setText(_translate("ReviewGenerator", "PDF论文位置："))
85 |         self.label_2.setText(_translate("ReviewGenerator", "生成综述位置："))
86 |         self.toolButton_2.setText(_translate("ReviewGenerator", "选择文件夹"))
87 |         self.label_3.setText(_translate("ReviewGenerator", "APP Key："))
88 |         self.label_4.setText(_translate("ReviewGenerator", "APP Secret："))
89 |         self.label_5.setText(_translate("ReviewGenerator", "综述文件名："))
90 |         self.toolButton_5.setText(_translate("ReviewGenerator", "查看源码"))
91 |         self.toolButton_3.setText(_translate("ReviewGenerator", "开始生成"))
92 |         self.toolButton_4.setText(_translate("ReviewGenerator", "如何获取APP Key"))
93 | 


--------------------------------------------------------------------------------
/GUI/review_generator.ui:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <ui version="4.0">
  3 |  <class>ReviewGenerator</class>
  4 |  <widget class="QDialog" name="ReviewGenerator">
  5 |   <property name="geometry">
  6 |    <rect>
  7 |     <x>0</x>
  8 |     <y>0</y>
  9 |     <width>800</width>
 10 |     <height>600</height>
 11 |    </rect>
 12 |   </property>
 13 |   <property name="windowTitle">
 14 |    <string>文献综述一键生成器 - By: Dubito</string>
 15 |   </property>
 16 |   <property name="windowIcon">
 17 |    <iconset>
 18 |     <normaloff>C:/Users/Dubito/Desktop/1634396985874.jpeg</normaloff>C:/Users/Dubito/Desktop/1634396985874.jpeg</iconset>
 19 |   </property>
 20 |   <property name="accessibleName">
 21 |    <string/>
 22 |   </property>
 23 |   <property name="accessibleDescription">
 24 |    <string/>
 25 |   </property>
 26 |   <widget class="QToolButton" name="toolButton">
 27 |    <property name="geometry">
 28 |     <rect>
 29 |      <x>630</x>
 30 |      <y>20</y>
 31 |      <width>150</width>
 32 |      <height>30</height>
 33 |     </rect>
 34 |    </property>
 35 |    <property name="text">
 36 |     <string>选择文件夹</string>
 37 |    </property>
 38 |   </widget>
 39 |   <widget class="QLabel" name="label">
 40 |    <property name="geometry">
 41 |     <rect>
 42 |      <x>10</x>
 43 |      <y>20</y>
 44 |      <width>111</width>
 45 |      <height>30</height>
 46 |     </rect>
 47 |    </property>
 48 |    <property name="text">
 49 |     <string>PDF论文位置：</string>
 50 |    </property>
 51 |   </widget>
 52 |   <widget class="QLineEdit" name="lineEdit">
 53 |    <property name="geometry">
 54 |     <rect>
 55 |      <x>130</x>
 56 |      <y>20</y>
 57 |      <width>480</width>
 58 |      <height>30</height>
 59 |     </rect>
 60 |    </property>
 61 |    <property name="styleSheet">
 62 |     <string notr="true"/>
 63 |    </property>
 64 |   </widget>
 65 |   <widget class="QLabel" name="label_2">
 66 |    <property name="geometry">
 67 |     <rect>
 68 |      <x>10</x>
 69 |      <y>60</y>
 70 |      <width>111</width>
 71 |      <height>30</height>
 72 |     </rect>
 73 |    </property>
 74 |    <property name="text">
 75 |     <string>生成综述位置：</string>
 76 |    </property>
 77 |   </widget>
 78 |   <widget class="QLineEdit" name="lineEdit_2">
 79 |    <property name="geometry">
 80 |     <rect>
 81 |      <x>130</x>
 82 |      <y>60</y>
 83 |      <width>481</width>
 84 |      <height>30</height>
 85 |     </rect>
 86 |    </property>
 87 |   </widget>
 88 |   <widget class="QToolButton" name="toolButton_2">
 89 |    <property name="geometry">
 90 |     <rect>
 91 |      <x>630</x>
 92 |      <y>60</y>
 93 |      <width>150</width>
 94 |      <height>30</height>
 95 |     </rect>
 96 |    </property>
 97 |    <property name="text">
 98 |     <string>选择文件夹</string>
 99 |    </property>
100 |   </widget>
101 |   <widget class="QLabel" name="label_3">
102 |    <property name="geometry">
103 |     <rect>
104 |      <x>10</x>
105 |      <y>140</y>
106 |      <width>111</width>
107 |      <height>30</height>
108 |     </rect>
109 |    </property>
110 |    <property name="text">
111 |     <string>APP Key：</string>
112 |    </property>
113 |   </widget>
114 |   <widget class="QLineEdit" name="lineEdit_3">
115 |    <property name="geometry">
116 |     <rect>
117 |      <x>130</x>
118 |      <y>140</y>
119 |      <width>331</width>
120 |      <height>30</height>
121 |     </rect>
122 |    </property>
123 |   </widget>
124 |   <widget class="QLabel" name="label_4">
125 |    <property name="geometry">
126 |     <rect>
127 |      <x>10</x>
128 |      <y>180</y>
129 |      <width>111</width>
130 |      <height>30</height>
131 |     </rect>
132 |    </property>
133 |    <property name="text">
134 |     <string>APP Secret：</string>
135 |    </property>
136 |   </widget>
137 |   <widget class="QLineEdit" name="lineEdit_4">
138 |    <property name="geometry">
139 |     <rect>
140 |      <x>130</x>
141 |      <y>180</y>
142 |      <width>331</width>
143 |      <height>30</height>
144 |     </rect>
145 |    </property>
146 |   </widget>
147 |   <widget class="QLineEdit" name="lineEdit_5">
148 |    <property name="geometry">
149 |     <rect>
150 |      <x>130</x>
151 |      <y>100</y>
152 |      <width>331</width>
153 |      <height>30</height>
154 |     </rect>
155 |    </property>
156 |    <property name="text">
157 |     <string/>
158 |    </property>
159 |   </widget>
160 |   <widget class="QLabel" name="label_5">
161 |    <property name="geometry">
162 |     <rect>
163 |      <x>10</x>
164 |      <y>100</y>
165 |      <width>111</width>
166 |      <height>30</height>
167 |     </rect>
168 |    </property>
169 |    <property name="text">
170 |     <string>综述文件名：</string>
171 |    </property>
172 |   </widget>
173 |   <widget class="QToolButton" name="toolButton_5">
174 |    <property name="geometry">
175 |     <rect>
176 |      <x>570</x>
177 |      <y>230</y>
178 |      <width>130</width>
179 |      <height>30</height>
180 |     </rect>
181 |    </property>
182 |    <property name="text">
183 |     <string>查看源码</string>
184 |    </property>
185 |   </widget>
186 |   <widget class="QToolButton" name="toolButton_3">
187 |    <property name="geometry">
188 |     <rect>
189 |      <x>100</x>
190 |      <y>230</y>
191 |      <width>130</width>
192 |      <height>30</height>
193 |     </rect>
194 |    </property>
195 |    <property name="styleSheet">
196 |     <string notr="true">background-color: rgb(85, 170, 255);
197 | color: rgb(255, 255, 255);
198 | border: 0;</string>
199 |    </property>
200 |    <property name="text">
201 |     <string>开始生成</string>
202 |    </property>
203 |   </widget>
204 |   <widget class="QToolButton" name="toolButton_4">
205 |    <property name="geometry">
206 |     <rect>
207 |      <x>250</x>
208 |      <y>230</y>
209 |      <width>300</width>
210 |      <height>30</height>
211 |     </rect>
212 |    </property>
213 |    <property name="text">
214 |     <string>如何获取APP Key</string>
215 |    </property>
216 |   </widget>
217 |   <widget class="QTextBrowser" name="textBrowser">
218 |    <property name="geometry">
219 |     <rect>
220 |      <x>0</x>
221 |      <y>280</y>
222 |      <width>801</width>
223 |      <height>321</height>
224 |     </rect>
225 |    </property>
226 |   </widget>
227 |  </widget>
228 |  <resources/>
229 |  <connections/>
230 | </ui>
231 | 


--------------------------------------------------------------------------------
/GUI/review_generator_client.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from PyQt5 import QtWidgets
  3 | from PyQt5.QtWidgets import QFileDialog
  4 | from review_generator import Ui_ReviewGenerator
  5 | import webbrowser
  6 | import hashlib
  7 | import requests
  8 | import uuid
  9 | import os
 10 | import time
 11 | import json
 12 | import importlib,sys
 13 | importlib.reload(sys)
 14 | from pdfminer.pdfparser import PDFParser, PDFDocument
 15 | from pdfminer.pdfdevice import PDFDevice
 16 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 17 | from pdfminer.converter import PDFPageAggregator
 18 | from pdfminer.layout import LTTextBoxHorizontal, LAParams
 19 | from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
 20 | import re
 21 | 
 22 | # 去除警告
 23 | import logging 
 24 | logging.Logger.propagate = False 
 25 | logging.getLogger().setLevel(logging.ERROR)
 26 | 
 27 | class YouDaoFanyi:
 28 |     def __init__(self, appKey, appSecret):
 29 |         self.YOUDAO_URL = 'https://openapi.youdao.com/api/'
 30 |         self.APP_KEY = appKey  # 应用id
 31 |         self.APP_SECRET = appSecret  # 应用密钥
 32 |         self.langFrom = 'en'   # 翻译前文字语言,auto为自动检查
 33 |         self.langTo = 'zh-CHS'     # 翻译后文字语言,auto为自动检查
 34 |         self.vocabId = "您的用户词表ID"
 35 | 
 36 |     def encrypt(self,signStr):
 37 |         hash_algorithm = hashlib.sha256()
 38 |         hash_algorithm.update(signStr.encode('utf-8'))
 39 |         return hash_algorithm.hexdigest()
 40 | 
 41 | 
 42 |     def truncate(self,q):
 43 |         if q is None:
 44 |             return None
 45 |         size = len(q)
 46 |         return q if size <= 20 else q[0:10] + str(size) + q[size - 10:size]
 47 | 
 48 |     def do_request(self,data):
 49 |         headers = {'Content-Type': 'application/x-www-form-urlencoded'}
 50 |         return requests.post(self.YOUDAO_URL, data=data, headers=headers)
 51 | 
 52 | 
 53 |     def translate(self,q):
 54 |         data = {}
 55 |         data['from'] = self.langFrom
 56 |         data['to'] = self.langTo
 57 |         data['signType'] = 'v3'
 58 |         curtime = str(int(time.time()))
 59 |         data['curtime'] = curtime
 60 |         salt = str(uuid.uuid1())
 61 |         signStr = self.APP_KEY + self.truncate(q) + salt + curtime + self.APP_SECRET
 62 |         sign = self.encrypt(signStr)
 63 |         data['appKey'] = self.APP_KEY
 64 |         data['q'] = q
 65 |         data['salt'] = salt
 66 |         data['sign'] = sign
 67 |         data['vocabId'] = self.vocabId
 68 | 
 69 |         response = self.do_request(data)
 70 |         contentType = response.headers['Content-Type']
 71 |         result = json.loads(response.content.decode('utf-8'))['translation'][0]
 72 |         print(result)
 73 |         return result
 74 | 
 75 | 
 76 | class ReviewGenerate:
 77 |     def __init__(self, folder, save_folder, write_txt_file, appKey, appSecret):
 78 |         self.folder = folder # 需要读取pdf的文件夹的路径，注意为绝对路径，如：E:/论文
 79 |         self.save_folder = save_folder # 保存结果的路径
 80 |         self.write_txt_file = write_txt_file # 保存结果的文件，为txt文件
 81 |         self.appKey = appKey  # 应用id
 82 |         self.appSecret = appSecret  # 应用密钥
 83 |         self.success_count = 0 # 统计成功的次数
 84 |         self.fail_count = 0 #统计失败的次数
 85 | 
 86 |     def generate_author(self, author):
 87 |         # 过滤掉作者名后面的各种符号，并生成引用的格式
 88 |         # print(author)
 89 |         author = re.sub('by |[\s\d\*∗\/@†\(\&\)]+$', '', author)
 90 |         author_list = re.split('\s+',author)
 91 |         author_str = author_list[len(author_list)-1]
 92 |         for i in range(0,len(author_list)-1):
 93 |             author_str = author_str + ' ' + author_list[i][0]
 94 |         return author_str
 95 | 
 96 |     def parse(self, QTextBrowser, DataIO, save_path, appKey, appSecret):
 97 |      
 98 |         #用文件对象创建一个PDF文档分析器
 99 |         parser = PDFParser(DataIO)
100 |         #创建一个PDF文档
101 |         doc = PDFDocument()
102 |         #分析器和文档相互连接
103 |         parser.set_document(doc)
104 |         doc.set_parser(parser)
105 |         #提供初始化密码，没有默认为空
106 |         doc.initialize()
107 |         #检查文档是否可以转成TXT，如果不可以就忽略
108 |         if not doc.is_extractable:
109 |             raise PDFTextExtractionNotAllowed
110 |         else:
111 |             #创建PDF资源管理器，来管理共享资源
112 |             rsrcmagr = PDFResourceManager()
113 |             #创建一个PDF设备对象
114 |             laparams = LAParams()
115 |             #将资源管理器和设备对象聚合
116 |             device = PDFPageAggregator(rsrcmagr, laparams=laparams)
117 |             #创建一个PDF解释器对象
118 |             interpreter = PDFPageInterpreter(rsrcmagr, device)
119 |             last_para = '' # 记录上一段文本
120 |             count = 0 # 对文本块进行计数，方便后续查找标题和作者
121 |             author = '' # 记录作者
122 |             ab_count = 0 # 记录已识别的摘要的数量，避免提取文中的abstract
123 | 
124 |             fanyi = YouDaoFanyi(appKey, appSecret)
125 |             #循环遍历列表，每次处理一个page内容
126 |             #doc.get_pages()获取page列表
127 |             for page in doc.get_pages():
128 |                 interpreter.process_page(page)
129 |                 #接收该页面的LTPage对象
130 |                 layout = device.get_result()
131 |                 #这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象
132 |                 #一般包括LTTextBox，LTFigure，LTImage，LTTextBoxHorizontal等等一些对像
133 |                 #想要获取文本就得获取对象的text属性
134 |                 for x in layout:
135 |                     try:
136 |                         if(isinstance(x, LTTextBoxHorizontal)):
137 |                             with open('%s' % (save_path), 'a', encoding='utf-8') as f:
138 |                                 result = x.get_text() # 每块的内容
139 |                                 # print(result)
140 |                                 # 提取标题
141 |                                 if count==0:
142 |                                     # 如果是researchgate的文章，直接翻页
143 |                                     if re.findall('^see discussions', result.lower())!=[]:
144 |                                         break
145 |                                     # 如果第一行是各种页眉等干扰信息，直接略过
146 |                                     if re.findall('(^[0-9])|(^(research )?article)|(unclassified)|(www.)|(accepted (from|manuscript))|(proceedings of)|(vol.)|(volume \d)|(https?://)|(^ieee)|(sciencedirect)|(\d{4}\)$)|(\d{1,4} – \d{1,4}$)|(cid:)',re.split('\s+$',result.lower())[0])!=[] or '':
147 |                                         count -= 1
148 |                                     else:
149 |                                         # 将结果写入TXT
150 |                                         f.write('\n'+result.replace('\n', '')+'\n')
151 |                                 # 提取作者
152 |                                 elif count==1:
153 |                                     # 只取第一作者
154 |                                     author = result.split('\n')[0].split(',')[0].split(' and ')[0]
155 |                                     author = self.generate_author(author)
156 |                                     QTextBrowser.append('author '+ author)
157 |                                     QTextBrowser.moveCursor(QTextBrowser.textCursor().End)  #文本框显示到底部
158 |                                     QtWidgets.QApplication.processEvents()
159 |                                     QtWidgets.QApplication.processEvents()
160 |                                 # 去掉pdf文件读取的各种换行符
161 |                                 result = result.replace('\n', '')
162 |                                 try:
163 |                                     # 转为小写，去掉空格，方便正则识别
164 |                                     last_para = last_para.lower().replace(' ', '')
165 |                                     # print(result)
166 |                                     # 匹配Abstract和摘要内容分开的情况
167 |                                     if re.findall('abstract$', last_para)!=[]:
168 |                                         # 去掉关键词
169 |                                         oringin_result = re.split('(K|k)(eyword|EYWORD)[sS]?',result)[0]
170 |                                         # 翻译并转换人称
171 |                                         trans_result = fanyi.translate(oringin_result).replace('我们', '他们')
172 |                                         # print(result)
173 |                                         # 组织语言写入TXT
174 |                                         write_cont = author + '等人提出：' + trans_result + '\n'
175 |                                         ab_count += 1
176 |                                         f.write(write_cont)
177 |                                     # 匹配Abstract和摘要内容位于同一行的情况
178 |                                     elif re.findall('^abstract', result.lower().replace(' ', ''))!=[] and re.findall('abstract$', result.lower().replace(' ', ''))==[]:
179 |                                         # 确保摘要只匹配一次，不匹配文中的Abstract字眼
180 |                                         if ab_count==0:
181 |                                             # 去掉Abstract字眼及其后续的符号
182 |                                             oringin_result = re.sub('(a|A)(bstract|BSTRACT)[- —.]?','', result)
183 |                                             # 去掉关键词
184 |                                             oringin_result = re.split('(K|k)(eyword|EYWORD)[sS]?',oringin_result)[0]
185 |                                             # 翻译并转换人称
186 |                                             trans_result = fanyi.translate(oringin_result).replace('我们', '他们')
187 |                                             # print(result)
188 |                                             # 组织语言写入TXT
189 |                                             write_cont = author + '等人提出：' + trans_result + '\n'
190 |                                             ab_count += 1
191 |                                             f.write(write_cont)
192 |                                     # 匹配结论
193 |                                     elif re.findall('(^(i|v|x|\d)*\.?conclusions?)|(conclusions?$)', last_para)!=[]:
194 |                                             # 避免因图表在标题下方导致的识别错误
195 |                                             if re.findall('^fig', result.lower()):
196 |                                                 continue
197 |                                             # 翻译
198 |                                             trans_result = fanyi.translate(result)
199 |                                             # print(result)
200 |                                             # 转换人称
201 |                                             write_cont = trans_result.replace('我们', '他们') + '\n'
202 |                                             # 写入TXT
203 |                                             f.write(write_cont)
204 |                                 except Exception as e:
205 |                                     QTextBrowser.append(str(e))
206 |                                     QTextBrowser.moveCursor(QTextBrowser.textCursor().End)  #文本框显示到底部
207 |                                     QtWidgets.QApplication.processEvents()
208 |                                 last_para = result
209 |                                 count += 1
210 |                     except Exception as e:
211 |                         QTextBrowser.append(str(e))
212 |                         QTextBrowser.moveCursor(QTextBrowser.textCursor().End)  #文本框显示到底部
213 |                         QtWidgets.QApplication.processEvents()
214 |                 else:
215 |                     continue
216 |             with open('%s' % (save_path), 'a', encoding='utf-8') as f:
217 |                 f.write('\n')
218 |      
219 |     def getFileName(self, filepath):
220 |         file_list = []
221 |         for root,dirs,files in os.walk(filepath):
222 |             for filespath in files:
223 |                 if '.pdf' == filespath[-4:].lower():
224 |                     file_list.append(os.path.join(root,filespath))
225 |         return file_list
226 | 
227 | class mywindow(QtWidgets.QWidget, Ui_ReviewGenerator):
228 |     def  __init__ (self):
229 |         super(mywindow, self).__init__()
230 |         self.setupUi(self)
231 |         self.toolButton.clicked.connect(self.choose_folder1)
232 |         self.toolButton_2.clicked.connect(self.choose_folder2)
233 |         self.toolButton_3.clicked.connect(self.start_generate)
234 |         self.toolButton_4.clicked.connect(self.get_app_key)
235 |         self.toolButton_5.clicked.connect(self.get_source_code)
236 | 
237 |     def choose_folder1(self):
238 |         #选取文件夹
239 |         foldername = QFileDialog.getExistingDirectory(self, "选取文件夹", "C:/")
240 |         print(foldername)
241 |         self.lineEdit.setText(foldername)
242 | 
243 |     def choose_folder2(self):
244 |         #选取文件夹
245 |         foldername = QFileDialog.getExistingDirectory(self, "选取文件夹", "C:/")
246 |         print(foldername)
247 |         self.lineEdit_2.setText(foldername)
248 | 
249 |     def get_app_key(self):
250 |         url = "https://ai.youdao.com/doc.s#guide"
251 |         webbrowser.open_new_tab(url)
252 | 
253 |     def get_source_code(self):
254 |         url = "https://github.com/you8023/Auto-Review-Generator"
255 |         webbrowser.open_new_tab(url)
256 | 
257 |     def start_generate(self):
258 |         folder = self.lineEdit.text() # 需要读取pdf的文件夹的路径，注意为绝对路径，如：E:/论文
259 |         save_folder = self.lineEdit_2.text() # 保存结果的路径
260 |         write_txt_file = self.lineEdit_5.text() # 保存结果的文件，为txt文件
261 |         appKey = self.lineEdit_3.text()  # 应用id
262 |         appSecret = self.lineEdit_4.text()  # 应用密钥
263 |         if not (folder and save_folder and write_txt_file and appKey and appSecret):
264 |             self.textBrowser.append("请检查上面的参数是否填写完整！")
265 |             self.textBrowser.moveCursor(self.textBrowser.textCursor().End)  #文本框显示到底部
266 |             QtWidgets.QApplication.processEvents()
267 |             return
268 |         else:
269 |             self.textBrowser.append("程序已开始运行，请稍等...")
270 |             self.textBrowser.moveCursor(self.textBrowser.textCursor().End)  #文本框显示到底部
271 |             QtWidgets.QApplication.processEvents()
272 | 
273 |         review_generator = ReviewGenerate(folder, save_folder, write_txt_file, appKey, appSecret)
274 |         pdf_list = review_generator.getFileName(folder)
275 |         self.textBrowser.append("已读取到" + str(len(pdf_list)) + "个PDF，正在处理...")
276 |         self.textBrowser.moveCursor(self.textBrowser.textCursor().End)  #文本框显示到底部
277 |         QtWidgets.QApplication.processEvents()
278 | 
279 |         # 依次读取元祖，获取pdf文件位置
280 |         for file_item in pdf_list:
281 |             with open(file_item,'rb') as pdf_html:
282 |                 try:
283 |                     self.textBrowser.append(file_item)
284 |                     self.textBrowser.moveCursor(self.textBrowser.textCursor().End)  #文本框显示到底部
285 |                     QtWidgets.QApplication.processEvents()
286 |                     QTextBrowser = self.textBrowser
287 |                     review_generator.parse(QTextBrowser, pdf_html, folder.rstrip('/') + '/' + write_txt_file.rstrip('.txt') + '.txt', appKey, appSecret)
288 |                     review_generator.success_count+=1
289 |                 except Exception as e:
290 |                     # 文件读取或翻译失败则将错误信息写入TXT
291 |                     self.textBrowser.append('文档读取失败：' + str(e) +'，路径为：' + file_item)
292 |                     self.textBrowser.moveCursor(self.textBrowser.textCursor().End)  #文本框显示到底部
293 |                     QtWidgets.QApplication.processEvents()
294 |                     with open('%s' % (folder + write_txt_file), 'a', encoding='utf-8') as f:
295 |                         f.write('\n'+'文档读取失败：' + str(e) +'，路径为：' + file_item + '\n')
296 |                     review_generator.fail_count+=1
297 | 
298 |         self.textBrowser.append('共读取pdf文件' + str(review_generator.success_count+review_generator.fail_count) + '个，其中成功读取并翻译' + str(review_generator.success_count) + '个，失败' + str(review_generator.fail_count) + '个')
299 |         self.textBrowser.moveCursor(self.textBrowser.textCursor().End)  #文本框显示到底部
300 | 
301 | 
302 | if __name__=="__main__":
303 |     
304 |     app=QtWidgets.QApplication(sys.argv)
305 |     ui = mywindow()
306 |     ui.show()
307 |     sys.exit(app.exec_())
308 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Auto-Review-Generator自动综述生成器
  2 | Read and translate English literature to generate review automatically
  3 | 
  4 | github上图片可能无法查看，完整教程见[博客](https://www.jianshu.com/p/3639aac9d520)
  5 | 
  6 | 如果不想了解技术细节，只想直接拿来用，可以直接跳过代码编写部分，直达最后代码使用部分。
  7 | 
  8 | **本代码免费[开源](https://github.com/you8023/Auto-Review-Generator)，如果你觉得好用，希望能够给我一个Star，也欢迎去[github](https://github.com/you8023/Auto-Review-Generator/issues)发表意见建议。**
  9 | 
 10 | 代码实现效果如下：
 11 | ![实现效果](https://upload-images.jianshu.io/upload_images/5714082-308ebb2f869973a9.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
 12 | 
 13 | ## 开发环境
 14 | * Windows 10
 15 | * Sublime Text 3
 16 | * Python 3.7
 17 | * Pdfminer
 18 | * 有道翻译API
 19 | 
 20 | ## 事前准备
 21 | ### 接口申请
 22 | 本代码使用了有道翻译的API，因此，如需使用，需要去有道翻译接口官方申请APP Key和Secret key，直接按照其[官方教程](https://ai.youdao.com/doc.s#guide)申请即可，后续需要在代码中配置。接口申请完全免费，初始会送100元的面值，用完需要续费，不过一般情况100元可以用很久了。
 23 | ### 安装pdfminer
 24 | 因为我使用的是python3，因此输入以下命令安装：
 25 | ```shell
 26 | pip install pdfminer3k
 27 | ```
 28 | ### 需求分析
 29 | 这里分析了本代码实现的关键点：
 30 | * 文献是已经下载下来的pdf文件
 31 | * 文献中，需要提取的部分主要为：
 32 |   * 标题
 33 |   * 作者
 34 |   * 摘要
 35 |   * 结论
 36 | 
 37 | 因此，本代码的思路是读取本地文件夹内的pdf文件，然后读取并识别出其关键元素，调用有道翻译的API进行翻译，并进行有机组合，写入TXT文件中。
 38 | ## 代码编写
 39 | ### 读取pdf文件
 40 | 依次读取文件夹内的文件，如果后缀为pdf，则写入文件元祖：
 41 | ```python
 42 | def getFileName(filepath):
 43 |     file_list = []
 44 |     for root,dirs,files in os.walk(filepath):
 45 |         for filespath in files:
 46 |             if 'pdf' in filespath.split('.')[1]:
 47 |                 file_list.append(os.path.join(root,filespath))
 48 |     return file_list
 49 | ```
 50 | ### 读取文件内容并提取标题、作者、摘要和结论
 51 | ```python
 52 | def parse(DataIO, save_path, appKey, appSecret):
 53 |  
 54 |     #用文件对象创建一个PDF文档分析器
 55 |     parser = PDFParser(DataIO)
 56 |     #创建一个PDF文档
 57 |     doc = PDFDocument()
 58 |     #分析器和文档相互连接
 59 |     parser.set_document(doc)
 60 |     doc.set_parser(parser)
 61 |     #提供初始化密码，没有默认为空
 62 |     doc.initialize()
 63 |     #检查文档是否可以转成TXT，如果不可以就忽略
 64 |     if not doc.is_extractable:
 65 |         raise PDFTextExtractionNotAllowed
 66 |     else:
 67 |         #创建PDF资源管理器，来管理共享资源
 68 |         rsrcmagr = PDFResourceManager()
 69 |         #创建一个PDF设备对象
 70 |         laparams = LAParams()
 71 |         #将资源管理器和设备对象聚合
 72 |         device = PDFPageAggregator(rsrcmagr, laparams=laparams)
 73 |         #创建一个PDF解释器对象
 74 |         interpreter = PDFPageInterpreter(rsrcmagr, device)
 75 |         last_para = '' # 记录上一段文本
 76 |         count = 0 # 对文本块进行计数，方便后续查找标题和作者
 77 |         author = '' # 记录作者
 78 |         ab_count = 0 # 记录已识别的摘要的数量，避免提取文中的abstract
 79 | 
 80 |         fanyi = YouDaoFanyi(appKey, appSecret)
 81 |         #循环遍历列表，每次处理一个page内容
 82 |         #doc.get_pages()获取page列表
 83 |         for page in doc.get_pages():
 84 |             interpreter.process_page(page)
 85 |             #接收该页面的LTPage对象
 86 |             layout = device.get_result()
 87 |             #这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象
 88 |             #一般包括LTTextBox，LTFigure，LTImage，LTTextBoxHorizontal等等一些对像
 89 |             #想要获取文本就得获取对象的text属性
 90 |             for x in layout:
 91 |                 try:
 92 |                     if(isinstance(x, LTTextBoxHorizontal)):
 93 |                         with open('%s' % (save_path), 'a', encoding='utf-8') as f:
 94 |                             result = x.get_text() # 每块的内容
 95 |                             # print(result)
 96 |                             # 提取标题
 97 |                             if count==0:
 98 |                                 # 如果是researchgate的文章，直接翻页
 99 |                                 if re.findall('^see discussions', result.lower())!=[]:
100 |                                     break
101 |                                 # 如果第一行是各种页眉等干扰信息，直接略过
102 |                                 if re.findall('(^[0-9])|(^(research )?article)|(unclassified)|(www.)|(accepted (from|manuscript))|(proceedings of)|(vol.)|(volume \d)|(https?://)|(^ieee)|(sciencedirect)|(\d{4}\)$)|(\d{1,4} – \d{1,4}$)|(cid:)',re.split('\s+$',result.lower())[0])!=[] or '':
103 |                                     count -= 1
104 |                                 else:
105 |                                     # 将结果写入TXT
106 |                                     f.write('\n'+result.replace('\n', '')+'\n')
107 |                             # 提取作者
108 |                             elif count==1:
109 |                                 # 只取第一作者
110 |                                 author = result.split('\n')[0].split(',')[0].split(' and ')[0]
111 |                                 author = generate_author(author)
112 |                                 print('author '+ author)
113 |                             # 去掉pdf文件读取的各种换行符
114 |                             result = result.replace('\n', '')
115 |                             try:
116 |                                 # 转为小写，去掉空格，方便正则识别
117 |                                 last_para = last_para.lower().replace(' ', '')
118 |                                 # print(result)
119 |                                 # 匹配Abstract和摘要内容分开的情况
120 |                                 if re.findall('abstract$', last_para)!=[]:
121 |                                     # 去掉关键词
122 |                                     oringin_result = re.split('(K|k)(eyword|EYWORD)[sS]?',result)[0]
123 |                                     # 翻译并转换人称
124 |                                     trans_result = fanyi.translate(oringin_result).replace('我们', '他们')
125 |                                     # print(result)
126 |                                     # 组织语言写入TXT
127 |                                     write_cont = author + '等人提出：' + trans_result + '\n'
128 |                                     ab_count += 1
129 |                                     f.write(write_cont)
130 |                                 # 匹配Abstract和摘要内容位于同一行的情况
131 |                                 elif re.findall('^abstract', result.lower().replace(' ', ''))!=[] and re.findall('abstract$', result.lower().replace(' ', ''))==[]:
132 |                                     # 确保摘要只匹配一次，不匹配文中的Abstract字眼
133 |                                     if ab_count==0:
134 |                                         # 去掉Abstract字眼及其后续的符号
135 |                                         oringin_result = re.sub('(a|A)(bstract|BSTRACT)[- —.]?','', result)
136 |                                         # 去掉关键词
137 |                                         oringin_result = re.split('(K|k)(eyword|EYWORD)[sS]?',oringin_result)[0]
138 |                                         # 翻译并转换人称
139 |                                         trans_result = fanyi.translate(oringin_result).replace('我们', '他们')
140 |                                         # print(result)
141 |                                         # 组织语言写入TXT
142 |                                         write_cont = author + '等人提出：' + trans_result + '\n'
143 |                                         ab_count += 1
144 |                                         f.write(write_cont)
145 |                                 # 匹配结论
146 |                                 elif re.findall('(^(i|v|x|\d)*\.?conclusions?)|(conclusions?$)', last_para)!=[]:
147 |                                         # 避免因图表在标题下方导致的识别错误
148 |                                         if re.findall('^fig', result.lower()):
149 |                                             continue
150 |                                         # 翻译
151 |                                         trans_result = fanyi.translate(result)
152 |                                         # print(result)
153 |                                         # 转换人称
154 |                                         write_cont = trans_result.replace('我们', '他们') + '\n'
155 |                                         # 写入TXT
156 |                                         f.write(write_cont)
157 |                             except Exception as e:
158 |                                 print(e)
159 |                             last_para = result
160 |                             count += 1
161 |                 except Exception as e:
162 |                     print('out'+str(e))
163 |             else:
164 |                 continue
165 |         with open('%s' % (save_path), 'a', encoding='utf-8') as f:
166 |             f.write('\n')
167 | ```
168 | ### 按照引用的格式生成作者信息
169 | ```python
170 | def generate_author(author):
171 |     # 过滤掉作者名后面的各种符号，并生成引用的格式
172 |     # print(author)
173 |     author = re.sub('by |[\s\d\*∗\/@†\(\&\)]+$', '', author)
174 |     author_list = re.split('\s+',author)
175 |     author_str = author_list[len(author_list)-1]
176 |     for i in range(0,len(author_list)-1):
177 |         author_str = author_str + ' ' + author_list[i][0]
178 |     return author_str
179 | ```
180 | ### 翻译接口
181 | 其实直接抄有道官网文档就可以了，这里在其基础上做了更改：
182 | ```python
183 | class YouDaoFanyi:
184 |     def __init__(self, appKey, appSecret):
185 |         self.YOUDAO_URL = 'https://openapi.youdao.com/api/'
186 |         self.APP_KEY = appKey  # 应用id
187 |         self.APP_SECRET = appSecret  # 应用密钥
188 |         self.langFrom = 'en'   # 翻译前文字语言,auto为自动检查
189 |         self.langTo = 'zh-CHS'     # 翻译后文字语言,auto为自动检查
190 |         self.vocabId = "您的用户词表ID"
191 | 
192 |     def encrypt(self,signStr):
193 |         hash_algorithm = hashlib.sha256()
194 |         hash_algorithm.update(signStr.encode('utf-8'))
195 |         return hash_algorithm.hexdigest()
196 | 
197 | 
198 |     def truncate(self,q):
199 |         if q is None:
200 |             return None
201 |         size = len(q)
202 |         return q if size <= 20 else q[0:10] + str(size) + q[size - 10:size]
203 | 
204 |     def do_request(self,data):
205 |         headers = {'Content-Type': 'application/x-www-form-urlencoded'}
206 |         return requests.post(self.YOUDAO_URL, data=data, headers=headers)
207 | 
208 | 
209 |     def translate(self,q):
210 |         data = {}
211 |         data['from'] = self.langFrom
212 |         data['to'] = self.langTo
213 |         data['signType'] = 'v3'
214 |         curtime = str(int(time.time()))
215 |         data['curtime'] = curtime
216 |         salt = str(uuid.uuid1())
217 |         signStr = self.APP_KEY + self.truncate(q) + salt + curtime + self.APP_SECRET
218 |         sign = self.encrypt(signStr)
219 |         data['appKey'] = self.APP_KEY
220 |         data['q'] = q
221 |         data['salt'] = salt
222 |         data['sign'] = sign
223 |         data['vocabId'] = self.vocabId
224 | 
225 |         response = self.do_request(data)
226 |         contentType = response.headers['Content-Type']
227 |         result = json.loads(response.content.decode('utf-8'))['translation'][0]
228 |         print(result)
229 |         return result
230 | ```
231 | 最后，书写主函数进行调用：
232 | ```python
233 | if __name__ == '__main__':
234 |     #解析本地PDF文本，保存到本地TXT
235 |     folder = '文件夹路径' # 需要读取pdf的文件夹的路径，注意为绝对路径，如：E:/论文
236 |     write_txt_file = 'result.txt' # 保存结果的文件
237 |     appKey = '应用ID'  # 应用id
238 |     appSecret = '应用秘钥'  # 应用密钥
239 |     success_count = 0 # 统计成功的次数
240 |     fail_count = 0 #统计失败的次数
241 | 
242 |     # 单次调用，供开发测试
243 |     # pdf_filename = folder+'文件名'
244 |     # with open(pdf_filename,'rb') as pdf_html:
245 |     #     try:
246 |     #         parse(pdf_html, folder + write_txt_file, appKey, appSecret)
247 |     #         success_count+=1
248 |     #     except Exception as e:
249 |     #         print(pdf_filename)
250 |     #         fail_count+=1
251 | 
252 |     pdf_list = getFileName(folder)
253 |     # 依次读取元祖，获取pdf文件位置
254 |     for file_item in pdf_list:
255 |         with open(file_item,'rb') as pdf_html:
256 |             try:
257 |                 print(file_item)
258 |                 parse(pdf_html, folder + write_txt_file, appKey, appSecret)
259 |                 success_count+=1
260 |             except Exception as e:
261 |                 # 文件读取或翻译失败则将错误信息写入TXT
262 |                 print('文档读取失败：' + str(e) +'，路径为：' + file_item)
263 |                 with open('%s' % (folder + write_txt_file), 'a', encoding='utf-8') as f:
264 |                     f.write('\n'+'文档读取失败：' + str(e) +'，路径为：' + file_item + '\n')
265 |                 fail_count+=1
266 | 
267 |     print('共读取pdf文件' + str(success_count+fail_count) + '个，其中成功读取并翻译' + str(success_count) + '个，失败' + str(fail_count) + '个')
268 | ```
269 | 至此，代码编写完毕
270 | ## 使用
271 | 代码可在[Github](https://github.com/you8023/Auto-Review-Generator)上下载
272 | ### 配置代码
273 | 更改代码主函数的配置变量（其中的应用ID和应用秘钥需要事先申请，见上文事前准备一节）：
274 | ```python
275 | if __name__ == '__main__':
276 |     #解析本地PDF文本，保存到本地TXT
277 |     folder = '文件夹路径' # 需要读取pdf的文件夹的路径，注意为绝对路径，如：E:/论文/
278 |     write_txt_file = 'result.txt' # 保存结果的文件
279 |     appKey = '应用ID'  # 应用id
280 |     appSecret = '应用秘钥'  # 应用密钥
281 | ```
282 | ### 运行代码
283 | 在代码所在的根目录下的命令行中输入以下命令即可：
284 | ```
285 | python pdfprocessor.py
286 | ```
287 | ## 运行结果
288 | 仅花了38秒的时间，就提取并翻译完成了14个pdf文件，翻译生成的字数合计6812个字：
289 | 
290 | ![运行结果](https://upload-images.jianshu.io/upload_images/5714082-6ebae187fdf9ac04.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
291 | 
292 | 试了一下45个文件，花了大概两分钟，生成了一万多字
293 | 
294 | 最后看一下翻译结果对比：
295 | 
296 | ![翻译结果中英对比](https://upload-images.jianshu.io/upload_images/5714082-308ebb2f869973a9.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
297 | 
298 | 


--------------------------------------------------------------------------------
/demo/An Ontology-Driven Approach to Automating the Process of Integrating Security Software Systems.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/you8023/Auto-Review-Generator/630d9ca2e64bd70bcfefa70fb6baed7a75962f66/demo/An Ontology-Driven Approach to Automating the Process of Integrating Security Software Systems.pdf


--------------------------------------------------------------------------------
/demo/result.txt:
--------------------------------------------------------------------------------
1 | 
2 | An Ontology-Driven Approach to Automating the Process of Integrating Security Software Systems 
3 | Islam C等人提出：各种各样的安全软件系统需要集成到安全编排平台(SecOrP)中，以简化防御和响应网络安全攻击的过程。安全系统之间缺乏可解释性和互操作性被认为是充分利用不同安全系统的集体能力潜力的关键挑战。集成安全系统的过程是重复的、耗时的和容易出错的;这些过程是由人工专家或使用特别的方法进行的。为了帮助自动化安全系统集成过程，他们提出了一种用于安全编排平台(OnSOAP)的本体驱动方法。开发的解决方案支持安全系统之间的可解释性和互操作性，这些系统可能存在于操作筒仓中。他们将演示OnSOAP对自动集成安全系统的支持，以使用三个安全系统(Splunk、Limacharlie和Snort)执行针对分布式拒绝服务(DDoS)攻击的事件响应过程。评估结果表明，OnSOAP使SecOrP能够解释不同安全系统的输入和输出，产生无错误的集成细节，并使安全系统之间的互操作，以自动化和加速事件响应过程。
4 | 他们提出了一种本体驱动的方法来自动化集成不同的安全系统在一个安全编排平台的过程。通过形式化安全系统的概念，他们的目标是支持安全系统集成过程中的自动化，从而进一步实现不同安全系统之间的互操作性。他们提供了一个本体论模型，描述了集成过程所需的所有SecOrP概念和关系。他们断言OnSOAP可以解释不同安全系统共享的输出的语义，并制定安全系统所需的输入。此外，OnSOAP使安全系统自动执行事件响应过程。他们已经通过开发和使用一个概念证明系统证明了所提议的方法的可行性。结果表明，OnSOAP可以(i)解释安全系统的输出，(ii)调用安全系统来分析一个安全系统的数据，(iii)自动化集成过程来执行事件响应计划。他们断言，他们的方法可以最大限度地减少人工集成过程所带来的挑战，并有效地将不同安全系统的集成过程自动化。开发和评估他们方法的结果使他们相信，OnSOAP可以很容易地与现有的SecOrP集成，在组织的SOC中实现大规模的安全编排和自动化。他们的工作需要详细定义不同安全系统的特点和事故响应计划。如果没有对安全系统的功能性和非功能性功能的合适定义，OnSOAP将无法执行上述任务。他们未来的工作目标是对拟议的系统进行大规模的评估。此外，他们的目标是设计一个概率学习模型来自动化集成过程，当缺少精确匹配时，可以使用本体模型和现有安全系统的配置来生成api。
5 | 
6 | 


--------------------------------------------------------------------------------
/pdfprocessor.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import requests
  3 | import uuid
  4 | import os
  5 | import time
  6 | import json
  7 | import importlib,sys
  8 | importlib.reload(sys)
  9 | from pdfminer.pdfparser import PDFParser, PDFDocument
 10 | from pdfminer.pdfdevice import PDFDevice
 11 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 12 | from pdfminer.converter import PDFPageAggregator
 13 | from pdfminer.layout import LTTextBoxHorizontal, LAParams
 14 | from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
 15 | import re
 16 | 
 17 | # 去除警告
 18 | import logging 
 19 | logging.Logger.propagate = False 
 20 | logging.getLogger().setLevel(logging.ERROR)
 21 | 
 22 | class YouDaoFanyi:
 23 |     def __init__(self, appKey, appSecret):
 24 |         self.YOUDAO_URL = 'https://openapi.youdao.com/api/'
 25 |         self.APP_KEY = appKey  # 应用id
 26 |         self.APP_SECRET = appSecret  # 应用密钥
 27 |         self.langFrom = 'en'   # 翻译前文字语言,auto为自动检查
 28 |         self.langTo = 'zh-CHS'     # 翻译后文字语言,auto为自动检查
 29 |         self.vocabId = "您的用户词表ID"
 30 | 
 31 |     def encrypt(self,signStr):
 32 |         hash_algorithm = hashlib.sha256()
 33 |         hash_algorithm.update(signStr.encode('utf-8'))
 34 |         return hash_algorithm.hexdigest()
 35 | 
 36 | 
 37 |     def truncate(self,q):
 38 |         if q is None:
 39 |             return None
 40 |         size = len(q)
 41 |         return q if size <= 20 else q[0:10] + str(size) + q[size - 10:size]
 42 | 
 43 |     def do_request(self,data):
 44 |         headers = {'Content-Type': 'application/x-www-form-urlencoded'}
 45 |         return requests.post(self.YOUDAO_URL, data=data, headers=headers)
 46 | 
 47 | 
 48 |     def translate(self,q):
 49 |         data = {}
 50 |         data['from'] = self.langFrom
 51 |         data['to'] = self.langTo
 52 |         data['signType'] = 'v3'
 53 |         curtime = str(int(time.time()))
 54 |         data['curtime'] = curtime
 55 |         salt = str(uuid.uuid1())
 56 |         signStr = self.APP_KEY + self.truncate(q) + salt + curtime + self.APP_SECRET
 57 |         sign = self.encrypt(signStr)
 58 |         data['appKey'] = self.APP_KEY
 59 |         data['q'] = q
 60 |         data['salt'] = salt
 61 |         data['sign'] = sign
 62 |         data['vocabId'] = self.vocabId
 63 | 
 64 |         response = self.do_request(data)
 65 |         contentType = response.headers['Content-Type']
 66 |         result = json.loads(response.content.decode('utf-8'))['translation'][0]
 67 |         print(result)
 68 |         return result
 69 | 
 70 | def generate_author(author):
 71 |     # 过滤掉作者名后面的各种符号，并生成引用的格式
 72 |     # print(author)
 73 |     author = re.sub('by |[\s\d\*∗\/@†\(\&\)]+$', '', author)
 74 |     author_list = re.split('\s+',author)
 75 |     author_str = author_list[len(author_list)-1]
 76 |     for i in range(0,len(author_list)-1):
 77 |         author_str = author_str + ' ' + author_list[i][0]
 78 |     return author_str
 79 | 
 80 | def parse(DataIO, save_path, appKey, appSecret):
 81 |  
 82 |     #用文件对象创建一个PDF文档分析器
 83 |     parser = PDFParser(DataIO)
 84 |     #创建一个PDF文档
 85 |     doc = PDFDocument()
 86 |     #分析器和文档相互连接
 87 |     parser.set_document(doc)
 88 |     doc.set_parser(parser)
 89 |     #提供初始化密码，没有默认为空
 90 |     doc.initialize()
 91 |     #检查文档是否可以转成TXT，如果不可以就忽略
 92 |     if not doc.is_extractable:
 93 |         raise PDFTextExtractionNotAllowed
 94 |     else:
 95 |         #创建PDF资源管理器，来管理共享资源
 96 |         rsrcmagr = PDFResourceManager()
 97 |         #创建一个PDF设备对象
 98 |         laparams = LAParams()
 99 |         #将资源管理器和设备对象聚合
100 |         device = PDFPageAggregator(rsrcmagr, laparams=laparams)
101 |         #创建一个PDF解释器对象
102 |         interpreter = PDFPageInterpreter(rsrcmagr, device)
103 |         last_para = '' # 记录上一段文本
104 |         count = 0 # 对文本块进行计数，方便后续查找标题和作者
105 |         author = '' # 记录作者
106 |         ab_count = 0 # 记录已识别的摘要的数量，避免提取文中的abstract
107 | 
108 |         fanyi = YouDaoFanyi(appKey, appSecret)
109 |         #循环遍历列表，每次处理一个page内容
110 |         #doc.get_pages()获取page列表
111 |         for page in doc.get_pages():
112 |             interpreter.process_page(page)
113 |             #接收该页面的LTPage对象
114 |             layout = device.get_result()
115 |             #这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象
116 |             #一般包括LTTextBox，LTFigure，LTImage，LTTextBoxHorizontal等等一些对像
117 |             #想要获取文本就得获取对象的text属性
118 |             for x in layout:
119 |                 try:
120 |                     if(isinstance(x, LTTextBoxHorizontal)):
121 |                         with open('%s' % (save_path), 'a', encoding='utf-8') as f:
122 |                             result = x.get_text() # 每块的内容
123 |                             # print(result)
124 |                             # 提取标题
125 |                             if count==0:
126 |                                 # 如果是researchgate的文章，直接翻页
127 |                                 if re.findall('^see discussions', result.lower())!=[]:
128 |                                     break
129 |                                 # 如果第一行是各种页眉等干扰信息，直接略过
130 |                                 if re.findall('(^[0-9])|(^(research )?article)|(unclassified)|(www.)|(accepted (from|manuscript))|(proceedings of)|(vol.)|(volume \d)|(https?://)|(^ieee)|(sciencedirect)|(\d{4}\)$)|(\d{1,4} – \d{1,4}$)|(cid:)',re.split('\s+$',result.lower())[0])!=[] or '':
131 |                                     count -= 1
132 |                                 else:
133 |                                     # 将结果写入TXT
134 |                                     f.write('\n'+result.replace('\n', '')+'\n')
135 |                             # 提取作者
136 |                             elif count==1:
137 |                                 # 只取第一作者
138 |                                 author = result.split('\n')[0].split(',')[0].split(' and ')[0]
139 |                                 author = generate_author(author)
140 |                                 print('author '+ author)
141 |                             # 去掉pdf文件读取的各种换行符
142 |                             result = result.replace('\n', '')
143 |                             try:
144 |                                 # 转为小写，去掉空格，方便正则识别
145 |                                 last_para = last_para.lower().replace(' ', '')
146 |                                 # print(result)
147 |                                 # 匹配Abstract和摘要内容分开的情况
148 |                                 if re.findall('abstract$', last_para)!=[]:
149 |                                     # 去掉关键词
150 |                                     oringin_result = re.split('(K|k)(eyword|EYWORD)[sS]?',result)[0]
151 |                                     # 翻译并转换人称
152 |                                     trans_result = fanyi.translate(oringin_result).replace('我们', '他们')
153 |                                     # print(result)
154 |                                     # 组织语言写入TXT
155 |                                     write_cont = author + '等人提出：' + trans_result + '\n'
156 |                                     ab_count += 1
157 |                                     f.write(write_cont)
158 |                                 # 匹配Abstract和摘要内容位于同一行的情况
159 |                                 elif re.findall('^abstract', result.lower().replace(' ', ''))!=[] and re.findall('abstract$', result.lower().replace(' ', ''))==[]:
160 |                                     # 确保摘要只匹配一次，不匹配文中的Abstract字眼
161 |                                     if ab_count==0:
162 |                                         # 去掉Abstract字眼及其后续的符号
163 |                                         oringin_result = re.sub('(a|A)(bstract|BSTRACT)[- —.]?','', result)
164 |                                         # 去掉关键词
165 |                                         oringin_result = re.split('(K|k)(eyword|EYWORD)[sS]?',oringin_result)[0]
166 |                                         # 翻译并转换人称
167 |                                         trans_result = fanyi.translate(oringin_result).replace('我们', '他们')
168 |                                         # print(result)
169 |                                         # 组织语言写入TXT
170 |                                         write_cont = author + '等人提出：' + trans_result + '\n'
171 |                                         ab_count += 1
172 |                                         f.write(write_cont)
173 |                                 # 匹配结论
174 |                                 elif re.findall('(^(i|v|x|\d)*\.?conclusions?)|(conclusions?$)', last_para)!=[]:
175 |                                         # 避免因图表在标题下方导致的识别错误
176 |                                         if re.findall('^fig', result.lower()):
177 |                                             continue
178 |                                         # 翻译
179 |                                         trans_result = fanyi.translate(result)
180 |                                         # print(result)
181 |                                         # 转换人称
182 |                                         write_cont = trans_result.replace('我们', '他们') + '\n'
183 |                                         # 写入TXT
184 |                                         f.write(write_cont)
185 |                             except Exception as e:
186 |                                 print(e)
187 |                             last_para = result
188 |                             count += 1
189 |                 except Exception as e:
190 |                     print('out'+str(e))
191 |             else:
192 |                 continue
193 |         with open('%s' % (save_path), 'a', encoding='utf-8') as f:
194 |             f.write('\n')
195 |  
196 | def getFileName(filepath):
197 |     file_list = []
198 |     for root,dirs,files in os.walk(filepath):
199 |         for filespath in files:
200 |             if 'pdf' in filespath.split('.')[1]:
201 |                 file_list.append(os.path.join(root,filespath))
202 |     return file_list
203 | 
204 | 
205 | if __name__ == '__main__':
206 |     #解析本地PDF文本，保存到本地TXT
207 |     folder = '文件夹路径' # 需要读取pdf的文件夹的路径，注意为绝对路径，如：E:/论文
208 |     write_txt_file = 'result.txt' # 保存结果的文件，为txt文件
209 |     appKey = '应用ID'  # 应用id
210 |     appSecret = '应用秘钥'  # 应用密钥
211 |     success_count = 0 # 统计成功的次数
212 |     fail_count = 0 #统计失败的次数
213 | 
214 |     # 单次调用，供开发测试
215 |     # pdf_filename = folder+'文件名'
216 |     # with open(pdf_filename,'rb') as pdf_html:
217 |     #     try:
218 |     #         parse(pdf_html, folder + write_txt_file, appKey, appSecret)
219 |     #         success_count+=1
220 |     #     except Exception as e:
221 |     #         print(pdf_filename)
222 |     #         fail_count+=1
223 | 
224 |     pdf_list = getFileName(folder)
225 |     # 依次读取元祖，获取pdf文件位置
226 |     for file_item in pdf_list:
227 |         with open(file_item,'rb') as pdf_html:
228 |             try:
229 |                 print(file_item)
230 |                 parse(pdf_html, folder + write_txt_file, appKey, appSecret)
231 |                 success_count+=1
232 |             except Exception as e:
233 |                 # 文件读取或翻译失败则将错误信息写入TXT
234 |                 print('文档读取失败：' + str(e) +'，路径为：' + file_item)
235 |                 with open('%s' % (folder + write_txt_file), 'a', encoding='utf-8') as f:
236 |                     f.write('\n'+'文档读取失败：' + str(e) +'，路径为：' + file_item + '\n')
237 |                 fail_count+=1
238 | 
239 |     print('共读取pdf文件' + str(success_count+fail_count) + '个，其中成功读取并翻译' + str(success_count) + '个，失败' + str(fail_count) + '个')


--------------------------------------------------------------------------------