├── README.md
└── pyLTP_model.py


/README.md:
--------------------------------------------------------------------------------
1 | # pyLTPServer


--------------------------------------------------------------------------------
/pyLTP_model.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python3
  2 | # -*- coding:utf-8 -*-
  3 | # create on 5/26/20
  4 | __author__ = 'sinsa'
  5 | 
  6 | import os
  7 | import logging
  8 | from logging import info, error, warn
  9 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - PID:%(process)d - %(levelname)s: %(message)s')
 10 | from pyltp import Segmentor
 11 | from pyltp import Postagger
 12 | from pyltp import NamedEntityRecognizer
 13 | from pyltp import Parser
 14 | from pyltp import SentenceSplitter
 15 | 
 16 | 
 17 | class LTP_MODEL():
 18 |     def __init__(self):
 19 |         LTP_DATA_DIR = './ltp_data_v3.4.0'  # ltp模型目录的路径
 20 |         info('loading models ...')
 21 |         self.segmentor = Segmentor()  # 初始化实例
 22 |         self.cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')  # 分词模型路径，模型名称为`cws.model`
 23 |         self.segmentor.load(self.cws_model_path)  # 加载模型
 24 |         info('has loaded 分词模型')
 25 |         self.pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径，模型名称为`pos.model`
 26 |         self.postaggers = Postagger()  # 初始化实例
 27 |         self.postaggers.load(self.pos_model_path)  # 加载模型
 28 |         info('has loaded 词性标注模型')
 29 |         self.ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径，模型名称为`pos.model`
 30 |         self.recognizer = NamedEntityRecognizer()  # 初始化实例
 31 |         self.recognizer.load(self.ner_model_path)  # 加载模型
 32 |         info('has loaded 命名实体识别模型')
 33 |         self.par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径，模型名称为`parser.model`
 34 |         self.parser = Parser()  # 初始化实例
 35 |         self.parser.load(self.par_model_path)  # 加载模型
 36 |         info('has loaded 依存句法分析模型')
 37 | 
 38 |     def __release__(self):
 39 |         self.segmentor.release()  # 释放模型
 40 |         self.postaggers.release()  # 释放模型
 41 |         self.recognizer.release()  # 释放模型
 42 |         self.parser.release()  # 释放模型
 43 | 
 44 |     def SplitSentence(self, sentence):
 45 |         sents_list = SentenceSplitter.split(sentence)  # 分句
 46 |         return list(sents_list)
 47 | 
 48 |     def segment(self, input_list):
 49 |         '''
 50 |         功能：实现分词文本的分词
 51 |         返回值：每个文本的形成一个列表[['word1','word2'],['word1','word3'],……]
 52 |         '''
 53 |         segmented_text_list = []
 54 |         for text in input_list:
 55 |             words = self.segmentor.segment(text)  # 分词
 56 |             segmented_text_list.append(list(words))
 57 |         return segmented_text_list
 58 | 
 59 |     def postagger(self, input_list, return_words_list=False):
 60 |         '''
 61 |         功能：实现文本中每个词的词性标注
 62 |         返回值：每个文本是一个列表，列表中的每个词也是个列表[[['word1',u'O'],['word2',u'O']],[['word2',u'O'],['word5',u'O']],……]
 63 |         '''
 64 |         postagger_text_list = []
 65 |         words_list = self.segment(input_list)
 66 |         postags_list = []
 67 |         for words in words_list:
 68 |             postags = self.postaggers.postag(words)  # 词性标注
 69 |             postags_list.append(list(postags))
 70 |             words_postags = list(zip(words, list(postags)))
 71 |             postagger_text_list.append(words_postags)
 72 |         if return_words_list:
 73 |             return words_list, postags_list
 74 |         else:
 75 |             return postagger_text_list
 76 | 
 77 |     def NamedEntityRecognizer(self, input_list, Entity_dist=False, repead=False):
 78 |         '''
 79 |         功能：识别文本中的命名实体：地名，组织名和机构名
 80 |         参数repead：表示是否进行去重处理 ，默认是不去重
 81 |         参数Entity_dist：表示每个文本，返回的识别后的列表，还是抽取后的实体字典，默认返回的是列表
 82 |         返回值的形式：1.[[['word1',u'O'],['word2',u'O'],['word3',u'O']],[['word2',u'O'],['word3',u'O'],['word4',u'O']],……]
 83 |                       2.[{'person':[],'place':[],'organization':[]},{'person':[],'place':[],'organization':[]},{'person':[],'place':[],'organization':[]},……]
 84 |         '''
 85 | 
 86 |         words_list, postags_list = self.postagger(input_list, return_words_list=True)
 87 | 
 88 |         entity_text_list = []
 89 |         for words, postags in zip(words_list, postags_list):
 90 |             netags = self.recognizer.recognize(words, postags)  # 命名实体识别 人名（Nh）、地名（Ns）、机构名（Ni）
 91 |             text = list(zip(words, netags))
 92 |             entity_text_list.append(text)
 93 | 
 94 |         if Entity_dist:
 95 |             extract_entity_list = []
 96 |             for words_entity_note_list in entity_text_list:
 97 |                 extract_entity_list.append(self.get_entity_dict(words_entity_note_list, repead))
 98 |             return extract_entity_list
 99 |         else:
100 |             return entity_text_list
101 | 
102 |     def get_entity_dict(self, words_entity_note_list, repead):
103 |         '''
104 |         功能：根据实体识别的标志，统计文本中的命名实体
105 |         参数repead：表示是否进行去重处理 ，默认是不去重
106 |         返回值：{'person':[],'place':[],'organization':[]}
107 |         '''
108 |         '''
109 |         O：这个词不是NE
110 |         S：这个词单独构成一个NE
111 |         B：这个词为一个NE的开始
112 |         I：这个词为一个NE的中间
113 |         E：这个词位一个NE的结尾
114 |         Nh：人名
115 |         Ni：机构名
116 |         Ns：地名
117 |         '''
118 |         name_entity_dist = {}
119 |         # 存储不同实体的列表
120 |         name_entity_list = []
121 |         place_entity_list = []
122 |         organization_entity_list = []
123 | 
124 |         ntag_E_Nh = ""
125 |         ntag_E_Ni = ""
126 |         ntag_E_Ns = ""
127 |         for word, ntag in words_entity_note_list:
128 |             # print word+"/"+ntag,
129 |             if ntag[0] != "O":
130 |                 if ntag[0] == "S":
131 |                     if ntag[-2:] == "Nh":
132 |                         name_entity_list.append(word)
133 |                     elif ntag[-2:] == "Ni":
134 |                         organization_entity_list.append(word)
135 |                     else:
136 |                         place_entity_list.append(word)
137 |                 elif ntag[0] == "B":
138 |                     if ntag[-2:] == "Nh":
139 |                         ntag_E_Nh = ntag_E_Nh + word
140 |                     elif ntag[-2:] == "Ni":
141 |                         ntag_E_Ni = ntag_E_Ni + word
142 |                     else:
143 |                         ntag_E_Ns = ntag_E_Ns + word
144 |                 elif ntag[0] == "I":
145 |                     if ntag[-2:] == "Nh":
146 |                         ntag_E_Nh = ntag_E_Nh + word
147 |                     elif ntag[-2:] == "Ni":
148 |                         ntag_E_Ni = ntag_E_Ni + word
149 |                     else:
150 |                         ntag_E_Ns = ntag_E_Ns + word
151 |                 else:
152 |                     if ntag[-2:] == "Nh":
153 |                         ntag_E_Nh = ntag_E_Nh + word
154 |                         name_entity_list.append(ntag_E_Nh)
155 |                         ntag_E_Nh = ""
156 |                     elif ntag[-2:] == "Ni":
157 |                         ntag_E_Ni = ntag_E_Ni + word
158 |                         organization_entity_list.append(ntag_E_Ni)
159 |                         ntag_E_Ni = ""
160 |                     else:
161 |                         ntag_E_Ns = ntag_E_Ns + word
162 |                         place_entity_list.append(ntag_E_Ns)
163 |                         ntag_E_Ns = ""
164 | 
165 |         if repead:
166 |             name_entity_dist['person'] = list(set(name_entity_list))
167 |             name_entity_dist['organization'] = list(set(organization_entity_list))
168 |             name_entity_dist['place'] = list(set(place_entity_list))
169 |         else:
170 |             name_entity_dist['person'] = name_entity_list
171 |             name_entity_dist['organization'] = organization_entity_list
172 |             name_entity_dist['place'] = place_entity_list
173 |         return name_entity_dist
174 | 
175 |     def SyntaxParser(self, input_list, return_words_pos=False):
176 |         '''
177 |         # head = parent+1
178 |         # relation = relate  可以从中间抽取head 和 relation 构成LTP 的标准输出，但是为了根据自己的情况，直接输出返回的全部的信息
179 |         功能：实现依存句法分析
180 |         返回值：每个文本的形成一个列表
181 |         [[{u'relate': u'WP', u'cont': u'\uff0c', u'id': 4, u'parent': 3, u'pos': u'wp'},{u'relate': u'RAD', u'cont': u'\u7684', u'id': 1, u'parent': 0, u'pos': u'u'}],……]
182 |         '''
183 | 
184 |         words_list, postags_list = self.postagger(input_list, return_words_list=True)
185 | 
186 |         syntaxparser_text_list = []
187 |         for words, postags in zip(words_list, postags_list):
188 |             arcs = self.parser.parse(words, postags)  # 句法分析
189 |             res = [(arc.head, arc.relation) for arc in arcs]
190 |             text = []
191 |             for i in range(len(words)):
192 |                 tt = {
193 |                     'id': i
194 |                     , 'cont': words[i]
195 |                     , 'pos': postags[i]
196 |                     , 'parent': res[i][0]
197 |                     , 'relate': res[i][1]
198 |                 }
199 |                 text.append(tt)
200 |             syntaxparser_text_list.append(text)
201 | 
202 |         if return_words_pos:
203 |             return words_list, postags_list, syntaxparser_text_list
204 |         else:
205 |             return syntaxparser_text_list
206 | 
207 |     def triple_extract(self, intput_list):
208 |         '''
209 |         功能: 对于给定的句子进行事实三元组抽取
210 |         Args:
211 |             sentence: 要处理的语句
212 |                       形式是：'真实的句子'
213 |         '''
214 |         Subjective_guest = []  # 主谓宾关系(e1,r,e2)
215 |         Dynamic_relation = []  # 动宾关系
216 |         Guest = []  # 介宾关系
217 |         Name_entity_relation = []  # 命名实体之间的关系
218 |         # 分词后词的列表 words，词性列表 postags，实体标志列表 netags，语法分析列表 arcs
219 |         words = []
220 |         postags = []
221 |         netags = []
222 |         arcs = []
223 |         syntaxparser_text_list = self.SyntaxParser(intput_list)
224 |         entity_list = self.NamedEntityRecognizer(intput_list)
225 |         for words_property_list in syntaxparser_text_list[0]:
226 |             words.append(words_property_list['cont'])
227 |             postags.append(words_property_list['pos'])
228 |             arcs.append({'head': words_property_list['parent'], 'relation': words_property_list['relate']})
229 |         for words_entity_list in entity_list[0]:
230 |             netags.append(words_entity_list[1])
231 | 
232 |         child_dict_list = self.build_parse_child_dict(words, postags, arcs)
233 | 
234 |         for index in range(len(postags)):
235 | 
236 |             # 抽取以谓词为中心的事实三元组
237 |             if postags[index] == 'v':
238 |                 child_dict = child_dict_list[index]
239 |                 # 主谓宾
240 |                 if 'SBV' in child_dict and 'VOB' in child_dict:
241 |                     e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
242 |                     r = words[index]
243 |                     e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
244 |                     Subjective_guest.append((e1, r, e2))
245 | 
246 |                 # 定语后置，动宾关系
247 |                 if arcs[index]['relation'] == 'ATT':
248 |                     if 'VOB' in child_dict:
249 |                         e1 = self.complete_e(words, postags, child_dict_list, arcs[index]['head'] - 1)
250 |                         r = words[index]
251 |                         e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
252 |                         temp_string = r + e2
253 |                         if temp_string == e1[:len(temp_string)]:
254 |                             e1 = e1[len(temp_string):]
255 |                         if temp_string not in e1:
256 |                             Dynamic_relation.append((e1, r, e2))
257 | 
258 |                 # 含有介宾关系的主谓动补关系
259 |                 if 'SBV' in child_dict and 'CMP' in child_dict:
260 |                     # e1 = words[child_dict['SBV'][0]]
261 |                     e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
262 |                     cmp_index = child_dict['CMP'][0]
263 |                     r = words[index] + words[cmp_index]
264 |                     if 'POB' in child_dict_list[cmp_index]:
265 |                         e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0])
266 |                         Guest.append((e1, r, e2))
267 | 
268 |             # 尝试抽取命名实体有关的三元组
269 |             if netags[index][0] == 'S' or netags[index][0] == 'B':
270 |                 ni = index
271 |                 if netags[ni][0] == 'B':
272 |                     while netags[ni][0] != 'E':
273 |                         ni += 1
274 |                     e1 = ''.join(words[index:ni + 1])
275 |                 else:
276 |                     e1 = words[ni]
277 |                 # 上面是抽取实体，没有判断是什么类型的实体。。
278 |                 if arcs[ni]['relation'] == 'ATT' and postags[arcs[ni]['head'] - 1] == 'n' and netags[
279 |                     arcs[ni]['head'] - 1] == 'O':
280 |                     r = self.complete_e(words, postags, child_dict_list, arcs[ni]['head'] - 1)
281 |                     if e1 in r:
282 |                         r = r[(r.index(e1) + len(e1)):]
283 |                     if arcs[arcs[ni]['head'] - 1]['relation'] == 'ATT' and netags[
284 |                         arcs[arcs[ni]['head'] - 1]['head'] - 1] != 'O':
285 |                         e2 = self.complete_e(words, postags, child_dict_list, arcs[arcs[ni]['head'] - 1]['head'] - 1)
286 |                         mi = arcs[arcs[ni]['head'] - 1]['head'] - 1
287 |                         li = mi
288 |                         if netags[mi][0] == 'B':
289 |                             while netags[mi][0] != 'E':
290 |                                 mi += 1
291 |                             e = ''.join(words[li + 1:mi + 1])
292 |                             e2 += e
293 |                         if r in e2:
294 |                             e2 = e2[(e2.index(r) + len(r)):]
295 |                         if r + e2 in sentence:
296 |                             Name_entity_relation.append((e1, r, e2))
297 |         return Subjective_guest, Dynamic_relation, Guest, Name_entity_relation
298 | 
299 |     def build_parse_child_dict(self, words, postags, arcs):
300 |         """
301 |         功能：为句子中的每个词语维护一个保存句法依存儿子节点的字典
302 |         Args:
303 |             words: 分词列表
304 |             postags: 词性列表
305 |             arcs: 句法依存列表
306 |         """
307 |         child_dict_list = []
308 |         for index in range(len(words)):
309 |             child_dict = dict()
310 |             for arc_index in range(len(arcs)):
311 |                 if arcs[arc_index]['head'] == index + 1:
312 |                     if arcs[arc_index]['relation'] in child_dict:
313 |                         child_dict[arcs[arc_index]['relation']].append(arc_index)
314 |                     else:
315 |                         child_dict[arcs[arc_index]['relation']] = []
316 |                         child_dict[arcs[arc_index]['relation']].append(arc_index)
317 |             child_dict_list.append(child_dict)
318 |         return child_dict_list
319 | 
320 |     def complete_e(self, words, postags, child_dict_list, word_index):
321 |         """
322 |         功能：完善识别的部分实体
323 |         """
324 |         child_dict = child_dict_list[word_index]
325 |         prefix = ''
326 | 
327 |         if 'ATT' in child_dict:
328 |             for i in range(len(child_dict['ATT'])):
329 |                 prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
330 | 
331 |         postfix = ''
332 |         if postags[word_index] == 'v':
333 |             if 'VOB' in child_dict:
334 |                 postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
335 |             if 'SBV' in child_dict:
336 |                 prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix
337 | 
338 |         return prefix + words[word_index] + postfix
339 | 
340 | 
341 | if __name__ == '__main__':
342 |     intput_list = ['中国自称为炎黄子孙、龙的传人']
343 |     model = LTP_MODEL()
344 |     input_sentence = "雅生活服务的物业管理服务。"
345 |     print(model.SplitSentence(input_sentence))
346 |     print(model.segment(intput_list))
347 |     print(model.postagger(intput_list))
348 |     print(model.NamedEntityRecognizer(intput_list, Entity_dist=True))
349 |     print(model.NamedEntityRecognizer(intput_list))
350 |     print(model.SyntaxParser(intput_list))
351 | 
352 |     Subjective_guest, Dynamic_relation, Guest, Name_entity_relation = model.triple_extract(intput_list)
353 | 
354 |     print('=' * 30)
355 |     print(Subjective_guest, Dynamic_relation, Guest, Name_entity_relation)
356 |     model.__release__()
357 | 


--------------------------------------------------------------------------------