├── LTP_model.py
└── README.md


/LTP_model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # @Date    : 2017-07-28 11:25:02
  4 | # @Author  : 周奇 (2590193099@qq.com)
  5 | # @Link    : ……
  6 | import json
  7 | import urllib, urllib2
  8 | '''
  9 | 参考LTP：http://ltp.readthedocs.io/zh_CN/latest/ltpserver.html
 10 | 搭载LTP在服务器上，开放指定的端口
 11 | 提供一个免费使用的中文分词，词性标注，命名实体识别，语义角色标注，依存语法分析的接口
 12 | '''
 13 | 
 14 | class LTP_MODEL():
 15 |     def __init__(self,server_url = "http://IP:port/ltp"):
 16 |         #task 任务的具体形式，可以是分词：'ws'，词性标注：'pos',依存语法分析：'dp'，命名实体识别：ner语义角色标注：'srl',或者全部：'all' 
 17 |         self.server_url = server_url
 18 |         
 19 | 
 20 | 
 21 |     def build_xml(self,input_list):
 22 |         '''
 23 |         功能：根据输入列表，构建输入的xml的具体形式
 24 |         '''
 25 |         ss_start = '<?xml version="1.0" encoding="utf-8" ?><xml4nlp><note sent="y" word="n" pos="n" ne="n" parser="n" srl="n" /><doc><para><sent cont="'
 26 |         ss_middle = '" /></para><para><sent cont="'
 27 |         ss_end = '" /></para></doc></xml4nlp>'
 28 |         ss = ss_start + ss_middle.join(input_list) + ss_end
 29 |         return ss
 30 | 
 31 |     def output_json(self,task,input_xml):
 32 |         '''
 33 |         功能：根据输入的xml，上传服务器，返回指定任务的结果json对象
 34 |         '''
 35 |         data = {'s': input_xml, 'x': 'y', 't': task}
 36 |         try:
 37 |             request = urllib2.Request(self.server_url)
 38 |             params = urllib.urlencode(data)
 39 |             response = urllib2.urlopen(request, params)
 40 |             content = response.read().strip()
 41 |         except Exception:
 42 |             return
 43 |         return json.loads(content)
 44 |     
 45 |     def segment(self,input_list,task='ws'):
 46 |         '''
 47 |         功能：实现分词文本的分词
 48 |         返回值：每个文本的形成一个列表[['word1','word2'],['word1','word3'],……]
 49 |         '''
 50 |         input_xml = self.build_xml(input_list)
 51 |         content = self.output_json(task,input_xml)
 52 |         segmented_text_list = []
 53 |         for text_other in content:
 54 |             sent = text_other[0]
 55 |             text =[]
 56 |             for word in sent:
 57 |                 text.append(word['cont'])
 58 |             segmented_text_list.append(text)
 59 |         return segmented_text_list
 60 | 
 61 |     def postagger(self,input_list,task = 'pos'):
 62 |         '''
 63 |         功能：实现文本中每个词的词性标注
 64 |         返回值：每个文本是一个列表，列表中的每个词也是个列表[[['word1',u'O'],['word2',u'O']],[['word2',u'O'],['word5',u'O']],……]
 65 |         '''
 66 |         input_xml = self.build_xml(input_list)
 67 |         content = self.output_json(task,input_xml)
 68 |         postagger_text_list = []
 69 |         for text_other in content:
 70 |             sent = text_other[0]
 71 |             text =[]
 72 |             for word in sent:
 73 |                 text.append([word['cont'],word['pos']])
 74 |             postagger_text_list.append(text)
 75 |         return postagger_text_list
 76 | 
 77 |     def NamedEntityRecognizer(self,input_list,task = 'ner',Entity_dist=False,repead=False):
 78 |         '''
 79 |         功能：识别文本中的命名实体：地名，组织名和机构名
 80 |         参数repead：表示是否进行去重处理 ，默认是不去重
 81 |         参数Entity_dist：表示每个文本，返回的识别后的列表，还是抽取后的实体字典，默认返回的是列表
 82 |         返回值的形式：1.[[['word1',u'O'],['word2',u'O'],['word3',u'O']],[['word2',u'O'],['word3',u'O'],['word4',u'O']],……]
 83 |                       2.[{'person':[],'place':[],'organization':[]},{'person':[],'place':[],'organization':[]},{'person':[],'place':[],'organization':[]},……] 
 84 |         '''
 85 |         input_xml = self.build_xml(input_list)
 86 |         content = self.output_json(task,input_xml)
 87 |         entity_text_list = []
 88 |         #
 89 |         for text_other in content:
 90 |             sent = text_other[0]
 91 |             text =[]
 92 |             words_list = []
 93 |             entity_note_list = []
 94 |             for word in sent:
 95 |                 text.append([word['cont'],word['ne']])
 96 |             entity_text_list.append(text)
 97 |         if Entity_dist:
 98 |             extract_entity_list = []
 99 |             for words_entity_note_list in entity_text_list:
100 |                 extract_entity_list.append(self.get_entity_dict(words_entity_note_list,repead))
101 |             return extract_entity_list
102 |         else:
103 |             return entity_text_list
104 | 
105 |     def get_entity_dict(self,words_entity_note_list,repead):
106 |         '''
107 |         功能：根据实体识别的标志，统计文本中的命名实体
108 |         参数repead：表示是否进行去重处理 ，默认是不去重
109 |         返回值：{'person':[],'place':[],'organization':[]}
110 |         '''
111 |         '''
112 |         O：这个词不是NE
113 |         S：这个词单独构成一个NE
114 |         B：这个词为一个NE的开始
115 |         I：这个词为一个NE的中间
116 |         E：这个词位一个NE的结尾
117 |         Nh：人名
118 |         Ni：机构名
119 |         Ns：地名
120 |         '''
121 |         name_entity_dist = {}
122 |         # 存储不同实体的列表
123 |         name_entity_list = []
124 |         place_entity_list = []
125 |         organization_entity_list = []
126 | 
127 |         ntag_E_Nh = ""
128 |         ntag_E_Ni = ""
129 |         ntag_E_Ns = ""
130 |         for word, ntag in words_entity_note_list:
131 |             #print word+"/"+ntag,
132 |             if ntag[0]!="O":
133 |                 if ntag[0]=="S":
134 |                     if ntag[-2:]=="Nh":
135 |                         name_entity_list.append(word)
136 |                     elif ntag[-2:]=="Ni":
137 |                         organization_entity_list.append(word)
138 |                     else:
139 |                         place_entity_list.append(word)
140 |                 elif ntag[0]=="B":
141 |                     if ntag[-2:]=="Nh":
142 |                         ntag_E_Nh =ntag_E_Nh + word
143 |                     elif ntag[-2:]=="Ni":
144 |                         ntag_E_Ni =ntag_E_Ni + word
145 |                     else:
146 |                         ntag_E_Ns =ntag_E_Ns +word
147 |                 elif ntag[0]=="I":
148 |                     if ntag[-2:]=="Nh":
149 |                         ntag_E_Nh =ntag_E_Nh + word
150 |                     elif ntag[-2:]=="Ni":
151 |                         ntag_E_Ni =ntag_E_Ni + word
152 |                     else:
153 |                         ntag_E_Ns =ntag_E_Ns +word
154 |                 else:
155 |                     if ntag[-2:]=="Nh":
156 |                         ntag_E_Nh =ntag_E_Nh + word
157 |                         name_entity_list.append(ntag_E_Nh)
158 |                         ntag_E_Nh = ""
159 |                     elif ntag[-2:]=="Ni":
160 |                         ntag_E_Ni =ntag_E_Ni + word
161 |                         organization_entity_list.append(ntag_E_Ni)
162 |                         ntag_E_Ni = ""
163 |                     else:
164 |                         ntag_E_Ns =ntag_E_Ns +word
165 |                         place_entity_list.append(ntag_E_Ns)
166 |                         ntag_E_Ns = ""
167 | 
168 |         if repead:
169 |             name_entity_dist['person'] = list(set(name_entity_list))
170 |             name_entity_dist['organization'] = list(set(organization_entity_list))
171 |             name_entity_dist['place'] = list(set(place_entity_list))
172 |         else:
173 |             name_entity_dist['person'] = name_entity_list
174 |             name_entity_dist['organization'] = organization_entity_list
175 |             name_entity_dist['place'] = place_entity_list
176 |         return name_entity_dist
177 | 
178 |     def SyntaxParser(self,input_list,task='dp'):
179 |         '''
180 |         # head = parent+1
181 |         # relation = relate  可以从中间抽取head 和 relation 构成LTP 的标准输出，但是为了根据自己的情况，直接输出返回的全部的信息
182 |         功能：实现依存句法分析
183 |         返回值：每个文本的形成一个列表
184 |         [[{u'relate': u'WP', u'cont': u'\uff0c', u'id': 4, u'parent': 3, u'pos': u'wp'},{u'relate': u'RAD', u'cont': u'\u7684', u'id': 1, u'parent': 0, u'pos': u'u'}],……]
185 |         '''
186 |         input_xml = self.build_xml(input_list)
187 |         content = self.output_json(task,input_xml)
188 |         syntaxparser_text_list = []
189 |         for text_other in content:
190 |             sent = text_other[0]
191 |             text =[]
192 |             for word in sent:
193 |                 text.append(word)
194 |             syntaxparser_text_list.append(text)
195 |         return syntaxparser_text_list
196 | 
197 |     
198 |     def triple_extract(self,sentence): 
199 |         '''
200 |         功能: 对于给定的句子进行事实三元组抽取
201 |         Args:
202 |             sentence: 要处理的语句 
203 |                       形式是：'真实的句子'
204 |         '''
205 |         Subjective_guest = [] #主谓宾关系(e1,r,e2)
206 |         Dynamic_relation = [] #动宾关系
207 |         Guest = []  # 介宾关系
208 |         Name_entity_relation = [] # 命名实体之间的关系
209 |         # 分词后词的列表 words，词性列表 postags，实体标志列表 netags，语法分析列表 arcs
210 |         words = []
211 |         postags = []
212 |         netags = []
213 |         arcs = []
214 |         syntaxparser_text_list = self.SyntaxParser([sentence])
215 |         entity_list = self.NamedEntityRecognizer([sentence])
216 |         for words_property_list in syntaxparser_text_list[0]:
217 |             words.append(words_property_list['cont'])
218 |             postags.append(words_property_list['pos'])
219 |             arcs.append({'head':words_property_list['parent']+1,'relation':words_property_list['relate']})
220 |         for words_entity_list in entity_list[0]:
221 |             netags.append(words_entity_list[1])
222 | 
223 |         child_dict_list = self.build_parse_child_dict(words, postags, arcs)
224 | 
225 |         for index in range(len(postags)):
226 |             
227 |             # 抽取以谓词为中心的事实三元组
228 |             if postags[index] == 'v':
229 |                 child_dict = child_dict_list[index]
230 |                 # 主谓宾
231 |                 if child_dict.has_key('SBV') and child_dict.has_key('VOB'):
232 |                     e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
233 |                     r = words[index]
234 |                     e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
235 |                     Subjective_guest.append((e1, r, e2))
236 | 
237 |                 # 定语后置，动宾关系
238 |                 if arcs[index]['relation'] == 'ATT':
239 |                     if child_dict.has_key('VOB'):
240 |                         e1 = self.complete_e(words, postags, child_dict_list, arcs[index]['head'] - 1)
241 |                         r = words[index]
242 |                         e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
243 |                         temp_string = r + e2
244 |                         if temp_string == e1[:len(temp_string)]:
245 |                             e1 = e1[len(temp_string):]
246 |                         if temp_string not in e1:
247 |                             Dynamic_relation.append((e1, r, e2))
248 |                             
249 | 
250 |                 # 含有介宾关系的主谓动补关系
251 |                 if child_dict.has_key('SBV') and child_dict.has_key('CMP'):
252 |                     #e1 = words[child_dict['SBV'][0]]
253 |                     e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
254 |                     cmp_index = child_dict['CMP'][0]
255 |                     r = words[index] + words[cmp_index]
256 |                     if child_dict_list[cmp_index].has_key('POB'):
257 |                         e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0])
258 |                         Guest.append((e1, r, e2))
259 |                         
260 |             
261 | 
262 |             # 尝试抽取命名实体有关的三元组
263 |             if netags[index][0] == 'S' or netags[index][0] == 'B':
264 |                 ni = index
265 |                 if netags[ni][0] == 'B':
266 |                     while netags[ni][0] != 'E':
267 |                         ni += 1
268 |                     e1 = ''.join(words[index:ni + 1])
269 |                 else:
270 |                     e1 = words[ni]
271 |                 #上面是抽取实体，没有判断是什么类型的实体。。
272 |                 if arcs[ni]['relation'] == 'ATT' and postags[arcs[ni]['head'] - 1] == 'n' and netags[arcs[ni]['head'] - 1] == 'O':
273 |                     r = self.complete_e(words, postags, child_dict_list, arcs[ni]['head'] - 1)
274 |                     if e1 in r:
275 |                         r = r[(r.index(e1) + len(e1)):]
276 |                     if arcs[arcs[ni]['head'] - 1]['relation'] == 'ATT' and netags[arcs[arcs[ni]['head'] - 1]['head'] - 1] != 'O':
277 |                         e2 = self.complete_e(words, postags, child_dict_list, arcs[arcs[ni]['head'] - 1]['head'] - 1)
278 |                         mi = arcs[arcs[ni]['head'] - 1]['head'] - 1
279 |                         li = mi
280 |                         if netags[mi][0] == 'B':
281 |                             while netags[mi][0] != 'E':
282 |                                 mi += 1
283 |                             e = ''.join(words[li + 1:mi + 1])
284 |                             e2 += e
285 |                         if r in e2:
286 |                             e2 = e2[(e2.index(r) + len(r)):]
287 |                         if r + e2 in sentence:
288 |                             Name_entity_relation.append((e1, r, e2))
289 |         return Subjective_guest,Dynamic_relation,Guest,Name_entity_relation
290 |                             
291 | 
292 | 
293 |     def build_parse_child_dict(self,words, postags, arcs):
294 |         """
295 |         功能：为句子中的每个词语维护一个保存句法依存儿子节点的字典
296 |         Args:
297 |             words: 分词列表
298 |             postags: 词性列表
299 |             arcs: 句法依存列表
300 |         """
301 |         child_dict_list = []
302 |         for index in range(len(words)):
303 |             child_dict = dict()
304 |             for arc_index in range(len(arcs)):
305 |                 if arcs[arc_index]['head'] == index + 1:
306 |                     if child_dict.has_key(arcs[arc_index]['relation']):
307 |                         child_dict[arcs[arc_index]['relation']].append(arc_index)
308 |                     else:
309 |                         child_dict[arcs[arc_index]['relation']] = []
310 |                         child_dict[arcs[arc_index]['relation']].append(arc_index)
311 |             child_dict_list.append(child_dict)
312 |         return child_dict_list
313 | 
314 | 
315 |     def complete_e(self, words, postags, child_dict_list, word_index):
316 |         """
317 |         功能：完善识别的部分实体
318 |         """
319 |         child_dict = child_dict_list[word_index]
320 |         prefix = ''
321 | 
322 |         if child_dict.has_key('ATT'):
323 |             for i in range(len(child_dict['ATT'])):
324 |                 prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
325 | 
326 |         postfix = ''
327 |         if postags[word_index] == 'v':
328 |             if child_dict.has_key('VOB'):
329 |                 postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
330 |             if child_dict.has_key('SBV'):
331 |                 prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix
332 | 
333 |         return prefix + words[word_index] + postfix
334 | 
335 |     
336 |     def SementicRoleLabeller(self,input_list,task='srl'):
337 |         '''
338 |         功能：语义角色标注
339 |         返回值：文本中存在角色的每个词的具体的标记列表
340 |         词：[u'\u662f', [{u'type': u'A0', u'end': 1, u'beg': 0, u'id': 0}]]
341 |         '''
342 |         input_xml = self.build_xml(input_list)
343 |         content = self.output_json(task,input_xml)
344 |         rolelabeller_text_list = []
345 |         for text_other in content:
346 |             sent = text_other[0]
347 |             text =[]
348 |             for word in sent:
349 |                 if word['arg']!=[]:
350 |                     text.append([word['cont'],word['arg']])
351 |             rolelabeller_text_list.append(text)
352 |         return rolelabeller_text_list
353 | 
354 | 
355 | 
356 | 
357 | 
358 | if __name__ == '__main__':
359 |     intput_list = ['中国，是以华夏文明为源泉、中华文化为基础，并以汉族为主体民族的多民族国家，通用汉语、汉字，汉族与少数民族被统称为“中华民族”，又自称为炎黄子孙、龙的传人']
360 |     model = LTP_MODEL()
361 |     input_sentence = "中国，是以华夏文明为源泉、中华文化为基础，并以汉族为主体民族的多民族国家，通用汉语、汉字，汉族与少数民族被统称为“中华民族”，又自称为炎黄子孙、龙的传人"
362 |     print model.segment(intput_list)
363 |     print model.postagger(intput_list)
364 |     print model.NamedEntityRecognizer(intput_list,Entity_dist=True)[0]['place'][0]
365 |     print model.NamedEntityRecognizer(intput_list)
366 |     print model.SyntaxParser(intput_list)
367 |     Subjective_guest,Dynamic_relation,Guest,Name_entity_relation = model.triple_extract(input_sentence)
368 |     for e in Subjective_guest[0]:
369 |         print e,
370 |     print "\n"
371 |     for e in Dynamic_relation[0]:
372 |         print e,
373 |     print model.SementicRoleLabeller(intput_list)
374 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LTP的 python 接口
 2 | >自己在服务器上搭载 LTP 的程序，并开放端口供LTP_MODEL使用，根据自己的情况，实现了：分词，词性标注，命名实体识别，依存句法分析，语义角色标注。同时，基于该接口，我实现了，**命名实体的抽取和事实三元组的抽取工作**
 3 | 
 4 | ### LTP_MODEL的优点
 5 | - LTP的 Python 接口，不需要安装其他额外的库
 6 | - 可以根据自己的需要更改代码，实现不同的输入和输出形式
 7 | - 实现了命名实体的抽取：人名，地名，机构名
 8 | - 实现了三元组的抽取：主谓宾，动宾关系，介宾关系，（实体，关系，实体）
 9 | 
10 | ### LTP_MODEL的用法
11 | - 分词
12 | 
13 | ```
14 | input_list = ['中国，是以华夏文明为源泉、中华文化为基础，并以汉族为主体民族的多民族国家，通用汉语、汉字，汉族与少数民族被统称为“中华民族”，又自称为炎黄子孙、龙的传人。']
15 | model = LTP_MODEL()
16 | print model.segment(input_list)
17 | ```
18 | - 词性标注
19 | 
20 | ```
21 | input_list = ['中国，是以华夏文明为源泉、中华文化为基础，并以汉族为主体民族的多民族国家，通用汉语、汉字，汉族与少数民族被统称为“中华民族”，又自称为炎黄子孙、龙的传人。']
22 | model = LTP_MODEL()
23 | print model.postagger(input_list)
24 | ```
25 | - 命名实体识别
26 | 
27 | ```
28 | input_list = ['中国，是以华夏文明为源泉、中华文化为基础，并以汉族为主体民族的多民族国家，通用汉语、汉字，汉族与少数民族被统称为“中华民族”，又自称为炎黄子孙、龙的传人。']
29 | model = LTP_MODEL()
30 | print model.NamedEntityRecognizer(input_list,Entity_dist=True)[0]['place'][0]
31 | print model.NamedEntityRecognizer(input_list)
32 | ```
33 | 
34 | - 依存句法分析
35 | 
36 | ```
37 | input_list = ['中国，是以华夏文明为源泉、中华文化为基础，并以汉族为主体民族的多民族国家，通用汉语、汉字，汉族与少数民族被统称为“中华民族”，又自称为炎黄子孙、龙的传人。']
38 | model = LTP_MODEL()
39 | print model.SyntaxParser(input_list)
40 | ```
41 | - 三元组抽取
42 | 
43 | ```
44 | input_sentence = "中国，是以华夏文明为源泉、中华文化为基础，并以汉族为主体民族的多民族国家，通用汉语、汉字，汉族与少数民族被统称为“中华民族”，又自称为炎黄子孙、龙的传人"
45 | model = LTP_MODEL()
46 | Subjective_guest,Dynamic_relation,Guest,Name_entity_relation = model.triple_extract(input_sentence)
47 | for e in Subjective_guest[0]:
48 |     print e,
49 | print "\n"
50 | for e in Dynamic_relation[0]:
51 |     print e,
52 | ```
53 | - 语义角色分析
54 | 
55 | ```
56 | input_list = ["中国，是以华夏文明为源泉、中华文化为基础，并以汉族为主体民族的多民族国家，通用汉语、汉字，汉族与少数民族被统称为“中华民族”，又自称为炎黄子孙、龙的传人"]
57 | model = LTP_MODEL()
58 | print model.SementicRoleLabeller(input_list)
59 | ```
60 | ### 后期还将添加关键字抽取功能
61 | 
62 | 


--------------------------------------------------------------------------------