├── LTP_model.py └── README.md /LTP_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Date : 2017-07-28 11:25:02 4 | # @Author : 周奇 (2590193099@qq.com) 5 | # @Link : …… 6 | import json 7 | import urllib, urllib2 8 | ''' 9 | 参考LTP:http://ltp.readthedocs.io/zh_CN/latest/ltpserver.html 10 | 搭载LTP在服务器上,开放指定的端口 11 | 提供一个免费使用的中文分词,词性标注,命名实体识别,语义角色标注,依存语法分析的接口 12 | ''' 13 | 14 | class LTP_MODEL(): 15 | def __init__(self,server_url = "http://IP:port/ltp"): 16 | #task 任务的具体形式,可以是分词:'ws',词性标注:'pos',依存语法分析:'dp',命名实体识别:ner语义角色标注:'srl',或者全部:'all' 17 | self.server_url = server_url 18 | 19 | 20 | 21 | def build_xml(self,input_list): 22 | ''' 23 | 功能:根据输入列表,构建输入的xml的具体形式 24 | ''' 25 | ss_start = '' 28 | ss = ss_start + ss_middle.join(input_list) + ss_end 29 | return ss 30 | 31 | def output_json(self,task,input_xml): 32 | ''' 33 | 功能:根据输入的xml,上传服务器,返回指定任务的结果json对象 34 | ''' 35 | data = {'s': input_xml, 'x': 'y', 't': task} 36 | try: 37 | request = urllib2.Request(self.server_url) 38 | params = urllib.urlencode(data) 39 | response = urllib2.urlopen(request, params) 40 | content = response.read().strip() 41 | except Exception: 42 | return 43 | return json.loads(content) 44 | 45 | def segment(self,input_list,task='ws'): 46 | ''' 47 | 功能:实现分词文本的分词 48 | 返回值:每个文本的形成一个列表[['word1','word2'],['word1','word3'],……] 49 | ''' 50 | input_xml = self.build_xml(input_list) 51 | content = self.output_json(task,input_xml) 52 | segmented_text_list = [] 53 | for text_other in content: 54 | sent = text_other[0] 55 | text =[] 56 | for word in sent: 57 | text.append(word['cont']) 58 | segmented_text_list.append(text) 59 | return segmented_text_list 60 | 61 | def postagger(self,input_list,task = 'pos'): 62 | ''' 63 | 功能:实现文本中每个词的词性标注 64 | 返回值:每个文本是一个列表,列表中的每个词也是个列表[[['word1',u'O'],['word2',u'O']],[['word2',u'O'],['word5',u'O']],……] 65 | ''' 66 | input_xml = self.build_xml(input_list) 67 | content = self.output_json(task,input_xml) 68 | postagger_text_list = [] 69 | for text_other in content: 70 | sent = text_other[0] 71 | text =[] 72 | for word in sent: 73 | text.append([word['cont'],word['pos']]) 74 | postagger_text_list.append(text) 75 | return postagger_text_list 76 | 77 | def NamedEntityRecognizer(self,input_list,task = 'ner',Entity_dist=False,repead=False): 78 | ''' 79 | 功能:识别文本中的命名实体:地名,组织名和机构名 80 | 参数repead:表示是否进行去重处理 ,默认是不去重 81 | 参数Entity_dist:表示每个文本,返回的识别后的列表,还是抽取后的实体字典,默认返回的是列表 82 | 返回值的形式:1.[[['word1',u'O'],['word2',u'O'],['word3',u'O']],[['word2',u'O'],['word3',u'O'],['word4',u'O']],……] 83 | 2.[{'person':[],'place':[],'organization':[]},{'person':[],'place':[],'organization':[]},{'person':[],'place':[],'organization':[]},……] 84 | ''' 85 | input_xml = self.build_xml(input_list) 86 | content = self.output_json(task,input_xml) 87 | entity_text_list = [] 88 | # 89 | for text_other in content: 90 | sent = text_other[0] 91 | text =[] 92 | words_list = [] 93 | entity_note_list = [] 94 | for word in sent: 95 | text.append([word['cont'],word['ne']]) 96 | entity_text_list.append(text) 97 | if Entity_dist: 98 | extract_entity_list = [] 99 | for words_entity_note_list in entity_text_list: 100 | extract_entity_list.append(self.get_entity_dict(words_entity_note_list,repead)) 101 | return extract_entity_list 102 | else: 103 | return entity_text_list 104 | 105 | def get_entity_dict(self,words_entity_note_list,repead): 106 | ''' 107 | 功能:根据实体识别的标志,统计文本中的命名实体 108 | 参数repead:表示是否进行去重处理 ,默认是不去重 109 | 返回值:{'person':[],'place':[],'organization':[]} 110 | ''' 111 | ''' 112 | O:这个词不是NE 113 | S:这个词单独构成一个NE 114 | B:这个词为一个NE的开始 115 | I:这个词为一个NE的中间 116 | E:这个词位一个NE的结尾 117 | Nh:人名 118 | Ni:机构名 119 | Ns:地名 120 | ''' 121 | name_entity_dist = {} 122 | # 存储不同实体的列表 123 | name_entity_list = [] 124 | place_entity_list = [] 125 | organization_entity_list = [] 126 | 127 | ntag_E_Nh = "" 128 | ntag_E_Ni = "" 129 | ntag_E_Ns = "" 130 | for word, ntag in words_entity_note_list: 131 | #print word+"/"+ntag, 132 | if ntag[0]!="O": 133 | if ntag[0]=="S": 134 | if ntag[-2:]=="Nh": 135 | name_entity_list.append(word) 136 | elif ntag[-2:]=="Ni": 137 | organization_entity_list.append(word) 138 | else: 139 | place_entity_list.append(word) 140 | elif ntag[0]=="B": 141 | if ntag[-2:]=="Nh": 142 | ntag_E_Nh =ntag_E_Nh + word 143 | elif ntag[-2:]=="Ni": 144 | ntag_E_Ni =ntag_E_Ni + word 145 | else: 146 | ntag_E_Ns =ntag_E_Ns +word 147 | elif ntag[0]=="I": 148 | if ntag[-2:]=="Nh": 149 | ntag_E_Nh =ntag_E_Nh + word 150 | elif ntag[-2:]=="Ni": 151 | ntag_E_Ni =ntag_E_Ni + word 152 | else: 153 | ntag_E_Ns =ntag_E_Ns +word 154 | else: 155 | if ntag[-2:]=="Nh": 156 | ntag_E_Nh =ntag_E_Nh + word 157 | name_entity_list.append(ntag_E_Nh) 158 | ntag_E_Nh = "" 159 | elif ntag[-2:]=="Ni": 160 | ntag_E_Ni =ntag_E_Ni + word 161 | organization_entity_list.append(ntag_E_Ni) 162 | ntag_E_Ni = "" 163 | else: 164 | ntag_E_Ns =ntag_E_Ns +word 165 | place_entity_list.append(ntag_E_Ns) 166 | ntag_E_Ns = "" 167 | 168 | if repead: 169 | name_entity_dist['person'] = list(set(name_entity_list)) 170 | name_entity_dist['organization'] = list(set(organization_entity_list)) 171 | name_entity_dist['place'] = list(set(place_entity_list)) 172 | else: 173 | name_entity_dist['person'] = name_entity_list 174 | name_entity_dist['organization'] = organization_entity_list 175 | name_entity_dist['place'] = place_entity_list 176 | return name_entity_dist 177 | 178 | def SyntaxParser(self,input_list,task='dp'): 179 | ''' 180 | # head = parent+1 181 | # relation = relate 可以从中间抽取head 和 relation 构成LTP 的标准输出,但是为了根据自己的情况,直接输出返回的全部的信息 182 | 功能:实现依存句法分析 183 | 返回值:每个文本的形成一个列表 184 | [[{u'relate': u'WP', u'cont': u'\uff0c', u'id': 4, u'parent': 3, u'pos': u'wp'},{u'relate': u'RAD', u'cont': u'\u7684', u'id': 1, u'parent': 0, u'pos': u'u'}],……] 185 | ''' 186 | input_xml = self.build_xml(input_list) 187 | content = self.output_json(task,input_xml) 188 | syntaxparser_text_list = [] 189 | for text_other in content: 190 | sent = text_other[0] 191 | text =[] 192 | for word in sent: 193 | text.append(word) 194 | syntaxparser_text_list.append(text) 195 | return syntaxparser_text_list 196 | 197 | 198 | def triple_extract(self,sentence): 199 | ''' 200 | 功能: 对于给定的句子进行事实三元组抽取 201 | Args: 202 | sentence: 要处理的语句 203 | 形式是:'真实的句子' 204 | ''' 205 | Subjective_guest = [] #主谓宾关系(e1,r,e2) 206 | Dynamic_relation = [] #动宾关系 207 | Guest = [] # 介宾关系 208 | Name_entity_relation = [] # 命名实体之间的关系 209 | # 分词后词的列表 words,词性列表 postags,实体标志列表 netags,语法分析列表 arcs 210 | words = [] 211 | postags = [] 212 | netags = [] 213 | arcs = [] 214 | syntaxparser_text_list = self.SyntaxParser([sentence]) 215 | entity_list = self.NamedEntityRecognizer([sentence]) 216 | for words_property_list in syntaxparser_text_list[0]: 217 | words.append(words_property_list['cont']) 218 | postags.append(words_property_list['pos']) 219 | arcs.append({'head':words_property_list['parent']+1,'relation':words_property_list['relate']}) 220 | for words_entity_list in entity_list[0]: 221 | netags.append(words_entity_list[1]) 222 | 223 | child_dict_list = self.build_parse_child_dict(words, postags, arcs) 224 | 225 | for index in range(len(postags)): 226 | 227 | # 抽取以谓词为中心的事实三元组 228 | if postags[index] == 'v': 229 | child_dict = child_dict_list[index] 230 | # 主谓宾 231 | if child_dict.has_key('SBV') and child_dict.has_key('VOB'): 232 | e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) 233 | r = words[index] 234 | e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) 235 | Subjective_guest.append((e1, r, e2)) 236 | 237 | # 定语后置,动宾关系 238 | if arcs[index]['relation'] == 'ATT': 239 | if child_dict.has_key('VOB'): 240 | e1 = self.complete_e(words, postags, child_dict_list, arcs[index]['head'] - 1) 241 | r = words[index] 242 | e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) 243 | temp_string = r + e2 244 | if temp_string == e1[:len(temp_string)]: 245 | e1 = e1[len(temp_string):] 246 | if temp_string not in e1: 247 | Dynamic_relation.append((e1, r, e2)) 248 | 249 | 250 | # 含有介宾关系的主谓动补关系 251 | if child_dict.has_key('SBV') and child_dict.has_key('CMP'): 252 | #e1 = words[child_dict['SBV'][0]] 253 | e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) 254 | cmp_index = child_dict['CMP'][0] 255 | r = words[index] + words[cmp_index] 256 | if child_dict_list[cmp_index].has_key('POB'): 257 | e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0]) 258 | Guest.append((e1, r, e2)) 259 | 260 | 261 | 262 | # 尝试抽取命名实体有关的三元组 263 | if netags[index][0] == 'S' or netags[index][0] == 'B': 264 | ni = index 265 | if netags[ni][0] == 'B': 266 | while netags[ni][0] != 'E': 267 | ni += 1 268 | e1 = ''.join(words[index:ni + 1]) 269 | else: 270 | e1 = words[ni] 271 | #上面是抽取实体,没有判断是什么类型的实体。。 272 | if arcs[ni]['relation'] == 'ATT' and postags[arcs[ni]['head'] - 1] == 'n' and netags[arcs[ni]['head'] - 1] == 'O': 273 | r = self.complete_e(words, postags, child_dict_list, arcs[ni]['head'] - 1) 274 | if e1 in r: 275 | r = r[(r.index(e1) + len(e1)):] 276 | if arcs[arcs[ni]['head'] - 1]['relation'] == 'ATT' and netags[arcs[arcs[ni]['head'] - 1]['head'] - 1] != 'O': 277 | e2 = self.complete_e(words, postags, child_dict_list, arcs[arcs[ni]['head'] - 1]['head'] - 1) 278 | mi = arcs[arcs[ni]['head'] - 1]['head'] - 1 279 | li = mi 280 | if netags[mi][0] == 'B': 281 | while netags[mi][0] != 'E': 282 | mi += 1 283 | e = ''.join(words[li + 1:mi + 1]) 284 | e2 += e 285 | if r in e2: 286 | e2 = e2[(e2.index(r) + len(r)):] 287 | if r + e2 in sentence: 288 | Name_entity_relation.append((e1, r, e2)) 289 | return Subjective_guest,Dynamic_relation,Guest,Name_entity_relation 290 | 291 | 292 | 293 | def build_parse_child_dict(self,words, postags, arcs): 294 | """ 295 | 功能:为句子中的每个词语维护一个保存句法依存儿子节点的字典 296 | Args: 297 | words: 分词列表 298 | postags: 词性列表 299 | arcs: 句法依存列表 300 | """ 301 | child_dict_list = [] 302 | for index in range(len(words)): 303 | child_dict = dict() 304 | for arc_index in range(len(arcs)): 305 | if arcs[arc_index]['head'] == index + 1: 306 | if child_dict.has_key(arcs[arc_index]['relation']): 307 | child_dict[arcs[arc_index]['relation']].append(arc_index) 308 | else: 309 | child_dict[arcs[arc_index]['relation']] = [] 310 | child_dict[arcs[arc_index]['relation']].append(arc_index) 311 | child_dict_list.append(child_dict) 312 | return child_dict_list 313 | 314 | 315 | def complete_e(self, words, postags, child_dict_list, word_index): 316 | """ 317 | 功能:完善识别的部分实体 318 | """ 319 | child_dict = child_dict_list[word_index] 320 | prefix = '' 321 | 322 | if child_dict.has_key('ATT'): 323 | for i in range(len(child_dict['ATT'])): 324 | prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i]) 325 | 326 | postfix = '' 327 | if postags[word_index] == 'v': 328 | if child_dict.has_key('VOB'): 329 | postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) 330 | if child_dict.has_key('SBV'): 331 | prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix 332 | 333 | return prefix + words[word_index] + postfix 334 | 335 | 336 | def SementicRoleLabeller(self,input_list,task='srl'): 337 | ''' 338 | 功能:语义角色标注 339 | 返回值:文本中存在角色的每个词的具体的标记列表 340 | 词:[u'\u662f', [{u'type': u'A0', u'end': 1, u'beg': 0, u'id': 0}]] 341 | ''' 342 | input_xml = self.build_xml(input_list) 343 | content = self.output_json(task,input_xml) 344 | rolelabeller_text_list = [] 345 | for text_other in content: 346 | sent = text_other[0] 347 | text =[] 348 | for word in sent: 349 | if word['arg']!=[]: 350 | text.append([word['cont'],word['arg']]) 351 | rolelabeller_text_list.append(text) 352 | return rolelabeller_text_list 353 | 354 | 355 | 356 | 357 | 358 | if __name__ == '__main__': 359 | intput_list = ['中国,是以华夏文明为源泉、中华文化为基础,并以汉族为主体民族的多民族国家,通用汉语、汉字,汉族与少数民族被统称为“中华民族”,又自称为炎黄子孙、龙的传人'] 360 | model = LTP_MODEL() 361 | input_sentence = "中国,是以华夏文明为源泉、中华文化为基础,并以汉族为主体民族的多民族国家,通用汉语、汉字,汉族与少数民族被统称为“中华民族”,又自称为炎黄子孙、龙的传人" 362 | print model.segment(intput_list) 363 | print model.postagger(intput_list) 364 | print model.NamedEntityRecognizer(intput_list,Entity_dist=True)[0]['place'][0] 365 | print model.NamedEntityRecognizer(intput_list) 366 | print model.SyntaxParser(intput_list) 367 | Subjective_guest,Dynamic_relation,Guest,Name_entity_relation = model.triple_extract(input_sentence) 368 | for e in Subjective_guest[0]: 369 | print e, 370 | print "\n" 371 | for e in Dynamic_relation[0]: 372 | print e, 373 | print model.SementicRoleLabeller(intput_list) 374 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LTP的 python 接口 2 | >自己在服务器上搭载 LTP 的程序,并开放端口供LTP_MODEL使用,根据自己的情况,实现了:分词,词性标注,命名实体识别,依存句法分析,语义角色标注。同时,基于该接口,我实现了,**命名实体的抽取和事实三元组的抽取工作** 3 | 4 | ### LTP_MODEL的优点 5 | - LTP的 Python 接口,不需要安装其他额外的库 6 | - 可以根据自己的需要更改代码,实现不同的输入和输出形式 7 | - 实现了命名实体的抽取:人名,地名,机构名 8 | - 实现了三元组的抽取:主谓宾,动宾关系,介宾关系,(实体,关系,实体) 9 | 10 | ### LTP_MODEL的用法 11 | - 分词 12 | 13 | ``` 14 | input_list = ['中国,是以华夏文明为源泉、中华文化为基础,并以汉族为主体民族的多民族国家,通用汉语、汉字,汉族与少数民族被统称为“中华民族”,又自称为炎黄子孙、龙的传人。'] 15 | model = LTP_MODEL() 16 | print model.segment(input_list) 17 | ``` 18 | - 词性标注 19 | 20 | ``` 21 | input_list = ['中国,是以华夏文明为源泉、中华文化为基础,并以汉族为主体民族的多民族国家,通用汉语、汉字,汉族与少数民族被统称为“中华民族”,又自称为炎黄子孙、龙的传人。'] 22 | model = LTP_MODEL() 23 | print model.postagger(input_list) 24 | ``` 25 | - 命名实体识别 26 | 27 | ``` 28 | input_list = ['中国,是以华夏文明为源泉、中华文化为基础,并以汉族为主体民族的多民族国家,通用汉语、汉字,汉族与少数民族被统称为“中华民族”,又自称为炎黄子孙、龙的传人。'] 29 | model = LTP_MODEL() 30 | print model.NamedEntityRecognizer(input_list,Entity_dist=True)[0]['place'][0] 31 | print model.NamedEntityRecognizer(input_list) 32 | ``` 33 | 34 | - 依存句法分析 35 | 36 | ``` 37 | input_list = ['中国,是以华夏文明为源泉、中华文化为基础,并以汉族为主体民族的多民族国家,通用汉语、汉字,汉族与少数民族被统称为“中华民族”,又自称为炎黄子孙、龙的传人。'] 38 | model = LTP_MODEL() 39 | print model.SyntaxParser(input_list) 40 | ``` 41 | - 三元组抽取 42 | 43 | ``` 44 | input_sentence = "中国,是以华夏文明为源泉、中华文化为基础,并以汉族为主体民族的多民族国家,通用汉语、汉字,汉族与少数民族被统称为“中华民族”,又自称为炎黄子孙、龙的传人" 45 | model = LTP_MODEL() 46 | Subjective_guest,Dynamic_relation,Guest,Name_entity_relation = model.triple_extract(input_sentence) 47 | for e in Subjective_guest[0]: 48 | print e, 49 | print "\n" 50 | for e in Dynamic_relation[0]: 51 | print e, 52 | ``` 53 | - 语义角色分析 54 | 55 | ``` 56 | input_list = ["中国,是以华夏文明为源泉、中华文化为基础,并以汉族为主体民族的多民族国家,通用汉语、汉字,汉族与少数民族被统称为“中华民族”,又自称为炎黄子孙、龙的传人"] 57 | model = LTP_MODEL() 58 | print model.SementicRoleLabeller(input_list) 59 | ``` 60 | ### 后期还将添加关键字抽取功能 61 | 62 | --------------------------------------------------------------------------------