├── README.md └── pyLTP_model.py /README.md: -------------------------------------------------------------------------------- 1 | # pyLTPServer -------------------------------------------------------------------------------- /pyLTP_model.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | # create on 5/26/20 4 | __author__ = 'sinsa' 5 | 6 | import os 7 | import logging 8 | from logging import info, error, warn 9 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - PID:%(process)d - %(levelname)s: %(message)s') 10 | from pyltp import Segmentor 11 | from pyltp import Postagger 12 | from pyltp import NamedEntityRecognizer 13 | from pyltp import Parser 14 | from pyltp import SentenceSplitter 15 | 16 | 17 | class LTP_MODEL(): 18 | def __init__(self): 19 | LTP_DATA_DIR = './ltp_data_v3.4.0' # ltp模型目录的路径 20 | info('loading models ...') 21 | self.segmentor = Segmentor() # 初始化实例 22 | self.cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` 23 | self.segmentor.load(self.cws_model_path) # 加载模型 24 | info('has loaded 分词模型') 25 | self.pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` 26 | self.postaggers = Postagger() # 初始化实例 27 | self.postaggers.load(self.pos_model_path) # 加载模型 28 | info('has loaded 词性标注模型') 29 | self.ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` 30 | self.recognizer = NamedEntityRecognizer() # 初始化实例 31 | self.recognizer.load(self.ner_model_path) # 加载模型 32 | info('has loaded 命名实体识别模型') 33 | self.par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` 34 | self.parser = Parser() # 初始化实例 35 | self.parser.load(self.par_model_path) # 加载模型 36 | info('has loaded 依存句法分析模型') 37 | 38 | def __release__(self): 39 | self.segmentor.release() # 释放模型 40 | self.postaggers.release() # 释放模型 41 | self.recognizer.release() # 释放模型 42 | self.parser.release() # 释放模型 43 | 44 | def SplitSentence(self, sentence): 45 | sents_list = SentenceSplitter.split(sentence) # 分句 46 | return list(sents_list) 47 | 48 | def segment(self, input_list): 49 | ''' 50 | 功能:实现分词文本的分词 51 | 返回值:每个文本的形成一个列表[['word1','word2'],['word1','word3'],……] 52 | ''' 53 | segmented_text_list = [] 54 | for text in input_list: 55 | words = self.segmentor.segment(text) # 分词 56 | segmented_text_list.append(list(words)) 57 | return segmented_text_list 58 | 59 | def postagger(self, input_list, return_words_list=False): 60 | ''' 61 | 功能:实现文本中每个词的词性标注 62 | 返回值:每个文本是一个列表,列表中的每个词也是个列表[[['word1',u'O'],['word2',u'O']],[['word2',u'O'],['word5',u'O']],……] 63 | ''' 64 | postagger_text_list = [] 65 | words_list = self.segment(input_list) 66 | postags_list = [] 67 | for words in words_list: 68 | postags = self.postaggers.postag(words) # 词性标注 69 | postags_list.append(list(postags)) 70 | words_postags = list(zip(words, list(postags))) 71 | postagger_text_list.append(words_postags) 72 | if return_words_list: 73 | return words_list, postags_list 74 | else: 75 | return postagger_text_list 76 | 77 | def NamedEntityRecognizer(self, input_list, Entity_dist=False, repead=False): 78 | ''' 79 | 功能:识别文本中的命名实体:地名,组织名和机构名 80 | 参数repead:表示是否进行去重处理 ,默认是不去重 81 | 参数Entity_dist:表示每个文本,返回的识别后的列表,还是抽取后的实体字典,默认返回的是列表 82 | 返回值的形式:1.[[['word1',u'O'],['word2',u'O'],['word3',u'O']],[['word2',u'O'],['word3',u'O'],['word4',u'O']],……] 83 | 2.[{'person':[],'place':[],'organization':[]},{'person':[],'place':[],'organization':[]},{'person':[],'place':[],'organization':[]},……] 84 | ''' 85 | 86 | words_list, postags_list = self.postagger(input_list, return_words_list=True) 87 | 88 | entity_text_list = [] 89 | for words, postags in zip(words_list, postags_list): 90 | netags = self.recognizer.recognize(words, postags) # 命名实体识别 人名(Nh)、地名(Ns)、机构名(Ni) 91 | text = list(zip(words, netags)) 92 | entity_text_list.append(text) 93 | 94 | if Entity_dist: 95 | extract_entity_list = [] 96 | for words_entity_note_list in entity_text_list: 97 | extract_entity_list.append(self.get_entity_dict(words_entity_note_list, repead)) 98 | return extract_entity_list 99 | else: 100 | return entity_text_list 101 | 102 | def get_entity_dict(self, words_entity_note_list, repead): 103 | ''' 104 | 功能:根据实体识别的标志,统计文本中的命名实体 105 | 参数repead:表示是否进行去重处理 ,默认是不去重 106 | 返回值:{'person':[],'place':[],'organization':[]} 107 | ''' 108 | ''' 109 | O:这个词不是NE 110 | S:这个词单独构成一个NE 111 | B:这个词为一个NE的开始 112 | I:这个词为一个NE的中间 113 | E:这个词位一个NE的结尾 114 | Nh:人名 115 | Ni:机构名 116 | Ns:地名 117 | ''' 118 | name_entity_dist = {} 119 | # 存储不同实体的列表 120 | name_entity_list = [] 121 | place_entity_list = [] 122 | organization_entity_list = [] 123 | 124 | ntag_E_Nh = "" 125 | ntag_E_Ni = "" 126 | ntag_E_Ns = "" 127 | for word, ntag in words_entity_note_list: 128 | # print word+"/"+ntag, 129 | if ntag[0] != "O": 130 | if ntag[0] == "S": 131 | if ntag[-2:] == "Nh": 132 | name_entity_list.append(word) 133 | elif ntag[-2:] == "Ni": 134 | organization_entity_list.append(word) 135 | else: 136 | place_entity_list.append(word) 137 | elif ntag[0] == "B": 138 | if ntag[-2:] == "Nh": 139 | ntag_E_Nh = ntag_E_Nh + word 140 | elif ntag[-2:] == "Ni": 141 | ntag_E_Ni = ntag_E_Ni + word 142 | else: 143 | ntag_E_Ns = ntag_E_Ns + word 144 | elif ntag[0] == "I": 145 | if ntag[-2:] == "Nh": 146 | ntag_E_Nh = ntag_E_Nh + word 147 | elif ntag[-2:] == "Ni": 148 | ntag_E_Ni = ntag_E_Ni + word 149 | else: 150 | ntag_E_Ns = ntag_E_Ns + word 151 | else: 152 | if ntag[-2:] == "Nh": 153 | ntag_E_Nh = ntag_E_Nh + word 154 | name_entity_list.append(ntag_E_Nh) 155 | ntag_E_Nh = "" 156 | elif ntag[-2:] == "Ni": 157 | ntag_E_Ni = ntag_E_Ni + word 158 | organization_entity_list.append(ntag_E_Ni) 159 | ntag_E_Ni = "" 160 | else: 161 | ntag_E_Ns = ntag_E_Ns + word 162 | place_entity_list.append(ntag_E_Ns) 163 | ntag_E_Ns = "" 164 | 165 | if repead: 166 | name_entity_dist['person'] = list(set(name_entity_list)) 167 | name_entity_dist['organization'] = list(set(organization_entity_list)) 168 | name_entity_dist['place'] = list(set(place_entity_list)) 169 | else: 170 | name_entity_dist['person'] = name_entity_list 171 | name_entity_dist['organization'] = organization_entity_list 172 | name_entity_dist['place'] = place_entity_list 173 | return name_entity_dist 174 | 175 | def SyntaxParser(self, input_list, return_words_pos=False): 176 | ''' 177 | # head = parent+1 178 | # relation = relate 可以从中间抽取head 和 relation 构成LTP 的标准输出,但是为了根据自己的情况,直接输出返回的全部的信息 179 | 功能:实现依存句法分析 180 | 返回值:每个文本的形成一个列表 181 | [[{u'relate': u'WP', u'cont': u'\uff0c', u'id': 4, u'parent': 3, u'pos': u'wp'},{u'relate': u'RAD', u'cont': u'\u7684', u'id': 1, u'parent': 0, u'pos': u'u'}],……] 182 | ''' 183 | 184 | words_list, postags_list = self.postagger(input_list, return_words_list=True) 185 | 186 | syntaxparser_text_list = [] 187 | for words, postags in zip(words_list, postags_list): 188 | arcs = self.parser.parse(words, postags) # 句法分析 189 | res = [(arc.head, arc.relation) for arc in arcs] 190 | text = [] 191 | for i in range(len(words)): 192 | tt = { 193 | 'id': i 194 | , 'cont': words[i] 195 | , 'pos': postags[i] 196 | , 'parent': res[i][0] 197 | , 'relate': res[i][1] 198 | } 199 | text.append(tt) 200 | syntaxparser_text_list.append(text) 201 | 202 | if return_words_pos: 203 | return words_list, postags_list, syntaxparser_text_list 204 | else: 205 | return syntaxparser_text_list 206 | 207 | def triple_extract(self, intput_list): 208 | ''' 209 | 功能: 对于给定的句子进行事实三元组抽取 210 | Args: 211 | sentence: 要处理的语句 212 | 形式是:'真实的句子' 213 | ''' 214 | Subjective_guest = [] # 主谓宾关系(e1,r,e2) 215 | Dynamic_relation = [] # 动宾关系 216 | Guest = [] # 介宾关系 217 | Name_entity_relation = [] # 命名实体之间的关系 218 | # 分词后词的列表 words,词性列表 postags,实体标志列表 netags,语法分析列表 arcs 219 | words = [] 220 | postags = [] 221 | netags = [] 222 | arcs = [] 223 | syntaxparser_text_list = self.SyntaxParser(intput_list) 224 | entity_list = self.NamedEntityRecognizer(intput_list) 225 | for words_property_list in syntaxparser_text_list[0]: 226 | words.append(words_property_list['cont']) 227 | postags.append(words_property_list['pos']) 228 | arcs.append({'head': words_property_list['parent'], 'relation': words_property_list['relate']}) 229 | for words_entity_list in entity_list[0]: 230 | netags.append(words_entity_list[1]) 231 | 232 | child_dict_list = self.build_parse_child_dict(words, postags, arcs) 233 | 234 | for index in range(len(postags)): 235 | 236 | # 抽取以谓词为中心的事实三元组 237 | if postags[index] == 'v': 238 | child_dict = child_dict_list[index] 239 | # 主谓宾 240 | if 'SBV' in child_dict and 'VOB' in child_dict: 241 | e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) 242 | r = words[index] 243 | e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) 244 | Subjective_guest.append((e1, r, e2)) 245 | 246 | # 定语后置,动宾关系 247 | if arcs[index]['relation'] == 'ATT': 248 | if 'VOB' in child_dict: 249 | e1 = self.complete_e(words, postags, child_dict_list, arcs[index]['head'] - 1) 250 | r = words[index] 251 | e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) 252 | temp_string = r + e2 253 | if temp_string == e1[:len(temp_string)]: 254 | e1 = e1[len(temp_string):] 255 | if temp_string not in e1: 256 | Dynamic_relation.append((e1, r, e2)) 257 | 258 | # 含有介宾关系的主谓动补关系 259 | if 'SBV' in child_dict and 'CMP' in child_dict: 260 | # e1 = words[child_dict['SBV'][0]] 261 | e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) 262 | cmp_index = child_dict['CMP'][0] 263 | r = words[index] + words[cmp_index] 264 | if 'POB' in child_dict_list[cmp_index]: 265 | e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0]) 266 | Guest.append((e1, r, e2)) 267 | 268 | # 尝试抽取命名实体有关的三元组 269 | if netags[index][0] == 'S' or netags[index][0] == 'B': 270 | ni = index 271 | if netags[ni][0] == 'B': 272 | while netags[ni][0] != 'E': 273 | ni += 1 274 | e1 = ''.join(words[index:ni + 1]) 275 | else: 276 | e1 = words[ni] 277 | # 上面是抽取实体,没有判断是什么类型的实体。。 278 | if arcs[ni]['relation'] == 'ATT' and postags[arcs[ni]['head'] - 1] == 'n' and netags[ 279 | arcs[ni]['head'] - 1] == 'O': 280 | r = self.complete_e(words, postags, child_dict_list, arcs[ni]['head'] - 1) 281 | if e1 in r: 282 | r = r[(r.index(e1) + len(e1)):] 283 | if arcs[arcs[ni]['head'] - 1]['relation'] == 'ATT' and netags[ 284 | arcs[arcs[ni]['head'] - 1]['head'] - 1] != 'O': 285 | e2 = self.complete_e(words, postags, child_dict_list, arcs[arcs[ni]['head'] - 1]['head'] - 1) 286 | mi = arcs[arcs[ni]['head'] - 1]['head'] - 1 287 | li = mi 288 | if netags[mi][0] == 'B': 289 | while netags[mi][0] != 'E': 290 | mi += 1 291 | e = ''.join(words[li + 1:mi + 1]) 292 | e2 += e 293 | if r in e2: 294 | e2 = e2[(e2.index(r) + len(r)):] 295 | if r + e2 in sentence: 296 | Name_entity_relation.append((e1, r, e2)) 297 | return Subjective_guest, Dynamic_relation, Guest, Name_entity_relation 298 | 299 | def build_parse_child_dict(self, words, postags, arcs): 300 | """ 301 | 功能:为句子中的每个词语维护一个保存句法依存儿子节点的字典 302 | Args: 303 | words: 分词列表 304 | postags: 词性列表 305 | arcs: 句法依存列表 306 | """ 307 | child_dict_list = [] 308 | for index in range(len(words)): 309 | child_dict = dict() 310 | for arc_index in range(len(arcs)): 311 | if arcs[arc_index]['head'] == index + 1: 312 | if arcs[arc_index]['relation'] in child_dict: 313 | child_dict[arcs[arc_index]['relation']].append(arc_index) 314 | else: 315 | child_dict[arcs[arc_index]['relation']] = [] 316 | child_dict[arcs[arc_index]['relation']].append(arc_index) 317 | child_dict_list.append(child_dict) 318 | return child_dict_list 319 | 320 | def complete_e(self, words, postags, child_dict_list, word_index): 321 | """ 322 | 功能:完善识别的部分实体 323 | """ 324 | child_dict = child_dict_list[word_index] 325 | prefix = '' 326 | 327 | if 'ATT' in child_dict: 328 | for i in range(len(child_dict['ATT'])): 329 | prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i]) 330 | 331 | postfix = '' 332 | if postags[word_index] == 'v': 333 | if 'VOB' in child_dict: 334 | postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) 335 | if 'SBV' in child_dict: 336 | prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix 337 | 338 | return prefix + words[word_index] + postfix 339 | 340 | 341 | if __name__ == '__main__': 342 | intput_list = ['中国自称为炎黄子孙、龙的传人'] 343 | model = LTP_MODEL() 344 | input_sentence = "雅生活服务的物业管理服务。" 345 | print(model.SplitSentence(input_sentence)) 346 | print(model.segment(intput_list)) 347 | print(model.postagger(intput_list)) 348 | print(model.NamedEntityRecognizer(intput_list, Entity_dist=True)) 349 | print(model.NamedEntityRecognizer(intput_list)) 350 | print(model.SyntaxParser(intput_list)) 351 | 352 | Subjective_guest, Dynamic_relation, Guest, Name_entity_relation = model.triple_extract(intput_list) 353 | 354 | print('=' * 30) 355 | print(Subjective_guest, Dynamic_relation, Guest, Name_entity_relation) 356 | model.__release__() 357 | --------------------------------------------------------------------------------