├── README.md ├── SentenceDispose.py ├── Law Title.py ├── CrawDict.py ├── LTP-Cloud.py ├── TextRank.py ├── ExtLocation.py ├── test2.py ├── SentenceSplit.py ├── CrawlMedicine.py ├── Neo4j.py ├── CrawHospital.py ├── Pyltp.py ├── ExtEntity.py ├── ExtEntity_UseLTP.py ├── FileDispose.py ├── test.py └── ExtRelation.py /README.md: -------------------------------------------------------------------------------- 1 | # MedicalInsuranceKG 2 | 医疗保险领域知识图谱 3 | -------------------------------------------------------------------------------- /SentenceDispose.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | import os 3 | 4 | inputpath = 'E:\\医疗保险测试语料库\\' 5 | filesname = os.listdir(inputpath) 6 | sentence = [] 7 | 8 | def hasNumbers(inputString): 9 | return any(char.isdigit() for char in inputString) 10 | 11 | for file in filesname: 12 | filename = inputpath + file 13 | fileobject = open(filename,'r',encoding='utf-8') 14 | for line in fileobject.readlines(): 15 | if hasNumbers(line) is True: 16 | sentence.append(line) 17 | fileobject.close() 18 | 19 | with open('./sentence.txt','w',encoding='utf-8') as f: 20 | for s in sentence: 21 | f.write(s+'\n') 22 | 23 | -------------------------------------------------------------------------------- /Law Title.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | pathHead="E:\\医疗保险测试语料库\\" 5 | outpath='E:\\规则抽取法律\\' 6 | pathdirs=os.listdir(pathHead) 7 | 8 | lawtitle=set() 9 | pattern = re.compile(r'《[^《》]*》') 10 | for path in pathdirs: 11 | in_file = pathHead+path 12 | inputfile = open(in_file,'r',encoding='utf-8') 13 | for line in inputfile.readlines(): 14 | names = pattern.findall(line) 15 | if len(names) != 0: 16 | for name in names: 17 | lawtitle.add(name) 18 | out_file = outpath+path 19 | outputfile = open(out_file,'a',encoding='utf-8') 20 | for name in lawtitle: 21 | outputfile.write(name+'\n') 22 | outputfile.close() 23 | lawtitle.clear() 24 | 25 | -------------------------------------------------------------------------------- /CrawDict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from bs4 import BeautifulSoup 3 | from urllib.request import urlopen 4 | 5 | baseurl = 'http://wiki.mbalib.com' 6 | urllinks = set() 7 | urllinks.add('/wiki/%E5%8C%BB%E7%96%97%E4%BF%9D%E9%99%A9') 8 | donelink = [] 9 | words = set() 10 | def PrepareDict(url): 11 | html = urlopen(str(baseurl + url)) 12 | bsobj = BeautifulSoup(html,"html.parser") 13 | urls = bsobj.findAll("a") 14 | print(urls) 15 | for u in urls: 16 | urllinks.add(u.attrs['href']) 17 | words.add(u.attrs['title']) 18 | donelink.append(url) 19 | 20 | k = 0 21 | while len(urllinks) != 0 and k < 20: 22 | url = urllinks.pop() 23 | print(url) 24 | if url not in donelink: 25 | PrepareDict(url) 26 | k += 1 27 | 28 | print(words) 29 | -------------------------------------------------------------------------------- /LTP-Cloud.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | import os 3 | 4 | path = 'E:\\实体关系抽取2\\' 5 | files = os.listdir(path) 6 | for f in files: 7 | print(f) 8 | file = path + f 9 | e = set() 10 | with open(file, 'r', encoding='utf-8') as ff: 11 | for line in ff.readlines(): 12 | part = line.split(',') 13 | # if not part[0].isdigit(): 14 | if len(part[0]) != 1: 15 | e.add(line) 16 | ff.close() 17 | with open(file, 'w', encoding='utf-8') as fff: 18 | for ee in e: 19 | fff.write(ee) 20 | 21 | # path = 'E:\\实体抽取\\' 22 | # files = os.listdir(path) 23 | # entity = set() 24 | # 25 | # for file in files: 26 | # filename = path + file 27 | # with open(filename, 'r', encoding='utf-8') as f: 28 | # for line in f.readlines(): 29 | # entity.add(line) 30 | # 31 | # print(len(entity)) 32 | 33 | -------------------------------------------------------------------------------- /TextRank.py: -------------------------------------------------------------------------------- 1 | #-*- encoding:utf-8 -*- 2 | from __future__ import print_function 3 | 4 | import sys 5 | # try: 6 | # reload(sys) 7 | # sys.setdefaultencoding('utf-8') 8 | # except: 9 | # pass 10 | 11 | import codecs 12 | from textrank4zh import TextRank4Keyword, TextRank4Sentence 13 | 14 | text = codecs.open('E:\\医疗保险测试语料库\\1.txt', 'r', 'utf-8').read() 15 | tr4w = TextRank4Keyword() 16 | 17 | tr4w.analyze(text=text, lower=True, window=2) # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象 18 | 19 | print( '关键词:' ) 20 | for item in tr4w.get_keywords(20, word_min_len=1): 21 | print(item.word, item.weight) 22 | 23 | print() 24 | print( '关键短语:' ) 25 | for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num= 2): 26 | print(phrase) 27 | 28 | tr4s = TextRank4Sentence() 29 | tr4s.analyze(text=text, lower=True, source = 'all_filters') 30 | 31 | print() 32 | print( '摘要:' ) 33 | for item in tr4s.get_key_sentences(num=3): 34 | print(item.index, item.weight, item.sentence) # index是语句在文本中位置,weight是权重 -------------------------------------------------------------------------------- /ExtLocation.py: -------------------------------------------------------------------------------- 1 | import os 2 | # from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer 3 | # LTP_DATA_DIR = 'E:\\ltp_data_v3.4.0' # ltp模型目录的路径 4 | # cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` 5 | # pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` 6 | # 7 | # segmenter = Segmentor() 8 | # segmenter.load_with_lexicon(cws_model_path,'E:\\ltp_data_v3.4.0\\personal_seg.txt') 9 | # 10 | # postaggor = Postagger() 11 | # postaggor.load_with_lexicon(pos_model_path,'E:\\ltp_data_v3.4.0\\personal_pos.txt') 12 | 13 | def ExtbySentence(sentence, segmentor, postagger): 14 | words = segmentor.segment(sentence) 15 | postags = postagger.postag(words) 16 | location = '' 17 | for i in range(len(postags)): 18 | if postags[i] == 'ns': 19 | location = words[i] 20 | # print(location) 21 | return location 22 | 23 | def ExtLocation(filename, segmentor, postagger): 24 | """提取语料所属地名""" 25 | file = open(filename, 'r', encoding='utf-8') 26 | loc = None 27 | while loc == None: 28 | sentence = next(file) 29 | loc = ExtbySentence(sentence, segmentor, postagger) 30 | return loc 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /test2.py: -------------------------------------------------------------------------------- 1 | # import os 2 | # 3 | # file = 'E:\\医疗保险语料库\\领域词典.txt' 4 | # fo = open(file,'r',encoding='utf-8') 5 | # e = [] 6 | # for l in fo.readlines(): 7 | # e.append(l.strip('\n')) 8 | # e.sort(key=lambda x:len(x)) 9 | # fo.close() 10 | # e.reverse() 11 | # ff = open(file,'w',encoding='utf-8') 12 | # for ee in e: 13 | # ff.write(ee+'\n') 14 | # ff.close() 15 | 16 | # 统计某文件夹下的所有csv文件的行数(多线程) 17 | import threading 18 | import csv 19 | import os 20 | 21 | 22 | class MyThreadLine(threading.Thread): # 用于统计csv文件的行数的线程类 23 | def __init__(self, path): 24 | threading.Thread.__init__(self) # 父类初始化 25 | self.path = path # 路径 26 | self.line = -1 # 统计行数 27 | 28 | def run(self): 29 | reader = csv.reader(open(self.path, "r", encoding='utf-8')) # 读取csv文件 30 | lines = 0 31 | for item in reader: # 读取每一行 32 | lines += 1 33 | self.line = lines # 保存行数 34 | print(self.getName(), self.line) 35 | 36 | 37 | path = "E:\\实体关系抽取2" # 所有csv文件所在的文件夹 38 | filelist = os.listdir(path) # 存储了所有的csv文件名 39 | threadlist = [] # 线程列表 40 | for filename in filelist: 41 | newpath = path + "\\" + filename # 代表绝对路径 42 | mythd = MyThreadLine(newpath) # 创建线程类对象 43 | mythd.start() # 线程开始干活 44 | threadlist.append(mythd) # 增加线程到线程列表 45 | for mythd in threadlist: # 遍历每一个线程 46 | mythd.join() # 等待所有线程干完活,再继续执行以下代码 47 | linelist = [] # csv文件行数列表 48 | for mythd in threadlist: 49 | linelist.append(mythd.line) 50 | print(linelist) 51 | ans = 0 52 | for l in linelist: 53 | ans += l 54 | print(ans) -------------------------------------------------------------------------------- /SentenceSplit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from pyltp import SentenceSplitter 3 | from bs4 import BeautifulSoup 4 | import os 5 | 6 | filepath0 = 'E:\\医疗保险语料库\\医疗保险语料原文\\' 7 | outpath = 'E:\\MedicareCorpus\\' 8 | filepath1 = os.listdir(filepath0) 9 | files = [] 10 | for path in filepath1: 11 | fp = filepath0 + path 12 | f = os.listdir(fp) 13 | for fi in f: 14 | files.append(fp + '\\' + fi) 15 | k = 1 16 | 17 | #分句函数 18 | def sentence_splitter(sentence): 19 | sents = SentenceSplitter.split(sentence) 20 | sents_list = list(sents) 21 | return sents_list 22 | 23 | # inputpath="E:\\MedicareCorpus" 24 | # outputpath="E:\\医疗保险语料" 25 | # k=1 26 | # inputfiles=os.listdir(inputpath) 27 | # print(inputfiles) 28 | # for file in inputfiles: 29 | # #打开待分句文件 30 | # fname=inputpath+"\\"+file 31 | # f=open(fname,'r',encoding='utf-8') 32 | # #打开输出文件 33 | # opath=outputpath+"\\"+str(k)+".txt" 34 | # outputfile=open(opath,'w',encoding='utf-8') 35 | # for line in f.readlines(): 36 | # sents=sentence_splitter(line) 37 | # for sent in sents: 38 | # if len(sent)>1: 39 | # outputfile.write(sent+'\n') 40 | # f.close() 41 | # outputfile.close() 42 | # k=k+1 43 | 44 | for file in files: 45 | try: 46 | fobj = open(file, 'r', encoding='utf-8') 47 | bsobj = BeautifulSoup(fobj.read(), "lxml") 48 | s = bsobj.text 49 | slist = sentence_splitter(s) 50 | outfile = outpath + str(k) + '.txt' 51 | with open(outfile, 'w', encoding='utf-8') as f: 52 | for ss in slist: 53 | if len(ss) > 1: 54 | ss = ss.strip() 55 | ss = ss.strip('\n') 56 | f.write(ss + '\n') 57 | k += 1 58 | except UnicodeDecodeError: 59 | print(file) 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /CrawlMedicine.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from urllib.request import urlopen 3 | from bs4 import BeautifulSoup 4 | from multiprocessing.pool import Pool 5 | import csv 6 | import time 7 | import requests 8 | 9 | baseurl = 'https://www.zgylbx.com/index.php?m=content&c=index&a=lists&catid=105&page=' 10 | list = [] 11 | k = 0 12 | def CrawlHospital(pagenum): 13 | print(pagenum) 14 | pageurl = baseurl + str(pagenum) + "&k1=&k2=&k3=&k4=" 15 | session = requests.Session() 16 | headers = { 17 | "User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 18 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" 19 | } 20 | try: 21 | html = session.get(pageurl, headers=headers) 22 | except ConnectionError as e: 23 | print(e) 24 | print(str(pagenum)+"页请求失败") 25 | return 26 | bsobj = BeautifulSoup(html.text,"html.parser") 27 | hos1 = bsobj.findAll("tr",{"class":" tr-dt"}) 28 | hos2 = bsobj.findAll("tr",{"class":"tr-b tr-dt"}) 29 | hos3 = bsobj.findAll("tr",{"class":"tr-dd dn"}) 30 | list_hos1 = [] 31 | list_hos2 = [] 32 | 33 | for h in hos1: 34 | s = h.get_text() 35 | tmp = s.split('\n') 36 | temp = {} 37 | temp['Med_Name'] = tmp[1] 38 | temp['Med_Kind'] = tmp[2] 39 | temp['Med_Plc'] = tmp[3] 40 | list_hos1.append(temp) 41 | 42 | for h in hos2: 43 | s = h.get_text() 44 | tmp = s.split('\n') 45 | temp = {} 46 | temp['Med_Name'] = tmp[1] 47 | temp['Med_Kind'] = tmp[2] 48 | temp['Med_Plc'] = tmp[3] 49 | list_hos2.append(temp) 50 | 51 | for i in range(len(list_hos1)): 52 | list.append(list_hos1[i]) 53 | list.append(list_hos2[i]) 54 | 55 | global k 56 | for h in hos3: 57 | s = h.get_text() 58 | s = s.strip() 59 | tmp = s.split('\n') 60 | # for t in tmp: 61 | # print(t.strip()) 62 | list[k]['Med_Remark'] = tmp[0].strip() 63 | k += 1 64 | 65 | if __name__ == '__main__': 66 | for i in range(1, 800): 67 | # time.sleep(1) 68 | CrawlHospital(i) 69 | headers = ['Med_Name','Med_Kind','Med_Plc','Med_Remark'] 70 | with open('./药品信息2.csv','w',encoding='utf-8') as f: 71 | f_csv = csv.DictWriter(f, headers) 72 | f_csv.writeheader() 73 | try: 74 | f_csv.writerows(list) 75 | except UnicodeEncodeError as e: 76 | print(e) -------------------------------------------------------------------------------- /Neo4j.py: -------------------------------------------------------------------------------- 1 | from py2neo import Graph 2 | from py2neo import Node, Relationship 3 | import os 4 | import csv 5 | graph = Graph("http://127.0.0.1:7474",username="chenjialinily@outlook.com",password="123456") 6 | # path = 'E:\\实体关系抽取\\ ' 7 | # files = os.listdir(path) 8 | 9 | #将医院信息导入图数据库 10 | # hos_file = 'E:\\PyCharm Project\\NLP\\医院信息2.csv' 11 | def ImportHospital(filename): 12 | with open(filename, 'r', encoding='utf-8') as f: 13 | f_csv = csv.reader(f) 14 | headers = next(f_csv) 15 | for row in f_csv: 16 | if len(row) != 0: 17 | str = row[1] 18 | s = str.split('-') 19 | graph.run('MERGE (p:Province {Pro_Name:{proname}}) ' 20 | 'MERGE (c:City {City_Name:{cityname}}) ' 21 | 'MERGE (c)-[s:属于]->(p) ' 22 | 'CREATE (h:Hospital {Hos_Name:{hosname},Hos_Grade:{hosgrade},Hos_Speciality:{hosspe},Hos_Address:{address},Hos_PhoneNumber:{pn},Hos_Email:{email},Hos_Website:{web}})-[b:附属]->(c)' 23 | ,proname = s[0], cityname = s[1], hosname = row[0], hosgrade = row[2], hosspe = row[3], address = row[4], pn = row[5], email = row[6], web = row[7]) 24 | 25 | #将药品信息导入图数据库 26 | med_file = 'E:\\PyCharm Project\\NLP\\药品信息2.csv' 27 | def ImportMedicine(filename): 28 | with open(filename, 'r', encoding='utf-8') as f: 29 | f_csv = csv.reader(f) 30 | headers = next(f_csv) 31 | for row in f_csv: 32 | if len(row) != 0: 33 | graph.run('CREATE (m:Medicine {Med_Name:{medname},Med_Kind:{medkind},Med_Remark:{remark}}) ' 34 | 'MERGE(p:Province :{proname} {Pro_Name:{proname}}) ' 35 | 'CREATE (m)-[r:参保地区]->(p)' 36 | ,medname = row[0], medkind = row[1], proname = row[2], remark = row[3]) 37 | 38 | def Import(filename, cityname): 39 | """ 40 | 将指定csv文件导入Neo4j数据库 41 | :param filename: csv文件名称 42 | :return: 43 | """ 44 | with open(filename, 'r', encoding='utf-8') as f: 45 | f_csv = csv.reader(f) 46 | headers = next(f_csv) 47 | for row in f_csv: 48 | graph.run('MERGE (a:Entity1 {Entity_Name:{e1name},City_Name:{cityname}}) ' 49 | 'MERGE(b:Entity2 {Entity_Name:{e2name},City_Name:{cityname}}) ' 50 | 'CREATE (a)-[r:Relation {name:{rname},City_Name:{cityname},Detail:{detail}}]->(b)' 51 | , e1name=row[0], cityname=cityname, rname=row[1], e2name=row[2], detail=row[3]) 52 | 53 | #导入三元组 54 | # for file in files: 55 | # print(file) 56 | # filepath = path + file 57 | # city = file.strip('.csv') 58 | # Import(filepath, city) 59 | 60 | # data = graph.data("MATCH (h:Hospital)-[]->(c:City) WHERE c.City_Name='长沙市' RETURN h.Hos_Name") 61 | # print(data) 62 | ImportMedicine(med_file) 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /CrawHospital.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | from urllib.request import urlopen 3 | from bs4 import BeautifulSoup 4 | from multiprocessing.pool import Pool 5 | import csv 6 | import time 7 | import requests 8 | k = 0 9 | baseurl = 'https://www.zgylbx.com/index.php?m=content&c=index&a=lists&catid=106&page=' 10 | list = [] 11 | 12 | 13 | def CrawlHospital(pagenum): 14 | print(pagenum) 15 | pageurl = baseurl + str(pagenum) + "&k1=&k2=&k3=&k4=" 16 | session = requests.Session() 17 | headers = { 18 | "User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 19 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" 20 | } 21 | try: 22 | html = session.get(pageurl, headers=headers) 23 | except ConnectionError as e: 24 | print(e) 25 | print(str(pagenum)+"页请求失败") 26 | return 27 | bsobj = BeautifulSoup(html.text,"html.parser") 28 | hos1 = bsobj.findAll("tr",{"class":" tr-dt"}) 29 | hos2 = bsobj.findAll("tr",{"class":"tr-b tr-dt"}) 30 | hos3 = bsobj.findAll("tr",{"class":"tr-dd dn"}) 31 | list_hos1 = [] 32 | list_hos2 = [] 33 | 34 | 35 | for h in hos1: 36 | s = h.get_text() 37 | tmp = s.split('\n') 38 | temp = {} 39 | temp['Hos_Name'] = tmp[1] 40 | temp['Hos_City'] = tmp[2] 41 | temp['Hos_Grade'] = tmp[3] 42 | temp['Hos_Speciality'] = tmp[4] 43 | # hos_info.append(tuple(temp)) 44 | list_hos1.append(temp) 45 | 46 | for h in hos2: 47 | s = h.get_text() 48 | tmp = s.split('\n') 49 | temp = {} 50 | temp['Hos_Name'] = tmp[1] 51 | temp['Hos_City'] = tmp[2] 52 | temp['Hos_Grade'] = tmp[3] 53 | temp['Hos_Speciality'] = tmp[4] 54 | # hos_info.append(tuple(temp)) 55 | list_hos2.append(temp) 56 | 57 | for i in range(len(list_hos1)): 58 | list.append(list_hos1[i]) 59 | list.append(list_hos2[i]) 60 | 61 | global k 62 | for h in hos3: 63 | s = h.get_text() 64 | s = s.strip() 65 | tmp = s.split('\n') 66 | # for t in tmp: 67 | # print(t.strip()) 68 | list[k]['Hos_Address'] = tmp[0].strip() 69 | list[k]['Hos_PhoneNumber'] = tmp[1].strip() 70 | list[k]['Hos_Email'] = tmp[2].strip() 71 | list[k]['Hos_Website'] = tmp[3].strip() 72 | k += 1 73 | 74 | 75 | if __name__ == '__main__': 76 | for i in range(1, 1530): 77 | # time.sleep(1) 78 | CrawlHospital(i) 79 | headers = ['Hos_Name','Hos_City','Hos_Grade','Hos_Speciality','Hos_Address','Hos_PhoneNumber','Hos_Email','Hos_Website'] 80 | with open('./医院信息2.csv','w',encoding='utf-8') as f: 81 | f_csv = csv.DictWriter(f, headers) 82 | f_csv.writeheader() 83 | try: 84 | f_csv.writerows(list) 85 | except UnicodeEncodeError as e: 86 | print(e) -------------------------------------------------------------------------------- /Pyltp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | LTP_DATA_DIR = 'E:\\ltp_data_v3.4.0' # ltp模型目录的路径 4 | cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` 5 | pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` 6 | ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` 7 | par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` 8 | srl_model_path = os.path.join(LTP_DATA_DIR, 'pisrl_win.model') # 语义角色标注模型 9 | 10 | contents = '各档次大病医疗保险所需资金从相应档次基本医疗保险统筹基金中划拨' 11 | 12 | def SrlFunction(contents): 13 | from pyltp import Segmentor 14 | segmentor = Segmentor() # 初始化实例 15 | # segmentor.load(cws_model_path) # 加载模型 16 | segmentor.load_with_lexicon(cws_model_path,'E:\\ltp_data_v3.4.0\\personal_seg.txt') 17 | words = segmentor.segment(contents) # 分词 18 | k = 1 19 | for word in words: 20 | print(word + str(k) + ' ',end='') 21 | k = k+1 22 | print('\n') 23 | # print('\t'.join(words)) 24 | segmentor.release() # 释放模型 25 | wordslist=list(words) 26 | 27 | from pyltp import Postagger 28 | postagger=Postagger() 29 | # postagger.load(pos_model_path) 30 | postagger.load_with_lexicon(pos_model_path,'D:\\ltp_data_v3.4.0\\personal_pos.txt') 31 | postags=postagger.postag(wordslist) 32 | print('\t'.join(postags)) 33 | postagger.release() 34 | 35 | # wordslist = ['人力资源社会保障局','主管','医疗保险','工作'] 36 | # postags = ['n','v','n','v'] 37 | 38 | from pyltp import Parser 39 | parser = Parser() # 初始化实例 40 | parser.load(par_model_path) # 加载模型 41 | arcs = parser.parse(wordslist, postags) # 句法分析 42 | print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) 43 | parser.release() # 释放模型 44 | 45 | from pyltp import SementicRoleLabeller 46 | labeller = SementicRoleLabeller() # 初始化实例 47 | labeller.load(srl_model_path) # 加载模型 48 | # arcs 使用依存句法分析的结果 49 | roles = labeller.label(wordslist, postags, arcs) # 语义角色标注 50 | 51 | # 打印结果 52 | for role in roles: 53 | print(role.index, "".join(["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) 54 | labeller.release() # 释放模型 55 | 56 | # A1 = [] 57 | # A0 = [] 58 | # Op = [] 59 | # for role in roles: 60 | # k = 0 61 | # a0 = '' 62 | # a1 = '' 63 | # for arg in role.arguments: 64 | # if arg.name == 'A0': 65 | # a0 = ''.join(wordslist[arg.range.start:arg.range.end]) 66 | # k = k + 1 67 | # if arg.name == 'A1': 68 | # a1 = ''.join(wordslist[arg.range.start:arg.range.end]) 69 | # k = k + 1 70 | # if k == 2: 71 | # A0.append(a0) 72 | # A1.append(a1) 73 | # Op.append(wordslist[role.index]) 74 | # 75 | # for (a0,o,a1) in zip(A0,Op,A1): 76 | # print(a0+'-'+o+'-'+a1) 77 | # 78 | # with open('./sentences.txt','r',encoding='utf-8') as f: 79 | # for line in f.readlines(): 80 | # SrlFunction(line) 81 | SrlFunction(contents) -------------------------------------------------------------------------------- /ExtEntity.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | from __future__ import print_function, unicode_literals 3 | from bosonnlp import BosonNLP 4 | import os 5 | import re 6 | # import ExtLocation 7 | 8 | nlp = BosonNLP('YGrN2PDi.22096.Qp-06MnNaWED') 9 | 10 | def ExtOrg(wordlist,taglist,filepath): 11 | centretag = ['#nr', '#ns', '#nt', '#m', '#nz', '#t'] 12 | centreword = ['机构', '部门', '局', '管理局', '站', '服务站', '中心', '保险', '对象', '基金', '费用', '资金', '公司', '人群', '人员', '保险费'] 13 | relist = [r'#n(#n)+(#b|#nz)*', r'#n(#n)+(#an|#nz)?', r'#n(#n)+', r'#n(#n)+', r'#n#an(#n)+', r'#n#an(#n)+', r'#n(#n)+(#an)?(#n)*', r'#n(#n)+(#a(#n)*)?', r'(#n)+(#a(#n)*)?', r'#n(#n)+(#a)?(#n)+', r'#n(#n)+(#a|#b)?', r'#n(#n)+(#a)?', r'#n(#n)+', r'(#n)+(#a)?(#n)?', r'#n(#n)+((#b)+|(#a)*|#an(#n)+)', r'#n(#n)+(#v|#a(#n)+)'] 14 | ecrelist = [r'[0-9]+'] 15 | entities = set() 16 | for it1, it2 in zip(wordlist, taglist): 17 | if it2 in centretag: 18 | if it2 == '#m': 19 | for i in range(len(ecrelist)): 20 | pat = re.compile(ecrelist[i]) 21 | m = pat.match(it1) 22 | if m: 23 | entities.add(it1) 24 | break 25 | else: 26 | entities.add(it1) 27 | 28 | for i in range(0, len(wordlist)): 29 | for k in range(0, len(centreword)): 30 | if wordlist[i] == centreword[k]: 31 | s = '' 32 | w = [] 33 | for j in range(i, i - 10, -1): 34 | s = s + taglist[j] 35 | w.append(wordlist[j]) 36 | # print(s) 37 | pattern = re.compile(relist[k]) 38 | m = pattern.match(s) 39 | # print(m) 40 | if m: 41 | cm = m.group() 42 | n = cm.count('#') 43 | str = '' 44 | for l in range(n - 1, -1, -1): 45 | str = str + w[l] 46 | # print(str) 47 | entities.add(str) 48 | file = open(filepath,'w',encoding='utf-8') 49 | for en in entities: 50 | file.write(en+'\n') 51 | file.close() 52 | 53 | 54 | def ExtEntity(in_file,out_file): 55 | inputfile = open(in_file,'r',encoding='utf-8') 56 | input = inputfile.read() 57 | result = nlp.tag(input) 58 | wordlist = [] 59 | taglist = [] 60 | for d in result: 61 | for it1,it2 in zip(d['word'],d['tag']): 62 | wordlist.append(it1) 63 | taglist.append('#'+it2) #标注的词性前加#号 64 | ExtOrg(wordlist,taglist,out_file) 65 | 66 | 67 | in_filepath = 'E:\\MedicareCorpus\\' 68 | out_filepath = 'E:\\实体抽取\\' 69 | in_files = os.listdir(in_filepath) 70 | ffiles = os.listdir(out_filepath) 71 | files = [] 72 | for f in in_files: 73 | if f not in ffiles: 74 | files.append(f) 75 | # print(files) 76 | for file in files: 77 | in_file = in_filepath + file #输入文件路径 78 | out_file = out_filepath + file #输出文件路径 79 | print(file) 80 | ExtEntity(in_file,out_file) 81 | # for file in files: 82 | # in_file = in_filepath + file #输入文件路径 83 | # out_file = out_filepath + file #输出文件路径 84 | # print(file) 85 | # ExtEntity(in_file,out_file) 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /ExtEntity_UseLTP.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer 3 | import os 4 | import re 5 | LTP_DATA_DIR = 'E:\\ltp_data_v3.4.0' # ltp模型目录的路径 6 | cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` 7 | pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` 8 | ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` 9 | 10 | segmentor = Segmentor() 11 | segmentor.load(cws_model_path) 12 | postagger = Postagger() 13 | postagger.load(pos_model_path) 14 | 15 | def ExtOrg(wordlist,taglist,filepath): 16 | centretag = ['#nh', '#ns', '#ni', '#m', '#nz', '#nt', '#j'] 17 | centreword = ['机构', '部门', '局', '管理局', '站', '服务站', '中心', '保险', '对象', '基金', '费用', '资金', '公司', '人群', '人员', '保险费'] 18 | relist = [r'#n(#n)+(#b|#nz)*', r'#n(#n)+(#an|#nz)?', r'#n(#n)+', r'#n(#n)+', r'#n#a(#n)+', r'#n#a(#n)+', r'#n(#n)+(#a)?(#n)*', r'#n(#n)+(#a(#n)*)?', r'(#n)+(#a(#n)*)?', r'#n(#n)+(#a)?(#n)+', r'#n(#n)+(#a|#b)?', r'#n(#n)+(#a)?', r'#n(#n)+', r'(#n)+(#a)?(#n)?', r'#n(#n)+((#b)+|(#a)*|#a(#n)+)', r'#n(#n)+(#v|#a(#n)+)'] 19 | ecrelist = [r'[0-9]+'] 20 | entities = set() 21 | for it1, it2 in zip(wordlist, taglist): 22 | if it2 in centretag: 23 | if it2 == '#m': 24 | for i in range(len(ecrelist)): 25 | pat = re.compile(ecrelist[i]) 26 | m = pat.match(it1) 27 | if m: 28 | entities.add(it1) 29 | break 30 | else: 31 | entities.add(it1) 32 | 33 | for i in range(0, len(wordlist)): 34 | for k in range(0, len(centreword)): 35 | if wordlist[i] == centreword[k]: 36 | s = '' 37 | w = [] 38 | for j in range(i, i - 10, -1): 39 | s = s + taglist[j] 40 | w.append(wordlist[j]) 41 | # print(s) 42 | pattern = re.compile(relist[k]) 43 | m = pattern.match(s) 44 | # print(m) 45 | if m: 46 | cm = m.group() 47 | n = cm.count('#') 48 | str = '' 49 | for l in range(n - 1, -1, -1): 50 | str = str + w[l] 51 | # print(str) 52 | entities.add(str) 53 | file = open(filepath,'w',encoding='utf-8') 54 | for en in entities: 55 | file.write(en+'\n') 56 | file.close() 57 | 58 | 59 | def ExtEntity(in_file,out_file): 60 | inputfile = open(in_file,'r',encoding='utf-8') 61 | sentence = inputfile.read() 62 | words = segmentor.segment(sentence) 63 | postags = postagger.postag(words) 64 | for i in range(len(postags)): 65 | postags[i] = '#' + postags[i] 66 | # print(' '.join(words)) 67 | # print(' '.join(postags)) 68 | ExtOrg(words,postags,out_file) 69 | 70 | 71 | in_filepath = 'E:\\MedicareCorpus\\' 72 | out_filepath = 'E:\\实体抽取LTP\\' 73 | in_files = os.listdir(in_filepath) 74 | ffiles = os.listdir(out_filepath) 75 | files = [] 76 | for f in in_files: 77 | if f not in ffiles: 78 | files.append(f) 79 | 80 | for file in files: 81 | in_file = in_filepath + file #输入文件路径 82 | out_file = out_filepath + file #输出文件路径 83 | print(file) 84 | ExtEntity(in_file,out_file) 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /FileDispose.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from pyltp import SentenceSplitter 3 | from bs4 import BeautifulSoup 4 | import os 5 | import re 6 | import csv,operator 7 | from pypinyin import lazy_pinyin 8 | 9 | 10 | def DisposeOrignalFile(): 11 | pathHead="E:\\医疗保险语料库\\医疗保险语料原文" 12 | txtpath="E:\\MedicareCorpus2" 13 | pathdirs=os.listdir(pathHead) 14 | # print(pathdirs) 15 | k=1 16 | filedir=[] 17 | for dir in pathdirs: 18 | pathdir=pathHead+"\\"+str(dir) 19 | # print(pathdir) 20 | filedirs=os.listdir(pathdir) 21 | # print(filedirs) 22 | for files in filedirs: 23 | filedir.append(pathdir+"\\"+files) 24 | print(filedir) 25 | for file in filedir: 26 | try: 27 | f=open(file,'r',encoding='utf-8') 28 | bsobj=BeautifulSoup(f,"html.parser") 29 | #打开临时文件 30 | p=txtpath+"\\tempfile.txt" 31 | fileobject=open(p,'w',encoding='utf-8') 32 | fileobject.write(bsobj.text) 33 | fileobject.close() 34 | fileobject = open(p, 'r', encoding='utf-8') 35 | #清洗临时文件并写入最终文件 36 | fpath = txtpath + "\\" + str(k) + ".txt" 37 | finalfile = open(fpath, 'w', encoding='utf-8') 38 | for line in fileobject.readlines(): 39 | sents = SentenceSplitter.split(line) 40 | for s in sents: 41 | data = s.strip() 42 | data = data.strip('\n') 43 | if len(data) != 0: 44 | finalfile.write(data + '\n') 45 | except UnicodeDecodeError: 46 | print(file+"解析失败") 47 | continue 48 | fileobject.close() 49 | k=k+1 50 | f.close() 51 | finalfile.close() 52 | 53 | def CleanSentence(sentence): 54 | """ 55 | 对句子进行清洗 56 | :param sentence: 待清洗的句子 57 | :return: 58 | """ 59 | pattern = [r'第.*(条|章)', r'([0-9]{1,2})', r'((一|二|三|四|五|六|七|八|九|十)*)', r'[0-9]{1,2}(\.|\.)' 60 | , r'(一|二|三|四|五|六|七|八|九|十|[0-9])+(、|\.|\.)', r'(◆|)', r'(?.(府|政|国|法|字|发|办|综)+.[0-9]+.综?[0-9]+号)?'] 61 | for p in pattern: 62 | sentence = re.sub(p, '', sentence) 63 | sentence = re.sub(r'( | )+', '\n', sentence) 64 | # print(sentence) 65 | return sentence 66 | 67 | def Dictionary(): 68 | file = open('E:\\医疗保险语料库\\领域词典.txt', 'r', encoding='utf-8') 69 | words = set() 70 | for line in file.readlines(): 71 | words.add(line) 72 | file.close() 73 | file2 = open('E:\\医疗保险语料库\\领域词典.txt', 'w', encoding='utf-8') 74 | for w in words: 75 | s = w.strip('\n')+ ' n'+ '\n' 76 | print(s) 77 | file2.write(s) 78 | file2.close() 79 | 80 | def PrepareText(): 81 | filepath = 'E:\\MedicareCorpus2\\' 82 | efilepath = 'E:\\实体抽取\\' 83 | outpath = 'E:\\医疗保险语料待解析\\' 84 | files = os.listdir(filepath) 85 | for file in files: 86 | print(file) 87 | entities = [] 88 | foj = open(filepath + file, 'r', encoding = 'utf-8') 89 | efoj = open(efilepath + file, 'r', encoding = 'utf-8') 90 | for e in efoj.readlines(): 91 | entities.append(e.strip('\n')) 92 | sentences = [] 93 | for line in foj.readlines(): 94 | for e in entities: 95 | if line.find(e) != -1: 96 | line = CleanSentence(line) 97 | line = line.strip() 98 | if len(line) > 1: 99 | sentences.append(line.strip('\n')) 100 | break 101 | with open(outpath + file, 'w', encoding='utf-8') as f: 102 | for s in sentences: 103 | f.write(s + '\n') 104 | foj.close() 105 | efoj.close() 106 | 107 | def PrepareEntity(filename): 108 | entities = set() 109 | with open(filename, 'r', encoding='utf-8') as f: 110 | for line in f.readlines(): 111 | entities.add(line.strip('\n')) 112 | # for e in entities2: 113 | # entities.add(e) 114 | en = [] 115 | for e in entities: 116 | en.append(e) 117 | en.sort(key=lambda x: len(x), reverse=True) 118 | with open(filename, 'w', encoding='utf-8') as ff: 119 | for e in en: 120 | ff.write(e + '\n') 121 | 122 | def SortCSVfile(filename): 123 | data = csv.reader(open(filename, 'r', encoding='utf-8')) 124 | sortedlist = sorted(data, key=lambda x:(lazy_pinyin(x[0])[0], lazy_pinyin(x[1])[0],lazy_pinyin(x[2])[0])) 125 | with open(filename, 'w', encoding='utf-8') as f: 126 | filewriter = csv.writer(f) 127 | for row in sortedlist: 128 | filewriter.writerow(row) 129 | f.close() 130 | 131 | # SortCSVfile('E:\\哈尔滨市.csv') 132 | filepath = 'E:\\实体关系抽取\\' 133 | files = os.listdir(filepath) 134 | 135 | for file in files: 136 | print(file) 137 | filename = filepath + file 138 | SortCSVfile(filename) 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Set your own model path 3 | MODELDIR = "D:\\ltp_data_v3.4.0" 4 | 5 | import sys 6 | import os 7 | 8 | from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer 9 | 10 | print("正在加载LTP模型... ...") 11 | 12 | segmentor = Segmentor() 13 | segmentor.load(os.path.join(MODELDIR, "cws.model")) 14 | # segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"),'D:\\ltp_data_v3.4.0\\personal_seg.txt') 15 | 16 | postagger = Postagger() 17 | postagger.load(os.path.join(MODELDIR, "pos.model")) 18 | 19 | parser = Parser() 20 | parser.load(os.path.join(MODELDIR, "parser.model")) 21 | 22 | recognizer = NamedEntityRecognizer() 23 | recognizer.load(os.path.join(MODELDIR, "ner.model")) 24 | 25 | 26 | print("加载模型完毕。") 27 | 28 | in_file_name = "input.txt" 29 | out_file_name = "output.txt" 30 | begin_line = 1 31 | end_line = 0 32 | 33 | 34 | def extraction_start(in_file_name, out_file_name, begin_line, end_line): 35 | """ 36 | 事实三元组抽取的总控程序 37 | Args: 38 | in_file_name: 输入文件的名称 39 | #out_file_name: 输出文件的名称 40 | begin_line: 读文件的起始行 41 | end_line: 读文件的结束行 42 | """ 43 | in_file = open(in_file_name, 'r', encoding='utf-8') 44 | out_file = open(out_file_name, 'w', encoding='utf-8') 45 | 46 | for line in in_file: 47 | fact_triple_extract(line.strip(), out_file) 48 | in_file.close() 49 | out_file.close() 50 | 51 | 52 | def fact_triple_extract(sentence, out_file): 53 | """ 54 | 对于给定的句子进行事实三元组抽取 55 | Args: 56 | sentence: 要处理的语句 57 | """ 58 | # print sentence 59 | words = segmentor.segment(sentence) 60 | print("\t".join(words)) 61 | postags = postagger.postag(words) 62 | print("\t".join(postags)) 63 | netags = recognizer.recognize(words, postags) 64 | arcs = parser.parse(words, postags) 65 | print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) 66 | 67 | child_dict_list = build_parse_child_dict(words, postags, arcs) 68 | for index in range(len(postags)): 69 | # 抽取以谓词为中心的事实三元组 70 | if postags[index] == 'v': 71 | child_dict = child_dict_list[index] 72 | # 主谓宾 73 | if 'SBV' in child_dict and 'VOB' in child_dict: 74 | e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) 75 | r = words[index] 76 | e2 = complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) 77 | out_file.write("主语谓语宾语关系\t(%s, %s, %s)\n" % (e1, r, e2)) 78 | out_file.flush() 79 | # 定语后置,动宾关系 80 | if arcs[index].relation == 'ATT': 81 | if 'VOB' in child_dict: 82 | e1 = complete_e(words, postags, child_dict_list, arcs[index].head - 1) 83 | r = words[index] 84 | e2 = complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) 85 | temp_string = r + e2 86 | if temp_string == e1[:len(temp_string)]: 87 | e1 = e1[len(temp_string):] 88 | if temp_string not in e1: 89 | out_file.write("定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2)) 90 | out_file.flush() 91 | # 含有介宾关系的主谓动补关系 92 | if 'SBV' in child_dict and 'CMP' in child_dict: 93 | # e1 = words[child_dict['SBV'][0]] 94 | e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) 95 | cmp_index = child_dict['CMP'][0] 96 | r = words[index] + words[cmp_index] 97 | if 'POB' in child_dict_list[cmp_index]: 98 | e2 = complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0]) 99 | out_file.write("介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2)) 100 | out_file.flush() 101 | 102 | # 尝试抽取命名实体有关的三元组 103 | if netags[index][0] == 'S' or netags[index][0] == 'B': 104 | ni = index 105 | if netags[ni][0] == 'B': 106 | while netags[ni][0] != 'E': 107 | ni += 1 108 | e1 = ''.join(words[index:ni + 1]) 109 | else: 110 | e1 = words[ni] 111 | if arcs[ni].relation == 'ATT' and postags[arcs[ni].head - 1] == 'n' and netags[arcs[ni].head - 1] == 'O': 112 | r = complete_e(words, postags, child_dict_list, arcs[ni].head - 1) 113 | if e1 in r: 114 | r = r[(r.index(e1) + len(e1)):] 115 | if arcs[arcs[ni].head - 1].relation == 'ATT' and netags[arcs[arcs[ni].head - 1].head - 1] != 'O': 116 | e2 = complete_e(words, postags, child_dict_list, arcs[arcs[ni].head - 1].head - 1) 117 | mi = arcs[arcs[ni].head - 1].head - 1 118 | li = mi 119 | if netags[mi][0] == 'B': 120 | while netags[mi][0] != 'E': 121 | mi += 1 122 | e = ''.join(words[li + 1:mi + 1]) 123 | e2 += e 124 | if r in e2: 125 | e2 = e2[(e2.index(r) + len(r)):] 126 | if r + e2 in sentence: 127 | out_file.write("人名//地名//机构\t(%s, %s, %s)\n" % (e1, r, e2)) 128 | out_file.flush() 129 | 130 | 131 | def build_parse_child_dict(words, postags, arcs): 132 | """ 133 | 为句子中的每个词语维护一个保存句法依存儿子节点的字典 134 | Args: 135 | words: 分词列表 136 | postags: 词性列表 137 | arcs: 句法依存列表 138 | """ 139 | child_dict_list = [] 140 | for index in range(len(words)): 141 | child_dict = dict() 142 | for arc_index in range(len(arcs)): 143 | if arcs[arc_index].head == index + 1: 144 | keys = child_dict.keys() 145 | if arcs[arc_index].relation in keys: 146 | child_dict[arcs[arc_index].relation].append(arc_index) 147 | else: 148 | child_dict[arcs[arc_index].relation] = [] 149 | child_dict[arcs[arc_index].relation].append(arc_index) 150 | # if child_dict.has_key('SBV'): 151 | # print words[index],child_dict['SBV'] 152 | child_dict_list.append(child_dict) 153 | return child_dict_list 154 | 155 | 156 | def complete_e(words, postags, child_dict_list, word_index): 157 | """ 158 | 完善识别的部分实体 159 | """ 160 | child_dict = child_dict_list[word_index] 161 | prefix = '' 162 | 163 | if 'ATT' in child_dict: 164 | for i in range(len(child_dict['ATT'])): 165 | prefix += complete_e(words, postags, child_dict_list, child_dict['ATT'][i]) 166 | 167 | postfix = '' 168 | if postags[word_index] == 'v': 169 | if 'VOB' in child_dict: 170 | postfix += complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) 171 | if 'SBV' in child_dict: 172 | prefix = complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix 173 | 174 | return prefix + words[word_index] + postfix 175 | 176 | 177 | if __name__ == "__main__": 178 | extraction_start(in_file_name, out_file_name, begin_line, end_line) 179 | -------------------------------------------------------------------------------- /ExtRelation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from pyltp import Segmentor, Postagger, Parser 3 | import os 4 | import csv 5 | import ExtLocation 6 | LTP_DATA_DIR = 'E:\\ltp_data_v3.4.0' # ltp模型目录的路径 7 | cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` 8 | pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` 9 | par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` 10 | 11 | segmentor = Segmentor() 12 | segmentor.load_with_lexicon(cws_model_path,'E:\\ltp_data_v3.4.0\\personal_seg.txt') 13 | 14 | postagger = Postagger() 15 | postagger.load_with_lexicon(pos_model_path,'E:\\ltp_data_v3.4.0\\personal_pos.txt') 16 | 17 | parser = Parser() 18 | parser.load(par_model_path) 19 | 20 | #输入文件 21 | in_file_path = 'E:\\医疗保险语料待解析\\' 22 | in_files_name = os.listdir(in_file_path) 23 | #实体集 24 | entity_file_path = 'E:\\实体抽取\\' 25 | out_file_path = 'E:\\实体关系抽取2\\' 26 | 27 | def extraction_start(in_file_name, out_file_name, entity_file_name): 28 | """ 29 | 总控程序 30 | :param in_file_name: 输入文件名 31 | :param out_file_name: 输出文件名 32 | :param entity_file_name: 实体文件名 33 | :return: 34 | """ 35 | #获取实体 36 | entities = [] 37 | entity_file = open(entity_file_name, 'r', encoding='utf-8') 38 | for e in entity_file.readlines(): 39 | entities.append(e.strip('\n')) 40 | entity_file.close() 41 | #输入文件 42 | in_file = open(in_file_name, 'r', encoding='utf-8') 43 | rows = [] 44 | for line in in_file.readlines(): 45 | fact_triple_extract(line.strip('\n'), rows) 46 | in_file.close() 47 | #对写入数据进行清洗 48 | clean_rows(entities, rows) 49 | write_out_file(out_file_name, rows) 50 | 51 | def fact_triple_extract(sentence, rows): 52 | """ 53 | 抽取实体关系三元组 54 | :param sentence: 待抽取的句子 55 | :param rows: 字典序列 56 | :return: 57 | """ 58 | global segmentor,postagger,parser 59 | words = segmentor.segment(sentence) 60 | # print("\t".join(words)) 61 | postags = postagger.postag(words) 62 | # print('\t'.join(postags)) 63 | arcs = parser.parse(words, postags) 64 | # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) 65 | 66 | child_dict_list = build_parse_child_dict(words, arcs) 67 | for index in range(len(postags)): 68 | # 抽取以谓词为中心的事实三元组 69 | if postags[index] == 'v': 70 | child_dict = child_dict_list[index] 71 | # 主谓宾 72 | if 'SBV' in child_dict and 'VOB' in child_dict: 73 | e1_index = child_dict['SBV'][0] 74 | e1_child_dict = child_dict_list[e1_index] 75 | e1 = [] 76 | if 'COO' in e1_child_dict.keys() and arcs[index].relation == 'HED': 77 | for i in range(len(e1_child_dict['COO'])): 78 | e1.append(complete_e(words, postags, child_dict_list, e1_child_dict['COO'][i])) 79 | e1.append(complete_e(words, postags, child_dict_list, child_dict['SBV'][0])) 80 | r = complete_r(words, postags, child_dict_list, index) 81 | e2_index = child_dict['VOB'][0] 82 | e2_child_dict = child_dict_list[e2_index] 83 | e2 = [] 84 | e2.append(complete_e(words, postags, child_dict_list, e2_index)) 85 | if 'COO' in e2_child_dict.keys(): 86 | for i in range(len(e2_child_dict['COO'])): 87 | e2.append(complete_e(words, postags, child_dict_list, e2_child_dict['COO'][i])) 88 | for e_1 in e1: 89 | for e_2 in e2: 90 | relation = dict() 91 | relation['e1'] = e_1 92 | relation['r'] = r 93 | relation['e2'] = e_2 94 | relation['sentence'] = sentence 95 | rows.append(relation) 96 | #前置宾语 97 | if 'FOB' in child_dict and 'ADV' in child_dict: 98 | e1_index = child_dict['ADV'][0] 99 | e1_index_dict = child_dict_list[e1_index] 100 | e1 = [] 101 | if 'POB' in e1_index_dict: 102 | for i in range(len(e1_index_dict['POB'])): 103 | e1_coo_index = e1_index_dict['POB'][i] 104 | e1.append(complete_e(words, postags, child_dict_list, e1_coo_index)) 105 | e1_coo_child_dict = child_dict_list[e1_coo_index] 106 | if 'COO' in e1_coo_child_dict.keys(): 107 | for i in range(len(e1_coo_child_dict['COO'])): 108 | e1.append(complete_e(words, postags, child_dict_list, e1_coo_child_dict['COO'][i])) 109 | r = complete_r(words, postags, child_dict_list, index) 110 | e2 = complete_e(words, postags, child_dict_list, child_dict['FOB'][0]) 111 | for e0 in e1: 112 | relation = dict() 113 | relation['e1'] = e0 114 | relation['r'] = r 115 | relation['e2'] = e2 116 | relation['sentence'] = sentence 117 | rows.append(relation) 118 | 119 | if arcs[index].relation == 'HED': 120 | child_dict = child_dict_list[index] 121 | #介宾关系 122 | if 'SBV' in child_dict and 'POB' in child_dict: 123 | e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) 124 | eindex = child_dict['POB'][0] 125 | r = complete_r(words, postags, child_dict_list, index) 126 | e2 = complete_e(words, postags, child_dict_list, eindex) 127 | relation = dict() 128 | relation['e1'] = e1 129 | relation['r'] = r 130 | relation['e2'] = e2 131 | relation['sentence'] = sentence 132 | rows.append(relation) 133 | 134 | def build_parse_child_dict(words, arcs): 135 | """ 136 | 为句子中的每个词语维护一个保存句法依存儿子节点的字典 137 | Args: 138 | words: 分词列表 139 | postags: 词性列表 140 | arcs: 句法依存列表 141 | """ 142 | child_dict_list = [] 143 | for index in range(len(words)): 144 | child_dict = dict() 145 | for arc_index in range(len(arcs)): 146 | if arcs[arc_index].head == index + 1: 147 | keys = child_dict.keys() 148 | if arcs[arc_index].relation in keys: 149 | child_dict[arcs[arc_index].relation].append(arc_index) 150 | else: 151 | child_dict[arcs[arc_index].relation] = [] 152 | child_dict[arcs[arc_index].relation].append(arc_index) 153 | # if child_dict.has_key('SBV'): 154 | # print words[index],child_dict['SBV'] 155 | child_dict_list.append(child_dict) 156 | return child_dict_list 157 | 158 | def complete_e(words, postags, child_dict_list, word_index): 159 | """完善论元""" 160 | child_dict = child_dict_list[word_index] 161 | prefix = '' 162 | 163 | if 'FOB' in child_dict: 164 | word_index = child_dict['FOB'][0] 165 | child_dict = child_dict_list[word_index] 166 | 167 | if 'ATT' in child_dict: 168 | for i in range(len(child_dict['ATT'])): 169 | prefix += complete_e(words, postags, child_dict_list, child_dict['ATT'][i]) 170 | 171 | postfix = '' 172 | if postags[word_index] == 'v': 173 | if 'VOB' in child_dict: 174 | postfix += complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) 175 | if 'SBV' in child_dict: 176 | prefix = complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix 177 | 178 | return prefix + words[word_index] + postfix 179 | 180 | def complete_r(words, postags, child_dict_list, word_index): 181 | """完善关系""" 182 | child_dict = child_dict_list[word_index] 183 | prefix = '' 184 | if 'ADV' in child_dict: 185 | for i in range(len(child_dict['ADV'])): 186 | prefix += complete_r(words, postags, child_dict_list, child_dict['ADV'][i]) 187 | if 'ATT' in child_dict: 188 | for i in range(len(child_dict['ATT'])): 189 | prefix += complete_r(words, postags, child_dict_list, child_dict['ATT'][i]) 190 | postfix = '' 191 | if 'CMP' in child_dict: 192 | for i in range(len(child_dict['CMP'])): 193 | postfix += complete_r(words, postags, child_dict_list, child_dict['CMP'][i]) 194 | if 'VOB' in child_dict: 195 | for i in range(len(child_dict['VOB'])): 196 | postfix += complete_r(words, postags, child_dict_list, child_dict['VOB'][i]) 197 | if 'POB' in child_dict: 198 | for i in range(len(child_dict['POB'])): 199 | postfix += complete_r(words, postags, child_dict_list, child_dict['POB'][i]) 200 | 201 | return prefix + words[word_index] + postfix 202 | 203 | def write_out_file(out_file_name, rows): 204 | """ 205 | 写入输出文件 206 | :param out_file_name: 输出文件名称 207 | :param rows: 写入的数据 208 | :return: 209 | """ 210 | #输出文件 211 | headers = ['e1', 'r', 'e2', 'sentence'] 212 | with open(out_file_name, 'a', encoding='utf-8') as f: 213 | f_csv = csv.DictWriter(f, headers) 214 | f_csv.writerows(rows) 215 | f.flush() 216 | 217 | def clean_rows(entities, rows): 218 | """ 219 | 对要写入的字典序列rows进行清洗,只写入包含实体的rows 220 | :param rows: 实体关系三元组字典序 221 | :return: 222 | """ 223 | tmp = [] 224 | for row in rows: 225 | e1find = 0 226 | e2find = 0 227 | str1 = str(row['e1']) 228 | str2 = str(row['e2']) 229 | for e in entities: 230 | if str1.find(e) != -1 and e1find == 0: 231 | row['e1'] = e 232 | e1find = 1 233 | if str2.find(e) != -1 and e2find == 0: 234 | row['e2'] = e 235 | e2find = 1 236 | if e1find == 1 or e2find ==1: 237 | tmp.append(row) 238 | rows.clear() 239 | for t in tmp: 240 | rows.append(t) 241 | 242 | if __name__ == "__main__": 243 | # in_files_name2 = ['1.txt'] 244 | ff = [] 245 | with open('./已处理文件.txt', 'r', encoding='utf-8') as f: 246 | for l in f.readlines(): 247 | ff.append(l.strip('\n')) 248 | in_files_name2 = [] 249 | for file in in_files_name: 250 | if file not in ff: 251 | in_files_name2.append(file.strip('\n')) 252 | for file in in_files_name2: 253 | print(file) 254 | # 合成输入文件位置 255 | in_file_name = in_file_path + file 256 | # 获取输入文件的省市 257 | location = ExtLocation.ExtLocation(in_file_name, segmentor, postagger) 258 | out_file_name = out_file_path + str(location) + '.csv' 259 | entity_file_name = entity_file_path + file 260 | extraction_start(in_file_name, out_file_name, entity_file_name) 261 | --------------------------------------------------------------------------------