├── README.md ├── baidubaike.py ├── baikecontent.py ├── hudongbaike.py ├── mergebaike.py └── sogoubaike.py /README.md: -------------------------------------------------------------------------------- 1 | # BaikeInfoExtraction 2 | 基于互动百科,百度百科,搜狗百科的词条infobox结构化信息抽取,百科知识的融合 3 | -------------------------------------------------------------------------------- /baidubaike.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: baidubaike.py 4 | # Author: lhy 5 | # Date: 18-3-8 6 | 7 | from urllib import request 8 | from lxml import etree 9 | from urllib import parse 10 | 11 | 12 | 13 | class BaiduBaike(): 14 | def __init__(self): 15 | pass 16 | 17 | def get_html(self, url): 18 | return request.urlopen(url).read().decode('utf-8').replace(' ', '') 19 | 20 | def info_extract_baidu(self, word): # 百度百科 21 | url = "http://baike.baidu.com/item/%s" % parse.quote(word) 22 | print(url) 23 | selector = etree.HTML(self.get_html(url)) 24 | info_list = list() 25 | info_list.append(self.extract_baidu(selector)) 26 | polysemantics = self.checkbaidu_polysemantic(selector) 27 | if polysemantics: 28 | info_list += polysemantics 29 | infos = [info for info in info_list if len(info) > 2] 30 | 31 | return infos 32 | 33 | def extract_baidu(self, selector): 34 | info_data = {} 35 | if selector.xpath('//h2/text()'): 36 | info_data['current_semantic'] = selector.xpath('//h2/text()')[0].replace('    ', '').replace('(','').replace(')','') 37 | else: 38 | info_data['current_semantic'] = '' 39 | if info_data['current_semantic'] == '目录': 40 | info_data['current_semantic'] = '' 41 | 42 | info_data['tags'] = [item.replace('\n', '') for item in selector.xpath('//span[@class="taglist"]/text()')] 43 | if selector.xpath("//div[starts-with(@class,'basic-info')]"): 44 | for li_result in selector.xpath("//div[starts-with(@class,'basic-info')]")[0].xpath('./dl'): 45 | attributes = [attribute.xpath('string(.)').replace('\n', '') for attribute in li_result.xpath('./dt')] 46 | values = [value.xpath('string(.)').replace('\n', '') for value in li_result.xpath('./dd')] 47 | for item in zip(attributes, values): 48 | info_data[item[0].replace('    ', '')] = item[1].replace('    ', '') 49 | return info_data 50 | 51 | def checkbaidu_polysemantic(self, selector): 52 | semantics = ['https://baike.baidu.com' + sem for sem in 53 | selector.xpath("//ul[starts-with(@class,'polysemantList-wrapper')]/li/a/@href")] 54 | names = [name for name in selector.xpath("//ul[starts-with(@class,'polysemantList-wrapper')]/li/a/text()")] 55 | info_list = [] 56 | if semantics: 57 | for item in zip(names, semantics): 58 | selector = etree.HTML(self.get_html(item[1])) 59 | info_data = self.extract_baidu(selector) 60 | info_data['current_semantic'] = item[0].replace('    ', '').replace('(','').replace(')','') 61 | if info_data: 62 | info_list.append(info_data) 63 | return info_list 64 | ''' 65 | baidu = BaiduBaike() 66 | while(1): 67 | word = input('enter an word:') 68 | baidu.info_extract_baidu(word) 69 | ''' -------------------------------------------------------------------------------- /baikecontent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: baikecontent.py 4 | # Author: lhy 5 | # Date: 18-3-9 6 | from hudongbaike import * 7 | from pyltp import SentenceSplitter 8 | import jieba.posseg as pseg 9 | 10 | 11 | 12 | hudong = HudongBaike() 13 | def collect_infos(word): 14 | infos = hudong.info_extract_hudong(word) 15 | for info in infos: 16 | intro_sents = [sent for sent in SentenceSplitter.split(info['intro']) if len(sent) > 0] 17 | desc_sents = [sent for sent in SentenceSplitter.split(info['desc']) if len(sent) > 0] 18 | print(intro_sents) 19 | print('****'*5) 20 | print(desc_sents) 21 | 22 | def question_parser(sentence): 23 | filter_pos = ['n', 'd', 'm'] 24 | segments = [word.word + '/' + word.flag for word in pseg.cut(sentence) if word.flag[0] in filter_pos] 25 | segments = [word.word + '/' + word.flag for word in pseg.cut(sentence)] 26 | 27 | print(segments) 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | while(1): 36 | sentence = input('enter an sentence to search:') 37 | #collect_infos(word) 38 | question_parser(sentence) -------------------------------------------------------------------------------- /hudongbaike.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: hudongbaike.py 4 | # Author: lhy 5 | # Date: 18-3-8 6 | 7 | from urllib import request 8 | from lxml import etree 9 | from urllib import parse 10 | 11 | 12 | class HudongBaike(): 13 | def __init__(self): 14 | pass 15 | 16 | def get_html(self, url): 17 | return request.urlopen(url).read().decode('utf-8').replace(' ', '') 18 | 19 | def info_extract_hudong(self, word): # 互动百科 20 | url = "http://www.baike.com/wiki/%s" % parse.quote(word) 21 | print(url) 22 | selector = etree.HTML(self.get_html(url)) 23 | info_list = list() 24 | info_data = self.extract_hudong(selector) 25 | if selector.xpath('//li[@class="current"]/strong/text()'): 26 | info_data['current_semantic'] = selector.xpath('//li[@class="current"]/strong/text()')[0].replace('    ', '').replace('(','').replace(')','') 27 | else: 28 | info_data['current_semantic'] = '' 29 | info_list.append(info_data) 30 | polysemantics = self.checkhudong_polysemantic(selector) 31 | if polysemantics: 32 | info_list += polysemantics 33 | infos = [info for info in info_list if len(info) > 2] 34 | 35 | return infos 36 | 37 | def extract_hudong(self, selector): 38 | info_data = {} 39 | info_data['desc'] = selector.xpath('//div[@id="content"]')[0].xpath('string(.)') 40 | info_data['intro'] = selector.xpath('//div[@class="summary"]')[0].xpath('string(.)').replace('编辑摘要', '') 41 | info_data['tags'] = [item.replace('\n', '') for item in selector.xpath('//p[@id="openCatp"]/a/text()')] 42 | for info in selector.xpath('//td'): 43 | attribute = info.xpath('./strong/text()') 44 | val = info.xpath('./span') 45 | if attribute and val: 46 | value = val[0].xpath('string(.)') 47 | info_data[attribute[0].replace(':','')] = value.replace('\n','').replace(' ','').replace('    ', '') 48 | return info_data 49 | 50 | def checkhudong_polysemantic(self, selector): 51 | semantics = [sem for sem in selector.xpath("//ul[@id='polysemyAll']/li/a/@href") if 'doc_title' not in sem] 52 | names = [name for name in selector.xpath("//ul[@id='polysemyAll']/li/a/text()")] 53 | info_list = list() 54 | if semantics: 55 | for item in zip(names, semantics): 56 | selector = etree.HTML(self.get_html(item[1])) 57 | info_data = self.extract_hudong(selector) 58 | info_data['current_semantic'] = item[0].replace('(','').replace(')','') 59 | if info_data: 60 | info_list.append(info_data) 61 | return info_list 62 | 63 | '''Testing''' 64 | ''' 65 | hudong = HudongBaike() 66 | while(1): 67 | word = input('enter an word to search:') 68 | info = hudong.info_extract_hudong(word) 69 | for item in info: 70 | print(item['desc']) 71 | ''' -------------------------------------------------------------------------------- /mergebaike.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: Merge_baike.py 4 | # Author: lhy 5 | # Date: 18-3-9 6 | from baidubaike import * 7 | from hudongbaike import * 8 | from sogoubaike import * 9 | import jieba 10 | 11 | def collect_infos(word): 12 | baidu = BaiduBaike() 13 | hudong = HudongBaike() 14 | sogou = SougouBaike() 15 | merge_infos = list() 16 | baidu_infos = baidu.info_extract_baidu(word) 17 | hudong_infos = hudong.info_extract_hudong(word) 18 | sogou_infos = sogou.info_extract_sogou(word) 19 | merge_infos += baidu_infos 20 | merge_infos += hudong_infos 21 | merge_infos += sogou_infos 22 | 23 | return merge_infos 24 | 25 | def merge_infos_semantic(infos): 26 | sems_all = [item['current_semantic'] for item in infos] 27 | '''merge infos by semantics''' 28 | update_infos = list() 29 | for sem in set(sems_all): 30 | sems_dict = {} 31 | for item in infos: 32 | if item['current_semantic'] == sem: 33 | sems_dict.update(item) 34 | update_infos.append(sems_dict) 35 | return update_infos 36 | 37 | def rank_infos(infos): 38 | att_nums = 0 39 | cover = 0.0 40 | score_dict = {} 41 | ranked_infos = list() 42 | covered_list = [] 43 | covered_rate = 0.6 44 | covered_index = 0 45 | 46 | for info in infos: 47 | att_nums += len(info) 48 | for index, info in enumerate(infos): 49 | info['score'] = len(info)/att_nums 50 | info['tags'] = ' '.join(info['tags']) 51 | score_dict[index] = info['score'] 52 | score_dict = sorted(score_dict.items(), key=lambda asd:asd[1], reverse=True) 53 | '''rank the infos''' 54 | for tmp in score_dict: 55 | cover += tmp[1] 56 | if cover < covered_rate: 57 | covered_index += 1 58 | else: 59 | continue 60 | ranked_infos.append(infos[tmp[0]]) 61 | '''print''' 62 | for index, info in enumerate(ranked_infos): 63 | print(index, info['score'], info['current_semantic'], info) 64 | 65 | return ranked_infos, covered_index 66 | 67 | def compute_similarity(a, b): 68 | return len(set(a).intersection(set(b))) 69 | 70 | def merge_infos_sim(infos, covered_index): 71 | for index in range(0, covered_index): 72 | for index_ in range(index + 1, covered_index): 73 | score_attr = compute_similarity(infos[index].keys(), infos[index_].keys()) 74 | score_val = compute_similarity(infos[index].values(), infos[index_].values()) 75 | 76 | score_pair = compute_similarity([key + str(value) for key, value in infos[index].items()], 77 | [key + str(value) for key, value in infos[index_].items()]) 78 | 79 | print(index, index_, score_attr, score_val, score_pair) 80 | 81 | def merge_infos(word): 82 | infos = collect_infos(word) 83 | update_infos = merge_infos_semantic(infos) 84 | ranked_infos, covered_index = rank_infos(update_infos) 85 | 86 | return ranked_infos 87 | 88 | 89 | while(1): 90 | word = input('enter an word to search:\n') 91 | infos = collect_infos(word) 92 | update_infos = merge_infos_semantic(infos) 93 | ranked_infos, covered_index = rank_infos(update_infos) 94 | merge_infos_sim(ranked_infos, covered_index) 95 | -------------------------------------------------------------------------------- /sogoubaike.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: info_extract.py 4 | # Author: lhy 5 | # Date: 18-3-8 6 | from urllib import request 7 | from lxml import etree 8 | from urllib import parse 9 | 10 | class SougouBaike(): 11 | def __index__(self): 12 | pass 13 | 14 | def get_html(self, url): 15 | return request.urlopen(url).read().decode('utf-8').replace(' ', '') 16 | 17 | def find_sofouid(self, word): 18 | url = "http://baike.sogou.com/Search.e?sp=S%s" % parse.quote(word) 19 | print(url) 20 | selector = etree.HTML(self.get_html(url)) 21 | id = selector.xpath('//h2/a/@href')[0].split(';')[0] 22 | info_url = "http://baike.sogou.com/%s"%id 23 | return info_url 24 | 25 | def info_extract_sogou(self, word): #sogou百科 26 | info_url = self.find_sofouid(word) 27 | selector = etree.HTML(self.get_html(info_url)) 28 | info_list = list() 29 | info_data = self.extract_sogou(selector) 30 | if selector.xpath('//li[@class="current_item"]/text()'): 31 | info_data['current_semantic'] = selector.xpath('//li[@class="current_item"]/text()')[0].replace('    ', '').replace('(','').replace(')','') 32 | else: 33 | info_data['current_semantic'] = '' 34 | 35 | info_list.append(info_data) 36 | polysemantics = self.checksogou_polysemantic(selector) 37 | if polysemantics: 38 | info_list += polysemantics 39 | infos = [info for info in info_list if len(info) > 2] 40 | return infos 41 | 42 | def extract_sogou(self, selector): 43 | info_data = {} 44 | info_data['tags'] = [item.replace('\n', '') for item in selector.xpath('//div[@class="relevant_wrap"]/a/text()')] 45 | if selector.xpath('//li[@class="current_item"]/text()'): 46 | info_data['current_semantic'] = selector.xpath('//li[@class="current_item"]/text()')[0].replace('    ', '').replace('(','').replace(')','') 47 | else: 48 | info_data['current_semantic'] = '' 49 | tables = selector.xpath('//table[@class="abstract_list"]') 50 | for table in tables: 51 | attributes = table.xpath('./tbody/tr/th/text()') 52 | values = [td.xpath('string(.)') for td in table.xpath('./tbody/tr/td')] 53 | for item in zip(attributes, values): 54 | info_data[item[0].replace(' ', '').replace('\xa0','')] = item[1].replace('    ', '') 55 | return info_data 56 | 57 | def checksogou_polysemantic(self, selector): 58 | semantics = ['http://baike.sogou.com' + sem.split('?')[0] for sem in selector.xpath("//ol[@class='semantic_item_list']/li/a/@href")] 59 | names = [name for name in selector.xpath("//ol[@class='semantic_item_list']/li/a/text()")] 60 | info_list = list() 61 | if semantics: 62 | for item in zip(names, semantics): 63 | selector = etree.HTML(self.get_html(item[1])) 64 | info_data = self.extract_sogou(selector) 65 | info_data['current_semantic'] = item[0].replace('(','').replace(')','') 66 | if info_data: 67 | info_list.append(info_data) 68 | return info_list 69 | 70 | '''Testing''' 71 | ''' 72 | if __name__ == "__main__": 73 | baikeinfo = SougouBaike() 74 | while(1): 75 | word = input('enter an word:') 76 | baikeinfo.info_extract_sogou(word) 77 | ''' 78 | 79 | 80 | 81 | --------------------------------------------------------------------------------