├── README.md
├── baidubaike.py
├── baikecontent.py
├── hudongbaike.py
├── mergebaike.py
└── sogoubaike.py


/README.md:
--------------------------------------------------------------------------------
1 | # BaikeInfoExtraction
2 | 基于互动百科,百度百科,搜狗百科的词条infobox结构化信息抽取,百科知识的融合
3 | 


--------------------------------------------------------------------------------
/baidubaike.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding: utf-8
 3 | # File: baidubaike.py
 4 | # Author: lhy
 5 | # Date: 18-3-8
 6 | 
 7 | from urllib import request
 8 | from lxml import etree
 9 | from urllib import parse
10 | 
11 | 
12 | 
13 | class BaiduBaike():
14 |     def __init__(self):
15 |         pass
16 | 
17 |     def get_html(self, url):
18 |         return request.urlopen(url).read().decode('utf-8').replace('&nbsp;', '')
19 | 
20 |     def info_extract_baidu(self, word):  # 百度百科
21 |         url = "http://baike.baidu.com/item/%s" % parse.quote(word)
22 |         print(url)
23 |         selector = etree.HTML(self.get_html(url))
24 |         info_list = list()
25 |         info_list.append(self.extract_baidu(selector))
26 |         polysemantics = self.checkbaidu_polysemantic(selector)
27 |         if polysemantics:
28 |             info_list += polysemantics
29 |         infos = [info for info in info_list if len(info) > 2]
30 | 
31 |         return infos
32 | 
33 |     def extract_baidu(self, selector):
34 |         info_data = {}
35 |         if selector.xpath('//h2/text()'):
36 |             info_data['current_semantic'] = selector.xpath('//h2/text()')[0].replace('    ', '').replace('（','').replace('）','')
37 |         else:
38 |             info_data['current_semantic'] = ''
39 |         if info_data['current_semantic'] == '目录':
40 |             info_data['current_semantic'] = ''
41 | 
42 |         info_data['tags'] = [item.replace('\n', '') for item in selector.xpath('//span[@class="taglist"]/text()')]
43 |         if selector.xpath("//div[starts-with(@class,'basic-info')]"):
44 |             for li_result in selector.xpath("//div[starts-with(@class,'basic-info')]")[0].xpath('./dl'):
45 |                 attributes = [attribute.xpath('string(.)').replace('\n', '') for attribute in li_result.xpath('./dt')]
46 |                 values = [value.xpath('string(.)').replace('\n', '') for value in li_result.xpath('./dd')]
47 |                 for item in zip(attributes, values):
48 |                     info_data[item[0].replace('    ', '')] = item[1].replace('    ', '')
49 |         return info_data
50 | 
51 |     def checkbaidu_polysemantic(self, selector):
52 |         semantics = ['https://baike.baidu.com' + sem for sem in
53 |                      selector.xpath("//ul[starts-with(@class,'polysemantList-wrapper')]/li/a/@href")]
54 |         names = [name for name in selector.xpath("//ul[starts-with(@class,'polysemantList-wrapper')]/li/a/text()")]
55 |         info_list = []
56 |         if semantics:
57 |             for item in zip(names, semantics):
58 |                 selector = etree.HTML(self.get_html(item[1]))
59 |                 info_data = self.extract_baidu(selector)
60 |                 info_data['current_semantic'] = item[0].replace('    ', '').replace('（','').replace('）','')
61 |                 if info_data:
62 |                     info_list.append(info_data)
63 |         return info_list
64 | '''
65 | baidu = BaiduBaike()
66 | while(1):
67 |     word = input('enter an word:')
68 |     baidu.info_extract_baidu(word)
69 | '''


--------------------------------------------------------------------------------
/baikecontent.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding: utf-8
 3 | # File: baikecontent.py
 4 | # Author: lhy
 5 | # Date: 18-3-9
 6 | from hudongbaike import *
 7 | from pyltp import SentenceSplitter
 8 | import jieba.posseg as pseg
 9 | 
10 | 
11 | 
12 | hudong = HudongBaike()
13 | def collect_infos(word):
14 |     infos = hudong.info_extract_hudong(word)
15 |     for info in infos:
16 |         intro_sents = [sent for sent in SentenceSplitter.split(info['intro']) if len(sent) > 0]
17 |         desc_sents = [sent for sent in SentenceSplitter.split(info['desc']) if len(sent) > 0]
18 |         print(intro_sents)
19 |         print('****'*5)
20 |         print(desc_sents)
21 | 
22 | def question_parser(sentence):
23 |     filter_pos = ['n', 'd', 'm']
24 |     segments = [word.word + '/' + word.flag for word in pseg.cut(sentence) if word.flag[0] in filter_pos]
25 |     segments = [word.word + '/' + word.flag for word in pseg.cut(sentence)]
26 | 
27 |     print(segments)
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 
34 | 
35 | while(1):
36 |     sentence = input('enter an sentence to search:')
37 |     #collect_infos(word)
38 |     question_parser(sentence)


--------------------------------------------------------------------------------
/hudongbaike.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding: utf-8
 3 | # File: hudongbaike.py
 4 | # Author: lhy
 5 | # Date: 18-3-8
 6 | 
 7 | from urllib import request
 8 | from lxml import etree
 9 | from urllib import parse
10 | 
11 | 
12 | class HudongBaike():
13 |     def __init__(self):
14 |         pass
15 | 
16 |     def get_html(self, url):
17 |         return request.urlopen(url).read().decode('utf-8').replace('&nbsp;', '')
18 | 
19 |     def info_extract_hudong(self, word):  # 互动百科
20 |         url = "http://www.baike.com/wiki/%s" % parse.quote(word)
21 |         print(url)
22 |         selector = etree.HTML(self.get_html(url))
23 |         info_list = list()
24 |         info_data = self.extract_hudong(selector)
25 |         if selector.xpath('//li[@class="current"]/strong/text()'):
26 |             info_data['current_semantic'] = selector.xpath('//li[@class="current"]/strong/text()')[0].replace('    ', '').replace('（','').replace('）','')
27 |         else:
28 |             info_data['current_semantic'] = ''
29 |         info_list.append(info_data)
30 |         polysemantics = self.checkhudong_polysemantic(selector)
31 |         if polysemantics:
32 |             info_list += polysemantics
33 |         infos = [info for info in info_list if len(info) > 2]
34 | 
35 |         return infos
36 | 
37 |     def extract_hudong(self, selector):
38 |         info_data = {}
39 |         info_data['desc'] = selector.xpath('//div[@id="content"]')[0].xpath('string(.)')
40 |         info_data['intro'] = selector.xpath('//div[@class="summary"]')[0].xpath('string(.)').replace('编辑摘要', '')
41 |         info_data['tags'] = [item.replace('\n', '') for item in selector.xpath('//p[@id="openCatp"]/a/text()')]
42 |         for info in selector.xpath('//td'):
43 |             attribute = info.xpath('./strong/text()')
44 |             val = info.xpath('./span')
45 |             if attribute and val:
46 |                 value = val[0].xpath('string(.)')
47 |                 info_data[attribute[0].replace('：','')] = value.replace('\n','').replace('  ','').replace('    ', '')
48 |         return info_data
49 | 
50 |     def checkhudong_polysemantic(self, selector):
51 |         semantics = [sem for sem in selector.xpath("//ul[@id='polysemyAll']/li/a/@href") if 'doc_title' not in sem]
52 |         names = [name for name in selector.xpath("//ul[@id='polysemyAll']/li/a/text()")]
53 |         info_list = list()
54 |         if semantics:
55 |             for item in zip(names, semantics):
56 |                 selector = etree.HTML(self.get_html(item[1]))
57 |                 info_data = self.extract_hudong(selector)
58 |                 info_data['current_semantic'] = item[0].replace('（','').replace('）','')
59 |                 if info_data:
60 |                     info_list.append(info_data)
61 |         return info_list
62 | 
63 | '''Testing'''
64 | '''
65 | hudong = HudongBaike()
66 | while(1):
67 |     word = input('enter an word to search:')
68 |     info = hudong.info_extract_hudong(word)
69 |     for item in info:
70 |         print(item['desc'])
71 | '''


--------------------------------------------------------------------------------
/mergebaike.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding: utf-8
 3 | # File: Merge_baike.py
 4 | # Author: lhy
 5 | # Date: 18-3-9
 6 | from baidubaike import *
 7 | from hudongbaike import *
 8 | from sogoubaike import *
 9 | import jieba
10 | 
11 | def collect_infos(word):
12 |     baidu = BaiduBaike()
13 |     hudong = HudongBaike()
14 |     sogou = SougouBaike()
15 |     merge_infos = list()
16 |     baidu_infos = baidu.info_extract_baidu(word)
17 |     hudong_infos = hudong.info_extract_hudong(word)
18 |     sogou_infos = sogou.info_extract_sogou(word)
19 |     merge_infos += baidu_infos
20 |     merge_infos += hudong_infos
21 |     merge_infos += sogou_infos
22 | 
23 |     return merge_infos
24 | 
25 | def merge_infos_semantic(infos):
26 |     sems_all = [item['current_semantic'] for item in infos]
27 |     '''merge infos by semantics'''
28 |     update_infos = list()
29 |     for sem in set(sems_all):
30 |         sems_dict = {}
31 |         for item in infos:
32 |             if item['current_semantic'] == sem:
33 |                 sems_dict.update(item)
34 |         update_infos.append(sems_dict)
35 |     return update_infos
36 | 
37 | def rank_infos(infos):
38 |     att_nums = 0
39 |     cover = 0.0
40 |     score_dict = {}
41 |     ranked_infos = list()
42 |     covered_list = []
43 |     covered_rate = 0.6
44 |     covered_index = 0
45 | 
46 |     for info in infos:
47 |         att_nums += len(info)
48 |     for index, info in enumerate(infos):
49 |         info['score'] = len(info)/att_nums
50 |         info['tags'] = ' '.join(info['tags'])
51 |         score_dict[index] = info['score']
52 |     score_dict = sorted(score_dict.items(), key=lambda asd:asd[1], reverse=True)
53 |     '''rank the infos'''
54 |     for tmp in score_dict:
55 |         cover += tmp[1]
56 |         if cover < covered_rate:
57 |             covered_index += 1
58 |         else:
59 |             continue
60 |         ranked_infos.append(infos[tmp[0]])
61 |     '''print'''
62 |     for index, info in enumerate(ranked_infos):
63 |         print(index, info['score'], info['current_semantic'], info)
64 | 
65 |     return ranked_infos, covered_index
66 | 
67 | def compute_similarity(a, b):
68 |     return len(set(a).intersection(set(b)))
69 | 
70 | def merge_infos_sim(infos, covered_index):
71 |     for index in range(0, covered_index):
72 |         for index_ in range(index + 1, covered_index):
73 |             score_attr = compute_similarity(infos[index].keys(), infos[index_].keys())
74 |             score_val = compute_similarity(infos[index].values(), infos[index_].values())
75 | 
76 |             score_pair = compute_similarity([key + str(value) for key, value in infos[index].items()],
77 |                                             [key + str(value) for key, value in infos[index_].items()])
78 | 
79 |             print(index, index_, score_attr, score_val, score_pair)
80 | 
81 | def merge_infos(word):
82 |     infos = collect_infos(word)
83 |     update_infos = merge_infos_semantic(infos)
84 |     ranked_infos, covered_index = rank_infos(update_infos)
85 | 
86 |     return ranked_infos
87 | 
88 | 
89 | while(1):
90 |     word = input('enter an word to search:\n')
91 |     infos = collect_infos(word)
92 |     update_infos = merge_infos_semantic(infos)
93 |     ranked_infos, covered_index = rank_infos(update_infos)
94 |     merge_infos_sim(ranked_infos, covered_index)
95 | 


--------------------------------------------------------------------------------
/sogoubaike.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding: utf-8
 3 | # File: info_extract.py
 4 | # Author: lhy
 5 | # Date: 18-3-8
 6 | from urllib import request
 7 | from lxml import etree
 8 | from urllib import parse
 9 | 
10 | class SougouBaike():
11 |     def __index__(self):
12 |         pass
13 | 
14 |     def get_html(self, url):
15 |         return request.urlopen(url).read().decode('utf-8').replace('&nbsp;', '')
16 | 
17 |     def find_sofouid(self, word):
18 |         url = "http://baike.sogou.com/Search.e?sp=S%s" % parse.quote(word)
19 |         print(url)
20 |         selector = etree.HTML(self.get_html(url))
21 |         id = selector.xpath('//h2/a/@href')[0].split(';')[0]
22 |         info_url = "http://baike.sogou.com/%s"%id
23 |         return info_url
24 | 
25 |     def info_extract_sogou(self, word):  #sogou百科
26 |         info_url = self.find_sofouid(word)
27 |         selector = etree.HTML(self.get_html(info_url))
28 |         info_list = list()
29 |         info_data = self.extract_sogou(selector)
30 |         if selector.xpath('//li[@class="current_item"]/text()'):
31 |             info_data['current_semantic'] = selector.xpath('//li[@class="current_item"]/text()')[0].replace('    ', '').replace('（','').replace('）','')
32 |         else:
33 |             info_data['current_semantic'] = ''
34 | 
35 |         info_list.append(info_data)
36 |         polysemantics = self.checksogou_polysemantic(selector)
37 |         if polysemantics:
38 |             info_list += polysemantics
39 |         infos = [info for info in info_list if len(info) > 2]
40 |         return infos
41 | 
42 |     def extract_sogou(self, selector):
43 |         info_data = {}
44 |         info_data['tags'] = [item.replace('\n', '') for item in selector.xpath('//div[@class="relevant_wrap"]/a/text()')]
45 |         if selector.xpath('//li[@class="current_item"]/text()'):
46 |             info_data['current_semantic'] = selector.xpath('//li[@class="current_item"]/text()')[0].replace('    ', '').replace('（','').replace('）','')
47 |         else:
48 |             info_data['current_semantic'] = ''
49 |         tables = selector.xpath('//table[@class="abstract_list"]')
50 |         for table in tables:
51 |             attributes = table.xpath('./tbody/tr/th/text()')
52 |             values = [td.xpath('string(.)') for td in table.xpath('./tbody/tr/td')]
53 |             for item in zip(attributes, values):
54 |                 info_data[item[0].replace(' ', '').replace('\xa0','')] = item[1].replace('    ', '')
55 |         return info_data
56 | 
57 |     def checksogou_polysemantic(self, selector):
58 |         semantics = ['http://baike.sogou.com' + sem.split('?')[0] for sem in selector.xpath("//ol[@class='semantic_item_list']/li/a/@href")]
59 |         names = [name for name in selector.xpath("//ol[@class='semantic_item_list']/li/a/text()")]
60 |         info_list = list()
61 |         if semantics:
62 |             for item in zip(names, semantics):
63 |                 selector = etree.HTML(self.get_html(item[1]))
64 |                 info_data = self.extract_sogou(selector)
65 |                 info_data['current_semantic'] = item[0].replace('（','').replace('）','')
66 |                 if info_data:
67 |                     info_list.append(info_data)
68 |         return info_list
69 | 
70 | '''Testing'''
71 | '''
72 | if __name__ == "__main__":
73 |     baikeinfo = SougouBaike()
74 |     while(1):
75 |         word = input('enter an word:')
76 |         baikeinfo.info_extract_sogou(word)
77 | '''
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------