├── ChineseAntiword.py
├── README.md
├── antisem.txt
└── collect_antiword.py


/ChineseAntiword.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding: utf-8
 3 | # File: ChineseAntiWord.py
 4 | # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
 5 | # Date: 18-8-26
 6 | 
 7 | import os
 8 | class ChineseAntiword:
 9 |     def __init__(self):
10 |         cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1])
11 |         antifile = os.path.join(cur_dir, 'antisem.txt')
12 |         self.antidict, self.simdict = self.build_antidict(antifile)
13 | 
14 |     '''构造反义词词典'''
15 |     def build_antidict(self, file):
16 |         antidict = {}
17 |         simdict = {}
18 |         for line in open(file):
19 |             line = line.strip().split(':')
20 |             wd = line[0]
21 |             antis = line[1].strip().split(';')
22 |             if wd not in antidict:
23 |                 antidict[wd] = antis
24 |             else:
25 |                 antidict[wd] += antis
26 | 
27 |             for anti in antis:
28 |                 if anti not in simdict:
29 |                     simdict[anti] = [i for i in antis if i != anti]
30 |                 else:
31 |                     simdict[anti] += [i for i in antis if i != anti]
32 | 
33 |         return antidict, simdict
34 | 
35 |     '''根据目标词获取反义词'''
36 |     def get_antiword(self, word):
37 |         return self.antidict.get(word, 'None')
38 | 
39 |     '''根据目标词获取近义词'''
40 |     def get_simword(self, word):
41 |         return self.simdict.get(word, 'no')
42 | 
43 | 
44 | def demo():
45 |     handler = ChineseAntiword()
46 |     word = '批判'
47 |     antiwords = handler.get_antiword(word)
48 |     print(antiwords)
49 | 
50 | if __name__=="__main__":
51 |     demo()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ChineseAntiword
 2 | ChineseAntiword,针对中文词语的反义词查询接口．在当前的中文信息处理当中，有大量的近义词词典，如同义词词林等，但少有反义词词典，反义词词典在构造对立语义上有很大用途，本项目目的是为提供这一接口
 3 | 
 4 | 
 5 | # 使用方式
 6 |     from ChineseAntiword import *
 7 |     antiwords = handler.get_antiword(word)
 8 | 
 9 | 
10 | # 测试样例
11 |     word = '天才'
12 |     antiwords:['庸才', '庸人', '蠢材']
13 |     word = '快乐'
14 |     antiwords:['悲伤', '伤心', '难过', '痛苦', '烦恼', '苦恼']
15 |     word = '和蔼'
16 |     antiwords:['凶狠', '凶残', '粗暴', '凶横', '凶恶', '严厉', '蛮横']
17 |     word = '批判'
18 |     antiwords:['表扬', '表彰', '赞颂']
19 | 
20 | 
21 | # 问题
22 | 1) 基于词典的反义词查询，不可取，取无法穷举
23 | 2) 要是能够找到像word2vec这样的技术大规模地挖掘反义词，该有多好．
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/collect_antiword.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding: utf-8
 3 | # File: spider.py
 4 | # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
 5 | # Date: 18-8-26
 6 | 
 7 | 
 8 | from urllib import request
 9 | from lxml import etree
10 | 
11 | def get_html(url):
12 |     return request.urlopen(url).read().decode('GBK', 'ignore')
13 | 
14 | def main():
15 |     f = open('antisem_fyc.txt', 'w+')
16 |     links = ['http://fyc.5156edu.com/html2/%s.html'%i for i in range(1, 75)]
17 |     for link in links:
18 |         html = get_html(link)
19 |         selector = etree.HTML(html)
20 |         words = [i for i in selector.xpath('//td/a/text()')]
21 |         links = ['http://fyc.5156edu.com' + i for i in selector.xpath('//td/a/@href')]
22 |         print(len(words), len(links))
23 |         word_dict = list(zip(words, links))
24 |         for i in word_dict:
25 |             wd = i[0]
26 |             link = i[1]
27 |             html = get_html(link)
28 |             selector = etree.HTML(html)
29 |             antis = [i.replace('(','') for i in selector.xpath('//tr[2]/td[2]/text()') if '(' in i and  '。' not in i and '，' not in i]
30 |             print(wd, link, antis)
31 |             f.write(wd + ':' + ';'.join(antis) + '\n')
32 |     f.close()
33 | 
34 | def main2():
35 |     f = open('antisem_kxue.txt', 'w+')
36 |     links = ['http://fyc.kxue.com/list/index_%s.html'%i for i in range(1, 120)]
37 |     for link in links:
38 |         print(link)
39 |         html = get_html(link)
40 |         selector = etree.HTML(html)
41 |         wds = [i for i in selector.xpath('//span[@class="hz"]/a/text()')]
42 |         antis = [i for i in selector.xpath('//span[@class="hz"]/span[@class="js"]/text()')]
43 |         print(len(wds))
44 |         print(len(antis))
45 |         for i in zip(wds, antis):
46 |             f.write(i[0] + ':' + i[1] + '\n')
47 |     f.close()
48 | 
49 | def merge():
50 |     f = open('antisem_full.txt', 'w+')
51 |     wd_dict = {}
52 |     for line in open('antisem2.txt'):
53 |         line = line.strip().split(':')
54 |         if len(line) < 2:
55 |             continue
56 |         if not line:
57 |             continue
58 |         wd = line[0]
59 |         antis = [i for i in line[1].split(';')]
60 |         if wd not in wd_dict:
61 |             wd_dict[wd] = antis
62 |         else:
63 |             wd_dict[wd] += antis
64 | 
65 |     for wd, anti in wd_dict.items():
66 |         f.write(wd+':' + ';'.join(list(set(anti))) + '\n')
67 |     f.close()
68 | 
69 | #
70 | # if __name__=='__main__':
71 | #     merge()


--------------------------------------------------------------------------------