├── ChineseAntiword.py ├── README.md ├── antisem.txt └── collect_antiword.py /ChineseAntiword.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: ChineseAntiWord.py 4 | # Author: lhy 5 | # Date: 18-8-26 6 | 7 | import os 8 | class ChineseAntiword: 9 | def __init__(self): 10 | cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1]) 11 | antifile = os.path.join(cur_dir, 'antisem.txt') 12 | self.antidict, self.simdict = self.build_antidict(antifile) 13 | 14 | '''构造反义词词典''' 15 | def build_antidict(self, file): 16 | antidict = {} 17 | simdict = {} 18 | for line in open(file): 19 | line = line.strip().split(':') 20 | wd = line[0] 21 | antis = line[1].strip().split(';') 22 | if wd not in antidict: 23 | antidict[wd] = antis 24 | else: 25 | antidict[wd] += antis 26 | 27 | for anti in antis: 28 | if anti not in simdict: 29 | simdict[anti] = [i for i in antis if i != anti] 30 | else: 31 | simdict[anti] += [i for i in antis if i != anti] 32 | 33 | return antidict, simdict 34 | 35 | '''根据目标词获取反义词''' 36 | def get_antiword(self, word): 37 | return self.antidict.get(word, 'None') 38 | 39 | '''根据目标词获取近义词''' 40 | def get_simword(self, word): 41 | return self.simdict.get(word, 'no') 42 | 43 | 44 | def demo(): 45 | handler = ChineseAntiword() 46 | word = '批判' 47 | antiwords = handler.get_antiword(word) 48 | print(antiwords) 49 | 50 | if __name__=="__main__": 51 | demo() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ChineseAntiword 2 | ChineseAntiword,针对中文词语的反义词查询接口.在当前的中文信息处理当中,有大量的近义词词典,如同义词词林等,但少有反义词词典,反义词词典在构造对立语义上有很大用途,本项目目的是为提供这一接口 3 | 4 | 5 | # 使用方式 6 | from ChineseAntiword import * 7 | antiwords = handler.get_antiword(word) 8 | 9 | 10 | # 测试样例 11 | word = '天才' 12 | antiwords:['庸才', '庸人', '蠢材'] 13 | word = '快乐' 14 | antiwords:['悲伤', '伤心', '难过', '痛苦', '烦恼', '苦恼'] 15 | word = '和蔼' 16 | antiwords:['凶狠', '凶残', '粗暴', '凶横', '凶恶', '严厉', '蛮横'] 17 | word = '批判' 18 | antiwords:['表扬', '表彰', '赞颂'] 19 | 20 | 21 | # 问题 22 | 1) 基于词典的反义词查询,不可取,取无法穷举 23 | 2) 要是能够找到像word2vec这样的技术大规模地挖掘反义词,该有多好. 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /collect_antiword.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: spider.py 4 | # Author: lhy 5 | # Date: 18-8-26 6 | 7 | 8 | from urllib import request 9 | from lxml import etree 10 | 11 | def get_html(url): 12 | return request.urlopen(url).read().decode('GBK', 'ignore') 13 | 14 | def main(): 15 | f = open('antisem_fyc.txt', 'w+') 16 | links = ['http://fyc.5156edu.com/html2/%s.html'%i for i in range(1, 75)] 17 | for link in links: 18 | html = get_html(link) 19 | selector = etree.HTML(html) 20 | words = [i for i in selector.xpath('//td/a/text()')] 21 | links = ['http://fyc.5156edu.com' + i for i in selector.xpath('//td/a/@href')] 22 | print(len(words), len(links)) 23 | word_dict = list(zip(words, links)) 24 | for i in word_dict: 25 | wd = i[0] 26 | link = i[1] 27 | html = get_html(link) 28 | selector = etree.HTML(html) 29 | antis = [i.replace('(','') for i in selector.xpath('//tr[2]/td[2]/text()') if '(' in i and '。' not in i and ',' not in i] 30 | print(wd, link, antis) 31 | f.write(wd + ':' + ';'.join(antis) + '\n') 32 | f.close() 33 | 34 | def main2(): 35 | f = open('antisem_kxue.txt', 'w+') 36 | links = ['http://fyc.kxue.com/list/index_%s.html'%i for i in range(1, 120)] 37 | for link in links: 38 | print(link) 39 | html = get_html(link) 40 | selector = etree.HTML(html) 41 | wds = [i for i in selector.xpath('//span[@class="hz"]/a/text()')] 42 | antis = [i for i in selector.xpath('//span[@class="hz"]/span[@class="js"]/text()')] 43 | print(len(wds)) 44 | print(len(antis)) 45 | for i in zip(wds, antis): 46 | f.write(i[0] + ':' + i[1] + '\n') 47 | f.close() 48 | 49 | def merge(): 50 | f = open('antisem_full.txt', 'w+') 51 | wd_dict = {} 52 | for line in open('antisem2.txt'): 53 | line = line.strip().split(':') 54 | if len(line) < 2: 55 | continue 56 | if not line: 57 | continue 58 | wd = line[0] 59 | antis = [i for i in line[1].split(';')] 60 | if wd not in wd_dict: 61 | wd_dict[wd] = antis 62 | else: 63 | wd_dict[wd] += antis 64 | 65 | for wd, anti in wd_dict.items(): 66 | f.write(wd+':' + ';'.join(list(set(anti))) + '\n') 67 | f.close() 68 | 69 | # 70 | # if __name__=='__main__': 71 | # merge() --------------------------------------------------------------------------------