├── count_ngrams ├── evaluate.py ├── README.md └── word_discovery.py /count_ngrams: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bojone/word-discovery/HEAD/count_ngrams -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | 3 | import os 4 | import jieba 5 | jieba.set_dictionary('thucnews.vocab') # 手动截取前50000个词 6 | 7 | 8 | jieba.lcut(u'今天天气很不错') 9 | 10 | 11 | F = open('myresult.txt', 'w') 12 | 13 | with open('../testing/pku_test.txt') as f: 14 | for l in f: 15 | l = l.decode('gbk').strip() 16 | l = ' '.join(jieba.cut(l, HMM=False)) 17 | l += '\r\n' 18 | l = l.encode('gbk') 19 | F.write(l) 20 | 21 | 22 | F.close() 23 | 24 | os.system('./score ../gold/pku_training_words.txt ../gold/pku_test_gold.txt myresult.txt > score.txt') 25 | os.system('cat score.txt') 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 速度更快、效果更好的中文新词发现 2 | 3 | 复现了之前的《【中文分词系列】 8. 更好的新词发现算法》中的新词发现算法。 4 | 5 | - 算法细节: https://kexue.fm/archives/4256 6 | - 复现细节: https://kexue.fm/archives/6920 7 | 8 | ## 实测 9 | 10 | 在经过充分训练的情况下,用bakeoff2005的pku语料进行测试,能得到0.765的F1,优于ICLR 2019的《Unsupervised Word Discovery with Segmental Neural Language Models》的0.731 11 | 12 | (注:这里是为了给效果提供一个直观感知,比较可能是不公平的,因为我不确定这篇论文中的训练集用了哪些语料。但我感觉在相同时间内本文算法会优于论文的算法,因为直觉论文的算法训练起来会很慢。作者也没有开源,所以有不少不确定之处,如有错谬,请读者指正。) 13 | 14 | ## 使用 15 | 16 | 使用前务必通过 17 | ``` 18 | chmod +x count_ngrams 19 | ``` 20 | 赋予`count_ngrams`可执行权限,然后修改`word_discovery.py`适配自己的数据,最后执行 21 | ``` 22 | python word_discovery.py 23 | ``` 24 | 25 | ## 更新 26 | - 2019.12.04: 兼容python3,在python2.7和python3.5下测试通过。 27 | 28 | ## 交流 29 | QQ交流群:67729435,微信群请加机器人微信号spaces_ac_cn 30 | -------------------------------------------------------------------------------- /word_discovery.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | 3 | import struct 4 | import os 5 | import six 6 | import codecs 7 | import math 8 | import logging 9 | logging.basicConfig(level=logging.INFO, format=u'%(asctime)s - %(levelname)s - %(message)s') 10 | 11 | 12 | class Progress: 13 | """显示进度,自己简单封装,比tqdm更可控一些 14 | iterator: 可迭代的对象; 15 | period: 显示进度的周期; 16 | steps: iterator可迭代的总步数,相当于len(iterator) 17 | """ 18 | def __init__(self, iterator, period=1, steps=None, desc=None): 19 | self.iterator = iterator 20 | self.period = period 21 | if hasattr(iterator, '__len__'): 22 | self.steps = len(iterator) 23 | else: 24 | self.steps = steps 25 | self.desc = desc 26 | if self.steps: 27 | self._format_ = u'%s/%s passed' %('%s', self.steps) 28 | else: 29 | self._format_ = u'%s passed' 30 | if self.desc: 31 | self._format_ = self.desc + ' - ' + self._format_ 32 | self.logger = logging.getLogger() 33 | def __iter__(self): 34 | for i, j in enumerate(self.iterator): 35 | if (i + 1) % self.period == 0: 36 | self.logger.info(self._format_ % (i+1)) 37 | yield j 38 | 39 | 40 | class KenlmNgrams: 41 | """加载Kenlm的ngram统计结果 42 | vocab_file: Kenlm统计出来的词(字)表; 43 | ngram_file: Kenlm统计出来的ngram表; 44 | order: 统计ngram时设置的n,必须跟ngram_file对应; 45 | min_count: 自行设置的截断频数。 46 | """ 47 | def __init__(self, vocab_file, ngram_file, order, min_count): 48 | self.vocab_file = vocab_file 49 | self.ngram_file = ngram_file 50 | self.order = order 51 | self.min_count = min_count 52 | self.read_chars() 53 | self.read_ngrams() 54 | def read_chars(self): 55 | f = open(self.vocab_file) 56 | chars = f.read() 57 | f.close() 58 | chars = chars.split('\x00') 59 | self.chars = [i.decode('utf-8') if six.PY2 else i for i in chars] 60 | def read_ngrams(self): 61 | """读取思路参考https://github.com/kpu/kenlm/issues/201 62 | """ 63 | self.ngrams = [{} for _ in range(self.order)] 64 | self.total = 0 65 | size_per_item = self.order * 4 + 8 66 | def ngrams(): 67 | with open(self.ngram_file, 'rb') as f: 68 | while True: 69 | s = f.read(size_per_item) 70 | if len(s) == size_per_item: 71 | n = self.unpack('l', s[-8:]) 72 | yield s, n 73 | else: 74 | break 75 | for s, n in Progress(ngrams(), 100000, desc=u'loading ngrams'): 76 | if n >= self.min_count: 77 | self.total += n 78 | c = [self.unpack('i', s[j*4: (j+1)*4]) for j in range(self.order)] 79 | c = ''.join([self.chars[j] for j in c if j > 2]) 80 | for j in range(len(c)): 81 | self.ngrams[j][c[:j+1]] = self.ngrams[j].get(c[:j+1], 0) + n 82 | def unpack(self, t, s): 83 | return struct.unpack(t, s)[0] 84 | 85 | 86 | def write_corpus(texts, filename): 87 | """将语料写到文件中,词与词(字与字)之间用空格隔开 88 | """ 89 | with codecs.open(filename, 'w', encoding='utf-8') as f: 90 | for s in Progress(texts, 10000, desc=u'exporting corpus'): 91 | s = ' '.join(s) + '\n' 92 | f.write(s) 93 | 94 | 95 | def count_ngrams(corpus_file, order, vocab_file, ngram_file, memory=0.5): 96 | """通过os.system调用Kenlm的count_ngrams来统计频数 97 | 其中,memory是占用内存比例,理论上不能超过可用内存比例。 98 | """ 99 | done = os.system( 100 | './count_ngrams -o %s --memory=%d%% --write_vocab_list %s <%s >%s' 101 | % (order, memory * 100, vocab_file, corpus_file, ngram_file) 102 | ) 103 | if done != 0: 104 | raise ValueError('Failed to count ngrams by KenLM.') 105 | 106 | 107 | def filter_ngrams(ngrams, total, min_pmi=1): 108 | """通过互信息过滤ngrams,只保留“结实”的ngram。 109 | """ 110 | order = len(ngrams) 111 | if hasattr(min_pmi, '__iter__'): 112 | min_pmi = list(min_pmi) 113 | else: 114 | min_pmi = [min_pmi] * order 115 | output_ngrams = set() 116 | total = float(total) 117 | for i in range(order-1, 0, -1): 118 | for w, v in ngrams[i].items(): 119 | pmi = min([ 120 | total * v / (ngrams[j].get(w[:j+1], total) * ngrams[i-j-1].get(w[j+1:], total)) 121 | for j in range(i) 122 | ]) 123 | if math.log(pmi) >= min_pmi[i]: 124 | output_ngrams.add(w) 125 | return output_ngrams 126 | 127 | 128 | class SimpleTrie: 129 | """通过Trie树结构,来搜索ngrams组成的连续片段 130 | """ 131 | def __init__(self): 132 | self.dic = {} 133 | self.end = True 134 | def add_word(self, word): 135 | _ = self.dic 136 | for c in word: 137 | if c not in _: 138 | _[c] = {} 139 | _ = _[c] 140 | _[self.end] = word 141 | def tokenize(self, sent): # 通过最长联接的方式来对句子进行分词 142 | result = [] 143 | start, end = 0, 1 144 | for i, c1 in enumerate(sent): 145 | _ = self.dic 146 | if i == end: 147 | result.append(sent[start: end]) 148 | start, end = i, i+1 149 | for j, c2 in enumerate(sent[i:]): 150 | if c2 in _: 151 | _ = _[c2] 152 | if self.end in _: 153 | if i + j + 1 > end: 154 | end = i + j + 1 155 | else: 156 | break 157 | result.append(sent[start: end]) 158 | return result 159 | 160 | 161 | def filter_vocab(candidates, ngrams, order): 162 | """通过与ngrams对比,排除可能出来的不牢固的词汇(回溯) 163 | """ 164 | result = {} 165 | for i, j in candidates.items(): 166 | if len(i) < 3: 167 | result[i] = j 168 | elif len(i) <= order and i in ngrams: 169 | result[i] = j 170 | elif len(i) > order: 171 | flag = True 172 | for k in range(len(i) + 1 - order): 173 | if i[k: k+order] not in ngrams: 174 | flag = False 175 | if flag: 176 | result[i] = j 177 | return result 178 | 179 | 180 | # ======= 算法构建完毕,下面开始执行完整的构建词库流程 ======= 181 | 182 | import re 183 | import glob 184 | 185 | # 语料生成器,并且初步预处理语料 186 | # 这个生成器例子的具体含义不重要,只需要知道它就是逐句地把文本yield出来就行了 187 | def text_generator(): 188 | txts = glob.glob('/root/thuctc/THUCNews/*/*.txt') 189 | for txt in txts: 190 | d = codecs.open(txt, encoding='utf-8').read() 191 | d = d.replace(u'\u3000', ' ').strip() 192 | yield re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z ]+', '\n', d) 193 | 194 | 195 | min_count = 32 196 | order = 4 197 | corpus_file = 'thucnews.corpus' # 语料保存的文件名 198 | vocab_file = 'thucnews.chars' # 字符集保存的文件名 199 | ngram_file = 'thucnews.ngrams' # ngram集保存的文件名 200 | output_file = 'thucnews.vocab' # 最后导出的词表文件名 201 | memory = 0.5 # memory是占用内存比例,理论上不能超过可用内存比例 202 | 203 | write_corpus(text_generator(), corpus_file) # 将语料转存为文本 204 | count_ngrams(corpus_file, order, vocab_file, ngram_file, memory) # 用Kenlm统计ngram 205 | ngrams = KenlmNgrams(vocab_file, ngram_file, order, min_count) # 加载ngram 206 | ngrams = filter_ngrams(ngrams.ngrams, ngrams.total, [0, 2, 4, 6]) # 过滤ngram 207 | ngtrie = SimpleTrie() # 构建ngram的Trie树 208 | 209 | for w in Progress(ngrams, 100000, desc=u'build ngram trie'): 210 | _ = ngtrie.add_word(w) 211 | 212 | candidates = {} # 得到候选词 213 | for t in Progress(text_generator(), 1000, desc='discovering words'): 214 | for w in ngtrie.tokenize(t): # 预分词 215 | candidates[w] = candidates.get(w, 0) + 1 216 | 217 | # 频数过滤 218 | candidates = {i: j for i, j in candidates.items() if j >= min_count} 219 | # 互信息过滤(回溯) 220 | candidates = filter_vocab(candidates, ngrams, order) 221 | 222 | # 输出结果文件 223 | with codecs.open(output_file, 'w', encoding='utf-8') as f: 224 | for i, j in sorted(candidates.items(), key=lambda s: -s[1]): 225 | s = '%s %s\n' % (i, j) 226 | f.write(s) 227 | --------------------------------------------------------------------------------