├── count_ngrams
├── evaluate.py
├── README.md
└── word_discovery.py
/count_ngrams:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bojone/word-discovery/HEAD/count_ngrams
--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
1 | #! -*- coding: utf-8 -*-
2 |
3 | import os
4 | import jieba
5 | jieba.set_dictionary('thucnews.vocab') # 手动截取前50000个词
6 |
7 |
8 | jieba.lcut(u'今天天气很不错')
9 |
10 |
11 | F = open('myresult.txt', 'w')
12 |
13 | with open('../testing/pku_test.txt') as f:
14 | for l in f:
15 | l = l.decode('gbk').strip()
16 | l = ' '.join(jieba.cut(l, HMM=False))
17 | l += '\r\n'
18 | l = l.encode('gbk')
19 | F.write(l)
20 |
21 |
22 | F.close()
23 |
24 | os.system('./score ../gold/pku_training_words.txt ../gold/pku_test_gold.txt myresult.txt > score.txt')
25 | os.system('cat score.txt')
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 速度更快、效果更好的中文新词发现
2 |
3 | 复现了之前的《【中文分词系列】 8. 更好的新词发现算法》中的新词发现算法。
4 |
5 | - 算法细节: https://kexue.fm/archives/4256
6 | - 复现细节: https://kexue.fm/archives/6920
7 |
8 | ## 实测
9 |
10 | 在经过充分训练的情况下,用bakeoff2005的pku语料进行测试,能得到0.765的F1,优于ICLR 2019的《Unsupervised Word Discovery with Segmental Neural Language Models》的0.731
11 |
12 | (注:这里是为了给效果提供一个直观感知,比较可能是不公平的,因为我不确定这篇论文中的训练集用了哪些语料。但我感觉在相同时间内本文算法会优于论文的算法,因为直觉论文的算法训练起来会很慢。作者也没有开源,所以有不少不确定之处,如有错谬,请读者指正。)
13 |
14 | ## 使用
15 |
16 | 使用前务必通过
17 | ```
18 | chmod +x count_ngrams
19 | ```
20 | 赋予`count_ngrams`可执行权限,然后修改`word_discovery.py`适配自己的数据,最后执行
21 | ```
22 | python word_discovery.py
23 | ```
24 |
25 | ## 更新
26 | - 2019.12.04: 兼容python3,在python2.7和python3.5下测试通过。
27 |
28 | ## 交流
29 | QQ交流群:67729435,微信群请加机器人微信号spaces_ac_cn
30 |
--------------------------------------------------------------------------------
/word_discovery.py:
--------------------------------------------------------------------------------
1 | #! -*- coding: utf-8 -*-
2 |
3 | import struct
4 | import os
5 | import six
6 | import codecs
7 | import math
8 | import logging
9 | logging.basicConfig(level=logging.INFO, format=u'%(asctime)s - %(levelname)s - %(message)s')
10 |
11 |
12 | class Progress:
13 | """显示进度,自己简单封装,比tqdm更可控一些
14 | iterator: 可迭代的对象;
15 | period: 显示进度的周期;
16 | steps: iterator可迭代的总步数,相当于len(iterator)
17 | """
18 | def __init__(self, iterator, period=1, steps=None, desc=None):
19 | self.iterator = iterator
20 | self.period = period
21 | if hasattr(iterator, '__len__'):
22 | self.steps = len(iterator)
23 | else:
24 | self.steps = steps
25 | self.desc = desc
26 | if self.steps:
27 | self._format_ = u'%s/%s passed' %('%s', self.steps)
28 | else:
29 | self._format_ = u'%s passed'
30 | if self.desc:
31 | self._format_ = self.desc + ' - ' + self._format_
32 | self.logger = logging.getLogger()
33 | def __iter__(self):
34 | for i, j in enumerate(self.iterator):
35 | if (i + 1) % self.period == 0:
36 | self.logger.info(self._format_ % (i+1))
37 | yield j
38 |
39 |
40 | class KenlmNgrams:
41 | """加载Kenlm的ngram统计结果
42 | vocab_file: Kenlm统计出来的词(字)表;
43 | ngram_file: Kenlm统计出来的ngram表;
44 | order: 统计ngram时设置的n,必须跟ngram_file对应;
45 | min_count: 自行设置的截断频数。
46 | """
47 | def __init__(self, vocab_file, ngram_file, order, min_count):
48 | self.vocab_file = vocab_file
49 | self.ngram_file = ngram_file
50 | self.order = order
51 | self.min_count = min_count
52 | self.read_chars()
53 | self.read_ngrams()
54 | def read_chars(self):
55 | f = open(self.vocab_file)
56 | chars = f.read()
57 | f.close()
58 | chars = chars.split('\x00')
59 | self.chars = [i.decode('utf-8') if six.PY2 else i for i in chars]
60 | def read_ngrams(self):
61 | """读取思路参考https://github.com/kpu/kenlm/issues/201
62 | """
63 | self.ngrams = [{} for _ in range(self.order)]
64 | self.total = 0
65 | size_per_item = self.order * 4 + 8
66 | def ngrams():
67 | with open(self.ngram_file, 'rb') as f:
68 | while True:
69 | s = f.read(size_per_item)
70 | if len(s) == size_per_item:
71 | n = self.unpack('l', s[-8:])
72 | yield s, n
73 | else:
74 | break
75 | for s, n in Progress(ngrams(), 100000, desc=u'loading ngrams'):
76 | if n >= self.min_count:
77 | self.total += n
78 | c = [self.unpack('i', s[j*4: (j+1)*4]) for j in range(self.order)]
79 | c = ''.join([self.chars[j] for j in c if j > 2])
80 | for j in range(len(c)):
81 | self.ngrams[j][c[:j+1]] = self.ngrams[j].get(c[:j+1], 0) + n
82 | def unpack(self, t, s):
83 | return struct.unpack(t, s)[0]
84 |
85 |
86 | def write_corpus(texts, filename):
87 | """将语料写到文件中,词与词(字与字)之间用空格隔开
88 | """
89 | with codecs.open(filename, 'w', encoding='utf-8') as f:
90 | for s in Progress(texts, 10000, desc=u'exporting corpus'):
91 | s = ' '.join(s) + '\n'
92 | f.write(s)
93 |
94 |
95 | def count_ngrams(corpus_file, order, vocab_file, ngram_file, memory=0.5):
96 | """通过os.system调用Kenlm的count_ngrams来统计频数
97 | 其中,memory是占用内存比例,理论上不能超过可用内存比例。
98 | """
99 | done = os.system(
100 | './count_ngrams -o %s --memory=%d%% --write_vocab_list %s <%s >%s'
101 | % (order, memory * 100, vocab_file, corpus_file, ngram_file)
102 | )
103 | if done != 0:
104 | raise ValueError('Failed to count ngrams by KenLM.')
105 |
106 |
107 | def filter_ngrams(ngrams, total, min_pmi=1):
108 | """通过互信息过滤ngrams,只保留“结实”的ngram。
109 | """
110 | order = len(ngrams)
111 | if hasattr(min_pmi, '__iter__'):
112 | min_pmi = list(min_pmi)
113 | else:
114 | min_pmi = [min_pmi] * order
115 | output_ngrams = set()
116 | total = float(total)
117 | for i in range(order-1, 0, -1):
118 | for w, v in ngrams[i].items():
119 | pmi = min([
120 | total * v / (ngrams[j].get(w[:j+1], total) * ngrams[i-j-1].get(w[j+1:], total))
121 | for j in range(i)
122 | ])
123 | if math.log(pmi) >= min_pmi[i]:
124 | output_ngrams.add(w)
125 | return output_ngrams
126 |
127 |
128 | class SimpleTrie:
129 | """通过Trie树结构,来搜索ngrams组成的连续片段
130 | """
131 | def __init__(self):
132 | self.dic = {}
133 | self.end = True
134 | def add_word(self, word):
135 | _ = self.dic
136 | for c in word:
137 | if c not in _:
138 | _[c] = {}
139 | _ = _[c]
140 | _[self.end] = word
141 | def tokenize(self, sent): # 通过最长联接的方式来对句子进行分词
142 | result = []
143 | start, end = 0, 1
144 | for i, c1 in enumerate(sent):
145 | _ = self.dic
146 | if i == end:
147 | result.append(sent[start: end])
148 | start, end = i, i+1
149 | for j, c2 in enumerate(sent[i:]):
150 | if c2 in _:
151 | _ = _[c2]
152 | if self.end in _:
153 | if i + j + 1 > end:
154 | end = i + j + 1
155 | else:
156 | break
157 | result.append(sent[start: end])
158 | return result
159 |
160 |
161 | def filter_vocab(candidates, ngrams, order):
162 | """通过与ngrams对比,排除可能出来的不牢固的词汇(回溯)
163 | """
164 | result = {}
165 | for i, j in candidates.items():
166 | if len(i) < 3:
167 | result[i] = j
168 | elif len(i) <= order and i in ngrams:
169 | result[i] = j
170 | elif len(i) > order:
171 | flag = True
172 | for k in range(len(i) + 1 - order):
173 | if i[k: k+order] not in ngrams:
174 | flag = False
175 | if flag:
176 | result[i] = j
177 | return result
178 |
179 |
180 | # ======= 算法构建完毕,下面开始执行完整的构建词库流程 =======
181 |
182 | import re
183 | import glob
184 |
185 | # 语料生成器,并且初步预处理语料
186 | # 这个生成器例子的具体含义不重要,只需要知道它就是逐句地把文本yield出来就行了
187 | def text_generator():
188 | txts = glob.glob('/root/thuctc/THUCNews/*/*.txt')
189 | for txt in txts:
190 | d = codecs.open(txt, encoding='utf-8').read()
191 | d = d.replace(u'\u3000', ' ').strip()
192 | yield re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z ]+', '\n', d)
193 |
194 |
195 | min_count = 32
196 | order = 4
197 | corpus_file = 'thucnews.corpus' # 语料保存的文件名
198 | vocab_file = 'thucnews.chars' # 字符集保存的文件名
199 | ngram_file = 'thucnews.ngrams' # ngram集保存的文件名
200 | output_file = 'thucnews.vocab' # 最后导出的词表文件名
201 | memory = 0.5 # memory是占用内存比例,理论上不能超过可用内存比例
202 |
203 | write_corpus(text_generator(), corpus_file) # 将语料转存为文本
204 | count_ngrams(corpus_file, order, vocab_file, ngram_file, memory) # 用Kenlm统计ngram
205 | ngrams = KenlmNgrams(vocab_file, ngram_file, order, min_count) # 加载ngram
206 | ngrams = filter_ngrams(ngrams.ngrams, ngrams.total, [0, 2, 4, 6]) # 过滤ngram
207 | ngtrie = SimpleTrie() # 构建ngram的Trie树
208 |
209 | for w in Progress(ngrams, 100000, desc=u'build ngram trie'):
210 | _ = ngtrie.add_word(w)
211 |
212 | candidates = {} # 得到候选词
213 | for t in Progress(text_generator(), 1000, desc='discovering words'):
214 | for w in ngtrie.tokenize(t): # 预分词
215 | candidates[w] = candidates.get(w, 0) + 1
216 |
217 | # 频数过滤
218 | candidates = {i: j for i, j in candidates.items() if j >= min_count}
219 | # 互信息过滤(回溯)
220 | candidates = filter_vocab(candidates, ngrams, order)
221 |
222 | # 输出结果文件
223 | with codecs.open(output_file, 'w', encoding='utf-8') as f:
224 | for i, j in sorted(candidates.items(), key=lambda s: -s[1]):
225 | s = '%s %s\n' % (i, j)
226 | f.write(s)
227 |
--------------------------------------------------------------------------------