├── README.md
└── words_generate.py


/README.md:
--------------------------------------------------------------------------------
 1 | # New_words_find
 2 | 新词发现，信息熵，左右互信息
 3 | 
 4 | 我们常使用jieba分词来作为语言处理的第一到工序，但是对行业领域的专业词、新词却很难区分，该程序通过统计行业领域的语料数据，用信息熵技术实现新词的发现。
 5 | 
 6 | # input
 7 | 语料文档txt
 8 | ```
 9 | 万灵石4级哪里可以获取
10 | 天下第一擂是干什么的呢。。。
11 | 雄才伟略
12 | 宝宝丢了，伤心啊
13 | 其他点卡可以充值不？
14 | 苏州打造台
15 | 小票获得什么
16 | 游戏视野这么拉远
17 | 黄龙洞在哪里
18 | 宝宝套怎么升星？
19 | 比武大会什么时候开始
20 | 制作装备的过程，需要多长时间
21 | 3万评分时什么级别的擂台
22 | .
23 | .
24 | .
25 | ```
26 | 
27 | # output
28 | 语料中的词，并依据次数排序
29 | ```
30 | 怎么 2836108
31 | 是什么 1551873
32 | 账号 896048
33 | 在哪 825500
34 | 如何 784982
35 | 多少 774153
36 | 珍兽 661080
37 | 哪里 613245
38 | 可以 598351
39 | 任务 578104
40 | 技能 540769
41 | 装备 529157
42 | 手机 511519
43 | 升级 494052
44 | 5级 488499
45 | 神器 461028
46 | 钓鱼 425015
47 | 宝石 394782
48 | 密码 377159
49 | 称号 374558
50 | 怎么获得 366338
51 | 在哪里 317517
52 | 宝宝 301391
53 | 属性 301280
54 | 武器 300026
55 | .
56 | .
57 | .
58 | ```
59 | 


--------------------------------------------------------------------------------
/words_generate.py:
--------------------------------------------------------------------------------
  1 | # -*- coding=utf8 -*-
  2 | 
  3 | """
  4 | 根据大规模语料，自动生成词库，可以用于：挖掘流行词、分词、聊天习惯和兴趣 等
  5 | # 技术：统计词频、互信息、信息熵 http://blog.csdn.net/xiaokang06/article/details/50616983
  6 | 
  7 | time    :2017-7-6
  8 | author  : zlw
  9 | 
 10 | others ： 在pyhton3中encode,在转码的同时还会把string 变成bytes类型，decode在解码的同时还会把bytes变回string
 11 | """
 12 | 
 13 | from __future__ import division
 14 | import time
 15 | 
 16 | import re
 17 | from math import log
 18 | 
 19 | 
 20 | # hanzi_re = re.compile(u"[\u4E00-\u9FD5]+", re.U)
 21 | hanzi_re = re.compile(u"[\w]+", re.U)
 22 | PHRASE_MAX_LENGTH = 6
 23 | 
 24 | 
 25 | def str_decode(sentence):
 26 |     """转码"""
 27 | #    if not isinstance(sentence, unicode):
 28 | #     try:
 29 | #         sentence = sentence.decode('utf-8')
 30 | #     except UnicodeDecodeError:
 31 | #         sentence = sentence.decode('gbk', 'ignore')
 32 |     return sentence
 33 | 
 34 | 
 35 | def extract_hanzi(sentence):
 36 |     """提取汉字"""
 37 |     return hanzi_re.findall(sentence)
 38 | 
 39 | 
 40 | def cut_sentence(sentence):
 41 |     """把句子按照前后关系切分"""
 42 |     result = {}
 43 |     sentence_length = len(sentence)
 44 |     for i in range(sentence_length):
 45 |         for j in range(1, min(sentence_length - i+1, PHRASE_MAX_LENGTH + 1)):
 46 |             tmp = sentence[i: j + i]
 47 |             result[tmp] = result.get(tmp, 0) + 1
 48 |     return result
 49 | 
 50 |     
 51 |     
 52 | def gen_word_dict(path):
 53 |     """统计文档所有候选词，词频（包括单字）"""
 54 |     word_dict = {}
 55 |     with open(path,'r',encoding='gbk') as fp:
 56 |         for line in fp:
 57 |             utf_rdd = str_decode(line)
 58 |             hanzi_rdd = extract_hanzi(utf_rdd)   # list
 59 |             for words in hanzi_rdd:
 60 |                 raw_phrase_rdd = cut_sentence(words)  # dict
 61 |                 for word in raw_phrase_rdd:
 62 | 
 63 |                     if word in word_dict:
 64 |                         word_dict[word] += raw_phrase_rdd[word]
 65 |                     else:
 66 |                         word_dict[word] = raw_phrase_rdd[word]
 67 |     return word_dict   
 68 |     
 69 | def gen_lr_dict(word_dict,counts,thr_fq,thr_mtro):
 70 |     """统计长度>1的词的左右字出现的频数，并进行了频数和互信息筛选。
 71 |     # 得到词典：{'一个':[1208,2,8,1,15,...],'':[],...}其中[]第一元素为该字总的频数，其他元素为加上右或左边单个字后的频数
 72 |     """
 73 | 
 74 |     # def dict_iteritems(dict):
 75 |         # for w in dict:
 76 |             # yield (w, dict[w])
 77 | 
 78 |     # word_r_sort = sorted(dict_iteritems(word_dict), key=lambda x: x[0][:-1], reverse=False)
 79 |     #word_r_sort = sorted(word_dict.items(), key=lambda x: x[0][:-1], reverse=False)
 80 |     # print('dict内存:', sys.getsizeof(word_r_sort))
 81 | 
 82 |     l_dict = {}
 83 |     r_dict = {}
 84 |     k = 0
 85 |     for word in word_dict:
 86 |         k += 1
 87 |         if len(word) < 3: 
 88 |             continue
 89 |         wordl = word[:-1]
 90 |         ml = word_dict[wordl]
 91 |         if ml > thr_fq:  # 词频筛选
 92 |             wordl_r = wordl[1:]
 93 |             wordl_l = wordl[0]
 94 |             mul_info1 = ml * counts / (word_dict[wordl_r] * word_dict[wordl_l])
 95 |             wordl_r = wordl[-1]
 96 |             wordl_l = wordl[:-1]
 97 |             mul_info2 = ml * counts / (word_dict[wordl_r] * word_dict[wordl_l])
 98 |             mul_info = min(mul_info1, mul_info2)
 99 |             #print (wordl,mul_info)
100 |             if mul_info > thr_mtro:  # 互信息筛选
101 |                 if wordl in l_dict:
102 |                     l_dict[wordl].append(word_dict[word])
103 |                 else:
104 |                     l_dict[wordl] = [ml, word_dict[word]]
105 |            
106 |             
107 |         wordr = word[1:]
108 |         mr = word_dict[wordr]
109 |         if mr > thr_fq:  # 词频筛选
110 |         
111 |             wordr_r = wordr[1:]
112 |             wordr_l = wordr[0]
113 |             mul_info1 = mr * counts / (word_dict[wordr_r] * word_dict[wordr_l])
114 |             wordr_r = wordr[-1]
115 |             wordr_l = wordr[:-1]
116 |             mul_info2 = mr * counts / (word_dict[wordr_r] * word_dict[wordr_l])
117 |             mul_info = min(mul_info1, mul_info2)
118 |             
119 |             if mul_info > thr_mtro:  # 互信息筛选        
120 |                 if wordr in r_dict:
121 |                     r_dict[wordr].append(word_dict[word])
122 |                 else:
123 |                     r_dict[wordr] = [mr, word_dict[word]]   
124 |         if k%1000000 == 0:
125 |             print('---------------',k)
126 |     return l_dict,r_dict
127 |  
128 | def cal_entro(r_dict):
129 |     """计算左边熵或右边熵"""
130 |     entro_r_dict = {}
131 |     for word in r_dict:
132 |         m_list = r_dict[word]
133 | 
134 |         r_list = m_list[1:]
135 |         fm = m_list[0]
136 | 
137 |         entro_r = 0  # 右边熵
138 |         krm = fm - sum(r_list)
139 |         if krm > 0:
140 |             entro_r -= 1 / fm * log(1 / fm, 2) * krm  # 右边为空时，应该增加熵
141 | 
142 |         for rm in r_list:
143 |             entro_r -= rm / fm * log(rm / fm, 2)
144 |         entro_r_dict[word] = entro_r
145 |         
146 |     return entro_r_dict
147 |       
148 | def entro_lr_fusion(entro_r_dict,entro_l_dict):      
149 |     """左右熵合并"""
150 |     entro_in_rl_dict = {}
151 |     entro_in_r_dict = {}
152 |     entro_in_l_dict =  entro_l_dict.copy()
153 |     for word in entro_r_dict:
154 |         if word in entro_l_dict:
155 |             entro_in_rl_dict[word] = [entro_l_dict[word], entro_r_dict[word]]
156 |             entro_in_l_dict.pop(word)
157 |         else:
158 |             entro_in_r_dict[word]  = entro_r_dict[word]
159 |     return entro_in_rl_dict,entro_in_l_dict,entro_in_r_dict
160 |    
161 | def entro_filter(entro_in_rl_dict,entro_in_l_dict,entro_in_r_dict,word_dict,thr_entro):
162 |     """信息熵筛选"""
163 |     entro_dict = {}
164 |     l, r, rl = 0, 0, 0
165 |     for word in entro_in_rl_dict:
166 |         #time.sleep(0.4)
167 |         if entro_in_rl_dict[word][0]>thr_entro and entro_in_rl_dict[word][1]>thr_entro:
168 |             entro_dict[word] = word_dict[word]
169 |             rl +=1
170 |             #print (word, entro_in_rl_dict[word])
171 | 
172 |     for word in entro_in_l_dict:
173 |         if entro_in_l_dict[word] > thr_entro:
174 |             entro_dict[word] = word_dict[word]
175 |             l += 1
176 |             #print (word, entro_in_l_dict[word])
177 | 
178 |     for word in entro_in_r_dict:
179 |         if entro_in_r_dict[word] > thr_entro:
180 |             entro_dict[word] = word_dict[word]
181 |             r += 1
182 |             #print (word, entro_in_r_dict[word])
183 | 
184 |     print ('（信息熵筛选后）左右词数量：', rl, l, r)
185 |     
186 |     return entro_dict
187 | 
188 |     
189 | def train_corpus_words(path):
190 |     """读取语料文件，根据互信息、左右信息熵训练出语料词库"""
191 |     thr_fq = 10  # 词频筛选阈值
192 |     thr_mtro = 80  # 互信息筛选阈值
193 |     thr_entro = 3  # 信息熵筛选阈值
194 |     
195 |     # 步骤1：统计文档所有候选词，词频（包括单字）
196 |     st = time.time()
197 |     word_dict = gen_word_dict(path)  
198 |     et = time.time()
199 |     print('读数耗时：',et-st)
200 |     counts = sum(word_dict.values())  # 总词频数
201 |     print ('总词频数：', counts,'候选词总数：',len(word_dict))
202 |     # print('dict内存:', sys.getsizeof(word_dict))
203 | 
204 | 
205 |     # 步骤2：统计长度>1的词的左右字出现的频数，并进行了频数和互信息筛选。  
206 |     print('rl_dict is starting...')
207 |     st = time.time()
208 |     l_dict,r_dict = gen_lr_dict(word_dict,counts,thr_fq,thr_mtro)  # 右边存在单个字的词 的词典，值为右边字的统计（注意两个词典不一定相同，因为，右边不存在字的词不被记录）
209 |     et = time.time()
210 |     print ('互信息筛选耗时：',et-st)
211 |     print( '（频数和互信息筛选后）左右候选词数量：', len(l_dict),len(r_dict))
212 |     
213 | 
214 | 
215 |     # 步骤3： 计算左右熵，得到词典：{'一个':5.37,'':,...}
216 |     entro_r_dict = cal_entro(l_dict)  # 左边词词典 计算右边熵
217 |     entro_l_dict = cal_entro(r_dict)  # 右边词词典 计算左边熵
218 |     del l_dict,r_dict  # 释放内存
219 | 
220 | 
221 |     # 步骤4：左右熵合并，词典：rl={'一个':[5.37,8.2],'':[左熵，右熵],...},r={'我说':5.37,'':右熵,...},l={'还行吧':3.37,'':左熵,...}
222 |     entro_in_rl_dict,entro_in_l_dict,entro_in_r_dict = entro_lr_fusion(entro_r_dict,entro_l_dict)
223 |     print ('合并后存在左右熵词数量：(左右、左、右)', len(entro_in_rl_dict), len(entro_in_l_dict), len(entro_in_r_dict))
224 |     del entro_r_dict,entro_l_dict
225 | 
226 |     # 步骤5： 信息熵筛选
227 |     entro_dict = entro_filter(entro_in_rl_dict,entro_in_l_dict,entro_in_r_dict,word_dict,thr_entro)
228 |     del entro_in_rl_dict,entro_in_l_dict,entro_in_r_dict,word_dict
229 |     
230 |     # 步骤6：输出最终满足的词，并按词频排序
231 |     result = sorted(entro_dict.items(), key=lambda x:x[1], reverse=True)
232 | 
233 |     with open('userdict.txt', 'w',encoding='utf-8') as kf:
234 |         for w, m in result:
235 |             #print w, m
236 |             kf.write(w + ' %d\n' % m)
237 |  
238 |     print ('\n词库训练完成！总耗时：')
239 | 
240 | 
241 | if __name__ == "__main__":
242 | 
243 |     path = 'query_text.txt'
244 |     print ('训练开始...')
245 |     train_corpus_words(path)
246 |     print ('training is ok !')
247 | 


--------------------------------------------------------------------------------