├── README.md └── words_generate.py /README.md: -------------------------------------------------------------------------------- 1 | # New_words_find 2 | 新词发现,信息熵,左右互信息 3 | 4 | 我们常使用jieba分词来作为语言处理的第一到工序,但是对行业领域的专业词、新词却很难区分,该程序通过统计行业领域的语料数据,用信息熵技术实现新词的发现。 5 | 6 | # input 7 | 语料文档txt 8 | ``` 9 | 万灵石4级哪里可以获取 10 | 天下第一擂是干什么的呢。。。 11 | 雄才伟略 12 | 宝宝丢了,伤心啊 13 | 其他点卡可以充值不? 14 | 苏州打造台 15 | 小票获得什么 16 | 游戏视野这么拉远 17 | 黄龙洞在哪里 18 | 宝宝套怎么升星? 19 | 比武大会什么时候开始 20 | 制作装备的过程,需要多长时间 21 | 3万评分时什么级别的擂台 22 | . 23 | . 24 | . 25 | ``` 26 | 27 | # output 28 | 语料中的词,并依据次数排序 29 | ``` 30 | 怎么 2836108 31 | 是什么 1551873 32 | 账号 896048 33 | 在哪 825500 34 | 如何 784982 35 | 多少 774153 36 | 珍兽 661080 37 | 哪里 613245 38 | 可以 598351 39 | 任务 578104 40 | 技能 540769 41 | 装备 529157 42 | 手机 511519 43 | 升级 494052 44 | 5级 488499 45 | 神器 461028 46 | 钓鱼 425015 47 | 宝石 394782 48 | 密码 377159 49 | 称号 374558 50 | 怎么获得 366338 51 | 在哪里 317517 52 | 宝宝 301391 53 | 属性 301280 54 | 武器 300026 55 | . 56 | . 57 | . 58 | ``` 59 | -------------------------------------------------------------------------------- /words_generate.py: -------------------------------------------------------------------------------- 1 | # -*- coding=utf8 -*- 2 | 3 | """ 4 | 根据大规模语料,自动生成词库,可以用于:挖掘流行词、分词、聊天习惯和兴趣 等 5 | # 技术:统计词频、互信息、信息熵 http://blog.csdn.net/xiaokang06/article/details/50616983 6 | 7 | time :2017-7-6 8 | author : zlw 9 | 10 | others : 在pyhton3中encode,在转码的同时还会把string 变成bytes类型,decode在解码的同时还会把bytes变回string 11 | """ 12 | 13 | from __future__ import division 14 | import time 15 | 16 | import re 17 | from math import log 18 | 19 | 20 | # hanzi_re = re.compile(u"[\u4E00-\u9FD5]+", re.U) 21 | hanzi_re = re.compile(u"[\w]+", re.U) 22 | PHRASE_MAX_LENGTH = 6 23 | 24 | 25 | def str_decode(sentence): 26 | """转码""" 27 | # if not isinstance(sentence, unicode): 28 | # try: 29 | # sentence = sentence.decode('utf-8') 30 | # except UnicodeDecodeError: 31 | # sentence = sentence.decode('gbk', 'ignore') 32 | return sentence 33 | 34 | 35 | def extract_hanzi(sentence): 36 | """提取汉字""" 37 | return hanzi_re.findall(sentence) 38 | 39 | 40 | def cut_sentence(sentence): 41 | """把句子按照前后关系切分""" 42 | result = {} 43 | sentence_length = len(sentence) 44 | for i in range(sentence_length): 45 | for j in range(1, min(sentence_length - i+1, PHRASE_MAX_LENGTH + 1)): 46 | tmp = sentence[i: j + i] 47 | result[tmp] = result.get(tmp, 0) + 1 48 | return result 49 | 50 | 51 | 52 | def gen_word_dict(path): 53 | """统计文档所有候选词,词频(包括单字)""" 54 | word_dict = {} 55 | with open(path,'r',encoding='gbk') as fp: 56 | for line in fp: 57 | utf_rdd = str_decode(line) 58 | hanzi_rdd = extract_hanzi(utf_rdd) # list 59 | for words in hanzi_rdd: 60 | raw_phrase_rdd = cut_sentence(words) # dict 61 | for word in raw_phrase_rdd: 62 | 63 | if word in word_dict: 64 | word_dict[word] += raw_phrase_rdd[word] 65 | else: 66 | word_dict[word] = raw_phrase_rdd[word] 67 | return word_dict 68 | 69 | def gen_lr_dict(word_dict,counts,thr_fq,thr_mtro): 70 | """统计长度>1的词的左右字出现的频数,并进行了频数和互信息筛选。 71 | # 得到词典:{'一个':[1208,2,8,1,15,...],'':[],...}其中[]第一元素为该字总的频数,其他元素为加上右或左边单个字后的频数 72 | """ 73 | 74 | # def dict_iteritems(dict): 75 | # for w in dict: 76 | # yield (w, dict[w]) 77 | 78 | # word_r_sort = sorted(dict_iteritems(word_dict), key=lambda x: x[0][:-1], reverse=False) 79 | #word_r_sort = sorted(word_dict.items(), key=lambda x: x[0][:-1], reverse=False) 80 | # print('dict内存:', sys.getsizeof(word_r_sort)) 81 | 82 | l_dict = {} 83 | r_dict = {} 84 | k = 0 85 | for word in word_dict: 86 | k += 1 87 | if len(word) < 3: 88 | continue 89 | wordl = word[:-1] 90 | ml = word_dict[wordl] 91 | if ml > thr_fq: # 词频筛选 92 | wordl_r = wordl[1:] 93 | wordl_l = wordl[0] 94 | mul_info1 = ml * counts / (word_dict[wordl_r] * word_dict[wordl_l]) 95 | wordl_r = wordl[-1] 96 | wordl_l = wordl[:-1] 97 | mul_info2 = ml * counts / (word_dict[wordl_r] * word_dict[wordl_l]) 98 | mul_info = min(mul_info1, mul_info2) 99 | #print (wordl,mul_info) 100 | if mul_info > thr_mtro: # 互信息筛选 101 | if wordl in l_dict: 102 | l_dict[wordl].append(word_dict[word]) 103 | else: 104 | l_dict[wordl] = [ml, word_dict[word]] 105 | 106 | 107 | wordr = word[1:] 108 | mr = word_dict[wordr] 109 | if mr > thr_fq: # 词频筛选 110 | 111 | wordr_r = wordr[1:] 112 | wordr_l = wordr[0] 113 | mul_info1 = mr * counts / (word_dict[wordr_r] * word_dict[wordr_l]) 114 | wordr_r = wordr[-1] 115 | wordr_l = wordr[:-1] 116 | mul_info2 = mr * counts / (word_dict[wordr_r] * word_dict[wordr_l]) 117 | mul_info = min(mul_info1, mul_info2) 118 | 119 | if mul_info > thr_mtro: # 互信息筛选 120 | if wordr in r_dict: 121 | r_dict[wordr].append(word_dict[word]) 122 | else: 123 | r_dict[wordr] = [mr, word_dict[word]] 124 | if k%1000000 == 0: 125 | print('---------------',k) 126 | return l_dict,r_dict 127 | 128 | def cal_entro(r_dict): 129 | """计算左边熵或右边熵""" 130 | entro_r_dict = {} 131 | for word in r_dict: 132 | m_list = r_dict[word] 133 | 134 | r_list = m_list[1:] 135 | fm = m_list[0] 136 | 137 | entro_r = 0 # 右边熵 138 | krm = fm - sum(r_list) 139 | if krm > 0: 140 | entro_r -= 1 / fm * log(1 / fm, 2) * krm # 右边为空时,应该增加熵 141 | 142 | for rm in r_list: 143 | entro_r -= rm / fm * log(rm / fm, 2) 144 | entro_r_dict[word] = entro_r 145 | 146 | return entro_r_dict 147 | 148 | def entro_lr_fusion(entro_r_dict,entro_l_dict): 149 | """左右熵合并""" 150 | entro_in_rl_dict = {} 151 | entro_in_r_dict = {} 152 | entro_in_l_dict = entro_l_dict.copy() 153 | for word in entro_r_dict: 154 | if word in entro_l_dict: 155 | entro_in_rl_dict[word] = [entro_l_dict[word], entro_r_dict[word]] 156 | entro_in_l_dict.pop(word) 157 | else: 158 | entro_in_r_dict[word] = entro_r_dict[word] 159 | return entro_in_rl_dict,entro_in_l_dict,entro_in_r_dict 160 | 161 | def entro_filter(entro_in_rl_dict,entro_in_l_dict,entro_in_r_dict,word_dict,thr_entro): 162 | """信息熵筛选""" 163 | entro_dict = {} 164 | l, r, rl = 0, 0, 0 165 | for word in entro_in_rl_dict: 166 | #time.sleep(0.4) 167 | if entro_in_rl_dict[word][0]>thr_entro and entro_in_rl_dict[word][1]>thr_entro: 168 | entro_dict[word] = word_dict[word] 169 | rl +=1 170 | #print (word, entro_in_rl_dict[word]) 171 | 172 | for word in entro_in_l_dict: 173 | if entro_in_l_dict[word] > thr_entro: 174 | entro_dict[word] = word_dict[word] 175 | l += 1 176 | #print (word, entro_in_l_dict[word]) 177 | 178 | for word in entro_in_r_dict: 179 | if entro_in_r_dict[word] > thr_entro: 180 | entro_dict[word] = word_dict[word] 181 | r += 1 182 | #print (word, entro_in_r_dict[word]) 183 | 184 | print ('(信息熵筛选后)左右词数量:', rl, l, r) 185 | 186 | return entro_dict 187 | 188 | 189 | def train_corpus_words(path): 190 | """读取语料文件,根据互信息、左右信息熵训练出语料词库""" 191 | thr_fq = 10 # 词频筛选阈值 192 | thr_mtro = 80 # 互信息筛选阈值 193 | thr_entro = 3 # 信息熵筛选阈值 194 | 195 | # 步骤1:统计文档所有候选词,词频(包括单字) 196 | st = time.time() 197 | word_dict = gen_word_dict(path) 198 | et = time.time() 199 | print('读数耗时:',et-st) 200 | counts = sum(word_dict.values()) # 总词频数 201 | print ('总词频数:', counts,'候选词总数:',len(word_dict)) 202 | # print('dict内存:', sys.getsizeof(word_dict)) 203 | 204 | 205 | # 步骤2:统计长度>1的词的左右字出现的频数,并进行了频数和互信息筛选。 206 | print('rl_dict is starting...') 207 | st = time.time() 208 | l_dict,r_dict = gen_lr_dict(word_dict,counts,thr_fq,thr_mtro) # 右边存在单个字的词 的词典,值为右边字的统计(注意两个词典不一定相同,因为,右边不存在字的词不被记录) 209 | et = time.time() 210 | print ('互信息筛选耗时:',et-st) 211 | print( '(频数和互信息筛选后)左右候选词数量:', len(l_dict),len(r_dict)) 212 | 213 | 214 | 215 | # 步骤3: 计算左右熵,得到词典:{'一个':5.37,'':,...} 216 | entro_r_dict = cal_entro(l_dict) # 左边词词典 计算右边熵 217 | entro_l_dict = cal_entro(r_dict) # 右边词词典 计算左边熵 218 | del l_dict,r_dict # 释放内存 219 | 220 | 221 | # 步骤4:左右熵合并,词典:rl={'一个':[5.37,8.2],'':[左熵,右熵],...},r={'我说':5.37,'':右熵,...},l={'还行吧':3.37,'':左熵,...} 222 | entro_in_rl_dict,entro_in_l_dict,entro_in_r_dict = entro_lr_fusion(entro_r_dict,entro_l_dict) 223 | print ('合并后存在左右熵词数量:(左右、左、右)', len(entro_in_rl_dict), len(entro_in_l_dict), len(entro_in_r_dict)) 224 | del entro_r_dict,entro_l_dict 225 | 226 | # 步骤5: 信息熵筛选 227 | entro_dict = entro_filter(entro_in_rl_dict,entro_in_l_dict,entro_in_r_dict,word_dict,thr_entro) 228 | del entro_in_rl_dict,entro_in_l_dict,entro_in_r_dict,word_dict 229 | 230 | # 步骤6:输出最终满足的词,并按词频排序 231 | result = sorted(entro_dict.items(), key=lambda x:x[1], reverse=True) 232 | 233 | with open('userdict.txt', 'w',encoding='utf-8') as kf: 234 | for w, m in result: 235 | #print w, m 236 | kf.write(w + ' %d\n' % m) 237 | 238 | print ('\n词库训练完成!总耗时:') 239 | 240 | 241 | if __name__ == "__main__": 242 | 243 | path = 'query_text.txt' 244 | print ('训练开始...') 245 | train_corpus_words(path) 246 | print ('training is ok !') 247 | --------------------------------------------------------------------------------