├── dict.txt ├── entropy.py ├── extract.py ├── README.md └── wordseg.py /dict.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /entropy.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | ''' 3 | left/right neighbors entropy for new word detection 4 | Author: Xylander 5 | Reference: 6 | https://github.com/Moonshile/ChineseWordSegmentation 7 | http://www.matrix67.com/blog/archives/5044 8 | https://zlc1994.com/2017/01/04/ 9 | ''' 10 | 11 | import math 12 | 13 | def compute_entropy(_list): 14 | length = float(len(_list)) 15 | frequence = {} 16 | if length == 0: 17 | return 0 18 | else: 19 | for i in _list: 20 | frequence[i] = frequence.get(i,0) + 1 21 | return sum(map(lambda x: - x/length * math.log(x/length) , frequence.values())) 22 | 23 | -------------------------------------------------------------------------------- /extract.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import re 4 | 5 | 6 | def extract_cadicateword(_doc,_max_word_len): 7 | indexes = [] 8 | doc_length = len(_doc) 9 | for i in range(doc_length): 10 | for j in range(i+1, min(i+1+_max_word_len,doc_length+1)): 11 | skip_flag = False 12 | for k in range(i, j): 13 | if _doc[k] == " ": 14 | skip_flag = True 15 | break 16 | if not skip_flag: 17 | indexes.append((i, j)) 18 | return sorted(indexes, key = lambda _word:_doc[_word[0]:_word[1]]) 19 | 20 | def gen_bigram(_word_str): 21 | ''' 22 | A word is divide into two part by following all possible combines. 23 | For instance, ABB can divide into (a,bb),(ab,b) 24 | :param _word_str: 25 | :return: 26 | ''' 27 | return [(_word_str[0:_i],_word_str[_i:]) for _i in range(1,len(_word_str))] 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # **新词发现算法**(**New Word Detection**) 2 | 3 | 与 NCXavier 原本 repo 的区别 (Difference with the original fork): 4 | 1. 更新至python3 (Updated to python 3) 5 | 2. 针对原本就带 空格/符号/拆分符号 的语料做一些优化 (Optimize for corpus with space/split symbol) 6 | 3. 优化左右熵(自由度),将左右邻接字从 list 改为 set (Optimize data format of entropy) 7 | 8 | 9 | **参考:** 10 | **Reference:** 11 | - 算法(Algorithm): 12 | - >http://www.matrix67.com/blog/archives/5044 13 | - 代码(Code): 14 | - > https://github.com/xiulonghan/wordSeg 15 | 16 | - > https://github.com/Moonshile/ChineseWordSegmentation 17 | 18 | **代码说明:** 19 | **Code describing:** 20 | - extract.py: 21 | 提供文档中所有成词的可能组合,以及计算词的点互信息时的一个组合。 22 | Providing all possible word combines in document, and computing all word combines of PMI. 23 | - entropy.py: 24 | 计算左右邻居熵的大小 25 | Computing left and right entropy of neighbors 26 | - wordseg.py: 27 | 根据计算出的频数、点互信息和左右熵来找出成词可能的组合,最后与词典对比找出新词。 28 | Finding all possible word combines by frequency, PMI and left/right entropy of neighbors, then comparing with diction to detect new word. 29 | -------------------------------------------------------------------------------- /wordseg.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | """ 3 | Chinese word segmentation algorithm with corpus 4 | Author: "Xylander" 5 | """ 6 | 7 | 8 | import os 9 | import re 10 | import math 11 | import time 12 | from entropy import compute_entropy 13 | from extract import extract_cadicateword,gen_bigram 14 | import pandas as pd 15 | import codecs 16 | 17 | 18 | class wordinfo(object): 19 | ''' 20 | Record every candidate word information include left neighbors, right neighbors, frequency, PMI 21 | ''' 22 | def __init__(self,text): 23 | super(wordinfo,self).__init__() 24 | self.text = text 25 | self.freq = 0.0 26 | self.left = set() #record left neighbors 27 | self.right = set() #record right neighbors 28 | self.pmi = 0 29 | 30 | def update_data(self,left,right): 31 | self.freq += 1.0 32 | if left: 33 | self.left.add(left) 34 | if right: 35 | self.right.add(right) 36 | 37 | def compute_indexes(self,length): 38 | #compute frequency of word,and left/right entropy 39 | self.freq /= length 40 | self.left = compute_entropy(self.left) 41 | self.right = compute_entropy(self.right) 42 | 43 | def compute_pmi(self,words_dict): 44 | #compute all kinds of combines for word 45 | sub_part = gen_bigram(self.text) 46 | if len(sub_part) > 0: 47 | self.pmi = min(map(lambda word : math.log(self.freq/words_dict[word[0]].freq/words_dict[word[1]].freq),sub_part)) 48 | 49 | class segdocument(object): 50 | ''' 51 | Main class for Chinese word segmentation 52 | 1. Generate words from a long enough document 53 | 2. Do the segmentation work with the document 54 | reference: 55 | 56 | ''' 57 | def __init__(self,doc,max_word_len=5,min_tf=0.000005,min_entropy=0.07,min_pmi=6.0): 58 | super(segdocument,self).__init__() 59 | self.max_word_len = max_word_len 60 | self.min_tf = min_tf 61 | self.min_entropy = min_entropy 62 | self.min_pmi = min_pmi 63 | #analysis documents 64 | self.word_info = self.gen_words(doc) 65 | count = float(len(self.word_info)) 66 | self.avg_frq = sum(map(lambda w : w.freq,self.word_info))/count 67 | self.avg_entropy = sum(map(lambda w : min(w.left,w.right),self.word_info))/count 68 | self.avg_pmi = sum(map(lambda w:w.pmi,self.word_info)) / count 69 | filter_function = lambda f:len(f.text) > 1 and f.pmi > self.min_pmi and f.freq > self.min_tf\ 70 | and min(f.left,f.right) > self.min_entropy 71 | self.word_tf_pmi_ent = map(lambda w :(w.text,len(w.text),w.freq,w.pmi,min(w.left,w.right)),filter(filter_function,self.word_info)) 72 | 73 | def gen_words(self,doc): 74 | #pattern = re.compile('[:“。”,!?、《》……;’‘\n——\r\t)、(——^[1-9]d*$]') 75 | #pattern = re.compile('[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。??:、~@#”“¥:%……&*()]+|[[A-Za-z0-9]*$]'.decode('utf-8')) 76 | pattern = re.compile(u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z,。《》、?:;“”‘’{}【】()…¥!—┄-]+') 77 | doc = pattern.sub(' ',doc) 78 | word_index = extract_cadicateword(doc,self.max_word_len) 79 | word_cad = {} #后选词的字典 80 | for suffix in word_index: 81 | word = doc[suffix[0]:suffix[1]] 82 | if word not in word_cad: 83 | word_cad[word] = wordinfo(word) 84 | # record frequency of word and left neighbors and right neighbors 85 | word_cad[word].update_data(doc[suffix[0]-1:suffix[0]],doc[suffix[1]:suffix[1]+1]) 86 | length = len(doc) 87 | #computing frequency of candicate word and entropy of left/right neighbors 88 | for word in word_cad: 89 | word_cad[word].compute_indexes(length) 90 | #ranking by length of word 91 | values = sorted(word_cad.values(),key=lambda x:len(x.text)) 92 | for v in values: 93 | if len(v.text) == 1: 94 | continue 95 | v.compute_pmi(word_cad) 96 | # ranking by freq 97 | return sorted(values,key = lambda v: len(v.text),reverse = False) 98 | 99 | 100 | if __name__ == '__main__': 101 | starttime = time.clock() 102 | path = os.path.abspath('.') 103 | wordlist = [] 104 | word_candidate = [] 105 | dict_bank = [] 106 | dict_path = path + '\\dict.txt' 107 | 108 | doc = codecs.open(path+'\\train_for_ws.txt', "r", "utf-8").read() 109 | 110 | word = segdocument(doc,max_word_len=3,min_tf=(1e-08),min_entropy=1.0,min_pmi=3.0) 111 | print('avg_frq:'+ str(word.avg_frq)) 112 | print('avg_pmi:' + str(word.avg_pmi)) 113 | print('avg_entropy:'+ str(word.avg_entropy)) 114 | 115 | for i in codecs.open(dict_path, 'r', "utf-8"): 116 | dict_bank.append(i.split(' ')[0]) 117 | 118 | print('result:') 119 | for i in word.word_tf_pmi_ent: 120 | if i[0] not in dict_bank: 121 | word_candidate.append(i[0]) 122 | wordlist.append([i[0],i[1],i[2],i[3],i[4]]) 123 | 124 | # ranking on entropy (primary key) and pmi (secondary key) 125 | wordlist = sorted(wordlist, key=lambda word: word[3], reverse=True) 126 | wordlist = sorted(wordlist, key=lambda word: word[4], reverse=True) 127 | 128 | seg = pd.DataFrame(wordlist,columns=['word','length','fre','pmi','entropy']) 129 | seg.to_csv(path+'/extractword.csv', index=False ,encoding="utf-8") 130 | 131 | # intersection = set(word_candidate) & set(dict_bank) 132 | # newwordset = set(word_candidate) - intersection 133 | 134 | # for i in wordlist: 135 | # print(i[0],i[1],i[2],i[3],i[4]) 136 | 137 | endtime = time.clock() 138 | print(endtime-starttime) 139 | 140 | --------------------------------------------------------------------------------