├── .idea └── vcs.xml ├── README.md ├── entropy.py ├── extract.py └── wordseg.py /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # **新词发现算法**(**New Word Detection**) 2 | **参考:** 3 | **Reference:** 4 | - 算法(Algorithm): 5 | - >http://www.matrix67.com/blog/archives/5044 6 | - 代码(Code): 7 | - > https://github.com/xiulonghan/wordSeg 8 | 9 | - > https://github.com/Moonshile/ChineseWordSegmentation 10 | 11 | **代码说明:** 12 | **Code describing:** 13 | - extract.py: 14 | 提供文档中所有成词的可能组合,以及计算词的点互信息时的一个组合。 15 | Providing all possible word combines in document, and computing all word combines of PMI. 16 | - entropy.py: 17 | 计算左右邻居熵的大小 18 | Computing left and right entropy of neighbors 19 | - wordseg.py: 20 | 根据计算出的频数、点互信息和左右熵来找出成词可能的组合,最后与词典对比找出新词。 21 | Finding all possible word combines by frequency, PMI and left/right entropy of neighbors, then comparing with diction to detect new word. -------------------------------------------------------------------------------- /entropy.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | ''' 3 | left/right neighbors entropy for new word detection 4 | Author: Xylander 5 | Reference: 6 | https://github.com/Moonshile/ChineseWordSegmentation 7 | http://www.matrix67.com/blog/archives/5044 8 | https://zlc1994.com/2017/01/04/ 9 | ''' 10 | 11 | import math 12 | 13 | def compute_entropy(_list): 14 | length = float(len(_list)) 15 | frequence = {} 16 | if length == 0: 17 | return 0 18 | else: 19 | for i in _list: 20 | frequence[i] = frequence.get(i,0) + 1 21 | return sum(map(lambda x: - x/length * math.log(x/length) , frequence.values())) 22 | 23 | -------------------------------------------------------------------------------- /extract.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import re 4 | 5 | 6 | def extract_cadicateword(_doc,_max_word_len): 7 | indexes = [] 8 | doc_length = len(_doc) 9 | for i in xrange(doc_length): 10 | for j in xrange(i+1,min(i+1+_max_word_len,doc_length+1)): 11 | indexes.append((i,j)) 12 | return sorted(indexes,key = lambda (_i,_j):_doc[_i:_j]) 13 | 14 | def gen_bigram(_word_str): 15 | ''' 16 | A word is divide into two part by following all possible combines. 17 | For instance, ABB can divide into (a,bb),(ab,b) 18 | :param _word_str: 19 | :return: 20 | ''' 21 | return [(_word_str[0:_i],_word_str[_i:]) for _i in xrange(1,len(_word_str))] -------------------------------------------------------------------------------- /wordseg.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | """ 3 | Chinese word segmentation algorithm with corpus 4 | Author: "Xylander" 5 | """ 6 | 7 | 8 | import os 9 | import re 10 | import math 11 | import time 12 | from entropy import compute_entropy 13 | from extract import extract_cadicateword,gen_bigram 14 | import pandas as pd 15 | 16 | 17 | class wordinfo(object): 18 | ''' 19 | Record every candidate word information include left neighbors, right neighbors, frequency, PMI 20 | ''' 21 | def __init__(self,text): 22 | super(wordinfo,self).__init__() 23 | self.text = text 24 | self.freq = 0.0 25 | self.left = [] #record left neighbors 26 | self.right = [] #record right neighbors 27 | self.pmi = 0 28 | 29 | def update_data(self,left,right): 30 | self.freq += 1.0 31 | if left: 32 | self.left.append(left) 33 | if right: 34 | self.right.append(right) 35 | 36 | def compute_indexes(self,length): 37 | #compute frequency of word,and left/right entropy 38 | self.freq /= length 39 | self.left = compute_entropy(self.left) 40 | self.right = compute_entropy(self.right) 41 | 42 | def compute_pmi(self,words_dict): 43 | #compute all kinds of combines for word 44 | sub_part = gen_bigram(self.text) 45 | if len(sub_part) > 0: 46 | self.pmi = min(map(lambda (left,right) : math.log(self.freq/words_dict[left].freq/words_dict[right].freq),sub_part)) 47 | 48 | class segdocument(object): 49 | ''' 50 | Main class for Chinese word segmentation 51 | 1. Generate words from a long enough document 52 | 2. Do the segmentation work with the document 53 | reference: 54 | 55 | ''' 56 | def __init__(self,doc,max_word_len=5,min_tf=0.000005,min_entropy=0.07,min_pmi=6): 57 | super(segdocument,self).__init__() 58 | self.max_word_len = max_word_len 59 | self.min_tf = min_tf 60 | self.min_entropy = min_entropy 61 | self.min_pmi = min_pmi 62 | #analysis documents 63 | self.word_info = self.gen_words(doc) 64 | count = float(len(self.word_info)) 65 | self.avg_frq = sum(map(lambda w : w.freq,self.word_info))/count 66 | self.avg_entropy = sum(map(lambda w : min(w.left,w.right),self.word_info))/count 67 | self.avg_pmi = sum(map(lambda w:w.pmi,self.word_info)) / count 68 | filter_function = lambda f:len(f.text) > 1 and f.pmi > self.min_pmi and f.freq > self.min_tf\ 69 | and min(f.left,f.right) > self.min_entropy 70 | self.word_tf_pmi_ent = map(lambda w :(w.text,len(w.text),w.freq,w.pmi,min(w.left,w.right)),filter(filter_function,self.word_info)) 71 | 72 | def gen_words(self,doc): 73 | #pattern = re.compile('[:“。”,!?、《》……;’‘\n——\r\t)、(——^[1-9]d*$]') 74 | #pattern = re.compile('[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。??:、~@#”“¥:%……&*()]+|[[A-Za-z0-9]*$]'.decode('utf-8')) 75 | pattern = re.compile(u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z,。《》、?:;“”‘’{}【】()…¥!—┄-]+') 76 | doc = pattern.sub(r'',doc) 77 | word_index = extract_cadicateword(doc,self.max_word_len) 78 | word_cad = {} #后选词的字典 79 | for suffix in word_index: 80 | word = doc[suffix[0]:suffix[1]] 81 | if word not in word_cad: 82 | word_cad[word] = wordinfo(word) 83 | # record frequency of word and left neighbors and right neighbors 84 | word_cad[word].update_data(doc[suffix[0]-1:suffix[0]],doc[suffix[1]:suffix[1]+1]) 85 | length = len(doc) 86 | #computing frequency of candicate word and entropy of left/right neighbors 87 | for word in word_cad: 88 | word_cad[word].compute_indexes(length) 89 | #ranking by length of word 90 | values = sorted(word_cad.values(),key=lambda x:len(x.text)) 91 | for v in values: 92 | if len(v.text) == 1: 93 | continue 94 | v.compute_pmi(word_cad) 95 | # ranking by freq 96 | return sorted(values,key = lambda v: len(v.text),reverse = False) 97 | 98 | 99 | if __name__ == '__main__': 100 | starttime = time.clock() 101 | path = os.path.abspath('.') 102 | wordlist = [] 103 | word_candidate = [] 104 | dict_bank = [] 105 | dict_path = path + '/dict.txt' 106 | decode_list = ['gb18030','gbk','utf-8','ISO-8859-2','unicode'] #providing multiple decode ways 107 | for decode_way in decode_list: 108 | try: 109 | doc = open(path+'/guangkai.txt','r').read().decode(decode_way) 110 | print 'Great!{0} success to decode the document!!!'.format(decode_way) 111 | break 112 | except: 113 | print 'Oops!{0} cannot decode the document!'.format(decode_way) 114 | word = segdocument(doc,max_word_len=4,min_tf=0,min_entropy=0.05,min_pmi=3.3) 115 | print '平均频率:'+ str(word.avg_frq) 116 | print '平均pmi:' + str(word.avg_pmi) 117 | print '平均自由度:'+ str(word.avg_entropy) 118 | 119 | for i in open(dict_path,'r'): 120 | dict_bank.append(i.split(' ')[0]) 121 | 122 | print 'result:' 123 | for i in word.word_tf_pmi_ent: 124 | if i[0].encode('utf-8') not in dict_bank: 125 | word_candidate.append(i[0].encode('utf-8')) 126 | wordlist.append([i[0].encode('utf-8'),i[1],i[2],i[3],i[4]]) 127 | seg = pd.DataFrame(wordlist,columns=['word','length','fre','pmi','entropy']) 128 | seg.to_csv(path+'/extractword.csv',index=False) 129 | # intersection = set(word_candidate) & set(dict_bank) 130 | # newwordset = set(word_candidate) - intersection 131 | for i in wordlist: 132 | print i[0],i[1],i[2],i[3],i[4] 133 | 134 | endtime = time.clock() 135 | print endtime-starttime 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | --------------------------------------------------------------------------------