├── .idea
    └── vcs.xml
├── README.md
├── entropy.py
├── extract.py
└── wordseg.py


/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # **新词发现算法**(**New Word Detection**)   
 2 | **参考：**  
 3 | **Reference:**  
 4 |   - 算法（Algorithm）：
 5 |     - >http://www.matrix67.com/blog/archives/5044  
 6 |   - 代码（Code）：  
 7 |     - > https://github.com/xiulonghan/wordSeg     
 8 |     
 9 |     - > https://github.com/Moonshile/ChineseWordSegmentation  
10 | 
11 | **代码说明：**  
12 | **Code describing:**  
13 | - extract.py:  
14 |     提供文档中所有成词的可能组合，以及计算词的点互信息时的一个组合。  
15 |     Providing all possible word combines in document, and computing all word combines of PMI.  
16 |  - entropy.py:  
17 |     计算左右邻居熵的大小  
18 |     Computing left and right entropy of neighbors  
19 | - wordseg.py:  
20 |     根据计算出的频数、点互信息和左右熵来找出成词可能的组合，最后与词典对比找出新词。  
21 |     Finding all possible word combines by frequency, PMI and left/right entropy of neighbors, then comparing with diction to detect new word.  


--------------------------------------------------------------------------------
/entropy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | '''
 3 | left/right neighbors entropy for new word detection
 4 | Author: Xylander
 5 | Reference:
 6 |     https://github.com/Moonshile/ChineseWordSegmentation
 7 |     http://www.matrix67.com/blog/archives/5044
 8 |     https://zlc1994.com/2017/01/04/
 9 | '''
10 | 
11 | import math
12 | 
13 | def compute_entropy(_list):
14 |     length = float(len(_list))
15 |     frequence = {}
16 |     if length == 0:
17 |         return 0
18 |     else:
19 |         for i in _list:
20 |             frequence[i] = frequence.get(i,0) + 1
21 |         return sum(map(lambda x: - x/length * math.log(x/length) , frequence.values()))
22 | 
23 | 


--------------------------------------------------------------------------------
/extract.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import re
 4 | 
 5 | 
 6 | def extract_cadicateword(_doc,_max_word_len):
 7 |     indexes = []
 8 |     doc_length = len(_doc)
 9 |     for i in xrange(doc_length):
10 |         for j in xrange(i+1,min(i+1+_max_word_len,doc_length+1)):
11 |             indexes.append((i,j))
12 |     return sorted(indexes,key = lambda (_i,_j):_doc[_i:_j])
13 | 
14 | def gen_bigram(_word_str):
15 |     '''
16 |     A word is divide into two part by following all possible combines.
17 |     For instance, ABB can divide into (a,bb),(ab,b)
18 |     :param _word_str:
19 |     :return:
20 |     '''
21 |     return [(_word_str[0:_i],_word_str[_i:]) for _i in xrange(1,len(_word_str))]


--------------------------------------------------------------------------------
/wordseg.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | """
  3 | Chinese word segmentation algorithm with corpus
  4 | Author: "Xylander"
  5 | """
  6 | 
  7 | 
  8 | import os
  9 | import re
 10 | import math
 11 | import time
 12 | from entropy import compute_entropy
 13 | from extract import extract_cadicateword,gen_bigram
 14 | import pandas as pd
 15 | 
 16 | 
 17 | class wordinfo(object):
 18 |     '''
 19 |     Record every candidate word information include left neighbors, right neighbors, frequency, PMI
 20 |     '''
 21 |     def __init__(self,text):
 22 |         super(wordinfo,self).__init__()
 23 |         self.text = text
 24 |         self.freq = 0.0
 25 |         self.left = []  #record left neighbors
 26 |         self.right = [] #record right neighbors
 27 |         self.pmi = 0
 28 | 
 29 |     def update_data(self,left,right):
 30 |         self.freq += 1.0
 31 |         if left:
 32 |             self.left.append(left)
 33 |         if right:
 34 |             self.right.append(right)
 35 | 
 36 |     def compute_indexes(self,length):
 37 |         #compute frequency of word,and left/right entropy
 38 |         self.freq /= length
 39 |         self.left = compute_entropy(self.left)
 40 |         self.right = compute_entropy(self.right)
 41 | 
 42 |     def compute_pmi(self,words_dict):
 43 |         #compute all kinds of combines for word
 44 |         sub_part = gen_bigram(self.text)
 45 |         if len(sub_part) > 0:
 46 |             self.pmi = min(map(lambda (left,right) : math.log(self.freq/words_dict[left].freq/words_dict[right].freq),sub_part))
 47 | 
 48 | class segdocument(object):
 49 |     '''
 50 |     Main class for Chinese word segmentation
 51 |     1. Generate words from a long enough document
 52 |     2. Do the segmentation work with the document
 53 |     reference:
 54 | 
 55 |     '''
 56 |     def __init__(self,doc,max_word_len=5,min_tf=0.000005,min_entropy=0.07,min_pmi=6):
 57 |         super(segdocument,self).__init__()
 58 |         self.max_word_len = max_word_len
 59 |         self.min_tf = min_tf
 60 |         self.min_entropy = min_entropy
 61 |         self.min_pmi = min_pmi
 62 |         #analysis documents
 63 |         self.word_info = self.gen_words(doc)
 64 |         count = float(len(self.word_info))
 65 |         self.avg_frq = sum(map(lambda w : w.freq,self.word_info))/count
 66 |         self.avg_entropy = sum(map(lambda w : min(w.left,w.right),self.word_info))/count
 67 |         self.avg_pmi = sum(map(lambda w:w.pmi,self.word_info)) / count
 68 |         filter_function = lambda f:len(f.text) > 1 and f.pmi > self.min_pmi and f.freq > self.min_tf\
 69 |                                    and min(f.left,f.right) > self.min_entropy
 70 |         self.word_tf_pmi_ent = map(lambda w :(w.text,len(w.text),w.freq,w.pmi,min(w.left,w.right)),filter(filter_function,self.word_info))
 71 | 
 72 |     def gen_words(self,doc):
 73 |         #pattern = re.compile('[：“。”，！？、《》……；’‘\n——\r\t）、（——^[1-9]d*$]')
 74 |         #pattern = re.compile('[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？?：、~@#”“￥：%……&*（）]+|[[A-Za-z0-9]*$]'.decode('utf-8'))
 75 |         pattern = re.compile(u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z，。《》、？：；“”‘’｛｝【】（）…￥！—┄－]+')
 76 |         doc = pattern.sub(r'',doc)
 77 |         word_index = extract_cadicateword(doc,self.max_word_len)
 78 |         word_cad = {} #后选词的字典
 79 |         for suffix in word_index:
 80 |             word = doc[suffix[0]:suffix[1]]
 81 |             if word not in word_cad:
 82 |                 word_cad[word] = wordinfo(word)
 83 |                 # record frequency of word and left neighbors and right neighbors
 84 |             word_cad[word].update_data(doc[suffix[0]-1:suffix[0]],doc[suffix[1]:suffix[1]+1])
 85 |         length = len(doc)
 86 |             #computing frequency of candicate word and entropy of left/right neighbors
 87 |         for word in word_cad:
 88 |             word_cad[word].compute_indexes(length)
 89 |         #ranking by length of word
 90 |         values = sorted(word_cad.values(),key=lambda x:len(x.text))
 91 |         for v in values:
 92 |             if len(v.text) == 1:
 93 |                 continue
 94 |             v.compute_pmi(word_cad)
 95 |         # ranking by freq
 96 |         return sorted(values,key = lambda v: len(v.text),reverse = False)
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |         starttime = time.clock()
101 |         path = os.path.abspath('.')
102 |         wordlist = []
103 |         word_candidate = []
104 |         dict_bank = []
105 |         dict_path = path + '/dict.txt'
106 |         decode_list = ['gb18030','gbk','utf-8','ISO-8859-2','unicode'] #providing multiple decode ways
107 |         for decode_way in decode_list:
108 |             try:
109 |                 doc = open(path+'/guangkai.txt','r').read().decode(decode_way)
110 |                 print 'Great!{0} success to decode the document!!!'.format(decode_way)
111 |                 break
112 |             except:
113 |                 print 'Oops!{0} cannot decode the document!'.format(decode_way)
114 |         word = segdocument(doc,max_word_len=4,min_tf=0,min_entropy=0.05,min_pmi=3.3)
115 |         print '平均频率:'+ str(word.avg_frq)
116 |         print '平均pmi:' + str(word.avg_pmi)
117 |         print '平均自由度:'+ str(word.avg_entropy)
118 | 
119 |         for i in open(dict_path,'r'):
120 |             dict_bank.append(i.split(' ')[0])
121 | 
122 |         print 'result:'
123 |         for i in word.word_tf_pmi_ent:
124 |             if i[0].encode('utf-8') not in dict_bank:
125 |                 word_candidate.append(i[0].encode('utf-8'))
126 |                 wordlist.append([i[0].encode('utf-8'),i[1],i[2],i[3],i[4]])
127 |         seg = pd.DataFrame(wordlist,columns=['word','length','fre','pmi','entropy'])
128 |         seg.to_csv(path+'/extractword.csv',index=False)
129 |         # intersection = set(word_candidate) & set(dict_bank)
130 |         # newwordset = set(word_candidate) - intersection
131 |         for i in wordlist:
132 |             print i[0],i[1],i[2],i[3],i[4]
133 | 
134 |         endtime = time.clock()
135 |         print endtime-starttime
136 |         
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------