├── dict.txt
├── entropy.py
├── extract.py
├── README.md
└── wordseg.py


/dict.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/entropy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | '''
 3 | left/right neighbors entropy for new word detection
 4 | Author: Xylander
 5 | Reference:
 6 |     https://github.com/Moonshile/ChineseWordSegmentation
 7 |     http://www.matrix67.com/blog/archives/5044
 8 |     https://zlc1994.com/2017/01/04/
 9 | '''
10 | 
11 | import math
12 | 
13 | def compute_entropy(_list):
14 |     length = float(len(_list))
15 |     frequence = {}
16 |     if length == 0:
17 |         return 0
18 |     else:
19 |         for i in _list:
20 |             frequence[i] = frequence.get(i,0) + 1
21 |         return sum(map(lambda x: - x/length * math.log(x/length) , frequence.values()))
22 | 
23 | 


--------------------------------------------------------------------------------
/extract.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import re
 4 | 
 5 | 
 6 | def extract_cadicateword(_doc,_max_word_len):
 7 |     indexes = []
 8 |     doc_length = len(_doc)
 9 |     for i in range(doc_length):
10 |         for j in range(i+1, min(i+1+_max_word_len,doc_length+1)):
11 |             skip_flag = False
12 |             for k in range(i, j):
13 |                 if _doc[k] == " ":
14 |                     skip_flag = True
15 |                     break
16 |             if not skip_flag:
17 |                 indexes.append((i, j))
18 |     return sorted(indexes, key = lambda _word:_doc[_word[0]:_word[1]])
19 | 
20 | def gen_bigram(_word_str):
21 |     '''
22 |     A word is divide into two part by following all possible combines.
23 |     For instance, ABB can divide into (a,bb),(ab,b)
24 |     :param _word_str:
25 |     :return:
26 |     '''
27 |     return [(_word_str[0:_i],_word_str[_i:]) for _i in range(1,len(_word_str))]
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # **新词发现算法**(**New Word Detection**)   
 2 | 
 3 | 与 NCXavier 原本 repo 的区别 (Difference with the original fork):
 4 | 1. 更新至python3 (Updated to python 3)
 5 | 2. 针对原本就带 空格/符号/拆分符号 的语料做一些优化 (Optimize for corpus with space/split symbol)
 6 | 3. 优化左右熵（自由度），将左右邻接字从 list 改为 set (Optimize data format of entropy)
 7 | 
 8 | 
 9 | **参考：**  
10 | **Reference:**  
11 |   - 算法（Algorithm）：
12 |     - >http://www.matrix67.com/blog/archives/5044  
13 |   - 代码（Code）：  
14 |     - > https://github.com/xiulonghan/wordSeg     
15 |     
16 |     - > https://github.com/Moonshile/ChineseWordSegmentation  
17 | 
18 | **代码说明：**  
19 | **Code describing:**  
20 | - extract.py:  
21 |     提供文档中所有成词的可能组合，以及计算词的点互信息时的一个组合。  
22 |     Providing all possible word combines in document, and computing all word combines of PMI.  
23 |  - entropy.py:  
24 |     计算左右邻居熵的大小  
25 |     Computing left and right entropy of neighbors  
26 | - wordseg.py:  
27 |     根据计算出的频数、点互信息和左右熵来找出成词可能的组合，最后与词典对比找出新词。  
28 |     Finding all possible word combines by frequency, PMI and left/right entropy of neighbors, then comparing with diction to detect new word.  
29 | 


--------------------------------------------------------------------------------
/wordseg.py:
--------------------------------------------------------------------------------
  1 | #-*- coding:utf-8 -*-
  2 | """
  3 | Chinese word segmentation algorithm with corpus
  4 | Author: "Xylander"
  5 | """
  6 | 
  7 | 
  8 | import os
  9 | import re
 10 | import math
 11 | import time
 12 | from entropy import compute_entropy
 13 | from extract import extract_cadicateword,gen_bigram
 14 | import pandas as pd
 15 | import codecs
 16 | 
 17 | 
 18 | class wordinfo(object):
 19 |     '''
 20 |     Record every candidate word information include left neighbors, right neighbors, frequency, PMI
 21 |     '''
 22 |     def __init__(self,text):
 23 |         super(wordinfo,self).__init__()
 24 |         self.text = text
 25 |         self.freq = 0.0
 26 |         self.left = set()  #record left neighbors
 27 |         self.right = set()  #record right neighbors
 28 |         self.pmi = 0
 29 | 
 30 |     def update_data(self,left,right):
 31 |         self.freq += 1.0
 32 |         if left:
 33 |             self.left.add(left)
 34 |         if right:
 35 |             self.right.add(right)
 36 | 
 37 |     def compute_indexes(self,length):
 38 |         #compute frequency of word,and left/right entropy
 39 |         self.freq /= length
 40 |         self.left = compute_entropy(self.left)
 41 |         self.right = compute_entropy(self.right)
 42 | 
 43 |     def compute_pmi(self,words_dict):
 44 |         #compute all kinds of combines for word
 45 |         sub_part = gen_bigram(self.text)
 46 |         if len(sub_part) > 0:
 47 |             self.pmi = min(map(lambda word : math.log(self.freq/words_dict[word[0]].freq/words_dict[word[1]].freq),sub_part))
 48 | 
 49 | class segdocument(object):
 50 |     '''
 51 |     Main class for Chinese word segmentation
 52 |     1. Generate words from a long enough document
 53 |     2. Do the segmentation work with the document
 54 |     reference:
 55 | 
 56 |     '''
 57 |     def __init__(self,doc,max_word_len=5,min_tf=0.000005,min_entropy=0.07,min_pmi=6.0):
 58 |         super(segdocument,self).__init__()
 59 |         self.max_word_len = max_word_len
 60 |         self.min_tf = min_tf
 61 |         self.min_entropy = min_entropy
 62 |         self.min_pmi = min_pmi
 63 |         #analysis documents
 64 |         self.word_info = self.gen_words(doc)
 65 |         count = float(len(self.word_info))
 66 |         self.avg_frq = sum(map(lambda w : w.freq,self.word_info))/count
 67 |         self.avg_entropy = sum(map(lambda w : min(w.left,w.right),self.word_info))/count
 68 |         self.avg_pmi = sum(map(lambda w:w.pmi,self.word_info)) / count
 69 |         filter_function = lambda f:len(f.text) > 1 and f.pmi > self.min_pmi and f.freq > self.min_tf\
 70 |                                    and min(f.left,f.right) > self.min_entropy
 71 |         self.word_tf_pmi_ent = map(lambda w :(w.text,len(w.text),w.freq,w.pmi,min(w.left,w.right)),filter(filter_function,self.word_info))
 72 | 
 73 |     def gen_words(self,doc):
 74 |         #pattern = re.compile('[：“。”，！？、《》……；’‘\n——\r\t）、（——^[1-9]d*$]')
 75 |         #pattern = re.compile('[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？?：、~@#”“￥：%……&*（）]+|[[A-Za-z0-9]*$]'.decode('utf-8'))
 76 |         pattern = re.compile(u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z，。《》、？：；“”‘’｛｝【】（）…￥！—┄－]+')
 77 |         doc = pattern.sub(' ',doc)
 78 |         word_index = extract_cadicateword(doc,self.max_word_len)
 79 |         word_cad = {} #后选词的字典
 80 |         for suffix in word_index:
 81 |             word = doc[suffix[0]:suffix[1]]
 82 |             if word not in word_cad:
 83 |                 word_cad[word] = wordinfo(word)
 84 |                 # record frequency of word and left neighbors and right neighbors
 85 |             word_cad[word].update_data(doc[suffix[0]-1:suffix[0]],doc[suffix[1]:suffix[1]+1])
 86 |         length = len(doc)
 87 |             #computing frequency of candicate word and entropy of left/right neighbors
 88 |         for word in word_cad:
 89 |             word_cad[word].compute_indexes(length)
 90 |         #ranking by length of word
 91 |         values = sorted(word_cad.values(),key=lambda x:len(x.text))
 92 |         for v in values:
 93 |             if len(v.text) == 1:
 94 |                 continue
 95 |             v.compute_pmi(word_cad)
 96 |         # ranking by freq
 97 |         return sorted(values,key = lambda v: len(v.text),reverse = False)
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |         starttime = time.clock()
102 |         path = os.path.abspath('.')
103 |         wordlist = []
104 |         word_candidate = []
105 |         dict_bank = []
106 |         dict_path = path + '\\dict.txt'
107 | 
108 |         doc = codecs.open(path+'\\train_for_ws.txt', "r", "utf-8").read()
109 | 
110 |         word = segdocument(doc,max_word_len=3,min_tf=(1e-08),min_entropy=1.0,min_pmi=3.0)
111 |         print('avg_frq:'+ str(word.avg_frq))
112 |         print('avg_pmi:' + str(word.avg_pmi))
113 |         print('avg_entropy:'+ str(word.avg_entropy))
114 | 
115 |         for i in codecs.open(dict_path, 'r', "utf-8"):
116 |             dict_bank.append(i.split(' ')[0])
117 | 
118 |         print('result:')
119 |         for i in word.word_tf_pmi_ent:
120 |             if i[0] not in dict_bank:
121 |                 word_candidate.append(i[0])
122 |                 wordlist.append([i[0],i[1],i[2],i[3],i[4]])
123 |                 
124 |         # ranking on entropy (primary key) and pmi (secondary key)
125 |         wordlist = sorted(wordlist, key=lambda word: word[3], reverse=True)
126 |         wordlist = sorted(wordlist, key=lambda word: word[4], reverse=True)
127 |         
128 |         seg = pd.DataFrame(wordlist,columns=['word','length','fre','pmi','entropy'])
129 |         seg.to_csv(path+'/extractword.csv', index=False ,encoding="utf-8")
130 | 
131 |         # intersection = set(word_candidate) & set(dict_bank)
132 |         # newwordset = set(word_candidate) - intersection
133 |         
134 |         # for i in wordlist:
135 |         #     print(i[0],i[1],i[2],i[3],i[4])
136 | 
137 |         endtime = time.clock()
138 |         print(endtime-starttime)
139 |         
140 | 


--------------------------------------------------------------------------------