├── File_Interface.py ├── HuffmanTree.py ├── WordCount.py ├── pyword2vec.py ├── static ├── stop_words.pkl └── 中文停用词表(比较全面,有1208个停用词).txt └── word2vec_v2.0.py /File_Interface.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | import csv,pickle 4 | 5 | def read_csv(path): #读入原始csv文件,不做任何变动 6 | file=open(path,'r') 7 | reader=csv.reader(file) 8 | data=[row for row in reader] 9 | return data 10 | def load_pickle(path): #读入pickle文件,不做任何变动 11 | file=open(path,'rb') 12 | data=pickle.load(file) 13 | file.close() 14 | return data 15 | def save_pickle(data,path): 16 | file=open(path,'wb') 17 | pickle.dump(data,file) 18 | file.close() -------------------------------------------------------------------------------- /HuffmanTree.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | import numpy as np 4 | 5 | class HuffmanTreeNode(): 6 | def __init__(self,value,possibility): 7 | # common part of leaf node and tree node 8 | self.possibility = possibility 9 | self.left = None 10 | self.right = None 11 | # value of leaf node will be the word, and be 12 | # mid vector in tree node 13 | self.value = value # the value of word 14 | self.Huffman = "" # store the huffman code 15 | 16 | def __str__(self): 17 | return 'HuffmanTreeNode object, value: {v}, possibility: {p}, Huffman: {h}'\ 18 | .format(v=self.value,p=self.possibility,h=self.Huffman) 19 | 20 | class HuffmanTree(): 21 | def __init__(self, word_dict, vec_len=15000): 22 | self.vec_len = vec_len # the length of word vector 23 | self.root = None 24 | 25 | word_dict_list = list(word_dict.values()) 26 | node_list = [HuffmanTreeNode(x['word'],x['possibility']) for x in word_dict_list] 27 | self.build_tree(node_list) 28 | # self.build_CBT(node_list) 29 | self.generate_huffman_code(self.root, word_dict) 30 | 31 | def build_tree(self,node_list): 32 | # node_list.sort(key=lambda x:x.possibility,reverse=True) 33 | # for i in range(node_list.__len__()-1)[::-1]: 34 | # top_node = self.merge(node_list[i],node_list[i+1]) 35 | # node_list.insert(i,top_node) 36 | # self.root = node_list[0] 37 | 38 | while node_list.__len__()>1: 39 | i1 = 0 # i1表示概率最小的节点 40 | i2 = 1 # i2 概率第二小的节点 41 | if node_list[i2].possibility < node_list[i1].possibility : 42 | [i1,i2] = [i2,i1] 43 | for i in range(2,node_list.__len__()): # 找到最小的两个节点 44 | if node_list[i].possibilityi2: 53 | node_list.pop(i1) 54 | node_list.pop(i2) 55 | else: 56 | raise RuntimeError('i1 should not be equal to i2') 57 | node_list.insert(0,top_node) 58 | self.root = node_list[0] 59 | 60 | def build_CBT(self,node_list): # build a complete binary tree 61 | node_list.sort(key=lambda x:x.possibility,reverse=True) 62 | node_num = node_list.__len__() 63 | before_start = 0 64 | while node_num>1 : 65 | for i in range(node_num>>1): 66 | top_node = self.merge(node_list[before_start+i*2],node_list[before_start+i*2+1]) 67 | node_list.append(top_node) 68 | if node_num%2==1: 69 | top_node = self.merge(node_list[before_start+i*2+2],node_list[-1]) 70 | node_list[-1] = top_node 71 | before_start = before_start + node_num 72 | node_num = node_num>>1 73 | self.root = node_list[-1] 74 | 75 | def generate_huffman_code(self, node, word_dict): 76 | # # use recursion in this edition 77 | # if node.left==None and node.right==None : 78 | # word = node.value 79 | # code = node.Huffman 80 | # print(word,code) 81 | # word_dict[word]['Huffman'] = code 82 | # return -1 83 | # 84 | # code = node.Huffman 85 | # if code==None: 86 | # code = "" 87 | # node.left.Huffman = code + "1" 88 | # node.right.Huffman = code + "0" 89 | # self.generate_huffman_code(node.left, word_dict) 90 | # self.generate_huffman_code(node.right, word_dict) 91 | 92 | # use stack butnot recursion in this edition 93 | stack = [self.root] 94 | while (stack.__len__()>0): 95 | node = stack.pop() 96 | # go along left tree 97 | while node.left or node.right : 98 | code = node.Huffman 99 | node.left.Huffman = code + "1" 100 | node.right.Huffman = code + "0" 101 | stack.append(node.right) 102 | node = node.left 103 | word = node.value 104 | code = node.Huffman 105 | # print(word,'\t',code.__len__(),'\t',node.possibility) 106 | word_dict[word]['Huffman'] = code 107 | 108 | def merge(self,node1,node2): 109 | top_pos = node1.possibility + node2.possibility 110 | top_node = HuffmanTreeNode(np.zeros([1,self.vec_len]), top_pos) 111 | if node1.possibility >= node2.possibility : 112 | top_node.left = node1 113 | top_node.right = node2 114 | else: 115 | top_node.left = node2 116 | top_node.right = node1 117 | return top_node 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /WordCount.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | 4 | from collections import Counter 5 | from operator import itemgetter as _itemgetter 6 | import jieba 7 | import File_Interface as FI 8 | 9 | class WordCounter(): 10 | # can calculate the freq of words in a text list 11 | 12 | # for example 13 | # >>> data = ['Merge multiple sorted inputs into a single sorted output', 14 | # 'The API below differs from textbook heap algorithms in two aspects'] 15 | # >>> wc = WordCounter(data) 16 | # >>> print(wc.count_res) 17 | 18 | # >>> MulCounter({' ': 18, 'sorted': 2, 'single': 1, 'below': 1, 'inputs': 1, 'The': 1, 'into': 1, 'textbook': 1, 19 | # 'API': 1, 'algorithms': 1, 'in': 1, 'output': 1, 'heap': 1, 'differs': 1, 'two': 1, 'from': 1, 20 | # 'aspects': 1, 'multiple': 1, 'a': 1, 'Merge': 1}) 21 | 22 | def __init__(self, text_list): 23 | self.text_list = text_list 24 | self.stop_word = self.Get_Stop_Words() 25 | self.count_res = None 26 | 27 | self.Word_Count(self.text_list) 28 | 29 | def Get_Stop_Words(self): 30 | ret = [] 31 | ret = FI.load_pickle('./static/stop_words.pkl') 32 | return ret 33 | 34 | def Word_Count(self,text_list,cut_all=False): 35 | 36 | filtered_word_list = [] 37 | count = 0 38 | for line in text_list: 39 | res = jieba.cut(line,cut_all=cut_all) 40 | res = list(res) 41 | text_list[count] = res 42 | count += 1 43 | filtered_word_list += res 44 | 45 | self.count_res = MulCounter(filtered_word_list) 46 | for word in self.stop_word: 47 | try: 48 | self.count_res.pop(word) 49 | except: 50 | pass 51 | 52 | class MulCounter(Counter): 53 | # a class extends from collections.Counter 54 | # add some methods, larger_than and less_than 55 | def __init__(self,element_list): 56 | super().__init__(element_list) 57 | 58 | def larger_than(self,minvalue,ret='list'): 59 | temp = sorted(self.items(),key=_itemgetter(1),reverse=True) 60 | low = 0 61 | high = temp.__len__() 62 | while(high - low > 1): 63 | mid = (low+high) >> 1 64 | if temp[mid][1] >= minvalue: 65 | low = mid 66 | else: 67 | high = mid 68 | if temp[low][1] 1): 86 | mid = (low+high) >> 1 87 | if temp[mid][1] <= maxvalue: 88 | low = mid 89 | else: 90 | high = mid 91 | if temp[low][1]>maxvalue: 92 | if ret=='dict': 93 | return {} 94 | else: 95 | return [] 96 | if ret=='dict': 97 | ret_data = {} 98 | for ele,count in temp[:high]: 99 | ret_data[ele]=count 100 | return ret_data 101 | else: 102 | return temp[:high] 103 | 104 | if __name__ == '__main__': 105 | # text = FI.load_pickle('./static/demo.pkl') 106 | # text =[ x['dealed_text']['left_content'][0] for x in text] 107 | # wc = WordCounter(text) 108 | # print(wc.count_res.larger_than(5)) 109 | 110 | data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] 111 | wc = WordCounter(data) 112 | c = wc.count_res 113 | print(sum(c.values())) 114 | 115 | 116 | # wc = FI.load_pickle('./static/test.pkl') 117 | 118 | # print(sorted(x.items(),key=lambda x:x[1])) 119 | # print(x) 120 | 121 | # c=MulCounter('abcdeabcdaffbcabag') 122 | # print(sorted(c.items(),key=_itemgetter(1),reverse=True)) 123 | # print(c.larger_than(1)) 124 | -------------------------------------------------------------------------------- /pyword2vec.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | import math 4 | 5 | from WordCount import WordCounter,MulCounter 6 | import File_Interface as FI 7 | from HuffmanTree import HuffmanTree 8 | 9 | import numpy as np 10 | import jieba 11 | from sklearn import preprocessing 12 | 13 | class Word2Vec(): 14 | def __init__(self, vec_len=15000, learn_rate=0.025, win_len=5, model='cbow'): 15 | self.cutted_text_list = None 16 | self.vec_len = vec_len 17 | self.learn_rate = learn_rate 18 | self.win_len = win_len 19 | self.model = model 20 | self.word_dict = None # each element is a dict, including: word,possibility,vector,huffmancode 21 | self.huffman = None # the object of HuffmanTree 22 | 23 | def Load_Word_Freq(self,word_freq_path): 24 | # load the info of word frequence 25 | # will generate a word dict 26 | if self.word_dict is not None: 27 | raise RuntimeError('the word dict is not empty') 28 | word_freq = FI.load_pickle(word_freq_path) 29 | self.__Gnerate_Word_Dict(word_freq) 30 | 31 | def __Gnerate_Word_Dict(self,word_freq): 32 | # generate a word dict 33 | # which containing the word, freq, possibility, a random initial vector and Huffman value 34 | if not isinstance(word_freq,dict) and not isinstance(word_freq,list): 35 | raise ValueError('the word freq info should be a dict or list') 36 | 37 | word_dict = {} 38 | if isinstance(word_freq,dict): 39 | # if word_freq is in type of dictionary 40 | sum_count = sum(word_freq.values()) 41 | for word in word_freq: 42 | temp_dict = dict( 43 | word = word, 44 | freq = word_freq[word], 45 | possibility = word_freq[word]/sum_count, 46 | vector = np.random.random([1,self.vec_len]), 47 | Huffman = None 48 | ) 49 | word_dict[word] = temp_dict 50 | else: 51 | # if word_freq is in type of list 52 | freq_list = [x[1] for x in word_freq] 53 | sum_count = sum(freq_list) 54 | 55 | for item in word_freq: 56 | temp_dict = dict( 57 | word = item[0], 58 | freq = item[1], 59 | possibility = item[1]/sum_count, 60 | vector = np.random.random([1,self.vec_len]), 61 | Huffman = None 62 | ) 63 | word_dict[item[0]] = temp_dict 64 | self.word_dict = word_dict 65 | 66 | def Import_Model(self,model_path): 67 | model = FI.load_pickle(model_path) # a dict, {'word_dict','huffman','vec_len'} 68 | self.word_dict = model.word_dict 69 | self.huffman = model.huffman 70 | self.vec_len = model.vec_len 71 | self.learn_rate = model.learn_rate 72 | self.win_len = model.win_len 73 | self.model = model.model 74 | 75 | def Export_Model(self,model_path): 76 | data=dict( 77 | word_dict = self.word_dict, 78 | huffman = self.huffman, 79 | vec_len = self.vec_len, 80 | learn_rate = self.learn_rate, 81 | win_len = self.win_len, 82 | model = self.model 83 | ) 84 | FI.save_pickle(data,model_path) 85 | 86 | def Train_Model(self,text_list): 87 | 88 | # generate the word_dict and huffman tree 89 | if self.huffman==None: 90 | # if the dict is not loaded, it will generate a new dict 91 | if self.word_dict==None : 92 | wc = WordCounter(text_list) 93 | self.__Gnerate_Word_Dict(wc.count_res.larger_than(5)) 94 | self.cutted_text_list = wc.text_list 95 | 96 | # generate a huffman tree according to the possibility of words 97 | self.huffman = HuffmanTree(self.word_dict,vec_len=self.vec_len) 98 | print('word_dict and huffman tree already generated, ready to train vector') 99 | 100 | # start to train word vector 101 | before = (self.win_len-1) >> 1 102 | after = self.win_len-1-before 103 | 104 | if self.model=='cbow': 105 | method = self.__Deal_Gram_CBOW 106 | else: 107 | method = self.__Deal_Gram_SkipGram 108 | 109 | if self.cutted_text_list: 110 | # if the text has been cutted 111 | total = self.cutted_text_list.__len__() 112 | count = 0 113 | for line in self.cutted_text_list: 114 | line_len = line.__len__() 115 | for i in range(line_len): 116 | method(line[i],line[max(0,i-before):i]+line[i+1:min(line_len,i+after+1)]) 117 | count += 1 118 | print('{c} of {d}'.format(c=count,d=total)) 119 | 120 | else: 121 | # if the text has note been cutted 122 | for line in text_list: 123 | line = list(jieba.cut(line,cut_all=False)) 124 | line_len = line.__len__() 125 | for i in range(line_len): 126 | method(line[i],line[max(0,i-before):i]+line[i+1:min(line_len,i+after+1)]) 127 | print('word vector has been generated') 128 | 129 | def __Deal_Gram_CBOW(self,word,gram_word_list): 130 | 131 | if not self.word_dict.__contains__(word): 132 | return 133 | 134 | word_huffman = self.word_dict[word]['Huffman'] 135 | gram_vector_sum = np.zeros([1,self.vec_len]) 136 | for i in range(gram_word_list.__len__())[::-1]: 137 | item = gram_word_list[i] 138 | if self.word_dict.__contains__(item): 139 | gram_vector_sum += self.word_dict[item]['vector'] 140 | else: 141 | gram_word_list.pop(i) 142 | 143 | if gram_word_list.__len__()==0: 144 | return 145 | 146 | e = self.__GoAlong_Huffman(word_huffman,gram_vector_sum,self.huffman.root) 147 | 148 | for item in gram_word_list: 149 | self.word_dict[item]['vector'] += e 150 | self.word_dict[item]['vector'] = preprocessing.normalize(self.word_dict[item]['vector']) 151 | 152 | def __Deal_Gram_SkipGram(self,word,gram_word_list): 153 | 154 | if not self.word_dict.__contains__(word): 155 | return 156 | 157 | word_vector = self.word_dict[word]['vector'] 158 | for i in range(gram_word_list.__len__())[::-1]: 159 | if not self.word_dict.__contains__(gram_word_list[i]): 160 | gram_word_list.pop(i) 161 | 162 | if gram_word_list.__len__()==0: 163 | return 164 | 165 | for u in gram_word_list: 166 | u_huffman = self.word_dict[u]['Huffman'] 167 | e = self.__GoAlong_Huffman(u_huffman,word_vector,self.huffman.root) 168 | self.word_dict[word]['vector'] += e 169 | self.word_dict[word]['vector'] = preprocessing.normalize(self.word_dict[word]['vector']) 170 | 171 | def __GoAlong_Huffman(self,word_huffman,input_vector,root): 172 | 173 | node = root 174 | e = np.zeros([1,self.vec_len]) 175 | for level in range(word_huffman.__len__()): 176 | huffman_charat = word_huffman[level] 177 | q = self.__Sigmoid(input_vector.dot(node.value.T)) 178 | grad = self.learn_rate * (1-int(huffman_charat)-q) 179 | e += grad * node.value 180 | node.value += grad * input_vector 181 | node.value = preprocessing.normalize(node.value) 182 | if huffman_charat=='0': 183 | node = node.right 184 | else: 185 | node = node.left 186 | return e 187 | 188 | def __Sigmoid(self,value): 189 | return 1/(1+math.exp(-value)) 190 | 191 | if __name__ == '__main__': 192 | # text = FI.load_pickle('./static/demo.pkl') 193 | # text =[ x['dealed_text']['left_content'][0] for x in text] 194 | # # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] 195 | # wv = Word2Vec(vec_len=500) 196 | # wv.Train_Model(text) 197 | # FI.save_pickle(wv.word_dict,'./static/wv.pkl') 198 | # 199 | # data = FI.load_pickle('./static/wv.pkl') 200 | # x = {} 201 | # for key in data: 202 | # temp = data[key]['vector'] 203 | # temp = preprocessing.normalize(temp) 204 | # x[key] = temp 205 | # FI.save_pickle(x,'./static/normal_wv.pkl') 206 | 207 | x = FI.load_pickle('./static/normal_wv.pkl') 208 | def cal_simi(data,key1,key2): 209 | return data[key1].dot(data[key2].T)[0][0] 210 | keys=list(x.keys()) 211 | for key in keys: 212 | print(key,'\t',cal_simi(x,'姚明',key)) 213 | 214 | -------------------------------------------------------------------------------- /static/stop_words.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/multiangle/pyword2vec/e3c7c5ff0308d0d00501ee26356b46df2438f68f/static/stop_words.pkl -------------------------------------------------------------------------------- /static/中文停用词表(比较全面,有1208个停用词).txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/multiangle/pyword2vec/e3c7c5ff0308d0d00501ee26356b46df2438f68f/static/中文停用词表(比较全面,有1208个停用词).txt -------------------------------------------------------------------------------- /word2vec_v2.0.py: -------------------------------------------------------------------------------- 1 | __author__ = 'multiangle' 2 | 3 | import math 4 | import File_Interface as FI 5 | from operator import itemgetter as _itemgetter 6 | import numpy as np 7 | import jieba 8 | from sklearn import preprocessing 9 | from collections import Counter 10 | import numpy as np 11 | 12 | class Word2Vec(): 13 | def __init__(self, vec_len=15000, learn_rate=0.025, win_len=5, model='cbow'): 14 | self.cutted_text_list = None 15 | self.vec_len = vec_len 16 | self.learn_rate = learn_rate 17 | self.win_len = win_len 18 | self.model = model 19 | self.word_dict = None # each element is a dict, including: word,possibility,vector,huffmancode 20 | self.huffman = None # the object of HuffmanTree 21 | 22 | def Load_Word_Freq(self,word_freq_path): 23 | # load the info of word frequence 24 | # will generate a word dict 25 | if self.word_dict is not None: 26 | raise RuntimeError('the word dict is not empty') 27 | word_freq = FI.load_pickle(word_freq_path) 28 | self.__Gnerate_Word_Dict(word_freq) 29 | 30 | def __Gnerate_Word_Dict(self,word_freq): 31 | # generate a word dict 32 | # which containing the word, freq, possibility, a random initial vector and Huffman value 33 | if not isinstance(word_freq,dict) and not isinstance(word_freq,list): 34 | raise ValueError('the word freq info should be a dict or list') 35 | 36 | word_dict = {} 37 | if isinstance(word_freq,dict): 38 | # if word_freq is in type of dictionary 39 | sum_count = sum(word_freq.values()) 40 | for word in word_freq: 41 | temp_dict = dict( 42 | word = word, 43 | freq = word_freq[word], 44 | possibility = word_freq[word]/sum_count, 45 | vector = np.random.random([1,self.vec_len]), 46 | Huffman = None 47 | ) 48 | word_dict[word] = temp_dict 49 | else: 50 | # if word_freq is in type of list 51 | freq_list = [x[1] for x in word_freq] 52 | sum_count = sum(freq_list) 53 | 54 | for item in word_freq: 55 | temp_dict = dict( 56 | word = item[0], 57 | freq = item[1], 58 | possibility = item[1]/sum_count, 59 | vector = np.random.random([1,self.vec_len]), 60 | Huffman = None 61 | ) 62 | word_dict[item[0]] = temp_dict 63 | self.word_dict = word_dict 64 | 65 | def Import_Model(self,model_path): 66 | model = FI.load_pickle(model_path) # a dict, {'word_dict','huffman','vec_len'} 67 | self.word_dict = model.word_dict 68 | self.huffman = model.huffman 69 | self.vec_len = model.vec_len 70 | self.learn_rate = model.learn_rate 71 | self.win_len = model.win_len 72 | self.model = model.model 73 | 74 | def Export_Model(self,model_path): 75 | data=dict( 76 | word_dict = self.word_dict, 77 | huffman = self.huffman, 78 | vec_len = self.vec_len, 79 | learn_rate = self.learn_rate, 80 | win_len = self.win_len, 81 | model = self.model 82 | ) 83 | FI.save_pickle(data,model_path) 84 | 85 | def Train_Model(self,text_list): 86 | 87 | # generate the word_dict and huffman tree 88 | if self.huffman==None: 89 | # if the dict is not loaded, it will generate a new dict 90 | if self.word_dict==None : 91 | wc = WordCounter(text_list) 92 | self.__Gnerate_Word_Dict(wc.count_res.larger_than(5)) 93 | self.cutted_text_list = wc.text_list 94 | 95 | # generate a huffman tree according to the possibility of words 96 | self.huffman = HuffmanTree(self.word_dict,vec_len=self.vec_len) 97 | print('word_dict and huffman tree already generated, ready to train vector') 98 | 99 | # start to train word vector 100 | before = (self.win_len-1) >> 1 101 | after = self.win_len-1-before 102 | 103 | if self.model=='cbow': 104 | method = self.__Deal_Gram_CBOW 105 | else: 106 | method = self.__Deal_Gram_SkipGram 107 | 108 | if self.cutted_text_list: 109 | # if the text has been cutted 110 | total = self.cutted_text_list.__len__() 111 | count = 0 112 | for line in self.cutted_text_list: 113 | line_len = line.__len__() 114 | for i in range(line_len): 115 | method(line[i],line[max(0,i-before):i]+line[i+1:min(line_len,i+after+1)]) 116 | count += 1 117 | print('{c} of {d}'.format(c=count,d=total)) 118 | 119 | else: 120 | # if the text has note been cutted 121 | for line in text_list: 122 | line = list(jieba.cut(line,cut_all=False)) 123 | line_len = line.__len__() 124 | for i in range(line_len): 125 | method(line[i],line[max(0,i-before):i]+line[i+1:min(line_len,i+after+1)]) 126 | print('word vector has been generated') 127 | 128 | def __Deal_Gram_CBOW(self,word,gram_word_list): 129 | 130 | if not self.word_dict.__contains__(word): 131 | return 132 | 133 | word_huffman = self.word_dict[word]['Huffman'] 134 | gram_vector_sum = np.zeros([1,self.vec_len]) 135 | for i in range(gram_word_list.__len__())[::-1]: 136 | item = gram_word_list[i] 137 | if self.word_dict.__contains__(item): 138 | gram_vector_sum += self.word_dict[item]['vector'] 139 | else: 140 | gram_word_list.pop(i) 141 | 142 | if gram_word_list.__len__()==0: 143 | return 144 | 145 | e = self.__GoAlong_Huffman(word_huffman,gram_vector_sum,self.huffman.root) 146 | 147 | for item in gram_word_list: 148 | self.word_dict[item]['vector'] += e 149 | self.word_dict[item]['vector'] = preprocessing.normalize(self.word_dict[item]['vector']) 150 | 151 | def __Deal_Gram_SkipGram(self,word,gram_word_list): 152 | 153 | if not self.word_dict.__contains__(word): 154 | return 155 | 156 | word_vector = self.word_dict[word]['vector'] 157 | for i in range(gram_word_list.__len__())[::-1]: 158 | if not self.word_dict.__contains__(gram_word_list[i]): 159 | gram_word_list.pop(i) 160 | 161 | if gram_word_list.__len__()==0: 162 | return 163 | 164 | for u in gram_word_list: 165 | u_huffman = self.word_dict[u]['Huffman'] 166 | e = self.__GoAlong_Huffman(u_huffman,word_vector,self.huffman.root) 167 | self.word_dict[word]['vector'] += e 168 | self.word_dict[word]['vector'] = preprocessing.normalize(self.word_dict[word]['vector']) 169 | 170 | def __GoAlong_Huffman(self,word_huffman,input_vector,root): 171 | 172 | node = root 173 | e = np.zeros([1,self.vec_len]) 174 | for level in range(word_huffman.__len__()): 175 | huffman_charat = word_huffman[level] 176 | q = self.__Sigmoid(input_vector.dot(node.value.T)) 177 | grad = self.learn_rate * (1-int(huffman_charat)-q) 178 | e += grad * node.value 179 | node.value += grad * input_vector 180 | node.value = preprocessing.normalize(node.value) 181 | if huffman_charat=='0': 182 | node = node.right 183 | else: 184 | node = node.left 185 | return e 186 | 187 | def __Sigmoid(self,value): 188 | return 1/(1+math.exp(-value)) 189 | 190 | class HuffmanTreeNode(): 191 | def __init__(self,value,possibility): 192 | # common part of leaf node and tree node 193 | self.possibility = possibility 194 | self.left = None 195 | self.right = None 196 | # value of leaf node will be the word, and be 197 | # mid vector in tree node 198 | self.value = value # the value of word 199 | self.Huffman = "" # store the huffman code 200 | 201 | def __str__(self): 202 | return 'HuffmanTreeNode object, value: {v}, possibility: {p}, Huffman: {h}' \ 203 | .format(v=self.value,p=self.possibility,h=self.Huffman) 204 | 205 | class HuffmanTree(): 206 | def __init__(self, word_dict, vec_len=15000): 207 | self.vec_len = vec_len # the length of word vector 208 | self.root = None 209 | 210 | word_dict_list = list(word_dict.values()) 211 | node_list = [HuffmanTreeNode(x['word'],x['possibility']) for x in word_dict_list] 212 | self.build_tree(node_list) 213 | # self.build_CBT(node_list) 214 | self.generate_huffman_code(self.root, word_dict) 215 | 216 | def build_tree(self,node_list): 217 | # node_list.sort(key=lambda x:x.possibility,reverse=True) 218 | # for i in range(node_list.__len__()-1)[::-1]: 219 | # top_node = self.merge(node_list[i],node_list[i+1]) 220 | # node_list.insert(i,top_node) 221 | # self.root = node_list[0] 222 | 223 | while node_list.__len__()>1: 224 | i1 = 0 # i1表示概率最小的节点 225 | i2 = 1 # i2 概率第二小的节点 226 | if node_list[i2].possibility < node_list[i1].possibility : 227 | [i1,i2] = [i2,i1] 228 | for i in range(2,node_list.__len__()): # 找到最小的两个节点 229 | if node_list[i].possibilityi2: 238 | node_list.pop(i1) 239 | node_list.pop(i2) 240 | else: 241 | raise RuntimeError('i1 should not be equal to i2') 242 | node_list.insert(0,top_node) 243 | self.root = node_list[0] 244 | 245 | def build_CBT(self,node_list): # build a complete binary tree 246 | node_list.sort(key=lambda x:x.possibility,reverse=True) 247 | node_num = node_list.__len__() 248 | before_start = 0 249 | while node_num>1 : 250 | for i in range(node_num>>1): 251 | top_node = self.merge(node_list[before_start+i*2],node_list[before_start+i*2+1]) 252 | node_list.append(top_node) 253 | if node_num%2==1: 254 | top_node = self.merge(node_list[before_start+i*2+2],node_list[-1]) 255 | node_list[-1] = top_node 256 | before_start = before_start + node_num 257 | node_num = node_num>>1 258 | self.root = node_list[-1] 259 | 260 | def generate_huffman_code(self, node, word_dict): 261 | # # use recursion in this edition 262 | # if node.left==None and node.right==None : 263 | # word = node.value 264 | # code = node.Huffman 265 | # print(word,code) 266 | # word_dict[word]['Huffman'] = code 267 | # return -1 268 | # 269 | # code = node.Huffman 270 | # if code==None: 271 | # code = "" 272 | # node.left.Huffman = code + "1" 273 | # node.right.Huffman = code + "0" 274 | # self.generate_huffman_code(node.left, word_dict) 275 | # self.generate_huffman_code(node.right, word_dict) 276 | 277 | # use stack butnot recursion in this edition 278 | stack = [self.root] 279 | while (stack.__len__()>0): 280 | node = stack.pop() 281 | # go along left tree 282 | while node.left or node.right : 283 | code = node.Huffman 284 | node.left.Huffman = code + "1" 285 | node.right.Huffman = code + "0" 286 | stack.append(node.right) 287 | node = node.left 288 | word = node.value 289 | code = node.Huffman 290 | # print(word,'\t',code.__len__(),'\t',node.possibility) 291 | word_dict[word]['Huffman'] = code 292 | 293 | def merge(self,node1,node2): 294 | top_pos = node1.possibility + node2.possibility 295 | top_node = HuffmanTreeNode(np.zeros([1,self.vec_len]), top_pos) 296 | if node1.possibility >= node2.possibility : 297 | top_node.left = node1 298 | top_node.right = node2 299 | else: 300 | top_node.left = node2 301 | top_node.right = node1 302 | return top_node 303 | 304 | class WordCounter(): 305 | # can calculate the freq of words in a text list 306 | 307 | # for example 308 | # >>> data = ['Merge multiple sorted inputs into a single sorted output', 309 | # 'The API below differs from textbook heap algorithms in two aspects'] 310 | # >>> wc = WordCounter(data) 311 | # >>> print(wc.count_res) 312 | 313 | # >>> MulCounter({' ': 18, 'sorted': 2, 'single': 1, 'below': 1, 'inputs': 1, 'The': 1, 'into': 1, 'textbook': 1, 314 | # 'API': 1, 'algorithms': 1, 'in': 1, 'output': 1, 'heap': 1, 'differs': 1, 'two': 1, 'from': 1, 315 | # 'aspects': 1, 'multiple': 1, 'a': 1, 'Merge': 1}) 316 | 317 | def __init__(self, text_list): 318 | self.text_list = text_list 319 | self.stop_word = self.Get_Stop_Words() 320 | self.count_res = None 321 | 322 | self.Word_Count(self.text_list) 323 | 324 | def Get_Stop_Words(self): 325 | ret = [] 326 | ret = FI.load_pickle('./static/stop_words.pkl') 327 | return ret 328 | 329 | def Word_Count(self,text_list,cut_all=False): 330 | 331 | filtered_word_list = [] 332 | count = 0 333 | for line in text_list: 334 | res = jieba.cut(line,cut_all=cut_all) 335 | res = list(res) 336 | text_list[count] = res 337 | count += 1 338 | filtered_word_list += res 339 | 340 | self.count_res = MulCounter(filtered_word_list) 341 | for word in self.stop_word: 342 | try: 343 | self.count_res.pop(word) 344 | except: 345 | pass 346 | 347 | class MulCounter(Counter): 348 | # a class extends from collections.Counter 349 | # add some methods, larger_than and less_than 350 | def __init__(self,element_list): 351 | super().__init__(element_list) 352 | 353 | def larger_than(self,minvalue,ret='list'): 354 | temp = sorted(self.items(),key=_itemgetter(1),reverse=True) 355 | low = 0 356 | high = temp.__len__() 357 | while(high - low > 1): 358 | mid = (low+high) >> 1 359 | if temp[mid][1] >= minvalue: 360 | low = mid 361 | else: 362 | high = mid 363 | if temp[low][1] 1): 381 | mid = (low+high) >> 1 382 | if temp[mid][1] <= maxvalue: 383 | low = mid 384 | else: 385 | high = mid 386 | if temp[low][1]>maxvalue: 387 | if ret=='dict': 388 | return {} 389 | else: 390 | return [] 391 | if ret=='dict': 392 | ret_data = {} 393 | for ele,count in temp[:high]: 394 | ret_data[ele]=count 395 | return ret_data 396 | else: 397 | return temp[:high] 398 | 399 | if __name__ == '__main__': 400 | # text = FI.load_pickle('./static/demo.pkl') 401 | # text =[ x['dealed_text']['left_content'][0] for x in text] 402 | data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects'] 403 | wv = Word2Vec(vec_len=500) 404 | wv.Train_Model(data) 405 | # FI.save_pickle(wv.word_dict,'./static/wv.pkl') 406 | # 407 | # data = FI.load_pickle('./static/wv.pkl') 408 | # x = {} 409 | # for key in data: 410 | # temp = data[key]['vector'] 411 | # temp = preprocessing.normalize(temp) 412 | # x[key] = temp 413 | # FI.save_pickle(x,'./static/normal_wv.pkl') 414 | 415 | # x = FI.load_pickle('./static/normal_wv.pkl') 416 | # def cal_simi(data,key1,key2): 417 | # return data[key1].dot(data[key2].T)[0][0] 418 | # keys=list(x.keys()) 419 | # for key in keys: 420 | # print(key,'\t',cal_simi(x,'姚明',key)) 421 | 422 | --------------------------------------------------------------------------------