├── File_Interface.py
├── HuffmanTree.py
├── WordCount.py
├── pyword2vec.py
├── static
    ├── stop_words.pkl
    └── 中文停用词表（比较全面，有1208个停用词）.txt
└── word2vec_v2.0.py


/File_Interface.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'multiangle'
 2 | 
 3 | import csv,pickle
 4 | 
 5 | def read_csv(path):         #读入原始csv文件，不做任何变动
 6 |     file=open(path,'r')
 7 |     reader=csv.reader(file)
 8 |     data=[row for row in reader]
 9 |     return data
10 | def load_pickle(path):          #读入pickle文件，不做任何变动
11 |     file=open(path,'rb')
12 |     data=pickle.load(file)
13 |     file.close()
14 |     return data
15 | def save_pickle(data,path):
16 |     file=open(path,'wb')
17 |     pickle.dump(data,file)
18 |     file.close()


--------------------------------------------------------------------------------
/HuffmanTree.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'multiangle'
  2 | 
  3 | import numpy as np
  4 | 
  5 | class HuffmanTreeNode():
  6 |     def __init__(self,value,possibility):
  7 |         # common part of leaf node and tree node
  8 |         self.possibility = possibility
  9 |         self.left = None
 10 |         self.right = None
 11 |         # value of leaf node  will be the word, and be
 12 |         # mid vector in tree node
 13 |         self.value = value # the value of word
 14 |         self.Huffman = "" # store the huffman code
 15 | 
 16 |     def __str__(self):
 17 |         return 'HuffmanTreeNode object, value: {v}, possibility: {p}, Huffman: {h}'\
 18 |             .format(v=self.value,p=self.possibility,h=self.Huffman)
 19 | 
 20 | class HuffmanTree():
 21 |     def __init__(self, word_dict, vec_len=15000):
 22 |         self.vec_len = vec_len      # the length of word vector
 23 |         self.root = None
 24 | 
 25 |         word_dict_list = list(word_dict.values())
 26 |         node_list = [HuffmanTreeNode(x['word'],x['possibility']) for x in word_dict_list]
 27 |         self.build_tree(node_list)
 28 |         # self.build_CBT(node_list)
 29 |         self.generate_huffman_code(self.root, word_dict)
 30 | 
 31 |     def build_tree(self,node_list):
 32 |         # node_list.sort(key=lambda x:x.possibility,reverse=True)
 33 |         # for i in range(node_list.__len__()-1)[::-1]:
 34 |         #     top_node = self.merge(node_list[i],node_list[i+1])
 35 |         #     node_list.insert(i,top_node)
 36 |         # self.root = node_list[0]
 37 | 
 38 |         while node_list.__len__()>1:
 39 |             i1 = 0  # i1表示概率最小的节点
 40 |             i2 = 1  # i2 概率第二小的节点
 41 |             if node_list[i2].possibility < node_list[i1].possibility :
 42 |                 [i1,i2] = [i2,i1]
 43 |             for i in range(2,node_list.__len__()): # 找到最小的两个节点
 44 |                 if node_list[i].possibility<node_list[i2].possibility :
 45 |                     i2 = i
 46 |                     if node_list[i2].possibility < node_list[i1].possibility :
 47 |                         [i1,i2] = [i2,i1]
 48 |             top_node = self.merge(node_list[i1],node_list[i2])
 49 |             if i1<i2:
 50 |                 node_list.pop(i2)
 51 |                 node_list.pop(i1)
 52 |             elif i1>i2:
 53 |                 node_list.pop(i1)
 54 |                 node_list.pop(i2)
 55 |             else:
 56 |                 raise RuntimeError('i1 should not be equal to i2')
 57 |             node_list.insert(0,top_node)
 58 |         self.root = node_list[0]
 59 | 
 60 |     def build_CBT(self,node_list): # build a complete binary tree
 61 |         node_list.sort(key=lambda  x:x.possibility,reverse=True)
 62 |         node_num = node_list.__len__()
 63 |         before_start = 0
 64 |         while node_num>1 :
 65 |             for i in range(node_num>>1):
 66 |                 top_node = self.merge(node_list[before_start+i*2],node_list[before_start+i*2+1])
 67 |                 node_list.append(top_node)
 68 |             if node_num%2==1:
 69 |                 top_node = self.merge(node_list[before_start+i*2+2],node_list[-1])
 70 |                 node_list[-1] = top_node
 71 |             before_start = before_start + node_num
 72 |             node_num = node_num>>1
 73 |         self.root = node_list[-1]
 74 | 
 75 |     def generate_huffman_code(self, node, word_dict):
 76 |         # # use recursion in this edition
 77 |         # if node.left==None and node.right==None :
 78 |         #     word = node.value
 79 |         #     code = node.Huffman
 80 |         #     print(word,code)
 81 |         #     word_dict[word]['Huffman'] = code
 82 |         #     return -1
 83 |         #
 84 |         # code = node.Huffman
 85 |         # if code==None:
 86 |         #     code = ""
 87 |         # node.left.Huffman = code + "1"
 88 |         # node.right.Huffman = code + "0"
 89 |         # self.generate_huffman_code(node.left, word_dict)
 90 |         # self.generate_huffman_code(node.right, word_dict)
 91 | 
 92 |         # use stack butnot recursion in this edition
 93 |         stack = [self.root]
 94 |         while (stack.__len__()>0):
 95 |             node = stack.pop()
 96 |             # go along left tree
 97 |             while node.left or node.right :
 98 |                 code = node.Huffman
 99 |                 node.left.Huffman = code + "1"
100 |                 node.right.Huffman = code + "0"
101 |                 stack.append(node.right)
102 |                 node = node.left
103 |             word = node.value
104 |             code = node.Huffman
105 |             # print(word,'\t',code.__len__(),'\t',node.possibility)
106 |             word_dict[word]['Huffman'] = code
107 | 
108 |     def merge(self,node1,node2):
109 |         top_pos = node1.possibility + node2.possibility
110 |         top_node = HuffmanTreeNode(np.zeros([1,self.vec_len]), top_pos)
111 |         if node1.possibility >= node2.possibility :
112 |             top_node.left = node1
113 |             top_node.right = node2
114 |         else:
115 |             top_node.left = node2
116 |             top_node.right = node1
117 |         return top_node
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/WordCount.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'multiangle'
  2 | 
  3 | 
  4 | from collections import Counter
  5 | from operator import itemgetter as _itemgetter
  6 | import jieba
  7 | import File_Interface as FI
  8 | 
  9 | class WordCounter():
 10 |     # can calculate the freq of words in a text list
 11 | 
 12 |     # for example
 13 |     # >>> data = ['Merge multiple sorted inputs into a single sorted output',
 14 |     #           'The API below differs from textbook heap algorithms in two aspects']
 15 |     # >>> wc = WordCounter(data)
 16 |     # >>> print(wc.count_res)
 17 | 
 18 |     # >>> MulCounter({' ': 18, 'sorted': 2, 'single': 1, 'below': 1, 'inputs': 1, 'The': 1, 'into': 1, 'textbook': 1,
 19 |     #                'API': 1, 'algorithms': 1, 'in': 1, 'output': 1, 'heap': 1, 'differs': 1, 'two': 1, 'from': 1,
 20 |     #                'aspects': 1, 'multiple': 1, 'a': 1, 'Merge': 1})
 21 | 
 22 |     def __init__(self, text_list):
 23 |         self.text_list = text_list
 24 |         self.stop_word = self.Get_Stop_Words()
 25 |         self.count_res = None
 26 | 
 27 |         self.Word_Count(self.text_list)
 28 | 
 29 |     def Get_Stop_Words(self):
 30 |         ret = []
 31 |         ret = FI.load_pickle('./static/stop_words.pkl')
 32 |         return ret
 33 | 
 34 |     def Word_Count(self,text_list,cut_all=False):
 35 | 
 36 |         filtered_word_list = []
 37 |         count = 0
 38 |         for line in text_list:
 39 |             res = jieba.cut(line,cut_all=cut_all)
 40 |             res = list(res)
 41 |             text_list[count] = res
 42 |             count += 1
 43 |             filtered_word_list += res
 44 | 
 45 |         self.count_res = MulCounter(filtered_word_list)
 46 |         for word in self.stop_word:
 47 |             try:
 48 |                 self.count_res.pop(word)
 49 |             except:
 50 |                 pass
 51 | 
 52 | class MulCounter(Counter):
 53 |     # a class extends from collections.Counter
 54 |     # add some methods, larger_than and less_than
 55 |     def __init__(self,element_list):
 56 |         super().__init__(element_list)
 57 | 
 58 |     def larger_than(self,minvalue,ret='list'):
 59 |         temp = sorted(self.items(),key=_itemgetter(1),reverse=True)
 60 |         low = 0
 61 |         high = temp.__len__()
 62 |         while(high - low > 1):
 63 |             mid = (low+high) >> 1
 64 |             if temp[mid][1] >= minvalue:
 65 |                 low = mid
 66 |             else:
 67 |                 high = mid
 68 |         if temp[low][1]<minvalue:
 69 |             if ret=='dict':
 70 |                 return {}
 71 |             else:
 72 |                 return []
 73 |         if ret=='dict':
 74 |             ret_data = {}
 75 |             for ele,count in temp[:high]:
 76 |                 ret_data[ele]=count
 77 |             return ret_data
 78 |         else:
 79 |             return temp[:high]
 80 | 
 81 |     def less_than(self,maxvalue,ret='list'):
 82 |         temp = sorted(self.items(),key=_itemgetter(1))
 83 |         low = 0
 84 |         high = temp.__len__()
 85 |         while ((high-low) > 1):
 86 |             mid = (low+high) >> 1
 87 |             if temp[mid][1] <= maxvalue:
 88 |                 low = mid
 89 |             else:
 90 |                 high = mid
 91 |         if temp[low][1]>maxvalue:
 92 |             if ret=='dict':
 93 |                 return {}
 94 |             else:
 95 |                 return []
 96 |         if ret=='dict':
 97 |             ret_data = {}
 98 |             for ele,count in temp[:high]:
 99 |                 ret_data[ele]=count
100 |             return ret_data
101 |         else:
102 |             return temp[:high]
103 | 
104 | if __name__ == '__main__':
105 |     # text = FI.load_pickle('./static/demo.pkl')
106 |     # text =[ x['dealed_text']['left_content'][0] for x in text]
107 |     # wc = WordCounter(text)
108 |     # print(wc.count_res.larger_than(5))
109 | 
110 |     data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
111 |     wc = WordCounter(data)
112 |     c = wc.count_res
113 |     print(sum(c.values()))
114 | 
115 | 
116 |     # wc = FI.load_pickle('./static/test.pkl')
117 | 
118 |     # print(sorted(x.items(),key=lambda x:x[1]))
119 |     # print(x)
120 | 
121 |     # c=MulCounter('abcdeabcdaffbcabag')
122 |     # print(sorted(c.items(),key=_itemgetter(1),reverse=True))
123 |     # print(c.larger_than(1))
124 | 


--------------------------------------------------------------------------------
/pyword2vec.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'multiangle'
  2 | 
  3 | import math
  4 | 
  5 | from WordCount import WordCounter,MulCounter
  6 | import File_Interface as FI
  7 | from HuffmanTree import HuffmanTree
  8 | 
  9 | import numpy as np
 10 | import jieba
 11 | from sklearn import preprocessing
 12 | 
 13 | class Word2Vec():
 14 |     def __init__(self, vec_len=15000, learn_rate=0.025, win_len=5, model='cbow'):
 15 |         self.cutted_text_list = None
 16 |         self.vec_len = vec_len
 17 |         self.learn_rate = learn_rate
 18 |         self.win_len = win_len
 19 |         self.model = model
 20 |         self.word_dict = None  # each element is a dict, including: word,possibility,vector,huffmancode
 21 |         self.huffman = None    # the object of HuffmanTree
 22 | 
 23 |     def Load_Word_Freq(self,word_freq_path):
 24 |         # load the info of word frequence
 25 |         # will generate a word dict
 26 |         if self.word_dict is not None:
 27 |             raise RuntimeError('the word dict is not empty')
 28 |         word_freq = FI.load_pickle(word_freq_path)
 29 |         self.__Gnerate_Word_Dict(word_freq)
 30 | 
 31 |     def __Gnerate_Word_Dict(self,word_freq):
 32 |         # generate a word dict
 33 |         # which containing the word, freq, possibility, a random initial vector and Huffman value
 34 |         if not isinstance(word_freq,dict) and not isinstance(word_freq,list):
 35 |             raise ValueError('the word freq info should be a dict or list')
 36 | 
 37 |         word_dict = {}
 38 |         if isinstance(word_freq,dict):
 39 |             # if word_freq is in type of dictionary
 40 |             sum_count = sum(word_freq.values())
 41 |             for word in word_freq:
 42 |                 temp_dict = dict(
 43 |                     word = word,
 44 |                     freq = word_freq[word],
 45 |                     possibility = word_freq[word]/sum_count,
 46 |                     vector = np.random.random([1,self.vec_len]),
 47 |                     Huffman = None
 48 |                 )
 49 |                 word_dict[word] = temp_dict
 50 |         else:
 51 |             # if word_freq is in type of list
 52 |             freq_list = [x[1] for x in word_freq]
 53 |             sum_count = sum(freq_list)
 54 | 
 55 |             for item in word_freq:
 56 |                 temp_dict = dict(
 57 |                     word = item[0],
 58 |                     freq = item[1],
 59 |                     possibility = item[1]/sum_count,
 60 |                     vector = np.random.random([1,self.vec_len]),
 61 |                     Huffman = None
 62 |                 )
 63 |                 word_dict[item[0]] = temp_dict
 64 |         self.word_dict = word_dict
 65 | 
 66 |     def Import_Model(self,model_path):
 67 |         model = FI.load_pickle(model_path)  # a dict, {'word_dict','huffman','vec_len'}
 68 |         self.word_dict = model.word_dict
 69 |         self.huffman = model.huffman
 70 |         self.vec_len = model.vec_len
 71 |         self.learn_rate = model.learn_rate
 72 |         self.win_len = model.win_len
 73 |         self.model = model.model
 74 | 
 75 |     def Export_Model(self,model_path):
 76 |         data=dict(
 77 |             word_dict = self.word_dict,
 78 |             huffman = self.huffman,
 79 |             vec_len = self.vec_len,
 80 |             learn_rate = self.learn_rate,
 81 |             win_len = self.win_len,
 82 |             model = self.model
 83 |         )
 84 |         FI.save_pickle(data,model_path)
 85 | 
 86 |     def Train_Model(self,text_list):
 87 | 
 88 |         # generate the word_dict and huffman tree
 89 |         if self.huffman==None:
 90 |             # if the dict is not loaded, it will generate a new dict
 91 |             if self.word_dict==None :
 92 |                 wc = WordCounter(text_list)
 93 |                 self.__Gnerate_Word_Dict(wc.count_res.larger_than(5))
 94 |                 self.cutted_text_list = wc.text_list
 95 | 
 96 |             # generate a huffman tree according to the possibility of words
 97 |             self.huffman = HuffmanTree(self.word_dict,vec_len=self.vec_len)
 98 |         print('word_dict and huffman tree already generated, ready to train vector')
 99 | 
100 |         # start to train word vector
101 |         before = (self.win_len-1) >> 1
102 |         after = self.win_len-1-before
103 | 
104 |         if self.model=='cbow':
105 |             method = self.__Deal_Gram_CBOW
106 |         else:
107 |             method = self.__Deal_Gram_SkipGram
108 | 
109 |         if self.cutted_text_list:
110 |             # if the text has been cutted
111 |             total = self.cutted_text_list.__len__()
112 |             count = 0
113 |             for line in self.cutted_text_list:
114 |                 line_len = line.__len__()
115 |                 for i in range(line_len):
116 |                     method(line[i],line[max(0,i-before):i]+line[i+1:min(line_len,i+after+1)])
117 |                 count += 1
118 |                 print('{c} of {d}'.format(c=count,d=total))
119 | 
120 |         else:
121 |             # if the text has note been cutted
122 |             for line in text_list:
123 |                 line = list(jieba.cut(line,cut_all=False))
124 |                 line_len = line.__len__()
125 |                 for i in range(line_len):
126 |                     method(line[i],line[max(0,i-before):i]+line[i+1:min(line_len,i+after+1)])
127 |         print('word vector has been generated')
128 | 
129 |     def __Deal_Gram_CBOW(self,word,gram_word_list):
130 | 
131 |         if not self.word_dict.__contains__(word):
132 |             return
133 | 
134 |         word_huffman = self.word_dict[word]['Huffman']
135 |         gram_vector_sum = np.zeros([1,self.vec_len])
136 |         for i in range(gram_word_list.__len__())[::-1]:
137 |             item = gram_word_list[i]
138 |             if self.word_dict.__contains__(item):
139 |                 gram_vector_sum += self.word_dict[item]['vector']
140 |             else:
141 |                 gram_word_list.pop(i)
142 | 
143 |         if gram_word_list.__len__()==0:
144 |             return
145 | 
146 |         e = self.__GoAlong_Huffman(word_huffman,gram_vector_sum,self.huffman.root)
147 | 
148 |         for item in gram_word_list:
149 |             self.word_dict[item]['vector'] += e
150 |             self.word_dict[item]['vector'] = preprocessing.normalize(self.word_dict[item]['vector'])
151 | 
152 |     def __Deal_Gram_SkipGram(self,word,gram_word_list):
153 | 
154 |         if not self.word_dict.__contains__(word):
155 |             return
156 | 
157 |         word_vector = self.word_dict[word]['vector']
158 |         for i in range(gram_word_list.__len__())[::-1]:
159 |             if not self.word_dict.__contains__(gram_word_list[i]):
160 |                 gram_word_list.pop(i)
161 | 
162 |         if gram_word_list.__len__()==0:
163 |             return
164 | 
165 |         for u in gram_word_list:
166 |             u_huffman = self.word_dict[u]['Huffman']
167 |             e = self.__GoAlong_Huffman(u_huffman,word_vector,self.huffman.root)
168 |             self.word_dict[word]['vector'] += e
169 |             self.word_dict[word]['vector'] = preprocessing.normalize(self.word_dict[word]['vector'])
170 | 
171 |     def __GoAlong_Huffman(self,word_huffman,input_vector,root):
172 | 
173 |         node = root
174 |         e = np.zeros([1,self.vec_len])
175 |         for level in range(word_huffman.__len__()):
176 |             huffman_charat = word_huffman[level]
177 |             q = self.__Sigmoid(input_vector.dot(node.value.T))
178 |             grad = self.learn_rate * (1-int(huffman_charat)-q)
179 |             e += grad * node.value
180 |             node.value += grad * input_vector
181 |             node.value = preprocessing.normalize(node.value)
182 |             if huffman_charat=='0':
183 |                 node = node.right
184 |             else:
185 |                 node = node.left
186 |         return e
187 | 
188 |     def __Sigmoid(self,value):
189 |         return 1/(1+math.exp(-value))
190 | 
191 | if __name__ == '__main__':
192 |     # text = FI.load_pickle('./static/demo.pkl')
193 |     # text =[ x['dealed_text']['left_content'][0] for x in text]
194 |     # # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
195 |     # wv = Word2Vec(vec_len=500)
196 |     # wv.Train_Model(text)
197 |     # FI.save_pickle(wv.word_dict,'./static/wv.pkl')
198 |     #
199 |     # data = FI.load_pickle('./static/wv.pkl')
200 |     # x = {}
201 |     # for key in data:
202 |     #     temp = data[key]['vector']
203 |     #     temp = preprocessing.normalize(temp)
204 |     #     x[key] = temp
205 |     # FI.save_pickle(x,'./static/normal_wv.pkl')
206 | 
207 |     x = FI.load_pickle('./static/normal_wv.pkl')
208 |     def cal_simi(data,key1,key2):
209 |         return data[key1].dot(data[key2].T)[0][0]
210 |     keys=list(x.keys())
211 |     for key in keys:
212 |         print(key,'\t',cal_simi(x,'姚明',key))
213 | 
214 | 


--------------------------------------------------------------------------------
/static/stop_words.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/multiangle/pyword2vec/e3c7c5ff0308d0d00501ee26356b46df2438f68f/static/stop_words.pkl


--------------------------------------------------------------------------------
/static/中文停用词表（比较全面，有1208个停用词）.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/multiangle/pyword2vec/e3c7c5ff0308d0d00501ee26356b46df2438f68f/static/中文停用词表（比较全面，有1208个停用词）.txt


--------------------------------------------------------------------------------
/word2vec_v2.0.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'multiangle'
  2 | 
  3 | import math
  4 | import File_Interface as FI
  5 | from operator import itemgetter as _itemgetter
  6 | import numpy as np
  7 | import jieba
  8 | from sklearn import preprocessing
  9 | from collections import Counter
 10 | import numpy as np
 11 | 
 12 | class Word2Vec():
 13 |     def __init__(self, vec_len=15000, learn_rate=0.025, win_len=5, model='cbow'):
 14 |         self.cutted_text_list = None
 15 |         self.vec_len = vec_len
 16 |         self.learn_rate = learn_rate
 17 |         self.win_len = win_len
 18 |         self.model = model
 19 |         self.word_dict = None  # each element is a dict, including: word,possibility,vector,huffmancode
 20 |         self.huffman = None    # the object of HuffmanTree
 21 | 
 22 |     def Load_Word_Freq(self,word_freq_path):
 23 |         # load the info of word frequence
 24 |         # will generate a word dict
 25 |         if self.word_dict is not None:
 26 |             raise RuntimeError('the word dict is not empty')
 27 |         word_freq = FI.load_pickle(word_freq_path)
 28 |         self.__Gnerate_Word_Dict(word_freq)
 29 | 
 30 |     def __Gnerate_Word_Dict(self,word_freq):
 31 |         # generate a word dict
 32 |         # which containing the word, freq, possibility, a random initial vector and Huffman value
 33 |         if not isinstance(word_freq,dict) and not isinstance(word_freq,list):
 34 |             raise ValueError('the word freq info should be a dict or list')
 35 | 
 36 |         word_dict = {}
 37 |         if isinstance(word_freq,dict):
 38 |             # if word_freq is in type of dictionary
 39 |             sum_count = sum(word_freq.values())
 40 |             for word in word_freq:
 41 |                 temp_dict = dict(
 42 |                     word = word,
 43 |                     freq = word_freq[word],
 44 |                     possibility = word_freq[word]/sum_count,
 45 |                     vector = np.random.random([1,self.vec_len]),
 46 |                     Huffman = None
 47 |                 )
 48 |                 word_dict[word] = temp_dict
 49 |         else:
 50 |             # if word_freq is in type of list
 51 |             freq_list = [x[1] for x in word_freq]
 52 |             sum_count = sum(freq_list)
 53 | 
 54 |             for item in word_freq:
 55 |                 temp_dict = dict(
 56 |                     word = item[0],
 57 |                     freq = item[1],
 58 |                     possibility = item[1]/sum_count,
 59 |                     vector = np.random.random([1,self.vec_len]),
 60 |                     Huffman = None
 61 |                 )
 62 |                 word_dict[item[0]] = temp_dict
 63 |         self.word_dict = word_dict
 64 | 
 65 |     def Import_Model(self,model_path):
 66 |         model = FI.load_pickle(model_path)  # a dict, {'word_dict','huffman','vec_len'}
 67 |         self.word_dict = model.word_dict
 68 |         self.huffman = model.huffman
 69 |         self.vec_len = model.vec_len
 70 |         self.learn_rate = model.learn_rate
 71 |         self.win_len = model.win_len
 72 |         self.model = model.model
 73 | 
 74 |     def Export_Model(self,model_path):
 75 |         data=dict(
 76 |             word_dict = self.word_dict,
 77 |             huffman = self.huffman,
 78 |             vec_len = self.vec_len,
 79 |             learn_rate = self.learn_rate,
 80 |             win_len = self.win_len,
 81 |             model = self.model
 82 |         )
 83 |         FI.save_pickle(data,model_path)
 84 | 
 85 |     def Train_Model(self,text_list):
 86 | 
 87 |         # generate the word_dict and huffman tree
 88 |         if self.huffman==None:
 89 |             # if the dict is not loaded, it will generate a new dict
 90 |             if self.word_dict==None :
 91 |                 wc = WordCounter(text_list)
 92 |                 self.__Gnerate_Word_Dict(wc.count_res.larger_than(5))
 93 |                 self.cutted_text_list = wc.text_list
 94 | 
 95 |             # generate a huffman tree according to the possibility of words
 96 |             self.huffman = HuffmanTree(self.word_dict,vec_len=self.vec_len)
 97 |         print('word_dict and huffman tree already generated, ready to train vector')
 98 | 
 99 |         # start to train word vector
100 |         before = (self.win_len-1) >> 1
101 |         after = self.win_len-1-before
102 | 
103 |         if self.model=='cbow':
104 |             method = self.__Deal_Gram_CBOW
105 |         else:
106 |             method = self.__Deal_Gram_SkipGram
107 | 
108 |         if self.cutted_text_list:
109 |             # if the text has been cutted
110 |             total = self.cutted_text_list.__len__()
111 |             count = 0
112 |             for line in self.cutted_text_list:
113 |                 line_len = line.__len__()
114 |                 for i in range(line_len):
115 |                     method(line[i],line[max(0,i-before):i]+line[i+1:min(line_len,i+after+1)])
116 |                 count += 1
117 |                 print('{c} of {d}'.format(c=count,d=total))
118 | 
119 |         else:
120 |             # if the text has note been cutted
121 |             for line in text_list:
122 |                 line = list(jieba.cut(line,cut_all=False))
123 |                 line_len = line.__len__()
124 |                 for i in range(line_len):
125 |                     method(line[i],line[max(0,i-before):i]+line[i+1:min(line_len,i+after+1)])
126 |         print('word vector has been generated')
127 | 
128 |     def __Deal_Gram_CBOW(self,word,gram_word_list):
129 | 
130 |         if not self.word_dict.__contains__(word):
131 |             return
132 | 
133 |         word_huffman = self.word_dict[word]['Huffman']
134 |         gram_vector_sum = np.zeros([1,self.vec_len])
135 |         for i in range(gram_word_list.__len__())[::-1]:
136 |             item = gram_word_list[i]
137 |             if self.word_dict.__contains__(item):
138 |                 gram_vector_sum += self.word_dict[item]['vector']
139 |             else:
140 |                 gram_word_list.pop(i)
141 | 
142 |         if gram_word_list.__len__()==0:
143 |             return
144 | 
145 |         e = self.__GoAlong_Huffman(word_huffman,gram_vector_sum,self.huffman.root)
146 | 
147 |         for item in gram_word_list:
148 |             self.word_dict[item]['vector'] += e
149 |             self.word_dict[item]['vector'] = preprocessing.normalize(self.word_dict[item]['vector'])
150 | 
151 |     def __Deal_Gram_SkipGram(self,word,gram_word_list):
152 | 
153 |         if not self.word_dict.__contains__(word):
154 |             return
155 | 
156 |         word_vector = self.word_dict[word]['vector']
157 |         for i in range(gram_word_list.__len__())[::-1]:
158 |             if not self.word_dict.__contains__(gram_word_list[i]):
159 |                 gram_word_list.pop(i)
160 | 
161 |         if gram_word_list.__len__()==0:
162 |             return
163 | 
164 |         for u in gram_word_list:
165 |             u_huffman = self.word_dict[u]['Huffman']
166 |             e = self.__GoAlong_Huffman(u_huffman,word_vector,self.huffman.root)
167 |             self.word_dict[word]['vector'] += e
168 |             self.word_dict[word]['vector'] = preprocessing.normalize(self.word_dict[word]['vector'])
169 | 
170 |     def __GoAlong_Huffman(self,word_huffman,input_vector,root):
171 | 
172 |         node = root
173 |         e = np.zeros([1,self.vec_len])
174 |         for level in range(word_huffman.__len__()):
175 |             huffman_charat = word_huffman[level]
176 |             q = self.__Sigmoid(input_vector.dot(node.value.T))
177 |             grad = self.learn_rate * (1-int(huffman_charat)-q)
178 |             e += grad * node.value
179 |             node.value += grad * input_vector
180 |             node.value = preprocessing.normalize(node.value)
181 |             if huffman_charat=='0':
182 |                 node = node.right
183 |             else:
184 |                 node = node.left
185 |         return e
186 | 
187 |     def __Sigmoid(self,value):
188 |         return 1/(1+math.exp(-value))
189 | 
190 | class HuffmanTreeNode():
191 |     def __init__(self,value,possibility):
192 |         # common part of leaf node and tree node
193 |         self.possibility = possibility
194 |         self.left = None
195 |         self.right = None
196 |         # value of leaf node  will be the word, and be
197 |         # mid vector in tree node
198 |         self.value = value # the value of word
199 |         self.Huffman = "" # store the huffman code
200 | 
201 |     def __str__(self):
202 |         return 'HuffmanTreeNode object, value: {v}, possibility: {p}, Huffman: {h}' \
203 |             .format(v=self.value,p=self.possibility,h=self.Huffman)
204 | 
205 | class HuffmanTree():
206 |     def __init__(self, word_dict, vec_len=15000):
207 |         self.vec_len = vec_len      # the length of word vector
208 |         self.root = None
209 | 
210 |         word_dict_list = list(word_dict.values())
211 |         node_list = [HuffmanTreeNode(x['word'],x['possibility']) for x in word_dict_list]
212 |         self.build_tree(node_list)
213 |         # self.build_CBT(node_list)
214 |         self.generate_huffman_code(self.root, word_dict)
215 | 
216 |     def build_tree(self,node_list):
217 |         # node_list.sort(key=lambda x:x.possibility,reverse=True)
218 |         # for i in range(node_list.__len__()-1)[::-1]:
219 |         #     top_node = self.merge(node_list[i],node_list[i+1])
220 |         #     node_list.insert(i,top_node)
221 |         # self.root = node_list[0]
222 | 
223 |         while node_list.__len__()>1:
224 |             i1 = 0  # i1表示概率最小的节点
225 |             i2 = 1  # i2 概率第二小的节点
226 |             if node_list[i2].possibility < node_list[i1].possibility :
227 |                 [i1,i2] = [i2,i1]
228 |             for i in range(2,node_list.__len__()): # 找到最小的两个节点
229 |                 if node_list[i].possibility<node_list[i2].possibility :
230 |                     i2 = i
231 |                     if node_list[i2].possibility < node_list[i1].possibility :
232 |                         [i1,i2] = [i2,i1]
233 |             top_node = self.merge(node_list[i1],node_list[i2])
234 |             if i1<i2:
235 |                 node_list.pop(i2)
236 |                 node_list.pop(i1)
237 |             elif i1>i2:
238 |                 node_list.pop(i1)
239 |                 node_list.pop(i2)
240 |             else:
241 |                 raise RuntimeError('i1 should not be equal to i2')
242 |             node_list.insert(0,top_node)
243 |         self.root = node_list[0]
244 | 
245 |     def build_CBT(self,node_list): # build a complete binary tree
246 |         node_list.sort(key=lambda  x:x.possibility,reverse=True)
247 |         node_num = node_list.__len__()
248 |         before_start = 0
249 |         while node_num>1 :
250 |             for i in range(node_num>>1):
251 |                 top_node = self.merge(node_list[before_start+i*2],node_list[before_start+i*2+1])
252 |                 node_list.append(top_node)
253 |             if node_num%2==1:
254 |                 top_node = self.merge(node_list[before_start+i*2+2],node_list[-1])
255 |                 node_list[-1] = top_node
256 |             before_start = before_start + node_num
257 |             node_num = node_num>>1
258 |         self.root = node_list[-1]
259 | 
260 |     def generate_huffman_code(self, node, word_dict):
261 |         # # use recursion in this edition
262 |         # if node.left==None and node.right==None :
263 |         #     word = node.value
264 |         #     code = node.Huffman
265 |         #     print(word,code)
266 |         #     word_dict[word]['Huffman'] = code
267 |         #     return -1
268 |         #
269 |         # code = node.Huffman
270 |         # if code==None:
271 |         #     code = ""
272 |         # node.left.Huffman = code + "1"
273 |         # node.right.Huffman = code + "0"
274 |         # self.generate_huffman_code(node.left, word_dict)
275 |         # self.generate_huffman_code(node.right, word_dict)
276 | 
277 |         # use stack butnot recursion in this edition
278 |         stack = [self.root]
279 |         while (stack.__len__()>0):
280 |             node = stack.pop()
281 |             # go along left tree
282 |             while node.left or node.right :
283 |                 code = node.Huffman
284 |                 node.left.Huffman = code + "1"
285 |                 node.right.Huffman = code + "0"
286 |                 stack.append(node.right)
287 |                 node = node.left
288 |             word = node.value
289 |             code = node.Huffman
290 |             # print(word,'\t',code.__len__(),'\t',node.possibility)
291 |             word_dict[word]['Huffman'] = code
292 | 
293 |     def merge(self,node1,node2):
294 |         top_pos = node1.possibility + node2.possibility
295 |         top_node = HuffmanTreeNode(np.zeros([1,self.vec_len]), top_pos)
296 |         if node1.possibility >= node2.possibility :
297 |             top_node.left = node1
298 |             top_node.right = node2
299 |         else:
300 |             top_node.left = node2
301 |             top_node.right = node1
302 |         return top_node
303 | 
304 | class WordCounter():
305 |     # can calculate the freq of words in a text list
306 | 
307 |     # for example
308 |     # >>> data = ['Merge multiple sorted inputs into a single sorted output',
309 |     #           'The API below differs from textbook heap algorithms in two aspects']
310 |     # >>> wc = WordCounter(data)
311 |     # >>> print(wc.count_res)
312 | 
313 |     # >>> MulCounter({' ': 18, 'sorted': 2, 'single': 1, 'below': 1, 'inputs': 1, 'The': 1, 'into': 1, 'textbook': 1,
314 |     #                'API': 1, 'algorithms': 1, 'in': 1, 'output': 1, 'heap': 1, 'differs': 1, 'two': 1, 'from': 1,
315 |     #                'aspects': 1, 'multiple': 1, 'a': 1, 'Merge': 1})
316 | 
317 |     def __init__(self, text_list):
318 |         self.text_list = text_list
319 |         self.stop_word = self.Get_Stop_Words()
320 |         self.count_res = None
321 | 
322 |         self.Word_Count(self.text_list)
323 | 
324 |     def Get_Stop_Words(self):
325 |         ret = []
326 |         ret = FI.load_pickle('./static/stop_words.pkl')
327 |         return ret
328 | 
329 |     def Word_Count(self,text_list,cut_all=False):
330 | 
331 |         filtered_word_list = []
332 |         count = 0
333 |         for line in text_list:
334 |             res = jieba.cut(line,cut_all=cut_all)
335 |             res = list(res)
336 |             text_list[count] = res
337 |             count += 1
338 |             filtered_word_list += res
339 | 
340 |         self.count_res = MulCounter(filtered_word_list)
341 |         for word in self.stop_word:
342 |             try:
343 |                 self.count_res.pop(word)
344 |             except:
345 |                 pass
346 | 
347 | class MulCounter(Counter):
348 |     # a class extends from collections.Counter
349 |     # add some methods, larger_than and less_than
350 |     def __init__(self,element_list):
351 |         super().__init__(element_list)
352 | 
353 |     def larger_than(self,minvalue,ret='list'):
354 |         temp = sorted(self.items(),key=_itemgetter(1),reverse=True)
355 |         low = 0
356 |         high = temp.__len__()
357 |         while(high - low > 1):
358 |             mid = (low+high) >> 1
359 |             if temp[mid][1] >= minvalue:
360 |                 low = mid
361 |             else:
362 |                 high = mid
363 |         if temp[low][1]<minvalue:
364 |             if ret=='dict':
365 |                 return {}
366 |             else:
367 |                 return []
368 |         if ret=='dict':
369 |             ret_data = {}
370 |             for ele,count in temp[:high]:
371 |                 ret_data[ele]=count
372 |             return ret_data
373 |         else:
374 |             return temp[:high]
375 | 
376 |     def less_than(self,maxvalue,ret='list'):
377 |         temp = sorted(self.items(),key=_itemgetter(1))
378 |         low = 0
379 |         high = temp.__len__()
380 |         while ((high-low) > 1):
381 |             mid = (low+high) >> 1
382 |             if temp[mid][1] <= maxvalue:
383 |                 low = mid
384 |             else:
385 |                 high = mid
386 |         if temp[low][1]>maxvalue:
387 |             if ret=='dict':
388 |                 return {}
389 |             else:
390 |                 return []
391 |         if ret=='dict':
392 |             ret_data = {}
393 |             for ele,count in temp[:high]:
394 |                 ret_data[ele]=count
395 |             return ret_data
396 |         else:
397 |             return temp[:high]
398 | 
399 | if __name__ == '__main__':
400 |     # text = FI.load_pickle('./static/demo.pkl')
401 |     # text =[ x['dealed_text']['left_content'][0] for x in text]
402 |     data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
403 |     wv = Word2Vec(vec_len=500)
404 |     wv.Train_Model(data)
405 |     # FI.save_pickle(wv.word_dict,'./static/wv.pkl')
406 |     #
407 |     # data = FI.load_pickle('./static/wv.pkl')
408 |     # x = {}
409 |     # for key in data:
410 |     #     temp = data[key]['vector']
411 |     #     temp = preprocessing.normalize(temp)
412 |     #     x[key] = temp
413 |     # FI.save_pickle(x,'./static/normal_wv.pkl')
414 | 
415 |     # x = FI.load_pickle('./static/normal_wv.pkl')
416 |     # def cal_simi(data,key1,key2):
417 |     #     return data[key1].dot(data[key2].T)[0][0]
418 |     # keys=list(x.keys())
419 |     # for key in keys:
420 |     #     print(key,'\t',cal_simi(x,'姚明',key))
421 | 
422 | 


--------------------------------------------------------------------------------