├── LICENSE ├── README.md └── word2vec.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Nathan Rooy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # word2vec-from-scratch-with-python 2 | A very simple, bare-bones, inefficient, implementation of skip-gram word2vec from scratch with Python 3 | 4 | blog post -> https://nathanrooy.github.io/posts/2018-03-22/word2vec-from-scratch-with-python-and-numpy/ 5 | 6 | -------------------------------------------------------------------------------- /word2vec.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------+ 2 | # 3 | # Nathan A. Rooy 4 | # Simple word2vec from scratch with Python 5 | # 2018-FEB 6 | # 7 | #------------------------------------------------------------------------------+ 8 | 9 | #--- IMPORT DEPENDENCIES ------------------------------------------------------+ 10 | 11 | import numpy as np 12 | import re 13 | from collections import defaultdict 14 | 15 | #--- CONSTANTS ----------------------------------------------------------------+ 16 | 17 | 18 | class word2vec(): 19 | def __init__ (self): 20 | self.n = settings['n'] 21 | self.eta = settings['learning_rate'] 22 | self.epochs = settings['epochs'] 23 | self.window = settings['window_size'] 24 | pass 25 | 26 | 27 | # GENERATE TRAINING DATA 28 | def generate_training_data(self, settings, corpus): 29 | 30 | # GENERATE WORD COUNTS 31 | word_counts = defaultdict(int) 32 | for row in corpus: 33 | for word in row: 34 | word_counts[word] += 1 35 | 36 | self.v_count = len(word_counts.keys()) 37 | 38 | # GENERATE LOOKUP DICTIONARIES 39 | self.words_list = sorted(list(word_counts.keys()),reverse=False) 40 | self.word_index = dict((word, i) for i, word in enumerate(self.words_list)) 41 | self.index_word = dict((i, word) for i, word in enumerate(self.words_list)) 42 | 43 | training_data = [] 44 | # CYCLE THROUGH EACH SENTENCE IN CORPUS 45 | for sentence in corpus: 46 | sent_len = len(sentence) 47 | 48 | # CYCLE THROUGH EACH WORD IN SENTENCE 49 | for i, word in enumerate(sentence): 50 | 51 | #w_target = sentence[i] 52 | w_target = self.word2onehot(sentence[i]) 53 | 54 | # CYCLE THROUGH CONTEXT WINDOW 55 | w_context = [] 56 | for j in range(i-self.window, i+self.window+1): 57 | if j!=i and j<=sent_len-1 and j>=0: 58 | w_context.append(self.word2onehot(sentence[j])) 59 | training_data.append([w_target, w_context]) 60 | return np.array(training_data) 61 | 62 | 63 | # SOFTMAX ACTIVATION FUNCTION 64 | def softmax(self, x): 65 | e_x = np.exp(x - np.max(x)) 66 | return e_x / e_x.sum(axis=0) 67 | 68 | 69 | # CONVERT WORD TO ONE HOT ENCODING 70 | def word2onehot(self, word): 71 | word_vec = [0 for i in range(0, self.v_count)] 72 | word_index = self.word_index[word] 73 | word_vec[word_index] = 1 74 | return word_vec 75 | 76 | 77 | # FORWARD PASS 78 | def forward_pass(self, x): 79 | h = np.dot(self.w1.T, x) 80 | u = np.dot(self.w2.T, h) 81 | y_c = self.softmax(u) 82 | return y_c, h, u 83 | 84 | 85 | # BACKPROPAGATION 86 | def backprop(self, e, h, x): 87 | dl_dw2 = np.outer(h, e) 88 | dl_dw1 = np.outer(x, np.dot(self.w2, e.T)) 89 | 90 | # UPDATE WEIGHTS 91 | self.w1 = self.w1 - (self.eta * dl_dw1) 92 | self.w2 = self.w2 - (self.eta * dl_dw2) 93 | pass 94 | 95 | 96 | # TRAIN W2V model 97 | def train(self, training_data): 98 | # INITIALIZE WEIGHT MATRICES 99 | self.w1 = np.random.uniform(-0.8, 0.8, (self.v_count, self.n)) # embedding matrix 100 | self.w2 = np.random.uniform(-0.8, 0.8, (self.n, self.v_count)) # context matrix 101 | 102 | # CYCLE THROUGH EACH EPOCH 103 | for i in range(0, self.epochs): 104 | 105 | self.loss = 0 106 | 107 | # CYCLE THROUGH EACH TRAINING SAMPLE 108 | for w_t, w_c in training_data: 109 | 110 | # FORWARD PASS 111 | y_pred, h, u = self.forward_pass(w_t) 112 | 113 | # CALCULATE ERROR 114 | EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0) 115 | 116 | # BACKPROPAGATION 117 | self.backprop(EI, h, w_t) 118 | 119 | # CALCULATE LOSS 120 | self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u))) 121 | #self.loss += -2*np.log(len(w_c)) -np.sum([u[word.index(1)] for word in w_c]) + (len(w_c) * np.log(np.sum(np.exp(u)))) 122 | 123 | print 'EPOCH:',i, 'LOSS:', self.loss 124 | pass 125 | 126 | 127 | # input a word, returns a vector (if available) 128 | def word_vec(self, word): 129 | w_index = self.word_index[word] 130 | v_w = self.w1[w_index] 131 | return v_w 132 | 133 | 134 | # input a vector, returns nearest word(s) 135 | def vec_sim(self, vec, top_n): 136 | 137 | # CYCLE THROUGH VOCAB 138 | word_sim = {} 139 | for i in range(self.v_count): 140 | v_w2 = self.w1[i] 141 | theta_num = np.dot(vec, v_w2) 142 | theta_den = np.linalg.norm(vec) * np.linalg.norm(v_w2) 143 | theta = theta_num / theta_den 144 | 145 | word = self.index_word[i] 146 | word_sim[word] = theta 147 | 148 | words_sorted = sorted(word_sim.items(), key=lambda(word, sim):sim, reverse=True) 149 | 150 | for word, sim in words_sorted[:top_n]: 151 | print word, sim 152 | 153 | pass 154 | 155 | # input word, returns top [n] most similar words 156 | def word_sim(self, word, top_n): 157 | 158 | w1_index = self.word_index[word] 159 | v_w1 = self.w1[w1_index] 160 | 161 | # CYCLE THROUGH VOCAB 162 | word_sim = {} 163 | for i in range(self.v_count): 164 | v_w2 = self.w1[i] 165 | theta_num = np.dot(v_w1, v_w2) 166 | theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2) 167 | theta = theta_num / theta_den 168 | 169 | word = self.index_word[i] 170 | word_sim[word] = theta 171 | 172 | words_sorted = sorted(word_sim.items(), key=lambda(word, sim):sim, reverse=True) 173 | 174 | for word, sim in words_sorted[:top_n]: 175 | print word, sim 176 | 177 | pass 178 | 179 | #--- EXAMPLE RUN --------------------------------------------------------------+ 180 | 181 | settings = {} 182 | settings['n'] = 5 # dimension of word embeddings 183 | settings['window_size'] = 2 # context window +/- center word 184 | settings['min_count'] = 0 # minimum word count 185 | settings['epochs'] = 5000 # number of training epochs 186 | settings['neg_samp'] = 10 # number of negative words to use during training 187 | settings['learning_rate'] = 0.01 # learning rate 188 | np.random.seed(0) # set the seed for reproducibility 189 | 190 | corpus = [['the','quick','brown','fox','jumped','over','the','lazy','dog']] 191 | 192 | # INITIALIZE W2V MODEL 193 | w2v = word2vec() 194 | 195 | # generate training data 196 | training_data = w2v.generate_training_data(settings, corpus) 197 | 198 | # train word2vec model 199 | w2v.train(training_data) 200 | 201 | #--- END ----------------------------------------------------------------------+ 202 | --------------------------------------------------------------------------------