├── README.md └── wordtovec.py /README.md: -------------------------------------------------------------------------------- 1 | # Word2Vec Implementation using Numpy 2 | 3 | This is a implementation of Word2Vec using numpy. Click [here](https://derekchia.com/an-implementation-guide-to-word2vec-using-numpy-and-google-sheets/) for the accompanying blog post. 4 | 5 | To see Word2Vec in action, uncomment the print functions! Also remember to change the number of `epochs` and set `training_data` to `training_data[0]` to avoid flooding your terminal. A Google Sheet implementation of Word2Vec is also available [here](https://docs.google.com/spreadsheets/d/1mgf82Ue7MmQixMm2ZqnT1oWUucj6pEcd2wDs_JgHmco/edit?usp=sharing). Have fun learning! 6 | 7 | ![Word2Vec - Skip-Gram](https://i.ibb.co/XbKnHGP/Screenshot-2018-12-03-at-8-27-46-PM.png) 8 | 9 | ![Word2Vec using Google Sheet](https://serving.photos.photobox.com/35757252841d1a139084472a6536916b53fa434b73586b3d86affd10e87d8dd73c23b9e6.jpg) 10 | 11 | To start, run the script using: 12 | ``` 13 | python wordtovec.py 14 | ``` 15 | -------------------------------------------------------------------------------- /wordtovec.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a implementation of Word2Vec using numpy. Uncomment the print functions to see Word2Vec in action! Also remember to change the number of epochs and set training_data to training_data[0] to avoid flooding your terminal. A Google Sheet implementation of Word2Vec is also available here - https://docs.google.com/spreadsheets/d/1mgf82Ue7MmQixMm2ZqnT1oWUucj6pEcd2wDs_JgHmco/edit?usp=sharing 3 | 4 | Have fun learning! 5 | 6 | Author: Derek Chia 7 | Email: derek@derekchia.com 8 | """ 9 | 10 | import numpy as np 11 | from collections import defaultdict 12 | 13 | ## Randomly initialise 14 | getW1 = [[0.236, -0.962, 0.686, 0.785, -0.454, -0.833, -0.744, 0.677, -0.427, -0.066], 15 | [-0.907, 0.894, 0.225, 0.673, -0.579, -0.428, 0.685, 0.973, -0.070, -0.811], 16 | [-0.576, 0.658, -0.582, -0.112, 0.662, 0.051, -0.401, -0.921, -0.158, 0.529], 17 | [0.517, 0.436, 0.092, -0.835, -0.444, -0.905, 0.879, 0.303, 0.332, -0.275], 18 | [0.859, -0.890, 0.651, 0.185, -0.511, -0.456, 0.377, -0.274, 0.182, -0.237], 19 | [0.368, -0.867, -0.301, -0.222, 0.630, 0.808, 0.088, -0.902, -0.450, -0.408], 20 | [0.728, 0.277, 0.439, 0.138, -0.943, -0.409, 0.687, -0.215, -0.807, 0.612], 21 | [0.593, -0.699, 0.020, 0.142, -0.638, -0.633, 0.344, 0.868, 0.913, 0.429], 22 | [0.447, -0.810, -0.061, -0.495, 0.794, -0.064, -0.817, -0.408, -0.286, 0.149]] 23 | 24 | getW2 = [[-0.868, -0.406, -0.288, -0.016, -0.560, 0.179, 0.099, 0.438, -0.551], 25 | [-0.395, 0.890, 0.685, -0.329, 0.218, -0.852, -0.919, 0.665, 0.968], 26 | [-0.128, 0.685, -0.828, 0.709, -0.420, 0.057, -0.212, 0.728, -0.690], 27 | [0.881, 0.238, 0.018, 0.622, 0.936, -0.442, 0.936, 0.586, -0.020], 28 | [-0.478, 0.240, 0.820, -0.731, 0.260, -0.989, -0.626, 0.796, -0.599], 29 | [0.679, 0.721, -0.111, 0.083, -0.738, 0.227, 0.560, 0.929, 0.017], 30 | [-0.690, 0.907, 0.464, -0.022, -0.005, -0.004, -0.425, 0.299, 0.757], 31 | [-0.054, 0.397, -0.017, -0.563, -0.551, 0.465, -0.596, -0.413, -0.395], 32 | [-0.838, 0.053, -0.160, -0.164, -0.671, 0.140, -0.149, 0.708, 0.425], 33 | [0.096, -0.995, -0.313, 0.881, -0.402, -0.631, -0.660, 0.184, 0.487]] 34 | 35 | class word2vec(): 36 | 37 | def __init__(self): 38 | self.n = settings['n'] 39 | self.lr = settings['learning_rate'] 40 | self.epochs = settings['epochs'] 41 | self.window = settings['window_size'] 42 | 43 | def generate_training_data(self, settings, corpus): 44 | # Find unique word counts using dictonary 45 | word_counts = defaultdict(int) 46 | for row in corpus: 47 | for word in row: 48 | word_counts[word] += 1 49 | ######################################################################################################################################################### 50 | # print(word_counts) # 51 | # # defaultdict(, {'natural': 1, 'language': 1, 'processing': 1, 'and': 2, 'machine': 1, 'learning': 1, 'is': 1, 'fun': 1, 'exciting': 1}) # 52 | ######################################################################################################################################################### 53 | 54 | ## How many unique words in vocab? 9 55 | self.v_count = len(word_counts.keys()) 56 | ######################### 57 | # print(self.v_count) # 58 | # 9 # 59 | ######################### 60 | 61 | # Generate Lookup Dictionaries (vocab) 62 | self.words_list = list(word_counts.keys()) 63 | ################################################################################################# 64 | # print(self.words_list) # 65 | # ['natural', 'language', 'processing', 'and', 'machine', 'learning', 'is', 'fun', 'exciting'] # 66 | ################################################################################################# 67 | 68 | # Generate word:index 69 | self.word_index = dict((word, i) for i, word in enumerate(self.words_list)) 70 | ############################################################################################################################# 71 | # print(self.word_index) # 72 | # # {'natural': 0, 'language': 1, 'processing': 2, 'and': 3, 'machine': 4, 'learning': 5, 'is': 6, 'fun': 7, 'exciting': 8} # 73 | ############################################################################################################################# 74 | 75 | # Generate index:word 76 | self.index_word = dict((i, word) for i, word in enumerate(self.words_list)) 77 | ############################################################################################################################# 78 | # print(self.index_word) # 79 | # {0: 'natural', 1: 'language', 2: 'processing', 3: 'and', 4: 'machine', 5: 'learning', 6: 'is', 7: 'fun', 8: 'exciting'} # 80 | ############################################################################################################################# 81 | 82 | training_data = [] 83 | 84 | # Cycle through each sentence in corpus 85 | for sentence in corpus: 86 | sent_len = len(sentence) 87 | 88 | # Cycle through each word in sentence 89 | for i, word in enumerate(sentence): 90 | # Convert target word to one-hot 91 | w_target = self.word2onehot(sentence[i]) 92 | 93 | # Cycle through context window 94 | w_context = [] 95 | 96 | # Note: window_size 2 will have range of 5 values 97 | for j in range(i - self.window, i + self.window+1): 98 | # Criteria for context word 99 | # 1. Target word cannot be context word (j != i) 100 | # 2. Index must be greater or equal than 0 (j >= 0) - if not list index out of range 101 | # 3. Index must be less or equal than length of sentence (j <= sent_len-1) - if not list index out of range 102 | if j != i and j <= sent_len-1 and j >= 0: 103 | # Append the one-hot representation of word to w_context 104 | w_context.append(self.word2onehot(sentence[j])) 105 | # print(sentence[i], sentence[j]) 106 | ######################### 107 | # Example: # 108 | # natural language # 109 | # natural processing # 110 | # language natural # 111 | # language processing # 112 | # language append # 113 | ######################### 114 | 115 | # training_data contains a one-hot representation of the target word and context words 116 | ################################################################################################# 117 | # Example: # 118 | # [Target] natural, [Context] language, [Context] processing # 119 | # print(training_data) # 120 | # [[[1, 0, 0, 0, 0, 0, 0, 0, 0], [[0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0]]]] # 121 | ################################################################################################# 122 | training_data.append([w_target, w_context]) 123 | 124 | return np.array(training_data) 125 | 126 | def word2onehot(self, word): 127 | # word_vec - initialise a blank vector 128 | word_vec = [0 for i in range(0, self.v_count)] # Alternative - np.zeros(self.v_count) 129 | ############################# 130 | # print(word_vec) # 131 | # [0, 0, 0, 0, 0, 0, 0, 0] # 132 | ############################# 133 | 134 | # Get ID of word from word_index 135 | word_index = self.word_index[word] 136 | 137 | # Change value from 0 to 1 according to ID of the word 138 | word_vec[word_index] = 1 139 | 140 | return word_vec 141 | 142 | def train(self, training_data): 143 | # Initialising weight matrices 144 | # np.random.uniform(HIGH, LOW, OUTPUT_SHAPE) 145 | # https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.random.uniform.html 146 | self.w1 = np.array(getW1) 147 | self.w2 = np.array(getW2) 148 | # self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n)) 149 | # self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count)) 150 | 151 | # Cycle through each epoch 152 | for i in range(self.epochs): 153 | # Intialise loss to 0 154 | self.loss = 0 155 | # Cycle through each training sample 156 | # w_t = vector for target word, w_c = vectors for context words 157 | for w_t, w_c in training_data: 158 | # Forward pass 159 | # 1. predicted y using softmax (y_pred) 2. matrix of hidden layer (h) 3. output layer before softmax (u) 160 | y_pred, h, u = self.forward_pass(w_t) 161 | ######################################### 162 | # print("Vector for target word:", w_t) # 163 | # print("W1-before backprop", self.w1) # 164 | # print("W2-before backprop", self.w2) # 165 | ######################################### 166 | 167 | # Calculate error 168 | # 1. For a target word, calculate difference between y_pred and each of the context words 169 | # 2. Sum up the differences using np.sum to give us the error for this particular target word 170 | EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0) 171 | ######################### 172 | # print("Error", EI) # 173 | ######################### 174 | 175 | # Backpropagation 176 | # We use SGD to backpropagate errors - calculate loss on the output layer 177 | self.backprop(EI, h, w_t) 178 | ######################################### 179 | #print("W1-after backprop", self.w1) # 180 | #print("W2-after backprop", self.w2) # 181 | ######################################### 182 | 183 | # Calculate loss 184 | # There are 2 parts to the loss function 185 | # Part 1: -ve sum of all the output + 186 | # Part 2: length of context words * log of sum for all elements (exponential-ed) in the output layer before softmax (u) 187 | # Note: word.index(1) returns the index in the context word vector with value 1 188 | # Note: u[word.index(1)] returns the value of the output layer before softmax 189 | self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u))) 190 | 191 | ############################################################# 192 | # Break if you want to see weights after first target word # 193 | # break # 194 | ############################################################# 195 | print('Epoch:', i, "Loss:", self.loss) 196 | 197 | def forward_pass(self, x): 198 | # x is one-hot vector for target word, shape - 9x1 199 | # Run through first matrix (w1) to get hidden layer - 10x9 dot 9x1 gives us 10x1 200 | h = np.dot(x, self.w1) 201 | # Dot product hidden layer with second matrix (w2) - 9x10 dot 10x1 gives us 9x1 202 | u = np.dot(h, self.w2) 203 | # Run 1x9 through softmax to force each element to range of [0, 1] - 1x8 204 | y_c = self.softmax(u) 205 | return y_c, h, u 206 | 207 | def softmax(self, x): 208 | e_x = np.exp(x - np.max(x)) 209 | return e_x / e_x.sum(axis=0) 210 | 211 | def backprop(self, e, h, x): 212 | # https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.outer.html 213 | # Column vector EI represents row-wise sum of prediction errors across each context word for the current center word 214 | # Going backwards, we need to take derivative of E with respect of w2 215 | # h - shape 10x1, e - shape 9x1, dl_dw2 - shape 10x9 216 | # x - shape 9x1, w2 - 10x9, e.T - 9x1 217 | dl_dw2 = np.outer(h, e) 218 | dl_dw1 = np.outer(x, np.dot(self.w2, e.T)) 219 | ######################################## 220 | # print('Delta for w2', dl_dw2) # 221 | # print('Hidden layer', h) # 222 | # print('np.dot', np.dot(self.w2, e.T)) # 223 | # print('Delta for w1', dl_dw1) # 224 | ######################################### 225 | 226 | # Update weights 227 | self.w1 = self.w1 - (self.lr * dl_dw1) 228 | self.w2 = self.w2 - (self.lr * dl_dw2) 229 | 230 | # Get vector from word 231 | def word_vec(self, word): 232 | w_index = self.word_index[word] 233 | v_w = self.w1[w_index] 234 | return v_w 235 | 236 | # Input vector, returns nearest word(s) 237 | def vec_sim(self, word, top_n): 238 | v_w1 = self.word_vec(word) 239 | word_sim = {} 240 | 241 | for i in range(self.v_count): 242 | # Find the similary score for each word in vocab 243 | v_w2 = self.w1[i] 244 | theta_sum = np.dot(v_w1, v_w2) 245 | theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2) 246 | theta = theta_sum / theta_den 247 | 248 | word = self.index_word[i] 249 | word_sim[word] = theta 250 | 251 | words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True) 252 | 253 | for word, sim in words_sorted[:top_n]: 254 | print(word, sim) 255 | 256 | ##################################################################### 257 | settings = { 258 | 'window_size': 2, # context window +- center word 259 | 'n': 10, # dimensions of word embeddings, also refer to size of hidden layer 260 | 'epochs': 50, # number of training epochs 261 | 'learning_rate': 0.01 # learning rate 262 | } 263 | 264 | text = "natural language processing and machine learning is fun and exciting" 265 | 266 | # Note the .lower() as upper and lowercase does not matter in our implementation 267 | # [['natural', 'language', 'processing', 'and', 'machine', 'learning', 'is', 'fun', 'and', 'exciting']] 268 | corpus = [[word.lower() for word in text.split()]] 269 | 270 | # Initialise object 271 | w2v = word2vec() 272 | 273 | # Numpy ndarray with one-hot representation for [target_word, context_words] 274 | training_data = w2v.generate_training_data(settings, corpus) 275 | 276 | # Training 277 | w2v.train(training_data) 278 | 279 | # Get vector for word 280 | word = "machine" 281 | vec = w2v.word_vec(word) 282 | print(word, vec) 283 | 284 | # Find similar words 285 | w2v.vec_sim("machine", 3) 286 | --------------------------------------------------------------------------------