├── README.md
└── wordtovec.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Word2Vec Implementation using Numpy
 2 | 
 3 | This is a implementation of Word2Vec using numpy. Click [here](https://derekchia.com/an-implementation-guide-to-word2vec-using-numpy-and-google-sheets/) for the accompanying blog post.
 4 | 
 5 | To see Word2Vec in action, uncomment the print functions! Also remember to change the number of `epochs` and set `training_data` to `training_data[0]` to avoid flooding your terminal. A Google Sheet implementation of Word2Vec is also available [here](https://docs.google.com/spreadsheets/d/1mgf82Ue7MmQixMm2ZqnT1oWUucj6pEcd2wDs_JgHmco/edit?usp=sharing). Have fun learning!
 6 | 
 7 | ![Word2Vec - Skip-Gram](https://i.ibb.co/XbKnHGP/Screenshot-2018-12-03-at-8-27-46-PM.png)
 8 | 
 9 | ![Word2Vec using Google Sheet](https://serving.photos.photobox.com/35757252841d1a139084472a6536916b53fa434b73586b3d86affd10e87d8dd73c23b9e6.jpg)
10 | 
11 | To start, run the script using:
12 | ```
13 | python wordtovec.py
14 | ```
15 | 


--------------------------------------------------------------------------------
/wordtovec.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is a implementation of Word2Vec using numpy. Uncomment the print functions to see Word2Vec in action! Also remember to change the number of epochs and set training_data to training_data[0] to avoid flooding your terminal. A Google Sheet implementation of Word2Vec is also available here - https://docs.google.com/spreadsheets/d/1mgf82Ue7MmQixMm2ZqnT1oWUucj6pEcd2wDs_JgHmco/edit?usp=sharing
  3 | 
  4 | Have fun learning!
  5 | 
  6 | Author: Derek Chia
  7 | Email: derek@derekchia.com
  8 | """
  9 | 
 10 | import numpy as np
 11 | from collections import defaultdict
 12 | 
 13 | ## Randomly initialise
 14 | getW1 = [[0.236, -0.962, 0.686, 0.785, -0.454, -0.833, -0.744, 0.677, -0.427, -0.066],
 15 | 		[-0.907, 0.894, 0.225, 0.673, -0.579, -0.428, 0.685, 0.973, -0.070, -0.811],
 16 | 		[-0.576, 0.658, -0.582, -0.112, 0.662, 0.051, -0.401, -0.921, -0.158, 0.529],
 17 | 		[0.517, 0.436, 0.092, -0.835, -0.444, -0.905, 0.879, 0.303, 0.332, -0.275],
 18 | 		[0.859, -0.890, 0.651, 0.185, -0.511, -0.456, 0.377, -0.274, 0.182, -0.237],
 19 | 		[0.368, -0.867, -0.301, -0.222, 0.630, 0.808, 0.088, -0.902, -0.450, -0.408],
 20 | 		[0.728, 0.277, 0.439, 0.138, -0.943, -0.409, 0.687, -0.215, -0.807, 0.612],
 21 | 		[0.593, -0.699, 0.020, 0.142, -0.638, -0.633, 0.344, 0.868, 0.913, 0.429],
 22 | 		[0.447, -0.810, -0.061, -0.495, 0.794, -0.064, -0.817, -0.408, -0.286, 0.149]]
 23 | 
 24 | getW2 = [[-0.868, -0.406, -0.288, -0.016, -0.560, 0.179, 0.099, 0.438, -0.551],
 25 | 		[-0.395, 0.890, 0.685, -0.329, 0.218, -0.852, -0.919, 0.665, 0.968],
 26 | 		[-0.128, 0.685, -0.828, 0.709, -0.420, 0.057, -0.212, 0.728, -0.690],
 27 | 		[0.881, 0.238, 0.018, 0.622, 0.936, -0.442, 0.936, 0.586, -0.020],
 28 | 		[-0.478, 0.240, 0.820, -0.731, 0.260, -0.989, -0.626, 0.796, -0.599],
 29 | 		[0.679, 0.721, -0.111, 0.083, -0.738, 0.227, 0.560, 0.929, 0.017],
 30 | 		[-0.690, 0.907, 0.464, -0.022, -0.005, -0.004, -0.425, 0.299, 0.757],
 31 | 		[-0.054, 0.397, -0.017, -0.563, -0.551, 0.465, -0.596, -0.413, -0.395],
 32 | 		[-0.838, 0.053, -0.160, -0.164, -0.671, 0.140, -0.149, 0.708, 0.425],
 33 | 		[0.096, -0.995, -0.313, 0.881, -0.402, -0.631, -0.660, 0.184, 0.487]]
 34 | 
 35 | class word2vec():
 36 | 
 37 | 	def __init__(self):
 38 | 		self.n = settings['n']
 39 | 		self.lr = settings['learning_rate']
 40 | 		self.epochs = settings['epochs']
 41 | 		self.window = settings['window_size']
 42 | 
 43 | 	def generate_training_data(self, settings, corpus):
 44 | 		# Find unique word counts using dictonary
 45 | 		word_counts = defaultdict(int)
 46 | 		for row in corpus:
 47 | 			for word in row:
 48 | 				word_counts[word] += 1
 49 | 		#########################################################################################################################################################
 50 | 		# print(word_counts)																																	#
 51 | 		# # defaultdict(<class 'int'>, {'natural': 1, 'language': 1, 'processing': 1, 'and': 2, 'machine': 1, 'learning': 1, 'is': 1, 'fun': 1, 'exciting': 1})	#
 52 | 		#########################################################################################################################################################
 53 | 
 54 | 		## How many unique words in vocab? 9
 55 | 		self.v_count = len(word_counts.keys())
 56 | 		#########################
 57 | 		# print(self.v_count)	#
 58 | 		# 9						#
 59 | 		#########################
 60 | 
 61 | 		# Generate Lookup Dictionaries (vocab)
 62 | 		self.words_list = list(word_counts.keys())
 63 | 		#################################################################################################
 64 | 		# print(self.words_list)																		#
 65 | 		# ['natural', 'language', 'processing', 'and', 'machine', 'learning', 'is', 'fun', 'exciting']	#
 66 | 		#################################################################################################
 67 | 		
 68 | 		# Generate word:index
 69 | 		self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
 70 | 		#############################################################################################################################
 71 | 		# print(self.word_index)																									#
 72 | 		# # {'natural': 0, 'language': 1, 'processing': 2, 'and': 3, 'machine': 4, 'learning': 5, 'is': 6, 'fun': 7, 'exciting': 8}	#
 73 | 		#############################################################################################################################
 74 | 
 75 | 		# Generate index:word
 76 | 		self.index_word = dict((i, word) for i, word in enumerate(self.words_list))
 77 | 		#############################################################################################################################
 78 | 		# print(self.index_word)																									#
 79 | 		# {0: 'natural', 1: 'language', 2: 'processing', 3: 'and', 4: 'machine', 5: 'learning', 6: 'is', 7: 'fun', 8: 'exciting'}	#
 80 | 		#############################################################################################################################
 81 | 
 82 | 		training_data = []
 83 | 
 84 | 		# Cycle through each sentence in corpus
 85 | 		for sentence in corpus:
 86 | 			sent_len = len(sentence)
 87 | 
 88 | 			# Cycle through each word in sentence
 89 | 			for i, word in enumerate(sentence):
 90 | 				# Convert target word to one-hot
 91 | 				w_target = self.word2onehot(sentence[i])
 92 | 
 93 | 				# Cycle through context window
 94 | 				w_context = []
 95 | 
 96 | 				# Note: window_size 2 will have range of 5 values
 97 | 				for j in range(i - self.window, i + self.window+1):
 98 | 					# Criteria for context word 
 99 | 					# 1. Target word cannot be context word (j != i)
100 | 					# 2. Index must be greater or equal than 0 (j >= 0) - if not list index out of range
101 | 					# 3. Index must be less or equal than length of sentence (j <= sent_len-1) - if not list index out of range 
102 | 					if j != i and j <= sent_len-1 and j >= 0:
103 | 						# Append the one-hot representation of word to w_context
104 | 						w_context.append(self.word2onehot(sentence[j]))
105 | 						# print(sentence[i], sentence[j]) 
106 | 						#########################
107 | 						# Example:				#
108 | 						# natural language		#
109 | 						# natural processing	#
110 | 						# language natural		#
111 | 						# language processing	#
112 | 						# language append 		#
113 | 						#########################
114 | 						
115 | 				# training_data contains a one-hot representation of the target word and context words
116 | 				#################################################################################################
117 | 				# Example:																						#
118 | 				# [Target] natural, [Context] language, [Context] processing									#
119 | 				# print(training_data)																			#
120 | 				# [[[1, 0, 0, 0, 0, 0, 0, 0, 0], [[0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0]]]]	#
121 | 				#################################################################################################
122 | 				training_data.append([w_target, w_context])
123 | 
124 | 		return np.array(training_data)
125 | 
126 | 	def word2onehot(self, word):
127 | 		# word_vec - initialise a blank vector
128 | 		word_vec = [0 for i in range(0, self.v_count)] # Alternative - np.zeros(self.v_count)
129 | 		#############################
130 | 		# print(word_vec)			#
131 | 		# [0, 0, 0, 0, 0, 0, 0, 0]	#
132 | 		#############################
133 | 
134 | 		# Get ID of word from word_index
135 | 		word_index = self.word_index[word]
136 | 
137 | 		# Change value from 0 to 1 according to ID of the word
138 | 		word_vec[word_index] = 1
139 | 
140 | 		return word_vec
141 | 
142 | 	def train(self, training_data):
143 | 		# Initialising weight matrices
144 | 		# np.random.uniform(HIGH, LOW, OUTPUT_SHAPE)
145 | 		# https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.random.uniform.html
146 | 		self.w1 = np.array(getW1)
147 | 		self.w2 = np.array(getW2)
148 | 		# self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
149 | 		# self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))
150 | 		
151 | 		# Cycle through each epoch
152 | 		for i in range(self.epochs):
153 | 			# Intialise loss to 0
154 | 			self.loss = 0
155 | 			# Cycle through each training sample
156 | 			# w_t = vector for target word, w_c = vectors for context words
157 | 			for w_t, w_c in training_data:
158 | 				# Forward pass
159 | 				# 1. predicted y using softmax (y_pred) 2. matrix of hidden layer (h) 3. output layer before softmax (u)
160 | 				y_pred, h, u = self.forward_pass(w_t)
161 | 				#########################################
162 | 				# print("Vector for target word:", w_t)	#
163 | 				# print("W1-before backprop", self.w1)	#
164 | 				# print("W2-before backprop", self.w2)	#
165 | 				#########################################
166 | 
167 | 				# Calculate error
168 | 				# 1. For a target word, calculate difference between y_pred and each of the context words
169 | 				# 2. Sum up the differences using np.sum to give us the error for this particular target word
170 | 				EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)
171 | 				#########################
172 | 				# print("Error", EI)	#
173 | 				#########################
174 | 
175 | 				# Backpropagation
176 | 				# We use SGD to backpropagate errors - calculate loss on the output layer 
177 | 				self.backprop(EI, h, w_t)
178 | 				#########################################
179 | 				#print("W1-after backprop", self.w1)	#
180 | 				#print("W2-after backprop", self.w2)	#
181 | 				#########################################
182 | 
183 | 				# Calculate loss
184 | 				# There are 2 parts to the loss function
185 | 				# Part 1: -ve sum of all the output +
186 | 				# Part 2: length of context words * log of sum for all elements (exponential-ed) in the output layer before softmax (u)
187 | 				# Note: word.index(1) returns the index in the context word vector with value 1
188 | 				# Note: u[word.index(1)] returns the value of the output layer before softmax
189 | 				self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
190 | 				
191 | 				#############################################################
192 | 				# Break if you want to see weights after first target word 	#
193 | 				# break 													#
194 | 				#############################################################
195 | 			print('Epoch:', i, "Loss:", self.loss)
196 | 
197 | 	def forward_pass(self, x):
198 | 		# x is one-hot vector for target word, shape - 9x1
199 | 		# Run through first matrix (w1) to get hidden layer - 10x9 dot 9x1 gives us 10x1
200 | 		h = np.dot(x, self.w1)
201 | 		# Dot product hidden layer with second matrix (w2) - 9x10 dot 10x1 gives us 9x1
202 | 		u = np.dot(h, self.w2)
203 | 		# Run 1x9 through softmax to force each element to range of [0, 1] - 1x8
204 | 		y_c = self.softmax(u)
205 | 		return y_c, h, u
206 | 
207 | 	def softmax(self, x):
208 | 		e_x = np.exp(x - np.max(x))
209 | 		return e_x / e_x.sum(axis=0)
210 | 
211 | 	def backprop(self, e, h, x):
212 | 		# https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.outer.html
213 | 		# Column vector EI represents row-wise sum of prediction errors across each context word for the current center word
214 | 		# Going backwards, we need to take derivative of E with respect of w2
215 | 		# h - shape 10x1, e - shape 9x1, dl_dw2 - shape 10x9
216 | 		# x - shape 9x1, w2 - 10x9, e.T - 9x1
217 | 		dl_dw2 = np.outer(h, e)
218 | 		dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
219 | 		########################################
220 | 		# print('Delta for w2', dl_dw2)			#
221 | 		# print('Hidden layer', h)				#
222 | 		# print('np.dot', np.dot(self.w2, e.T))	#
223 | 		# print('Delta for w1', dl_dw1)			#
224 | 		#########################################
225 | 
226 | 		# Update weights
227 | 		self.w1 = self.w1 - (self.lr * dl_dw1)
228 | 		self.w2 = self.w2 - (self.lr * dl_dw2)
229 | 
230 | 	# Get vector from word
231 | 	def word_vec(self, word):
232 | 		w_index = self.word_index[word]
233 | 		v_w = self.w1[w_index]
234 | 		return v_w
235 | 
236 | 	# Input vector, returns nearest word(s)
237 | 	def vec_sim(self, word, top_n):
238 | 		v_w1 = self.word_vec(word)
239 | 		word_sim = {}
240 | 
241 | 		for i in range(self.v_count):
242 | 			# Find the similary score for each word in vocab
243 | 			v_w2 = self.w1[i]
244 | 			theta_sum = np.dot(v_w1, v_w2)
245 | 			theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
246 | 			theta = theta_sum / theta_den
247 | 
248 | 			word = self.index_word[i]
249 | 			word_sim[word] = theta
250 | 
251 | 		words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)
252 | 
253 | 		for word, sim in words_sorted[:top_n]:
254 | 			print(word, sim)
255 | 
256 | #####################################################################
257 | settings = {
258 | 	'window_size': 2,			# context window +- center word
259 | 	'n': 10,					# dimensions of word embeddings, also refer to size of hidden layer
260 | 	'epochs': 50,				# number of training epochs
261 | 	'learning_rate': 0.01		# learning rate
262 | }
263 | 
264 | text = "natural language processing and machine learning is fun and exciting"
265 | 
266 | # Note the .lower() as upper and lowercase does not matter in our implementation
267 | # [['natural', 'language', 'processing', 'and', 'machine', 'learning', 'is', 'fun', 'and', 'exciting']]
268 | corpus = [[word.lower() for word in text.split()]]
269 | 
270 | # Initialise object
271 | w2v = word2vec()
272 | 
273 | # Numpy ndarray with one-hot representation for [target_word, context_words]
274 | training_data = w2v.generate_training_data(settings, corpus)
275 | 
276 | # Training
277 | w2v.train(training_data)
278 | 
279 | # Get vector for word
280 | word = "machine"
281 | vec = w2v.word_vec(word)
282 | print(word, vec)
283 | 
284 | # Find similar words
285 | w2v.vec_sim("machine", 3)
286 | 


--------------------------------------------------------------------------------