├── LICENSE
├── README.md
└── word2vec.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Nathan Rooy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # word2vec-from-scratch-with-python
2 | A very simple, bare-bones, inefficient, implementation of skip-gram word2vec from scratch with Python
3 | 
4 | blog post -> https://nathanrooy.github.io/posts/2018-03-22/word2vec-from-scratch-with-python-and-numpy/
5 | 
6 | 


--------------------------------------------------------------------------------
/word2vec.py:
--------------------------------------------------------------------------------
  1 | #------------------------------------------------------------------------------+
  2 | #
  3 | #   Nathan A. Rooy
  4 | #   Simple word2vec from scratch with Python
  5 | #   2018-FEB
  6 | #
  7 | #------------------------------------------------------------------------------+
  8 | 
  9 | #--- IMPORT DEPENDENCIES ------------------------------------------------------+
 10 | 
 11 | import numpy as np
 12 | import re
 13 | from collections import defaultdict
 14 | 
 15 | #--- CONSTANTS ----------------------------------------------------------------+
 16 | 
 17 | 
 18 | class word2vec():
 19 |     def __init__ (self):
 20 |         self.n = settings['n']
 21 |         self.eta = settings['learning_rate']
 22 |         self.epochs = settings['epochs']
 23 |         self.window = settings['window_size']
 24 |         pass
 25 |     
 26 |     
 27 |     # GENERATE TRAINING DATA
 28 |     def generate_training_data(self, settings, corpus):
 29 | 
 30 |         # GENERATE WORD COUNTS
 31 |         word_counts = defaultdict(int)
 32 |         for row in corpus:
 33 |             for word in row:
 34 |                 word_counts[word] += 1
 35 | 
 36 |         self.v_count = len(word_counts.keys())
 37 | 
 38 |         # GENERATE LOOKUP DICTIONARIES
 39 |         self.words_list = sorted(list(word_counts.keys()),reverse=False)
 40 |         self.word_index = dict((word, i) for i, word in enumerate(self.words_list))
 41 |         self.index_word = dict((i, word) for i, word in enumerate(self.words_list))
 42 | 
 43 |         training_data = []
 44 |         # CYCLE THROUGH EACH SENTENCE IN CORPUS
 45 |         for sentence in corpus:
 46 |             sent_len = len(sentence)
 47 | 
 48 |             # CYCLE THROUGH EACH WORD IN SENTENCE
 49 |             for i, word in enumerate(sentence):
 50 |                 
 51 |                 #w_target  = sentence[i]
 52 |                 w_target = self.word2onehot(sentence[i])
 53 | 
 54 |                 # CYCLE THROUGH CONTEXT WINDOW
 55 |                 w_context = []
 56 |                 for j in range(i-self.window, i+self.window+1):
 57 |                     if j!=i and j<=sent_len-1 and j>=0:
 58 |                         w_context.append(self.word2onehot(sentence[j]))
 59 |                 training_data.append([w_target, w_context])
 60 |         return np.array(training_data)
 61 | 
 62 | 
 63 |     # SOFTMAX ACTIVATION FUNCTION
 64 |     def softmax(self, x):
 65 |         e_x = np.exp(x - np.max(x))
 66 |         return e_x / e_x.sum(axis=0)
 67 | 
 68 | 
 69 |     # CONVERT WORD TO ONE HOT ENCODING
 70 |     def word2onehot(self, word):
 71 |         word_vec = [0 for i in range(0, self.v_count)]
 72 |         word_index = self.word_index[word]
 73 |         word_vec[word_index] = 1
 74 |         return word_vec
 75 | 
 76 | 
 77 |     # FORWARD PASS
 78 |     def forward_pass(self, x):
 79 |         h = np.dot(self.w1.T, x)
 80 |         u = np.dot(self.w2.T, h)
 81 |         y_c = self.softmax(u)
 82 |         return y_c, h, u
 83 |                 
 84 | 
 85 |     # BACKPROPAGATION
 86 |     def backprop(self, e, h, x):
 87 |         dl_dw2 = np.outer(h, e)  
 88 |         dl_dw1 = np.outer(x, np.dot(self.w2, e.T))
 89 | 
 90 |         # UPDATE WEIGHTS
 91 |         self.w1 = self.w1 - (self.eta * dl_dw1)
 92 |         self.w2 = self.w2 - (self.eta * dl_dw2)
 93 |         pass
 94 | 
 95 | 
 96 |     # TRAIN W2V model
 97 |     def train(self, training_data):
 98 |         # INITIALIZE WEIGHT MATRICES
 99 |         self.w1 = np.random.uniform(-0.8, 0.8, (self.v_count, self.n))     # embedding matrix
100 |         self.w2 = np.random.uniform(-0.8, 0.8, (self.n, self.v_count))     # context matrix
101 |         
102 |         # CYCLE THROUGH EACH EPOCH
103 |         for i in range(0, self.epochs):
104 | 
105 |             self.loss = 0
106 | 
107 |             # CYCLE THROUGH EACH TRAINING SAMPLE
108 |             for w_t, w_c in training_data:
109 | 
110 |                 # FORWARD PASS
111 |                 y_pred, h, u = self.forward_pass(w_t)
112 |                 
113 |                 # CALCULATE ERROR
114 |                 EI = np.sum([np.subtract(y_pred, word) for word in w_c], axis=0)
115 | 
116 |                 # BACKPROPAGATION
117 |                 self.backprop(EI, h, w_t)
118 | 
119 |                 # CALCULATE LOSS
120 |                 self.loss += -np.sum([u[word.index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
121 |                 #self.loss += -2*np.log(len(w_c)) -np.sum([u[word.index(1)] for word in w_c]) + (len(w_c) * np.log(np.sum(np.exp(u))))
122 |                 
123 |             print 'EPOCH:',i, 'LOSS:', self.loss
124 |         pass
125 | 
126 | 
127 |     # input a word, returns a vector (if available)
128 |     def word_vec(self, word):
129 |         w_index = self.word_index[word]
130 |         v_w = self.w1[w_index]
131 |         return v_w
132 | 
133 | 
134 |     # input a vector, returns nearest word(s)
135 |     def vec_sim(self, vec, top_n):
136 | 
137 |         # CYCLE THROUGH VOCAB
138 |         word_sim = {}
139 |         for i in range(self.v_count):
140 |             v_w2 = self.w1[i]
141 |             theta_num = np.dot(vec, v_w2)
142 |             theta_den = np.linalg.norm(vec) * np.linalg.norm(v_w2)
143 |             theta = theta_num / theta_den
144 | 
145 |             word = self.index_word[i]
146 |             word_sim[word] = theta
147 | 
148 |         words_sorted = sorted(word_sim.items(), key=lambda(word, sim):sim, reverse=True)
149 | 
150 |         for word, sim in words_sorted[:top_n]:
151 |             print word, sim
152 |             
153 |         pass
154 | 
155 |     # input word, returns top [n] most similar words
156 |     def word_sim(self, word, top_n):
157 |         
158 |         w1_index = self.word_index[word]
159 |         v_w1 = self.w1[w1_index]
160 | 
161 |         # CYCLE THROUGH VOCAB
162 |         word_sim = {}
163 |         for i in range(self.v_count):
164 |             v_w2 = self.w1[i]
165 |             theta_num = np.dot(v_w1, v_w2)
166 |             theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
167 |             theta = theta_num / theta_den
168 | 
169 |             word = self.index_word[i]
170 |             word_sim[word] = theta
171 | 
172 |         words_sorted = sorted(word_sim.items(), key=lambda(word, sim):sim, reverse=True)
173 | 
174 |         for word, sim in words_sorted[:top_n]:
175 |             print word, sim
176 |             
177 |         pass
178 | 
179 | #--- EXAMPLE RUN --------------------------------------------------------------+
180 | 
181 | settings = {}
182 | settings['n'] = 5                   # dimension of word embeddings
183 | settings['window_size'] = 2         # context window +/- center word
184 | settings['min_count'] = 0           # minimum word count
185 | settings['epochs'] = 5000           # number of training epochs
186 | settings['neg_samp'] = 10           # number of negative words to use during training
187 | settings['learning_rate'] = 0.01    # learning rate
188 | np.random.seed(0)                   # set the seed for reproducibility
189 | 
190 | corpus = [['the','quick','brown','fox','jumped','over','the','lazy','dog']]
191 | 
192 | # INITIALIZE W2V MODEL
193 | w2v = word2vec()
194 | 
195 | # generate training data
196 | training_data = w2v.generate_training_data(settings, corpus)
197 | 
198 | # train word2vec model
199 | w2v.train(training_data)
200 | 
201 | #--- END ----------------------------------------------------------------------+
202 | 


--------------------------------------------------------------------------------