├── LICENSE
├── Machine Translation.ipynb
├── Machine Translation.py
├── README.md
└── Translation_preprocess.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Jishnu Ray Chowdhury
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Machine Translation.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # ### Loading Pre-processed Data
  5 | 
  6 | # In[1]:
  7 | 
  8 | 
  9 | import pickle
 10 | import math
 11 | import numpy as np
 12 | 
 13 | 
 14 | with open ('translationPICKLE', 'rb') as fp:
 15 |     PICK = pickle.load(fp)
 16 | 
 17 | vocab_eng = PICK[0] 
 18 | vocab_beng = PICK[1] 
 19 | vocab_len = len(vocab_beng)
 20 | 
 21 | np_embedding_eng = PICK[2]
 22 | np_embedding_beng = PICK[3]
 23 | np_embedding_eng = np.asarray(np_embedding_eng,np.float32)
 24 | np_embedding_beng = np.asarray(np_embedding_beng,np.float32)
 25 | 
 26 | word_vec_dim = np_embedding_eng.shape[1] 
 27 | 
 28 | train_batch_x = PICK[4]
 29 | train_batch_y = PICK[5]
 30 | 
 31 | val_batch_x = PICK[6]
 32 | val_batch_y = PICK[7]
 33 | 
 34 | test_batch_x = PICK[8]
 35 | test_batch_y = PICK[9]
 36 |     
 37 | 
 38 | 
 39 | # ### Function for converting vector of size word_vec_dim into the closest representative english word. 
 40 | 
 41 | # In[2]:
 42 | 
 43 | 
 44 | def most_similar_eucli_eng(x):
 45 |     xminusy = np.subtract(np_embedding_eng,x)
 46 |     sq_xminusy = np.square(xminusy)
 47 |     sum_sq_xminusy = np.sum(sq_xminusy,1)
 48 |     eucli_dists = np.sqrt(sum_sq_xminusy)
 49 |     return np.argsort(eucli_dists)
 50 |     
 51 | def vec2word_eng(vec):   # converts a given vector representation into the represented word 
 52 |     most_similars = most_similar_eucli_eng(np.asarray(vec,np.float32))
 53 |     return vocab_eng[most_similars[0]]
 54 |     
 55 | 
 56 | 
 57 | # ### Hyperparameters and Placeholders.
 58 | 
 59 | # In[3]:
 60 | 
 61 | 
 62 | import tensorflow as tf
 63 | 
 64 | #Hyperparamters
 65 | 
 66 | h=8 #no. of heads
 67 | N=1 #no. of decoder and encoder layers
 68 | learning_rate=0.001
 69 | epochs = 200
 70 | keep_prob = tf.placeholder(tf.float32)
 71 | 
 72 | #Placeholders
 73 | 
 74 | x = tf.placeholder(tf.float32, [None,None,word_vec_dim])
 75 | y = tf.placeholder(tf.int32, [None,None])
 76 | 
 77 | output_len = tf.placeholder(tf.int32)
 78 | 
 79 | teacher_forcing = tf.placeholder(tf.bool)
 80 | 
 81 | tf_pad_mask = tf.placeholder(tf.float32,[None,None])
 82 | tf_illegal_position_masks = tf.placeholder(tf.float32,[None,None,None])
 83 | 
 84 | tf_pe_out = tf.placeholder(tf.float32,[None,None,None]) #positional codes for output
 85 | 
 86 | 
 87 | # ### Model Parameters.
 88 | 
 89 | # In[4]:
 90 | 
 91 | 
 92 | 
 93 | # Dimensions for Q (Query),K (Keys) and V (Values) for attention layers.
 94 | 
 95 | dqkv = 32 
 96 |  
 97 | #Parameters for attention sub-layers for all n encoders
 98 | 
 99 | Wq_enc = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
100 | Wk_enc = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
101 | Wv_enc = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
102 | Wo_enc = tf.Variable(tf.truncated_normal(shape=[N,h*dqkv,word_vec_dim],stddev=0.01))
103 | 
104 | #Parameters for position-wise fully connected layers for n encoders
105 | 
106 | d = 1024
107 | W1_enc = tf.Variable(tf.truncated_normal(shape=[N,1,1,word_vec_dim,d],stddev=0.01))
108 | b1_enc = tf.Variable(tf.constant(0,tf.float32,shape=[N,d]))
109 | W2_enc = tf.Variable(tf.truncated_normal(shape=[N,1,1,d,word_vec_dim],stddev=0.01))
110 | b2_enc = tf.Variable(tf.constant(0,tf.float32,shape=[N,word_vec_dim]))
111 |  
112 | #Parameters for 2 attention sub-layers for all n decoders
113 | 
114 | Wq_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
115 | Wk_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
116 | Wv_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
117 | Wo_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h*dqkv,word_vec_dim],stddev=0.01))
118 | Wq_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
119 | Wk_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
120 | Wv_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
121 | Wo_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h*dqkv,word_vec_dim],stddev=0.01))
122 |  
123 | #Parameters for position-wise fully connected layers for n decoders
124 | 
125 | d = 1024
126 | W1_dec = tf.Variable(tf.truncated_normal(shape=[N,1,1,word_vec_dim,d],stddev=0.01))
127 | b1_dec = tf.Variable(tf.constant(0,tf.float32,shape=[N,d]))
128 | W2_dec = tf.Variable(tf.truncated_normal(shape=[N,1,1,d,word_vec_dim],stddev=0.01))
129 | b2_dec = tf.Variable(tf.constant(0,tf.float32,shape=[N,word_vec_dim]))
130 |  
131 | #Layer Normalization parameters for encoder and decoder   
132 | 
133 | scale_enc_1 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32)
134 | shift_enc_1 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32)
135 | 
136 | scale_enc_2 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32)
137 | shift_enc_2 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32)
138 | 
139 | #Layer Normalization parameters for decoder   
140 | 
141 | scale_dec_1 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32)
142 | shift_dec_1 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32)
143 | 
144 | scale_dec_2 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32)
145 | shift_dec_2 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32)
146 | 
147 | scale_dec_3 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32)
148 | shift_dec_3 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32)
149 | 
150 | 
151 | # ### Function for generating a sequence of positional codes for positional encoding.
152 | 
153 | # In[5]:
154 | 
155 | 
156 | def positional_encoding(seq_len,model_dimensions):
157 |     pe = np.zeros((seq_len,model_dimensions,),np.float32)
158 |     for pos in xrange(0,seq_len):
159 |         for i in xrange(0,model_dimensions):
160 |             pe[pos][i] = math.sin(pos/(10000**(2*i/model_dimensions)))
161 |     return pe.reshape((seq_len,model_dimensions))
162 | 
163 | 
164 | # ### Function for Layer Normalization 
165 | # 
166 | # https://arxiv.org/abs/1607.06450
167 | 
168 | # In[6]:
169 | 
170 | 
171 | #modified version of def LN used here: 
172 | #https://theneuralperspective.com/2016/10/27/gradient-topics/
173 | 
174 | def layer_norm(inputs,scale,shift,epsilon = 1e-5):
175 | 
176 |     mean, var = tf.nn.moments(inputs, [1,2], keep_dims=True)
177 | 
178 |     LN = tf.multiply((scale / tf.sqrt(var + epsilon)),(inputs - mean)) + shift
179 |  
180 |     return LN
181 | 
182 | 
183 | # ### Function to pre-generate masks for illegal positions. 
184 | # 
185 | # These masks are to be used to fill illegal positions with -infinity (or a very low value eg. -2^30).
186 | # 
187 | # Illegal positions are positions of the decoder input tokens that aren't predicted at a given timestep.
188 | # 
189 | # { In a transformer, the decoder input is of the same shape as the WHOLE decoder output sequence. One word for the sequence is predicted at each timestep (from left to right). So in most timesteps, the left side of the decoder input sequence will contain valid previously predicted output words, but the right side -the yet to be predicted side should contain some values that should be ignored and never attended. We make sure that they're ignored by masking it }
190 | # 
191 | # So, the illegal positions depends on the total output length and the no. of predicted output tokens.
192 | # 
193 | # The appropriate mask when i output tokens are predicted can be retrieved from mask[i-1] where mask is the return value from this function. The argument out_len that function takes, signifies the total length of the output. 
194 | # 
195 | # The masks are used to assign the value -2^30 to all positions in the tensor influenced by the illegal ones.
196 | # After going through the softmax layer, these positions become close to 0, as it should be.
197 | # 
198 | # Dynamically creating masks depending on the current position\timestep (depending on which the program can know which positions are legal and which aren't) is, however,
199 | # a bit troublesome with tensorflow tf_while_loop. 
200 | # 
201 | # I will be pre-generating all the masks with Python native code and feed the list of all required masks to the network at each training step (output length can be different at different training steps). 
202 | #                                                                  
203 | 
204 | # In[7]:
205 | 
206 | 
207 | def generate_masks_for_illegal_positions(out_len):
208 |     
209 |     masks=np.zeros((out_len-1,out_len,out_len),dtype=np.float32)
210 |     
211 |     for i in xrange(1,out_len):
212 |         mask = np.zeros((out_len,out_len),dtype=np.float32)
213 |         mask[i:out_len,:] = -2**30
214 |         mask[:,i:out_len] = -2**30
215 |         masks[i-1] = mask
216 |         
217 |     return masks
218 | 
219 | 
220 | # ### Function for Multi-Headed Attention.
221 | # 
222 | # Details: https://arxiv.org/pdf/1706.03762.pdf
223 | # 
224 | # Q = Query
225 | # 
226 | # K = Key
227 | # 
228 | # V = Value
229 | # 
230 | # d is the dimension for Q, K and V. 
231 | 
232 | # In[8]:
233 | 
234 | 
235 | 
236 | def attention(Q,K,V,d,filled=0,mask=False):
237 | 
238 |     K = tf.transpose(K,[0,2,1])
239 |     d = tf.cast(d,tf.float32)
240 |     
241 |     softmax_component = tf.div(tf.matmul(Q,K),tf.sqrt(d))
242 |     
243 |     if mask == True:
244 |         softmax_component = softmax_component + tf_illegal_position_masks[filled-1]
245 |         
246 |     result = tf.matmul(tf.nn.dropout(tf.nn.softmax(softmax_component),keep_prob),V)
247 |  
248 |     return result
249 |        
250 | 
251 | def multihead_attention(Q,K,V,d,weights,filled=0,mask=False):
252 |     
253 |     Q_ = tf.reshape(Q,[-1,tf.shape(Q)[2]])
254 |     K_ = tf.reshape(K,[-1,tf.shape(Q)[2]])
255 |     V_ = tf.reshape(V,[-1,tf.shape(Q)[2]])
256 | 
257 |     heads = tf.TensorArray(size=h,dtype=tf.float32)
258 |     
259 |     Wq = weights['Wq']
260 |     Wk = weights['Wk']
261 |     Wv = weights['Wv']
262 |     Wo = weights['Wo']
263 |     
264 |     for i in xrange(0,h):
265 |         
266 |         Q_w = tf.matmul(Q_,Wq[i])
267 |         Q_w = tf.reshape(Q_w,[tf.shape(Q)[0],tf.shape(Q)[1],d])
268 |         
269 |         K_w = tf.matmul(K_,Wk[i])
270 |         K_w = tf.reshape(K_w,[tf.shape(K)[0],tf.shape(K)[1],d])
271 |         
272 |         V_w = tf.matmul(V_,Wv[i])
273 |         V_w = tf.reshape(V_w,[tf.shape(V)[0],tf.shape(V)[1],d])
274 | 
275 |         head = attention(Q_w,K_w,V_w,d,filled,mask)
276 |             
277 |         heads = heads.write(i,head)
278 |         
279 |     heads = heads.stack()
280 |     
281 |     concated = heads[0]
282 |     
283 |     for i in xrange(1,h):
284 |         concated = tf.concat([concated,heads[i]],2)
285 | 
286 |     concated = tf.reshape(concated,[-1,h*d])
287 |     out = tf.matmul(concated,Wo)
288 |     out = tf.reshape(out,[tf.shape(heads)[1],tf.shape(heads)[2],word_vec_dim])
289 |     
290 |     return out
291 |     
292 | 
293 | 
294 | # ### Function for encoder
295 | # 
296 | # More details: https://arxiv.org/pdf/1706.03762.pdf
297 | 
298 | # In[9]:
299 | 
300 | 
301 | def encoder(x,weights,attention_weights,dqkv):
302 | 
303 |     W1 = weights['W1']
304 |     W2 = weights['W2']
305 |     b1 = weights['b1']
306 |     b2 = weights['b2']
307 |     
308 |     scale1 = weights['scale1']
309 |     shift1 = weights['shift1']
310 |     scale2 = weights['scale2']
311 |     shift2 = weights['shift2']
312 |     
313 |     # SUBLAYER 1 (MASKED MULTI HEADED SELF ATTENTION)
314 |     
315 |     sublayer1 = multihead_attention(x,x,x,dqkv,attention_weights)
316 |     sublayer1 = tf.nn.dropout(sublayer1,keep_prob)
317 |     sublayer1 = layer_norm(sublayer1 + x,scale1,shift1)
318 |     
319 |     sublayer1_ = tf.reshape(sublayer1,[tf.shape(sublayer1)[0],1,tf.shape(sublayer1)[1],word_vec_dim])
320 |     
321 |     # SUBLAYER 2 (TWO 1x1 CONVOLUTIONAL LAYERS AKA POSITION WISE FULLY CONNECTED NETWORKS)
322 |     
323 |     sublayer2 = tf.nn.conv2d(sublayer1_, W1, strides=[1,1,1,1], padding='SAME')
324 |     sublayer2 = tf.nn.bias_add(sublayer2,b1)
325 |     sublayer2 = tf.nn.relu(sublayer2)
326 |     
327 |     sublayer2 = tf.nn.conv2d(sublayer2, W2, strides=[1,1,1,1], padding='SAME')
328 |     sublayer2 = tf.nn.bias_add(sublayer2,b2)
329 |     
330 |     sublayer2 = tf.reshape(sublayer2,[tf.shape(sublayer2)[0],tf.shape(sublayer2)[2],word_vec_dim])
331 |     
332 |     sublayer2 = tf.nn.dropout(sublayer2,keep_prob)
333 |     sublayer2 = layer_norm(sublayer2 + sublayer1,scale2,shift2)
334 |     
335 |     return sublayer2
336 | 
337 | 
338 | # ### Function for decoder
339 | # 
340 | # More details: https://arxiv.org/pdf/1706.03762.pdf
341 | 
342 | # In[10]:
343 | 
344 | 
345 | def decoder(y,enc_out,weights,masked_attention_weights,attention_weights,dqkv,mask=False,filled=0):
346 | 
347 |     W1 = weights['W1']
348 |     W2 = weights['W2']
349 |     b1 = weights['b1']
350 |     b2 = weights['b2']
351 |     
352 |     scale1 = weights['scale1']
353 |     shift1 = weights['shift1']
354 |     scale2 = weights['scale2']
355 |     shift2 = weights['shift2']
356 |     scale3 = weights['scale3']
357 |     shift3 = weights['shift3']
358 |     
359 |     # SUBLAYER 1 (MASKED MULTI HEADED SELF ATTENTION)
360 | 
361 |     sublayer1 = multihead_attention(y,y,y,dqkv,masked_attention_weights,filled,mask)
362 |     sublayer1 = tf.nn.dropout(sublayer1,keep_prob)
363 |     sublayer1 = layer_norm(sublayer1 + y,scale1,shift1)
364 |     
365 |     # SUBLAYER 2 (MULTIHEADED ENCODER-DECODER INTERLAYER ATTENTION)
366 |     
367 |     sublayer2 = multihead_attention(sublayer1,enc_out,enc_out,dqkv,attention_weights)
368 |     sublayer2 = tf.nn.dropout(sublayer2,keep_prob)
369 |     sublayer2 = layer_norm(sublayer2 + sublayer1,scale2,shift2)
370 |     
371 |     # SUBLAYER 3 (TWO 1x1 CONVOLUTIONAL LAYERS AKA POSITION WISE FULLY CONNECTED NETWORKS)
372 |     
373 |     sublayer2_ = tf.reshape(sublayer2,[tf.shape(sublayer2)[0],1,tf.shape(sublayer2)[1],word_vec_dim])
374 |     
375 |     sublayer3 = tf.nn.conv2d(sublayer2_, W1, strides=[1,1,1,1], padding='SAME')
376 |     sublayer3 = tf.nn.bias_add(sublayer3,b1)
377 |     sublayer3 = tf.nn.relu(sublayer3)
378 |     
379 |     sublayer3 = tf.nn.conv2d(sublayer3, W2, strides=[1,1,1,1], padding='SAME')
380 |     sublayer3 = tf.nn.bias_add(sublayer3,b2)
381 |     
382 |     sublayer3 = tf.reshape(sublayer3,[tf.shape(sublayer3)[0],tf.shape(sublayer3)[2],word_vec_dim])
383 |     
384 |     sublayer3 = tf.nn.dropout(sublayer3,keep_prob)
385 |     sublayer3 = layer_norm(sublayer3 + sublayer2,scale3,shift3)
386 |     
387 |     return sublayer3
388 | 
389 | 
390 | # ### Function for Stacking Encoders.
391 | 
392 | # In[11]:
393 | 
394 | 
395 | def stacked_encoders(layer_num,encoderin):
396 |     
397 |     for i in xrange(0,layer_num):
398 |         
399 |         encoder_weights = {
400 |             
401 |             'W1': W1_enc[i],
402 |             'W2': W2_enc[i],
403 |             'b1': b1_enc[i],
404 |             'b2': b2_enc[i],
405 |             'scale1': scale_enc_1[i],
406 |             'shift1': shift_enc_1[i],
407 |             'scale2': scale_enc_2[i],
408 |             'shift2': shift_enc_2[i],
409 |         }
410 |         
411 |         attention_weights = {
412 |             
413 |             'Wq': Wq_enc[i],
414 |             'Wk': Wk_enc[i],
415 |             'Wv': Wv_enc[i],
416 |             'Wo': Wo_enc[i],                       
417 |         }
418 |         
419 |         encoderin = encoder(encoderin,encoder_weights,attention_weights,dqkv)
420 |     
421 |     return encoderin
422 |     
423 | 
424 | 
425 | # ### Function for Stacking Decoders.
426 | 
427 | # In[12]:
428 | 
429 | 
430 | def stacked_decoders(layer_num,decoderin,encoderout,filled):
431 |     
432 |     for j in xrange(0,layer_num):
433 |         
434 |         decoder_weights = {
435 |             
436 |             'W1': W1_dec[j],
437 |             'W2': W2_dec[j],
438 |             'b1': b1_dec[j],
439 |             'b2': b2_dec[j],
440 |             'scale1': scale_dec_1[j],
441 |             'shift1': shift_dec_1[j],
442 |             'scale2': scale_dec_2[j],
443 |             'shift2': shift_dec_2[j],
444 |             'scale3': scale_dec_3[j],
445 |             'shift3': shift_dec_3[j],
446 |         }
447 |             
448 |         masked_attention_weights = {
449 |             
450 |             'Wq': Wq_dec_1[j],
451 |             'Wk': Wk_dec_1[j],
452 |             'Wv': Wv_dec_1[j],
453 |             'Wo': Wo_dec_1[j],                       
454 |         }
455 |         
456 |         attention_weights = {
457 |             
458 |             'Wq': Wq_dec_2[j],
459 |             'Wk': Wk_dec_2[j],
460 |             'Wv': Wv_dec_2[j],
461 |             'Wo': Wo_dec_2[j],                       
462 |         }
463 |             
464 |         decoderin = decoder(decoderin,encoderout,
465 |                             decoder_weights,
466 |                             masked_attention_weights,
467 |                             attention_weights,
468 |                             dqkv,
469 |                             mask=True,filled=filled)
470 |     return decoderin
471 |     
472 | 
473 | 
474 | # ### predicted_embedding():
475 | # 
476 | # Given a probability distribution and an embedding matrix, this function returns the embedding of the word with the maximum probability in the given distribution.
477 | # 
478 | # ### replaceSOS():
479 | # 
480 | # SOS signifies the start of sentence for the decoder. Also often represented as 'GO'. I am using an all ones vector as the first decoder input token. 
481 | # In the next time step, the SOS will be forgotten, and only the context of the previously predicted output (or the target output at the previous timestep, if teacher forcing is on) will be used.
482 | # 
483 | # ### add_pred_to_output_lists():
484 | # 
485 | # This function will concatenate the last predicted output into a tensor of concatenated sequence of output tokens. 
486 | 
487 | # In[13]:
488 | 
489 | 
490 | def predicted_embedding(out_prob_dist,tf_embd):
491 |     out_index = tf.cast(tf.argmax(out_prob_dist,1),tf.int32)
492 |     return tf.gather(tf_embd,out_index)
493 | 
494 | def replaceSOS(output,out_prob_dist):
495 |     return output,tf.constant(1),tf.reshape(out_prob_dist,[tf.shape(x)[0],1,vocab_len])
496 | 
497 | def add_pred_to_output_list(decoderin_part_1,output,filled,out_probs,out_prob_dist):
498 |     decoderin_part_1 = tf.concat([decoderin_part_1,output],1)
499 |     filled += 1
500 |     out_probs = tf.concat([out_probs,tf.reshape(out_prob_dist,[tf.shape(x)[0],1,vocab_len])],1)
501 |     return decoderin_part_1,filled,out_probs
502 | 
503 | 
504 | # ### Model Definition
505 | # 
506 | # It follows the encoder-decoder paradigm. The main exception from standard encoder-decoder paradigm, is that it uses 'transformers' instead of Reccurrent networks. The decoder undergoes a sequential processing, though. 
507 | # 
508 | # If teacher forcing is True, the decoder is made to guess the next output from the previous words in the actual target output, else the decoder predicts the next output from the previously predicted output of the decoder.
509 | # 
510 | # Details about the model: https://arxiv.org/pdf/1706.03762.pdf
511 | 
512 | # In[14]:
513 | 
514 | 
515 | def model(x,y,teacher_forcing=True):
516 |     
517 |         
518 |     # NOTE: tf.shape(x)[0] == batch_size
519 |     
520 |     encoderin = x # (should be already positionally encoded) 
521 |     encoderin = tf.nn.dropout(encoderin,keep_prob)
522 | 
523 |     
524 |     # ENCODER LAYERS
525 | 
526 |     encoderout = stacked_encoders(N,encoderin)
527 |     
528 | 
529 |     # DECODER LAYERS
530 | 
531 |     decoderin_part_1 = tf.ones([tf.shape(x)[0],1,word_vec_dim],dtype=tf.float32) #represents SOS
532 |     
533 |     filled = tf.constant(1) 
534 |     # no. of output words that are filled
535 |     # filled value is used to retrieve appropriate mask for illegal positions. 
536 |     
537 |     
538 |     tf_embd = tf.convert_to_tensor(np_embedding_beng)
539 |     Wpd = tf.transpose(tf_embd)
540 |     # Wpd the transpose of the output embedding matrix will be used to convert the decoder output
541 |     # into a probability distribution over the output language vocabulary. 
542 |     
543 |     out_probs = tf.zeros([tf.shape(x)[0],output_len,vocab_len],tf.float32)
544 |     # out_probs will contain the list of probability distributions.
545 | 
546 |     #tf_while_loop since output_len will be dynamically defined during session run
547 |     
548 |     i=tf.constant(0)
549 |     
550 |     def cond(i,filled,decoderin_part_1,out_probs):
551 |         return i<output_len
552 |     
553 |     def body(i,filled,decoderin_part_1,out_probs):
554 |         
555 |         decoderin_part_2 = tf.zeros([tf.shape(x)[0],(output_len-filled),word_vec_dim],dtype=tf.float32)
556 |         
557 |         decoderin = tf.concat([decoderin_part_1,decoderin_part_2],1)
558 |         
559 |         decoderin = tf.nn.dropout(decoderin,keep_prob)
560 |         
561 |         decoderout = stacked_decoders(N,decoderin,encoderout,filled)
562 |         
563 |         # decoderout shape (now) = batch_size x seq_len x word_vec_dim
564 | 
565 |         decoderout = tf.reduce_sum(decoderout,1) 
566 |         # summation over all the word_vec_dim dimensional vectors in the sequence to transform dimensions
567 |         # from batch_size x seq_len x word_vec_dim to batch_size x word_vec_dim.
568 |         # I suppose, a Linear layer can be alternatively used here too.
569 |         
570 |         # decoderout shape (now) = batch_size x word_vec_dim
571 |         
572 |         # converting decoderout to probability distributions
573 |         
574 |         out_prob_dist = tf.matmul(decoderout,Wpd)
575 |    
576 |         # If teacher forcing is false, initiate predicted_embedding(). It guesses the output embeddings
577 |         # to be that whose vocabulary index has maximum probability in out_prob_dist
578 |         # (the current output probability distribution). The embedding is used in the next
579 |         # iteration. 
580 |         
581 |         # If teacher forcing is true, use the embedding of target index from y (laebls) 
582 |         # for the next iteration.
583 |         
584 |         output = tf.cond(tf.equal(teacher_forcing,tf.convert_to_tensor(False)),
585 |                          lambda: predicted_embedding(out_prob_dist,tf_embd),
586 |                          lambda: tf.gather(tf_embd,y[:,i]))
587 |         
588 |         # Position Encoding the output
589 |         
590 |         output = output + tf_pe_out[i]
591 |         output = tf.reshape(output,[tf.shape(x)[0],1,word_vec_dim])
592 |                                 
593 |         
594 |         #concatenate with list of previous predicted output tokens
595 |         
596 |         decoderin_part_1,filled,out_probs = tf.cond(tf.equal(i,0),
597 |                                         lambda:replaceSOS(output,out_prob_dist),
598 |                                         lambda:add_pred_to_output_list(decoderin_part_1,output,filled,out_probs,out_prob_dist))
599 |         
600 |         return i+1,filled,decoderin_part_1,out_probs
601 |             
602 |     _,_,_,out_probs = tf.while_loop(cond,body,[i,filled,decoderin_part_1,out_probs],
603 |                       shape_invariants=[i.get_shape(),
604 |                                         filled.get_shape(),
605 |                                         tf.TensorShape([None,None,word_vec_dim]),
606 |                                         tf.TensorShape([None,None,vocab_len])])
607 | 
608 |     return out_probs          
609 | 
610 | 
611 | # ### Setting up cost function and optimizer
612 | 
613 | # In[15]:
614 | 
615 | 
616 | # Construct Model
617 | output = model(x,y,teacher_forcing)
618 | 
619 | #OPTIMIZER
620 | 
621 | cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output, labels=y)
622 | cost = tf.multiply(cost,tf_pad_mask) #mask used to remove loss effect due to PADS
623 | cost = tf.reduce_mean(cost)
624 | 
625 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,beta1=0.9,beta2=0.98,epsilon=1e-9).minimize(cost)
626 | 
627 | #wanna add some temperature?
628 | 
629 | """temperature = 0.7
630 | scaled_output = tf.log(output)/temperature
631 | softmax_output = tf.nn.softmax(scaled_output)"""
632 | 
633 | #(^Use it with "#prediction_int = np.random.choice(range(vocab_len), p=array.ravel())")
634 | 
635 | softmax_output = tf.nn.softmax(output)
636 | 
637 | 
638 | # ### Function to create a Mask for pads' effect on cost. 
639 | # 
640 | # The mask will have the same shape as the batch of labels but with the value 0 wherever there is a PAD.
641 | # The mask will be element-wise multipled to the cost (before its averaged), so that any position in the cost tensor that is effected by the PAD will be multiplied by 0. This way, the effect of PADs (which we don't need to care about) on the cost (and therefore on the gradients) can be nullified. 
642 | 
643 | # In[16]:
644 | 
645 | 
646 | def create_pad_Mask(output_batch):
647 |     pad_index = vocab_beng.index('<PAD>')
648 |     mask = np.ones_like((output_batch),np.float32)
649 |     for i in xrange(len(mask)):
650 |         for j in xrange(len(mask[i])):
651 |             if output_batch[i,j]==pad_index:
652 |                 mask[i,j]=0
653 |     return mask
654 | 
655 | 
656 | # ### Training .....
657 | # 
658 | # The input batch is positionally encoded before its fed to the network.
659 | 
660 | # In[17]:
661 | 
662 | 
663 | import string
664 | import random
665 | from __future__ import print_function
666 | 
667 | init = tf.global_variables_initializer()
668 | 
669 | with tf.Session() as sess: # Start Tensorflow Session
670 |     
671 |     saver = tf.train.Saver() 
672 |     # Prepares variable for saving the model
673 |     sess.run(init) #initialize all variables
674 |     step = 0   
675 |     best_loss = 999
676 |     display_step = 1
677 |     warm_up_steps = 7000
678 |     
679 |     while step < epochs:
680 |         
681 |         batch_len = len(train_batch_x)
682 |         shuffled_indices = np.arange(batch_len)
683 |         np.random.shuffle(shuffled_indices)
684 |         
685 |         for i in xrange(0,batch_len):
686 |             
687 |             # Adaptive learning rate formula
688 |             #learning_rate = ((word_vec_dim)**(-0.5))*min((step*batch_len+i+1)**(-0.5),(step*batch_len+i+1)*warm_up_steps**(-1.5))
689 | 
690 |             sample_no = np.random.randint(0,len(train_batch_x[0]))
691 |             print("\nCHOSEN SAMPLE NO.: "+str(sample_no))
692 |             
693 |             if i%display_step==0:
694 |                 
695 |                 print("\nEpoch: "+str(step+1)+" Iteration: "+str(i+1))
696 |                 print("\nSAMPLE TEXT:")
697 |                 for vec in train_batch_x[shuffled_indices[i]][sample_no]:
698 |                     print(vec2word_eng(vec),end=" ")
699 |                 print("\n")
700 |                 
701 |             input_seq_len = len(train_batch_x[shuffled_indices[i]][0])
702 |             
703 |             pe_in = positional_encoding(input_seq_len,word_vec_dim)
704 |             pe_in = pe_in.reshape((1,input_seq_len,word_vec_dim))
705 |             
706 |             output_seq_len = len(train_batch_y[shuffled_indices[i]][0])
707 |             
708 |             
709 |             
710 |             illegal_position_masks = generate_masks_for_illegal_positions(output_seq_len)
711 |             
712 |             pe_out = positional_encoding(output_seq_len,word_vec_dim)
713 |             pe_out = pe_out.reshape((output_seq_len,1,word_vec_dim))
714 |     
715 |             
716 |             rand = random.randint(0,2) #determines chance of using Teacher Forcing
717 |             if rand==1:
718 |                 random_bool = True
719 |             else:
720 |                 random_bool = False
721 |             
722 |             pad_mask = create_pad_Mask(train_batch_y[shuffled_indices[i]])
723 |             
724 |             # Run optimization operation (backpropagation)
725 |             _,loss,out = sess.run([optimizer,cost,softmax_output],
726 |                                   feed_dict={x: (train_batch_x[shuffled_indices[i]]+pe_in), 
727 |                                              y: train_batch_y[shuffled_indices[i]],
728 |                                              keep_prob: 0.9,
729 |                                              output_len: len(train_batch_y[shuffled_indices[i]][0]),
730 |                                              tf_pad_mask: pad_mask,
731 |                                              tf_illegal_position_masks: illegal_position_masks,
732 |                                              tf_pe_out: pe_out,
733 |                                              teacher_forcing: False #random_bool
734 |                                              # feed random bool for randomized teacher forcing. 
735 |                                              })
736 |             
737 |             if i%display_step==0:
738 |                 
739 |                 print("\nPREDICTED TRANSLATION OF THE SAMPLE:\n")
740 |                 flag = 0
741 |                 for array in out[sample_no]:
742 |                     
743 |                     #prediction_int = np.random.choice(range(vocab_len), p=array.ravel()) 
744 |                     #(^use this if you want some variety)
745 |                     #(or use this what's below:)
746 |                     
747 |                     prediction_int = np.argmax(array)
748 |                     
749 |                     if vocab_beng[prediction_int] in string.punctuation or flag==0: 
750 |                         print(vocab_beng[prediction_int],end='')
751 |                     else:
752 |                         print(" "+vocab_beng[prediction_int],end='')
753 |                     flag=1
754 |                 print("\n")
755 |                 
756 |                 print("ACTUAL TRANSLATION OF THE SAMPLE:\n")
757 |                 for index in train_batch_y[shuffled_indices[i]][sample_no]:
758 |                     print(vocab_beng[index],end=" ")
759 |                 print("\n")
760 |             
761 |             print("loss="+str(loss))
762 |                   
763 |             if(loss<best_loss):
764 |                 best_loss = loss
765 |                 saver.save(sess, 'Model_Backup/translation_model.ckpt')
766 | 
767 |         step=step+1
768 |     
769 | 
770 | 
771 | # In[18]:
772 | 
773 | 
774 | def word2vec(word):
775 |     if word in vocab_eng:
776 |         return np_embedding_eng[vocab_eng.index(word)]
777 |     else:
778 |         return np_embedding_eng[vocab_eng.index('<PAD>')]
779 | 
780 | 
781 | # ### Prediction.
782 | 
783 | # In[21]:
784 | 
785 | 
786 | with tf.Session() as sess: # Begin session
787 |     
788 |     print('Loading pre-trained weights for the model...')
789 |     saver = tf.train.Saver()
790 |     saver.restore(sess, 'Model_Backup/translation_model.ckpt')
791 |     sess.run(tf.global_variables())
792 |     print('\nRESTORATION COMPLETE\n')
793 |     
794 |     
795 |     test = ['who','are','you'] # Enter tokenized text here
796 |     test = map(word2vec,test)
797 |     test = np.asarray(test,np.float32)
798 |     test = test.reshape((1,test.shape[0],test.shape[1]))
799 |     
800 |     input_seq_len = test.shape[0]
801 |     pe_in = positional_encoding(input_seq_len,word_vec_dim)
802 |     pe_in = pe_in.reshape((1,input_seq_len,word_vec_dim))
803 |     test_pe = test+pe_in
804 |     
805 |     output_seq_len = int(input_seq_len+20)
806 |     illegal_position_masks = generate_masks_for_illegal_positions(output_seq_len)
807 |     pe_out = positional_encoding(output_seq_len,word_vec_dim) 
808 |     pe_out = pe_out.reshape((output_seq_len,1,word_vec_dim))
809 |         
810 |     out = sess.run(softmax_output,
811 |                           feed_dict={x: test_pe,
812 |                                      y: np.zeros((1,1),np.int32), 
813 |                                      # y value doesn't matter here.
814 |                                      # feeding y, because the network graph requires y.
815 |                                      # but its value won't actually be used in this case. 
816 |                                      keep_prob: 1,
817 |                                      output_len: output_seq_len,
818 |                                      tf_pe_out: pe_out,
819 |                                      tf_illegal_position_masks: illegal_position_masks,
820 |                                      teacher_forcing: False
821 |                                     })
822 | 
823 |     for array in out[0]:
824 |         if vocab_beng[np.argmax(array)] != '<EOS>':
825 |             print(vocab_beng[np.argmax(array)],end=' ')
826 |     
827 | 
828 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | 
   2 | 
   3 | # Machine Translation using Transformers    
   4 | 
   5 | The model is based on:
   6 | 
   7 | ["Attention Is All You Need" by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin. arXiv:1706.03762](https://arxiv.org/abs/1706.03762) 
   8 | 
   9 | # WARNING:
  10 | 
  11 | This is an old code. I have an updated version of Transformers over here: https://github.com/JRC1995/Transformers
  12 | 
  13 | # Preprocessing Translation Data
  14 | (from Translation_preprocess.py)
  15 | 
  16 | ### Function for expanding English contractions
  17 | 
  18 | source: https://gist.github.com/nealrs/96342d8231b75cf4bb82
  19 | 
  20 | 
  21 | ```python
  22 | import numpy as np
  23 | from __future__ import division
  24 | import io
  25 | import unicodedata
  26 | import nltk
  27 | from nltk import word_tokenize
  28 | import string
  29 | import re
  30 | import random
  31 | 
  32 | 
  33 | #source: https://gist.github.com/nealrs/96342d8231b75cf4bb82
  34 | cList = {
  35 |   "ain't": "am not",
  36 |   "aren't": "are not",
  37 |   "can't": "cannot",
  38 |   "can't've": "cannot have",
  39 |   "'cause": "because",
  40 |   "could've": "could have",
  41 |   "couldn't": "could not",
  42 |   "couldn't've": "could not have",
  43 |   "didn't": "did not",
  44 |   "doesn't": "does not",
  45 |   "don't": "do not",
  46 |   "hadn't": "had not",
  47 |   "hadn't've": "had not have",
  48 |   "hasn't": "has not",
  49 |   "haven't": "have not",
  50 |   "he'd": "he would",
  51 |   "he'd've": "he would have",
  52 |   "he'll": "he will",
  53 |   "he'll've": "he will have",
  54 |   "he's": "he is",
  55 |   "how'd": "how did",
  56 |   "how'd'y": "how do you",
  57 |   "how'll": "how will",
  58 |   "how's": "how is",
  59 |   "I'd": "I would",
  60 |   "I'd've": "I would have",
  61 |   "I'll": "I will",
  62 |   "I'll've": "I will have",
  63 |   "I'm": "I am",
  64 |   "I've": "I have",
  65 |   "isn't": "is not",
  66 |   "it'd": "it had",
  67 |   "it'd've": "it would have",
  68 |   "it'll": "it will",
  69 |   "it'll've": "it will have",
  70 |   "it's": "it is",
  71 |   "let's": "let us",
  72 |   "ma'am": "madam",
  73 |   "mayn't": "may not",
  74 |   "might've": "might have",
  75 |   "mightn't": "might not",
  76 |   "mightn't've": "might not have",
  77 |   "must've": "must have",
  78 |   "mustn't": "must not",
  79 |   "mustn't've": "must not have",
  80 |   "needn't": "need not",
  81 |   "needn't've": "need not have",
  82 |   "o'clock": "of the clock",
  83 |   "oughtn't": "ought not",
  84 |   "oughtn't've": "ought not have",
  85 |   "shan't": "shall not",
  86 |   "sha'n't": "shall not",
  87 |   "shan't've": "shall not have",
  88 |   "she'd": "she would",
  89 |   "she'd've": "she would have",
  90 |   "she'll": "she will",
  91 |   "she'll've": "she will have",
  92 |   "she's": "she is",
  93 |   "should've": "should have",
  94 |   "shouldn't": "should not",
  95 |   "shouldn't've": "should not have",
  96 |   "so've": "so have",
  97 |   "so's": "so is",
  98 |   "that'd": "that would",
  99 |   "that'd've": "that would have",
 100 |   "that's": "that is",
 101 |   "there'd": "there had",
 102 |   "there'd've": "there would have",
 103 |   "there's": "there is",
 104 |   "they'd": "they would",
 105 |   "they'd've": "they would have",
 106 |   "they'll": "they will",
 107 |   "they'll've": "they will have",
 108 |   "they're": "they are",
 109 |   "they've": "they have",
 110 |   "to've": "to have",
 111 |   "wasn't": "was not",
 112 |   "we'd": "we had",
 113 |   "we'd've": "we would have",
 114 |   "we'll": "we will",
 115 |   "we'll've": "we will have",
 116 |   "we're": "we are",
 117 |   "we've": "we have",
 118 |   "weren't": "were not",
 119 |   "what'll": "what will",
 120 |   "what'll've": "what will have",
 121 |   "what're": "what are",
 122 |   "what's": "what is",
 123 |   "what've": "what have",
 124 |   "when's": "when is",
 125 |   "when've": "when have",
 126 |   "where'd": "where did",
 127 |   "where's": "where is",
 128 |   "where've": "where have",
 129 |   "who'll": "who will",
 130 |   "who'll've": "who will have",
 131 |   "who's": "who is",
 132 |   "who've": "who have",
 133 |   "why's": "why is",
 134 |   "why've": "why have",
 135 |   "will've": "will have",
 136 |   "won't": "will not",
 137 |   "won't've": "will not have",
 138 |   "would've": "would have",
 139 |   "wouldn't": "would not",
 140 |   "wouldn't've": "would not have",
 141 |   "y'all": "you all",
 142 |   "y'alls": "you alls",
 143 |   "y'all'd": "you all would",
 144 |   "y'all'd've": "you all would have",
 145 |   "y'all're": "you all are",
 146 |   "y'all've": "you all have",
 147 |   "you'd": "you had",
 148 |   "you'd've": "you would have",
 149 |   "you'll": "you you will",
 150 |   "you'll've": "you you will have",
 151 |   "you're": "you are",
 152 |   "you've": "you have"
 153 | }
 154 | 
 155 | c_re = re.compile('(%s)' % '|'.join(cList.keys()))
 156 | 
 157 | def expandContractions(text, c_re=c_re):
 158 |     def replace(match):
 159 |         return cList[match.group(0)]
 160 |     return c_re.sub(replace, text)
 161 | ```
 162 | 
 163 | ### Loading Translation Data
 164 | 
 165 | Splitting the data into eng and beng.
 166 | eng will contain the list of English lines, and beng will contain the corresponding list of Bengali lines.
 167 | 
 168 | 
 169 | Source of data: http://www.manythings.org/anki/ (downloaded ben-eng)
 170 | 
 171 | 
 172 | ```python
 173 | filename = 'ben.txt'
 174 | #Datasource: http://www.manythings.org/anki/
 175 |     
 176 | # http://stackoverflow.com/a/518232/2809427
 177 | def unicodeToAscii(s):
 178 |     return ''.join(
 179 |         c for c in unicodedata.normalize('NFD', s)
 180 |         if unicodedata.category(c) != 'Mn'
 181 |     )
 182 |     
 183 | def normalizeString(s):
 184 |     s = unicodeToAscii(expandContractions(s.lower().strip()))
 185 |     s = re.sub(r"([.!?,\"])", r" ", s)
 186 |     return s
 187 |     
 188 | def loaddata(filename):
 189 |     file = io.open(filename,'r')
 190 |     eng=[]
 191 |     beng = []
 192 |     for line in file.readlines():
 193 |         lang_pair = line.split('\t')
 194 |         lang_pair[0] = normalizeString(lang_pair[0])
 195 |         lang_pair[1] = normalizeString(lang_pair[1])
 196 |         eng.append(word_tokenize(lang_pair[0]))
 197 |         beng.append(word_tokenize(lang_pair[1]))
 198 |     file.close()
 199 |     return eng,beng
 200 | 
 201 | eng,beng = loaddata(filename)
 202 | 
 203 | #Example:
 204 | sample = random.randint(0,len(eng))
 205 | print "Example Sample #"+str(sample)+":\n"
 206 | string = "ENGLISH:"
 207 | for i in xrange(0,len(eng[sample])):
 208 |     string+=" "+eng[sample][i]
 209 | print string
 210 |     
 211 | string = "\nBENGALI:"
 212 | for i in xrange(0,len(beng[sample])):
 213 |     string+=" "+beng[sample][i]
 214 | print string
 215 | 
 216 | ```
 217 | 
 218 |     Example Sample #646:
 219 |     
 220 |     ENGLISH: tom 's right
 221 |     
 222 |     BENGALI: টমই ঠিক।
 223 | 
 224 | 
 225 | ### Creating separate vocabulary lists for English words and Bengali words
 226 | 
 227 | The index of vocabulary will represent the numerical representation of the word which is stored at that index. 
 228 | 
 229 | 
 230 | 
 231 | ```python
 232 | import numpy as np
 233 | 
 234 | vocab_eng=[]
 235 | vocab_eng.append('<PAD>')
 236 | vocab_eng.append('<EOS>')
 237 | 
 238 | vocab_beng=[]
 239 | vocab_beng.append('<PAD>')
 240 | vocab_beng.append('<EOS>')
 241 | 
 242 | #The index of vocab will serve as an integer representation of the word
 243 | 
 244 | vectorized_eng = []
 245 | vectorized_beng = []
 246 | 
 247 | for i in xrange(len(eng)):
 248 |     
 249 |     vectorized_eng_line = []
 250 |     for word in eng[i]:
 251 |         if word not in vocab_eng:
 252 |             vocab_eng.append(word)
 253 |             vectorized_eng_line.append(vocab_eng.index(word))
 254 |         else:
 255 |             vectorized_eng_line.append(vocab_eng.index(word))
 256 |     vectorized_eng.append(vectorized_eng_line)
 257 |     
 258 |     vectorized_beng_line = []
 259 |     for word in beng[i]:
 260 |         if word not in vocab_beng:
 261 |             vocab_beng.append(word)
 262 |             vectorized_beng_line.append(vocab_beng.index(word))
 263 |         else:
 264 |             vectorized_beng_line.append(vocab_beng.index(word))
 265 |     vectorized_beng.append(vectorized_beng_line)
 266 |     
 267 |     
 268 | ```
 269 | 
 270 | ### Creating training dataset for word2vec embedding
 271 | 
 272 | if the sentence is "I am alright"
 273 | 
 274 | then for the word 'am', the context words with window size 1 will be "I" and "alright"
 275 | i.e ["I","alright"]
 276 | 
 277 | For 'I' the context words will be "PAD" and "am"
 278 | 
 279 | For 'alright' the context words will be "am" and "PAD"
 280 | 
 281 | PAD represents empty and EOS represents end of sentence.
 282 | 
 283 | Later lots of pads may be applied after the end of sentence to fit sequence length.
 284 | 
 285 | So I also added the word PAD with context words being PADs, and PAD and EOS for embedding.
 286 | 
 287 | (Doing what I wrote directly above, was actually unnecessary but I already did it. We don't need to consider these cases. With masking I will ignore the effect of PADs on the cost, anyway, and the model doesn't need to predict pads correctly. Predicting the EOS properly will be enough. So PAD embedding doesn't need to be taken so seriously.)
 288 | 
 289 | In this way, first, from each sentence, I am creating a list of words, and a corresponding list of context words.
 290 | I am doing the same thing for both English and Bengali lines. 
 291 | 
 292 | 
 293 | ```python
 294 | words_eng = []
 295 | contexts_eng = []
 296 | 
 297 | words_beng = []
 298 | contexts_beng = []
 299 | 
 300 | words_eng.append(vocab_eng.index('<PAD>'))
 301 | contexts_eng.append([vocab_eng.index('<EOS>'),vocab_eng.index('<PAD>')])
 302 | words_eng.append(vocab_eng.index('<PAD>'))
 303 | contexts_eng.append([vocab_eng.index('<PAD>'),vocab_eng.index('<PAD>')])
 304 | 
 305 | words_beng.append(vocab_beng.index('<PAD>'))
 306 | contexts_beng.append([vocab_beng.index('<EOS>'),vocab_beng.index('<PAD>')])
 307 | words_beng.append(vocab_beng.index('<PAD>'))
 308 | contexts_beng.append([vocab_beng.index('<PAD>'),vocab_beng.index('<PAD>')])
 309 | 
 310 | 
 311 | for i in xrange(len(vectorized_eng)):
 312 |     
 313 |     for j in xrange(0,len(vectorized_eng[i])):
 314 |         
 315 |         context1=0
 316 |         context2=0
 317 |         
 318 |         if j==0:
 319 |             context1 = vocab_eng.index('<PAD>')
 320 |             if j!=len(vectorized_eng[i])-1:
 321 |                 context2 = vectorized_eng[i][j+1]
 322 |         if j==len(vectorized_eng[i])-1:
 323 |             context2=vocab_eng.index('<EOS>')
 324 |             if j!=0:
 325 |                 context1 = vectorized_eng[i][j-1]
 326 |         if j>0 and j<len(vectorized_eng[i])-1:
 327 |             context1 = vectorized_eng[i][j-1]
 328 |             context2 = vectorized_eng[i][j+1]
 329 |         
 330 |         words_eng.append(vectorized_eng[i][j])
 331 |         contexts_eng.append([context1,context2])
 332 |     
 333 |     rand = random.randint(0,3)
 334 |     if rand == 1: #reduce the freuency of <EOS> for training data
 335 |         words_eng.append(vocab_eng.index('<EOS>'))
 336 |         context1 = vectorized_eng[i][len(vectorized_eng[i])-1]
 337 |         context2 = vocab_eng.index('<PAD>')
 338 |         contexts_eng.append([context1,context2])
 339 |     
 340 |     for j in xrange(0,len(vectorized_beng[i])):
 341 |         
 342 |         context1=0
 343 |         context2=0
 344 |         
 345 |         if j==0:
 346 |             context1 = vocab_beng.index('<PAD>')
 347 |             if j!=len(vectorized_beng[i])-1:
 348 |                 context2 = vectorized_beng[i][j+1]
 349 |         if j==len(vectorized_beng[i])-1:
 350 |             context2=vocab_beng.index('<EOS>')
 351 |             if j!=0:
 352 |                 context1 = vectorized_beng[i][j-1]
 353 |         if j>0 and j<len(vectorized_beng[i])-1:
 354 |             context1 = vectorized_beng[i][j-1]
 355 |             context2 = vectorized_beng[i][j+1]
 356 |         
 357 |         words_beng.append(vectorized_beng[i][j])
 358 |         contexts_beng.append([context1,context2])
 359 |     
 360 |     rand = random.randint(0,3)
 361 |     if rand == 1: #reduce the freuency of <EOS> for training data
 362 |         words_beng.append(vocab_beng.index('<EOS>'))
 363 |         context1 = vectorized_beng[i][len(vectorized_beng[i])-1]
 364 |         context2 = vocab_beng.index('<PAD>')
 365 |         contexts_beng.append([context1,context2])
 366 |     
 367 |     
 368 |             
 369 | ```
 370 | 
 371 | If word = "am" and context = ["I","alright"],
 372 | then, from this data I will create the following samples:
 373 | 
 374 | input = "am"
 375 | output = "I"
 376 | and 
 377 | input = "am"
 378 | label = "alright"
 379 | 
 380 | Like this I will construct a list of all training inputs (words) and training outputs\labels (context words)
 381 | 
 382 | embd_inputs_eng will contain all the English training inputs.
 383 | embd_labels_eng will contain all the English training labels.
 384 | 
 385 | embd_inputs_beng will contain all the Bengali training inputs.
 386 | embd_labels_beng will contain all the Bengali training labels.
 387 | 
 388 | 
 389 | ```python
 390 | embd_inputs_eng = []
 391 | embd_labels_eng = []
 392 | for i in xrange(len(contexts_eng)):
 393 |     for context in contexts_eng[i]:
 394 |         embd_inputs_eng.append(words_eng[i])
 395 |         embd_labels_eng.append(context)
 396 | embd_inputs_eng = np.asarray(embd_inputs_eng,np.int32)
 397 | embd_labels_eng = np.asarray(embd_labels_eng,np.int32)
 398 | 
 399 | embd_inputs_beng = []
 400 | embd_labels_beng = []
 401 | for i in xrange(len(contexts_beng)):
 402 |     for context in contexts_beng[i]:
 403 |         embd_inputs_beng.append(words_beng[i])
 404 |         embd_labels_beng.append(context)
 405 | embd_inputs_beng = np.asarray(embd_inputs_beng,np.int32)
 406 | embd_labels_beng = np.asarray(embd_labels_beng,np.int32)
 407 |     
 408 | ```
 409 | 
 410 | ### Function for generating mini-batches from the total training set
 411 | 
 412 | 
 413 | ```python
 414 | batch_size = 128
 415 | 
 416 | def generate_batch(inputs,labels,batch_size):
 417 |     rand = random.sample((np.arange(len(inputs))),batch_size)
 418 |     batch_inputs=[]
 419 |     batch_labels=[]
 420 |     for i in xrange(batch_size):
 421 |         batch_inputs.append(inputs[int(rand[i])])
 422 |         batch_labels.append(labels[int(rand[i])])
 423 |     batch_inputs = np.asarray(batch_inputs,np.int32)
 424 |     batch_labels = np.asarray(batch_labels,np.int32)
 425 |     return batch_inputs,batch_labels
 426 |     
 427 | ```
 428 | 
 429 | ### Preparing for word2vec embedding
 430 | 
 431 | 
 432 | ```python
 433 | import tensorflow as tf
 434 | import math
 435 | 
 436 | #https://www.tensorflow.org/tutorials/word2vec
 437 | embedding_size = 256
 438 | vocabulary_size_eng = len(vocab_eng)
 439 | vocabulary_size_beng = len(vocab_beng)
 440 | 
 441 | # Placeholders for inputs
 442 | train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
 443 | train_labels = tf.placeholder(tf.int32, shape=[batch_size,1])
 444 | 
 445 | ```
 446 | 
 447 | ### Training for word2vec embedding (For English words)
 448 | 
 449 | See: https://www.tensorflow.org/tutorials/word2vec
 450 | 
 451 | for details of word2vec and code description. 
 452 | 
 453 | Most of the word2vec code used here are from the Tensorflow tutorial. 
 454 | 
 455 | 
 456 | ```python
 457 | embeddings_eng = tf.Variable(
 458 |     tf.random_uniform([vocabulary_size_eng, embedding_size], -1.0, 1.0))
 459 | 
 460 | nce_weights_eng = tf.Variable(
 461 |   tf.truncated_normal([vocabulary_size_eng, embedding_size],
 462 |                       stddev=1.0 / math.sqrt(embedding_size)))
 463 | nce_biases_eng = tf.Variable(tf.zeros([vocabulary_size_eng]))
 464 | 
 465 | # Initializing the variables
 466 | init = tf.global_variables_initializer()
 467 | ```
 468 | 
 469 | 
 470 | ```python
 471 | embed_eng = tf.nn.embedding_lookup(embeddings_eng, train_inputs)
 472 | 
 473 | # Compute the NCE loss, using a sample of the negative labels each time.
 474 | loss = tf.reduce_mean(
 475 |   tf.nn.nce_loss(weights=nce_weights_eng,
 476 |                  biases=nce_biases_eng,
 477 |                  labels=train_labels,
 478 |                  inputs=embed_eng,
 479 |                  num_sampled=10, 
 480 |                  num_classes=vocabulary_size_eng)) #num_sampled = no. of negative samples
 481 | 
 482 | # We use the SGD optimizer.
 483 | optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
 484 | ```
 485 | 
 486 | 
 487 | ```python
 488 | 
 489 | with tf.Session() as sess:
 490 |     sess.run(init)
 491 |     convergence_threshold = 0.5
 492 |     training_iters = 500*(int((len(embd_inputs_eng))/batch_size))
 493 |     step=0
 494 |     n=5
 495 |     last_n_losses = np.zeros((n),np.float32)
 496 |     
 497 |     while step<training_iters:
 498 |         
 499 |         batch_inputs,batch_labels = generate_batch(embd_inputs_eng,embd_labels_eng,batch_size)
 500 |         
 501 |         feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels.reshape((-1,1))}
 502 |         
 503 |         _, np_embedding_eng, cur_loss = sess.run([optimizer, embeddings_eng, loss], feed_dict=feed_dict)
 504 |         
 505 |         print "Iter "+str(step)+", loss = "+str(cur_loss)
 506 |         
 507 |         last_n_losses[step%n]=cur_loss
 508 |         
 509 |         if step>=n:
 510 |             if np.mean(last_n_losses)<=convergence_threshold:
 511 |                 break
 512 |         step+=1
 513 |                 
 514 | print "\nOptimization Finished\n"
 515 | ```
 516 | 
 517 |     Iter 172463, loss = 1.07693
 518 |     Iter 172464, loss = 1.23457
 519 |     Iter 172465, loss = 0.929267
 520 |     Iter 172466, loss = 0.951752
 521 |     Iter 172467, loss = 1.03454
 522 |     Iter 172468, loss = 1.04068
 523 |     Iter 172469, loss = 1.00835
 524 |     Iter 172470, loss = 0.724509
 525 |     Iter 172471, loss = 1.34491
 526 |     Iter 172472, loss = 1.40166
 527 |     Iter 172473, loss = 0.903883
 528 |     Iter 172474, loss = 0.820817
 529 |     Iter 172475, loss = 0.971768
 530 |     Iter 172476, loss = 1.13689
 531 |     Iter 172477, loss = 1.14364
 532 |     Iter 172478, loss = 0.898286
 533 |     Iter 172479, loss = 1.13082
 534 |     Iter 172480, loss = 0.942493
 535 |     Iter 172481, loss = 1.24602
 536 |     Iter 172482, loss = 1.41656
 537 |     Iter 172483, loss = 2.0268
 538 |     Iter 172484, loss = 1.85192
 539 |     Iter 172485, loss = 0.975864
 540 |     Iter 172486, loss = 1.64831
 541 |     Iter 172487, loss = 1.02136
 542 |     Iter 172488, loss = 1.39286
 543 |     Iter 172489, loss = 1.18648
 544 |     Iter 172490, loss = 0.867372
 545 |     Iter 172491, loss = 1.01302
 546 |     Iter 172492, loss = 1.37601
 547 |     Iter 172493, loss = 1.04559
 548 |     Iter 172494, loss = 1.44399
 549 |     Iter 172495, loss = 1.29064
 550 |     Iter 172496, loss = 1.1171
 551 |     Iter 172497, loss = 1.38955
 552 |     Iter 172498, loss = 1.26175
 553 |     Iter 172499, loss = 1.36192
 554 |     
 555 |     Optimization Finished
 556 |     
 557 | 
 558 | 
 559 | ### Training for word2vec embedding (For Bengali words)
 560 | 
 561 | 
 562 | ```python
 563 | embeddings_beng = tf.Variable(
 564 |     tf.random_uniform([vocabulary_size_beng, embedding_size], -1.0, 1.0))
 565 | 
 566 | nce_weights_beng = tf.Variable(
 567 |   tf.truncated_normal([vocabulary_size_beng, embedding_size],
 568 |                       stddev=1.0 / math.sqrt(embedding_size)))
 569 | nce_biases_beng = tf.Variable(tf.zeros([vocabulary_size_beng]))
 570 | 
 571 | # Initializing the variables
 572 | init = tf.global_variables_initializer()
 573 | ```
 574 | 
 575 | 
 576 | ```python
 577 | embed_beng = tf.nn.embedding_lookup(embeddings_beng, train_inputs)
 578 | 
 579 | # Compute the NCE loss, using a sample of the negative labels each time.
 580 | loss = tf.reduce_mean(
 581 |   tf.nn.nce_loss(weights=nce_weights_beng,
 582 |                  biases=nce_biases_beng,
 583 |                  labels=train_labels,
 584 |                  inputs=embed_beng,
 585 |                  num_sampled=10, 
 586 |                  num_classes=vocabulary_size_beng)) #num_sampled = no. of negative samples
 587 | 
 588 | # We use the SGD optimizer.
 589 | optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
 590 | 
 591 |             
 592 | 
 593 | ```
 594 | 
 595 | 
 596 | ```python
 597 | 
 598 | with tf.Session() as sess:
 599 |     sess.run(init)
 600 |     convergence_threshold = 0.5
 601 |     training_iters = 500*(int((len(embd_inputs_beng))/batch_size))
 602 |     step=0
 603 |     n=5
 604 |     last_n_losses = np.zeros((n),np.float32)
 605 |     while step<training_iters:
 606 |         
 607 |         batch_inputs,batch_labels = generate_batch(embd_inputs_beng,embd_labels_beng,batch_size)
 608 |         
 609 |         feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels.reshape((-1,1))}
 610 |         _, np_embedding_beng, cur_loss = sess.run([optimizer, embeddings_beng, loss], feed_dict=feed_dict)
 611 |         
 612 |         print "Iter "+str(step)+", loss = "+str(cur_loss)
 613 |         last_n_losses[step%n]=cur_loss
 614 |         if step>=n:
 615 |             if np.mean(last_n_losses)<=convergence_threshold:
 616 |                 break
 617 |         step+=1
 618 |                 
 619 | print "\nOptimization Finished\n"
 620 | ```
 621 | 
 622 |     Iter 35806, loss = 0.918012
 623 |     Iter 35807, loss = 1.19433
 624 |     Iter 35808, loss = 0.576221
 625 |     Iter 35809, loss = 1.23828
 626 |     Iter 35810, loss = 1.04737
 627 |     Iter 35811, loss = 0.971104
 628 |     Iter 35812, loss = 0.607476
 629 |     Iter 35813, loss = 0.733661
 630 |     Iter 35814, loss = 0.612409
 631 |     Iter 35815, loss = 1.11281
 632 |     Iter 35816, loss = 1.00669
 633 |     Iter 35817, loss = 0.973409
 634 |     Iter 35818, loss = 0.56991
 635 |     Iter 35819, loss = 0.937719
 636 |     Iter 35820, loss = 0.389082
 637 |     Iter 35821, loss = 0.393635
 638 |     Iter 35822, loss = 0.385571
 639 |     Iter 35823, loss = 0.374355
 640 |     
 641 |     Optimization Finished
 642 |  
 643 |     
 644 | 
 645 | 
 646 | ### Creating Train, Validation, and Test set
 647 | 
 648 | Randomly shuffling the complete dataset (not yet embedded with word2vec embeddings which was learned just now), 
 649 | and then splitting it into train, validation and test set
 650 | 
 651 | 
 652 | ```python
 653 | shuffled_indices = np.arange(len(eng))
 654 | np.random.shuffle(shuffled_indices)
 655 | 
 656 | shuffled_vectorized_eng = []
 657 | shuffled_vectorized_beng = []
 658 | 
 659 | for i in xrange(len(eng)):
 660 |     shuffled_vectorized_eng.append(vectorized_eng[shuffled_indices[i]])
 661 |     shuffled_vectorized_beng.append(vectorized_beng[shuffled_indices[i]])
 662 | 
 663 | train_len = int(.75*len(eng))
 664 | val_len = int(.15*len(eng))
 665 | 
 666 | train_eng = shuffled_vectorized_eng[0:train_len]
 667 | train_beng = shuffled_vectorized_beng[0:train_len]
 668 | 
 669 | val_eng = shuffled_vectorized_eng[train_len:val_len]
 670 | val_beng = shuffled_vectorized_beng[train_len:val_len]
 671 | 
 672 | test_eng = shuffled_vectorized_eng[train_len+val_len:]
 673 | test_beng = shuffled_vectorized_beng[train_len+val_len:]
 674 | ```
 675 | 
 676 | ### Function for bucketing and generating batches
 677 | 
 678 | Mini-batch training requires all lines in a batch to be of equal length.
 679 | We have different lines of different lengths. 
 680 | 
 681 | A solution is to fill shorter sentences with PADs so that length of all sentences become equal.
 682 | But, if one sentence in a batch has 20 words, and the same batch has another sentence with one word, then the latter sentence will have to be filled in by at least 19 pads. If most of the sentences start to have more PADs than actual content, training can be problematic.
 683 | 
 684 | The solution to that is bucketing. First the sentences in the total list are sorted. After that sentences of similar lengths will be closer to each other. Batches are then formed with sentences of similar lengths. Much less padding will be required to turn sentences of similar lengths into sentences of equal lengths. 
 685 | 
 686 | Also while creating the batch, the input samples (the Engish lines) will have their words embedded using the recently trained embedding matrix for English. The output samples (the labels) will simply contain the index of the target Bengali word in the Bengali vocabulary list. The labels being in this format will be easier to train with sparse_softmax_cross_entropy cost function of Tensorflow. 
 687 | 
 688 | 
 689 | ```python
 690 | def bucket_and_batch(x,y,batch_size):
 691 |     
 692 |     len_x= np.zeros((len(x)),np.int32)
 693 |     
 694 |     for i in xrange(len(x)):
 695 |         len_x[i] = len(x[i])
 696 |         
 697 |     sorted_by_len_indices = np.flip(np.argsort(len_x),0)
 698 | 
 699 |     sorted_x = []
 700 |     sorted_y = []
 701 |     
 702 |     for i in xrange(len(x)):
 703 |         sorted_x.append(x[sorted_by_len_indices[i]])
 704 |         sorted_y.append(y[sorted_by_len_indices[i]])
 705 |         
 706 |     i=0
 707 |     batches_x = []
 708 |     batches_y = []
 709 |     
 710 |     while i<len(x):
 711 |         
 712 |         if i+batch_size>=len(x):
 713 |             break
 714 |         
 715 |         batch_x = []
 716 |         batch_y = []
 717 |     
 718 |         max_len_x = len(sorted_x[i])
 719 |     
 720 |         len_y= np.zeros((len(y)),np.int32)
 721 |     
 722 |         for j in xrange(i,i+batch_size):
 723 |             len_y[j] = len(sorted_y[j])
 724 |             
 725 |         max_len_y = np.amax(len_y)
 726 |         
 727 |         for j in xrange(i,i+batch_size):
 728 |             line=[]
 729 |             for k1 in xrange(max_len_x+1): #+1 to include <EOS>
 730 |                 if k1==len(sorted_x[j]):
 731 |                     line.append(np_embedding_eng[vocab_eng.index('<EOS>')])
 732 |                 elif k1>len(sorted_x[j]):
 733 |                     line.append(np_embedding_eng[vocab_eng.index('<PAD>')])
 734 |                 else:
 735 |                     line.append(np_embedding_eng[sorted_x[j][k1]])
 736 |             batch_x.append(line)
 737 |         
 738 |             line=[]
 739 |             for k2 in xrange(max_len_y+1): #+1 to include <EOS>
 740 |                 if k2>len(sorted_y[j]):
 741 |                     line.append(vocab_beng.index('<PAD>'))
 742 |                 elif k2==len(sorted_y[j]):
 743 |                     line.append(vocab_beng.index('<EOS>'))
 744 |                 else:
 745 |                     line.append(sorted_y[j][k2])
 746 |             batch_y.append(line)
 747 |     
 748 |         batch_x = np.asarray(batch_x,np.float32)
 749 |         batch_y = np.asarray(batch_y,np.int32)
 750 | 
 751 |         batches_x.append(batch_x)
 752 |         batches_y.append(batch_y)
 753 |     
 754 |         i+=batch_size
 755 |         
 756 |     return batches_x,batches_y
 757 | 
 758 | 
 759 | ```
 760 | 
 761 | ### Creating train, validation, and test batches
 762 | 
 763 | 
 764 | ```python
 765 | batch_size = 64
 766 | 
 767 | train_batch_eng,train_batch_beng = bucket_and_batch(train_eng,train_beng,batch_size)
 768 | 
 769 | val_batch_eng,val_batch_beng = bucket_and_batch(val_eng,val_beng,batch_size)
 770 | 
 771 | test_batch_eng,test_batch_beng = bucket_and_batch(test_eng,test_beng,batch_size)
 772 | 
 773 | ```
 774 | 
 775 | ### Saving processed data in another file.
 776 | 
 777 | 
 778 | ```python
 779 | #Saving processed data in another file.
 780 | 
 781 | import pickle
 782 | 
 783 | PICK = [vocab_eng,vocab_beng,np_embedding_eng,np_embedding_beng,train_batch_eng,train_batch_beng,val_batch_eng,val_batch_beng,test_batch_eng,test_batch_beng]
 784 | 
 785 | with open('translationPICKLE', 'wb') as fp:
 786 |     pickle.dump(PICK, fp)
 787 |     
 788 | ```
 789 | 
 790 | ### Loading Pre-processed Data
 791 | (start of Machine Translation.ipynb)
 792 | 
 793 | 
 794 | ```python
 795 | import pickle
 796 | import math
 797 | import numpy as np
 798 | 
 799 | 
 800 | with open ('translationPICKLE', 'rb') as fp:
 801 |     PICK = pickle.load(fp)
 802 | 
 803 | vocab_eng = PICK[0] 
 804 | vocab_beng = PICK[1] 
 805 | vocab_len = len(vocab_beng)
 806 | 
 807 | np_embedding_eng = PICK[2]
 808 | np_embedding_beng = PICK[3]
 809 | np_embedding_eng = np.asarray(np_embedding_eng,np.float32)
 810 | np_embedding_beng = np.asarray(np_embedding_beng,np.float32)
 811 | 
 812 | word_vec_dim = np_embedding_eng.shape[1] 
 813 | 
 814 | train_batch_x = PICK[4]
 815 | train_batch_y = PICK[5]
 816 | 
 817 | val_batch_x = PICK[6]
 818 | val_batch_y = PICK[7]
 819 | 
 820 | test_batch_x = PICK[8]
 821 | test_batch_y = PICK[9]
 822 |     
 823 | ```
 824 | 
 825 | ### Function for converting vector of size word_vec_dim into the closest representative english word. 
 826 | 
 827 | 
 828 | ```python
 829 | def most_similar_eucli_eng(x):
 830 |     xminusy = np.subtract(np_embedding_eng,x)
 831 |     sq_xminusy = np.square(xminusy)
 832 |     sum_sq_xminusy = np.sum(sq_xminusy,1)
 833 |     eucli_dists = np.sqrt(sum_sq_xminusy)
 834 |     return np.argsort(eucli_dists)
 835 |     
 836 | def vec2word_eng(vec):   # converts a given vector representation into the represented word 
 837 |     most_similars = most_similar_eucli_eng(np.asarray(vec,np.float32))
 838 |     return vocab_eng[most_similars[0]]
 839 |     
 840 | ```
 841 | 
 842 | ### Hyperparameters and Placeholders.
 843 | 
 844 | 
 845 | ```python
 846 | import tensorflow as tf
 847 | 
 848 | #Hyperparamters
 849 | 
 850 | h=8 #no. of heads
 851 | N=1 #no. of decoder and encoder layers
 852 | learning_rate=0.001
 853 | epochs = 200
 854 | keep_prob = tf.placeholder(tf.float32)
 855 | 
 856 | #Placeholders
 857 | 
 858 | x = tf.placeholder(tf.float32, [None,None,word_vec_dim])
 859 | y = tf.placeholder(tf.int32, [None,None])
 860 | 
 861 | output_len = tf.placeholder(tf.int32)
 862 | 
 863 | teacher_forcing = tf.placeholder(tf.bool)
 864 | 
 865 | tf_pad_mask = tf.placeholder(tf.float32,[None,None])
 866 | tf_illegal_position_masks = tf.placeholder(tf.float32,[None,None,None])
 867 | 
 868 | tf_pe_out = tf.placeholder(tf.float32,[None,None,None]) #positional codes for output
 869 | ```
 870 | 
 871 | ### Model Parameters.
 872 | 
 873 | 
 874 | ```python
 875 |    
 876 | # Dimensions for Q (Query),K (Keys) and V (Values) for attention layers.
 877 | 
 878 | dqkv = 32 
 879 |     
 880 | #Parameters for attention sub-layers for all n encoders
 881 | 
 882 | Wq_enc = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
 883 | Wk_enc = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
 884 | Wv_enc = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
 885 | Wo_enc = tf.Variable(tf.truncated_normal(shape=[N,h*dqkv,word_vec_dim],stddev=0.01))
 886 | 
 887 | #Parameters for position-wise fully connected layers for n encoders
 888 | 
 889 | d = 1024
 890 | W1_enc = tf.Variable(tf.truncated_normal(shape=[N,1,1,word_vec_dim,d],stddev=0.01))
 891 | b1_enc = tf.Variable(tf.constant(0,tf.float32,shape=[N,d]))
 892 | W2_enc = tf.Variable(tf.truncated_normal(shape=[N,1,1,d,word_vec_dim],stddev=0.01))
 893 | b2_enc = tf.Variable(tf.constant(0,tf.float32,shape=[N,word_vec_dim]))
 894 |     
 895 | #Parameters for 2 attention sub-layers for all n decoders
 896 | 
 897 | Wq_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
 898 | Wk_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
 899 | Wv_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
 900 | Wo_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h*dqkv,word_vec_dim],stddev=0.01))
 901 | Wq_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
 902 | Wk_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
 903 | Wv_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01))
 904 | Wo_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h*dqkv,word_vec_dim],stddev=0.01))
 905 |     
 906 | #Parameters for position-wise fully connected layers for n decoders
 907 | 
 908 | d = 1024
 909 | W1_dec = tf.Variable(tf.truncated_normal(shape=[N,1,1,word_vec_dim,d],stddev=0.01))
 910 | b1_dec = tf.Variable(tf.constant(0,tf.float32,shape=[N,d]))
 911 | W2_dec = tf.Variable(tf.truncated_normal(shape=[N,1,1,d,word_vec_dim],stddev=0.01))
 912 | b2_dec = tf.Variable(tf.constant(0,tf.float32,shape=[N,word_vec_dim]))
 913 |     
 914 | #Layer Normalization parameters for encoder
 915 | 
 916 | scale_enc_1 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32)
 917 | shift_enc_1 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32)
 918 | 
 919 | scale_enc_2 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32)
 920 | shift_enc_2 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32)
 921 | 
 922 | #Layer Normalization parameters for decoder   
 923 | 
 924 | scale_dec_1 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32)
 925 | shift_dec_1 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32)
 926 | 
 927 | scale_dec_2 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32)
 928 | shift_dec_2 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32)
 929 | 
 930 | scale_dec_3 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32)
 931 | shift_dec_3 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32)
 932 | ```
 933 | 
 934 | ### Function for generating a sequence of positional codes for positional encoding.
 935 | 
 936 | 
 937 | ```python
 938 | def positional_encoding(seq_len,model_dimensions):
 939 |     pe = np.zeros((seq_len,model_dimensions,),np.float32)
 940 |     for pos in xrange(0,seq_len):
 941 |         for i in xrange(0,model_dimensions):
 942 |             pe[pos][i] = math.sin(pos/(10000**(2*i/model_dimensions)))
 943 |     return pe.reshape((seq_len,model_dimensions))
 944 | ```
 945 | 
 946 | ### Function for Layer Normalization 
 947 | 
 948 | [Layer Normalization - by Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton](https://arxiv.org/abs/1607.06450)
 949 | 
 950 | 
 951 | ```python
 952 | 
 953 | def layer_norm(inputs,scale,shift,epsilon = 1e-5):
 954 | 
 955 |     mean, var = tf.nn.moments(inputs, [1,2], keep_dims=True)
 956 | 
 957 |     LN = tf.multiply((scale / tf.sqrt(var + epsilon)),(inputs - mean)) + shift
 958 |  
 959 |     return LN
 960 | ```
 961 | 
 962 | ### Function to pre-generate masks for illegal positions. 
 963 | 
 964 | These masks are to be used to fill illegal positions with -infinity (or a very low value eg. -2^30).
 965 | 
 966 | Illegal positions are positions of the decoder input tokens that aren't predicted at a given timestep.
 967 | 
 968 | { In a transformer, the decoder input is of the same shape as the WHOLE decoder output sequence. One word for the sequence is predicted at each timestep (from left to right). So in most timesteps, the left side of the decoder input sequence will contain valid previously predicted output words, but the right side -the yet to be predicted side should contain some values that should be ignored and never attended. We make sure that they're ignored by masking it }
 969 | 
 970 | So, the illegal positions depends on the total output length and the no. of predicted output tokens.
 971 | 
 972 | The appropriate mask when i output tokens are predicted can be retrieved from mask[i-1] where mask is the return value from this function. The argument out_len that function takes, signifies the total length of the output. 
 973 | 
 974 | The masks are used to assign the value -2^30 to all positions in the tensor influenced by the illegal ones.
 975 | After going through the softmax layer, these positions become close to 0, as it should be.
 976 | 
 977 | Dynamically creating masks depending on the current position\timestep (depending on which the program can know which positions are legal and which aren't) is, however,
 978 | a bit troublesome with tensorflow tf_while_loop. 
 979 | 
 980 | I will be pre-generating all the masks with Python native code and feed the list of all required masks to the network at each training step (output length can be different at different training steps). 
 981 |                                                                  
 982 | 
 983 | 
 984 | ```python
 985 | def generate_masks_for_illegal_positions(out_len):
 986 |     
 987 |     masks=np.zeros((out_len-1,out_len,out_len),dtype=np.float32)
 988 |     
 989 |     for i in xrange(1,out_len):
 990 |         mask = np.zeros((out_len,out_len),dtype=np.float32)
 991 |         mask[i:out_len,:] = -2**30
 992 |         mask[:,i:out_len] = -2**30
 993 |         masks[i-1] = mask
 994 |         
 995 |     return masks
 996 | ```
 997 | 
 998 | ### Function for Multi-Headed Attention.
 999 | 
1000 | Details: https://arxiv.org/pdf/1706.03762.pdf
1001 | 
1002 | Q = Query
1003 | 
1004 | K = Key
1005 | 
1006 | V = Value
1007 | 
1008 | d is the dimension for Q, K and V. 
1009 | 
1010 | 
1011 | ```python
1012 | 
1013 | def attention(Q,K,V,d,filled=0,mask=False):
1014 | 
1015 |     K = tf.transpose(K,[0,2,1])
1016 |     d = tf.cast(d,tf.float32)
1017 |     
1018 |     softmax_component = tf.div(tf.matmul(Q,K),tf.sqrt(d))
1019 |     
1020 |     if mask == True:
1021 |         softmax_component = softmax_component + tf_illegal_position_masks[filled-1]
1022 |         
1023 |     result = tf.matmul(tf.nn.dropout(tf.nn.softmax(softmax_component),keep_prob),V)
1024 |  
1025 |     return result
1026 |        
1027 | 
1028 | def multihead_attention(Q,K,V,d,weights,filled=0,mask=False):
1029 |     
1030 |     Q_ = tf.reshape(Q,[-1,tf.shape(Q)[2]])
1031 |     K_ = tf.reshape(K,[-1,tf.shape(Q)[2]])
1032 |     V_ = tf.reshape(V,[-1,tf.shape(Q)[2]])
1033 | 
1034 |     heads = tf.TensorArray(size=h,dtype=tf.float32)
1035 |     
1036 |     Wq = weights['Wq']
1037 |     Wk = weights['Wk']
1038 |     Wv = weights['Wv']
1039 |     Wo = weights['Wo']
1040 |     
1041 |     for i in xrange(0,h):
1042 |         
1043 |        Q_w = tf.matmul(Q_,Wq[i])
1044 |        Q_w = tf.reshape(Q_w,[tf.shape(Q)[0],tf.shape(Q)[1],d])
1045 |         
1046 |        K_w = tf.matmul(K_,Wk[i])
1047 |        K_w = tf.reshape(K_w,[tf.shape(K)[0],tf.shape(K)[1],d])
1048 |         
1049 |        V_w = tf.matmul(V_,Wv[i])
1050 |        V_w = tf.reshape(V_w,[tf.shape(V)[0],tf.shape(V)[1],d])
1051 | 
1052 |        head = attention(Q_w,K_w,V_w,d,filled,mask)
1053 |             
1054 |        heads = heads.write(i,head)
1055 |         
1056 |     heads = heads.stack()
1057 |     
1058 |     concated = heads[0]
1059 |     
1060 |     for i in xrange(1,h):
1061 |         concated = tf.concat([concated,heads[i]],2)
1062 | 
1063 |     concated = tf.reshape(concated,[-1,h*d])
1064 |     out = tf.matmul(concated,Wo)
1065 |     out = tf.reshape(out,[tf.shape(heads)[1],tf.shape(heads)[2],word_vec_dim])
1066 |     
1067 |     return out
1068 |     
1069 | ```
1070 | 
1071 | ### Function for encoder
1072 | 
1073 | More details: https://arxiv.org/pdf/1706.03762.pdf
1074 | 
1075 | 
1076 | ```python
1077 | def encoder(x,weights,attention_weights,dqkv):
1078 | 
1079 |     W1 = weights['W1']
1080 |     W2 = weights['W2']
1081 |     b1 = weights['b1']
1082 |     b2 = weights['b2']
1083 |     
1084 |     scale1 = weights['scale1']
1085 |     shift1 = weights['shift1']
1086 |     scale2 = weights['scale2']
1087 |     shift2 = weights['shift2']
1088 |     
1089 |     # SUBLAYER 1 (MASKED MULTI HEADED SELF ATTENTION)
1090 |     
1091 |     sublayer1 = multihead_attention(x,x,x,dqkv,attention_weights)
1092 |     sublayer1 = tf.nn.dropout(sublayer1,keep_prob)
1093 |     sublayer1 = layer_norm(sublayer1 + x,scale1,shift1)
1094 |     
1095 |     sublayer1_ = tf.reshape(sublayer1,[tf.shape(sublayer1)[0],1,tf.shape(sublayer1)[1],word_vec_dim])
1096 |     
1097 |     # SUBLAYER 2 (TWO 1x1 CONVOLUTIONAL LAYERS AKA POSITION WISE FULLY CONNECTED NETWORKS)
1098 |     
1099 |     sublayer2 = tf.nn.conv2d(sublayer1_, W1, strides=[1,1,1,1], padding='SAME')
1100 |     sublayer2 = tf.nn.bias_add(sublayer2,b1)
1101 |     sublayer2 = tf.nn.relu(sublayer2)
1102 |     
1103 |     sublayer2 = tf.nn.conv2d(sublayer2, W2, strides=[1,1,1,1], padding='SAME')
1104 |     sublayer2 = tf.nn.bias_add(sublayer2,b2)
1105 |     
1106 |     sublayer2 = tf.reshape(sublayer2,[tf.shape(sublayer2)[0],tf.shape(sublayer2)[2],word_vec_dim])
1107 |     
1108 |     sublayer2 = tf.nn.dropout(sublayer2,keep_prob)
1109 |     sublayer2 = layer_norm(sublayer2 + sublayer1,scale2,shift2)
1110 |     
1111 |     return sublayer2
1112 | 
1113 | ```
1114 | 
1115 | ### Function for decoder
1116 | 
1117 | More details: https://arxiv.org/pdf/1706.03762.pdf
1118 | 
1119 | 
1120 | ```python
1121 | def decoder(y,enc_out,weights,masked_attention_weights,attention_weights,dqkv,mask=False,filled=0):
1122 | 
1123 |     W1 = weights['W1']
1124 |     W2 = weights['W2']
1125 |     b1 = weights['b1']
1126 |     b2 = weights['b2']
1127 |     
1128 |     scale1 = weights['scale1']
1129 |     shift1 = weights['shift1']
1130 |     scale2 = weights['scale2']
1131 |     shift2 = weights['shift2']
1132 |     scale3 = weights['scale3']
1133 |     shift3 = weights['shift3']
1134 |     
1135 |     # SUBLAYER 1 (MASKED MULTI HEADED SELF ATTENTION)
1136 | 
1137 |     sublayer1 = multihead_attention(y,y,y,dqkv,masked_attention_weights,filled,mask)
1138 |     sublayer1 = tf.nn.dropout(sublayer1,keep_prob)
1139 |     sublayer1 = layer_norm(sublayer1 + y,scale1,shift1)
1140 |     
1141 |     # SUBLAYER 2 (MULTIHEADED ENCODER-DECODER INTERLAYER ATTENTION)
1142 |     
1143 |     sublayer2 = multihead_attention(sublayer1,enc_out,enc_out,dqkv,attention_weights)
1144 |     sublayer2 = tf.nn.dropout(sublayer2,keep_prob)
1145 |     sublayer2 = layer_norm(sublayer2 + sublayer1,scale2,shift2)
1146 |     
1147 |     # SUBLAYER 3 (TWO 1x1 CONVOLUTIONAL LAYERS AKA POSITION WISE FULLY CONNECTED NETWORKS)
1148 |     
1149 |     sublayer2_ = tf.reshape(sublayer2,[tf.shape(sublayer2)[0],1,tf.shape(sublayer2)[1],word_vec_dim])
1150 |     
1151 |     sublayer3 = tf.nn.conv2d(sublayer2_, W1, strides=[1,1,1,1], padding='SAME')
1152 |     sublayer3 = tf.nn.bias_add(sublayer3,b1)
1153 |     sublayer3 = tf.nn.relu(sublayer3)
1154 |     
1155 |     sublayer3 = tf.nn.conv2d(sublayer3, W2, strides=[1,1,1,1], padding='SAME')
1156 |     sublayer3 = tf.nn.bias_add(sublayer3,b2)
1157 |     
1158 |     sublayer3 = tf.reshape(sublayer3,[tf.shape(sublayer3)[0],tf.shape(sublayer3)[2],word_vec_dim])
1159 |     
1160 |     sublayer3 = tf.nn.dropout(sublayer3,keep_prob)
1161 |     sublayer3 = layer_norm(sublayer3 + sublayer2,scale3,shift3)
1162 |     
1163 |     return sublayer3
1164 | ```
1165 | 
1166 | ### Function for Stacking Encoders.
1167 | 
1168 | 
1169 | ```python
1170 | def stacked_encoders(layer_num,encoderin):
1171 |     
1172 |     for i in xrange(0,layer_num):
1173 |         
1174 |         encoder_weights = {
1175 |             
1176 |             'W1': W1_enc[i],
1177 |             'W2': W2_enc[i],
1178 |             'b1': b1_enc[i],
1179 |             'b2': b2_enc[i],
1180 |             'scale1': scale_enc_1[i],
1181 |             'shift1': shift_enc_1[i],
1182 |             'scale2': scale_enc_2[i],
1183 |             'shift2': shift_enc_2[i],
1184 |         }
1185 |         
1186 |         attention_weights = {
1187 |             
1188 |             'Wq': Wq_enc[i],
1189 |             'Wk': Wk_enc[i],
1190 |             'Wv': Wv_enc[i],
1191 |             'Wo': Wo_enc[i],                       
1192 |         }
1193 |         
1194 |         encoderin = encoder(encoderin,encoder_weights,attention_weights,dqkv)
1195 |     
1196 |     return encoderin
1197 |     
1198 | ```
1199 | 
1200 | ### Function for Stacking Decoders.
1201 | 
1202 | 
1203 | ```python
1204 | def stacked_decoders(layer_num,decoderin,encoderout,filled):
1205 |     
1206 |     for j in xrange(0,layer_num):
1207 |         
1208 |         decoder_weights = {
1209 |             
1210 |             'W1': W1_dec[j],
1211 |             'W2': W2_dec[j],
1212 |             'b1': b1_dec[j],
1213 |             'b2': b2_dec[j],
1214 |             'scale1': scale_dec_1[j],
1215 |             'shift1': shift_dec_1[j],
1216 |             'scale2': scale_dec_2[j],
1217 |             'shift2': shift_dec_2[j],
1218 |             'scale3': scale_dec_3[j],
1219 |             'shift3': shift_dec_3[j],
1220 |         }
1221 |             
1222 |         masked_attention_weights = {
1223 |             
1224 |             'Wq': Wq_dec_1[j],
1225 |             'Wk': Wk_dec_1[j],
1226 |             'Wv': Wv_dec_1[j],
1227 |             'Wo': Wo_dec_1[j],                       
1228 |         }
1229 |         
1230 |         attention_weights = {
1231 |             
1232 |             'Wq': Wq_dec_2[j],
1233 |             'Wk': Wk_dec_2[j],
1234 |             'Wv': Wv_dec_2[j],
1235 |             'Wo': Wo_dec_2[j],                       
1236 |         }
1237 |             
1238 |         decoderin = decoder(decoderin,encoderout,
1239 |                             decoder_weights,
1240 |                             masked_attention_weights,
1241 |                             attention_weights,
1242 |                             dqkv,
1243 |                             mask=True,filled=filled)
1244 |     return decoderin
1245 |     
1246 | ```
1247 | 
1248 | ### predicted_embedding():
1249 | 
1250 | Given a probability distribution and an embedding matrix, this function returns the embedding of the word with the maximum probability in the given distribution.
1251 | 
1252 | ### replaceSOS():
1253 | 
1254 | SOS signifies the start of sentence for the decoder. Also often represented as 'GO'. I am using an all ones vector as the first decoder input token. 
1255 | In the next time step, the SOS will be forgotten, and only the context of the previously predicted output (or the target output at the previous timestep, if teacher forcing is on) will be used.
1256 | 
1257 | ### add_pred_to_output_lists():
1258 | 
1259 | This function will concatenate the last predicted output into a tensor of concatenated sequence of output tokens. 
1260 | 
1261 | 
1262 | ```python
1263 | def predicted_embedding(out_prob_dist,tf_embd):
1264 |     out_index = tf.cast(tf.argmax(out_prob_dist,1),tf.int32)
1265 |     return tf.gather(tf_embd,out_index)
1266 | 
1267 | def replaceSOS(output,out_prob_dist):
1268 |     return output,tf.constant(1),tf.reshape(out_prob_dist,[tf.shape(x)[0],1,vocab_len])
1269 | 
1270 | def add_pred_to_output_list(decoderin_part_1,output,filled,out_probs,out_prob_dist):
1271 |     decoderin_part_1 = tf.concat([decoderin_part_1,output],1)
1272 |     filled += 1
1273 |     out_probs = tf.concat([out_probs,tf.reshape(out_prob_dist,[tf.shape(x)[0],1,vocab_len])],1)
1274 |     return decoderin_part_1,filled,out_probs
1275 | ```
1276 | 
1277 | ### Model Definition
1278 | 
1279 | It follows the encoder-decoder paradigm. The main exception from standard encoder-decoder paradigm, is that it uses 'transformers' instead of Reccurrent networks. The decoder undergoes a sequential processing, though. 
1280 | 
1281 | If teacher forcing is True, the decoder is made to guess the next output from the previous words in the actual target output, else the decoder predicts the next output from the previously predicted output of the decoder.
1282 | 
1283 | Details about the model: https://arxiv.org/pdf/1706.03762.pdf
1284 | 
1285 | 
1286 | ```python
1287 | def model(x,y,teacher_forcing=True):
1288 |     
1289 |         
1290 |     # NOTE: tf.shape(x)[0] == batch_size
1291 |     
1292 |     encoderin = x # (should be already positionally encoded) 
1293 |     encoderin = tf.nn.dropout(encoderin,keep_prob)
1294 | 
1295 |     
1296 |     # ENCODER LAYERS
1297 | 
1298 |     encoderout = stacked_encoders(N,encoderin)
1299 |     
1300 | 
1301 |     # DECODER LAYERS
1302 | 
1303 |     decoderin_part_1 = tf.ones([tf.shape(x)[0],1,word_vec_dim],dtype=tf.float32) #represents SOS
1304 |     
1305 |     filled = tf.constant(1) 
1306 |     # no. of output words that are filled i.e already predicted - are stored in 'filled'
1307 |     # filled value is used to retrieve appropriate mask for illegal positions. 
1308 |     
1309 |     
1310 |     tf_embd = tf.convert_to_tensor(np_embedding_beng)
1311 |     Wpd = tf.transpose(tf_embd)
1312 |     # Wpd the transpose of the output embedding matrix will be used to convert the decoder output
1313 |     # into a probability distribution over the output language vocabulary. 
1314 |     
1315 |     out_probs = tf.zeros([tf.shape(x)[0],output_len,vocab_len],tf.float32)
1316 |     # out_probs will contain the list of probability distributions.
1317 | 
1318 |     #tf_while_loop since output_len will be dynamically defined during session run
1319 |     
1320 |     i=tf.constant(0)
1321 |     
1322 |     def cond(i,filled,decoderin_part_1,out_probs):
1323 |         return i<output_len
1324 |     
1325 |     def body(i,filled,decoderin_part_1,out_probs):
1326 |         
1327 |         decoderin_part_2 = tf.zeros([tf.shape(x)[0],(output_len-filled),word_vec_dim],dtype=tf.float32)
1328 |         
1329 |         decoderin = tf.concat([decoderin_part_1,decoderin_part_2],1)
1330 |         
1331 |         decoderin = tf.nn.dropout(decoderin,keep_prob)
1332 |         
1333 |         decoderout = stacked_decoders(N,decoderin,encoderout,filled)
1334 |         
1335 |         # decoderout shape (now) = batch_size x seq_len x word_vec_dim
1336 | 
1337 |         decoderout = tf.reduce_sum(decoderout,1) 
1338 |         
1339 |         # summation over all the word_vec_dim dimensional vectors in the sequence to transform dimensions
1340 |         # from batch_size x seq_len x word_vec_dim to batch_size x word_vec_dim.
1341 |         # I suppose, a Linear layer can be alternatively used here too.
1342 |         
1343 |         # decoderout shape (now) = batch_size x word_vec_dim
1344 |         
1345 |         # converting decoderout to probability distributions
1346 |         
1347 |         out_prob_dist = tf.matmul(decoderout,Wpd)
1348 |    
1349 |         # If teacher forcing is false, initiate predicted_embedding(). It guesses the output embeddings
1350 |         # to be that whose vocabulary index has maximum probability in out_prob_dist
1351 |         # (the current output probability distribution). The embedding is used in the next
1352 |         # iteration. 
1353 |         
1354 |         # If teacher forcing is true, use the embedding of target index from y (laebls) 
1355 |         # for the next iteration.
1356 |         
1357 |         output = tf.cond(tf.equal(teacher_forcing,tf.convert_to_tensor(False)),
1358 |                          lambda: predicted_embedding(out_prob_dist,tf_embd),
1359 |                          lambda: tf.gather(tf_embd,y[:,i]))
1360 |         
1361 |         # Position Encoding the output
1362 |         
1363 |         output = output + tf_pe_out[i]
1364 |         output = tf.reshape(output,[tf.shape(x)[0],1,word_vec_dim])
1365 |                                 
1366 |         
1367 |         #concatenate with list of previous predicted output tokens
1368 |         
1369 |         decoderin_part_1,filled,out_probs = tf.cond(tf.equal(i,0),
1370 |                                         lambda:replaceSOS(output,out_prob_dist),
1371 |                                         lambda:add_pred_to_output_list(decoderin_part_1,output,filled,out_probs,out_prob_dist))
1372 |         
1373 |         return i+1,filled,decoderin_part_1,out_probs
1374 |             
1375 |     _,_,_,out_probs = tf.while_loop(cond,body,[i,filled,decoderin_part_1,out_probs],
1376 |                       shape_invariants=[i.get_shape(),
1377 |                                         filled.get_shape(),
1378 |                                         tf.TensorShape([None,None,word_vec_dim]),
1379 |                                         tf.TensorShape([None,None,vocab_len])])
1380 | 
1381 |     return out_probs
1382 | ```
1383 | 
1384 | ### Setting up cost function and optimizer
1385 | 
1386 | 
1387 | ```python
1388 | # Construct Model
1389 | output = model(x,y,teacher_forcing)
1390 | 
1391 | #OPTIMIZER
1392 | 
1393 | cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output, labels=y)
1394 | cost = tf.multiply(cost,tf_pad_mask) #mask used to remove loss effect due to PADS
1395 | cost = tf.reduce_mean(cost)
1396 | 
1397 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,beta1=0.9,beta2=0.98,epsilon=1e-9).minimize(cost)
1398 | 
1399 | #wanna add some temperature?
1400 | 
1401 | """temperature = 0.7
1402 | scaled_output = tf.log(output)/temperature
1403 | softmax_output = tf.nn.softmax(scaled_output)"""
1404 | 
1405 | #(^Use it with "#prediction_int = np.random.choice(range(vocab_len), p=array.ravel())")
1406 | 
1407 | softmax_output = tf.nn.softmax(output)
1408 | 
1409 | ```
1410 | 
1411 | ### Function to create a Mask for pads' effect on cost. 
1412 | 
1413 | The mask will have the same shape as the batch of labels but with the value 0 wherever there is a PAD.
1414 | The mask will be element-wise multipled to the cost (before its averaged), so that any position in the cost tensor that is effected by the PAD will be multiplied by 0. This way, the effect of PADs (which we don't need to care about) on the cost (and therefore on the gradients) can be nullified. 
1415 | 
1416 | 
1417 | ```python
1418 | def create_pad_Mask(output_batch):
1419 |     pad_index = vocab_beng.index('<PAD>')
1420 |     mask = np.ones_like((output_batch),np.float32)
1421 |     for i in xrange(len(mask)):
1422 |         for j in xrange(len(mask[i])):
1423 |             if output_batch[i,j]==pad_index:
1424 |                 mask[i,j]=0
1425 |     return mask
1426 | ```
1427 | 
1428 | ### Training .....
1429 | 
1430 | The input batch is positionally encoded before its fed to the network.
1431 | 
1432 | 
1433 | ```python
1434 | import string
1435 | import random
1436 | from __future__ import print_function
1437 | 
1438 | init = tf.global_variables_initializer()
1439 | 
1440 | with tf.Session() as sess: # Start Tensorflow Session
1441 |     
1442 |     saver = tf.train.Saver() 
1443 |     # Prepares variable for saving the model
1444 |     sess.run(init) #initialize all variables
1445 |     step = 0   
1446 |     best_loss = 999
1447 |     display_step = 1
1448 |     warm_up_steps = 7000
1449 |     
1450 |     while step < epochs:
1451 |         
1452 |         batch_len = len(train_batch_x)
1453 |         shuffled_indices = np.arange(batch_len)
1454 |         np.random.shuffle(shuffled_indices)
1455 |         
1456 |         for i in xrange(0,batch_len):
1457 |             
1458 |             # Adaptive learning rate formula
1459 |             #learning_rate = ((word_vec_dim)**(-0.5))*min((step*batch_len+i+1)**(-0.5),(step*batch_len+i+1)*warm_up_steps**(-1.5))
1460 | 
1461 |             sample_no = np.random.randint(0,len(train_batch_x[0]))
1462 |             print("\nCHOSEN SAMPLE NO.: "+str(sample_no))
1463 |             
1464 |             if i%display_step==0:
1465 |                 
1466 |                 print("\nEpoch: "+str(step+1)+" Iteration: "+str(i+1))
1467 |                 print("\nSAMPLE TEXT:")
1468 |                 for vec in train_batch_x[shuffled_indices[i]][sample_no]:
1469 |                     print(vec2word_eng(vec),end=" ")
1470 |                 print("\n")
1471 |                 
1472 |             input_seq_len = len(train_batch_x[shuffled_indices[i]][0])
1473 |             
1474 |             pe_in = positional_encoding(input_seq_len,word_vec_dim)
1475 |             pe_in = pe_in.reshape((1,input_seq_len,word_vec_dim))
1476 |             
1477 |             output_seq_len = len(train_batch_y[shuffled_indices[i]][0])
1478 |             
1479 |             
1480 |             
1481 |             illegal_position_masks = generate_masks_for_illegal_positions(output_seq_len)
1482 |             
1483 |             pe_out = positional_encoding(output_seq_len,word_vec_dim)
1484 |             pe_out = pe_out.reshape((output_seq_len,1,word_vec_dim))
1485 |     
1486 |             
1487 |             rand = random.randint(0,2) #determines chance of using Teacher Forcing
1488 |             if rand==1:
1489 |                 random_bool = True
1490 |             else:
1491 |                 random_bool = False
1492 |             
1493 |             pad_mask = create_pad_Mask(train_batch_y[shuffled_indices[i]])
1494 |             
1495 |             # Run optimization operation (backpropagation)
1496 |             _,loss,out = sess.run([optimizer,cost,softmax_output],
1497 |                                   feed_dict={x: (train_batch_x[shuffled_indices[i]]+pe_in), 
1498 |                                              y: train_batch_y[shuffled_indices[i]],
1499 |                                              keep_prob: 0.9,
1500 |                                              output_len: len(train_batch_y[shuffled_indices[i]][0]),
1501 |                                              tf_pad_mask: pad_mask,
1502 |                                              tf_illegal_position_masks: illegal_position_masks,
1503 |                                              tf_pe_out: pe_out,
1504 |                                              teacher_forcing: False #random_bool
1505 |                                              # feed random bool for randomized teacher forcing. 
1506 |                                              })
1507 |             
1508 |             if i%display_step==0:
1509 |                 
1510 |                 print("\nPREDICTED TRANSLATION OF THE SAMPLE:\n")
1511 |                 flag = 0
1512 |                 for array in out[sample_no]:
1513 |                     
1514 |                     #prediction_int = np.random.choice(range(vocab_len), p=array.ravel()) 
1515 |                     #(^use this if you want some variety)
1516 |                     #(or use this what's below:)
1517 |                     
1518 |                     prediction_int = np.argmax(array)
1519 |                     
1520 |                     if vocab_beng[prediction_int] in string.punctuation or flag==0: 
1521 |                         print(vocab_beng[prediction_int],end='')
1522 |                     else:
1523 |                         print(" "+vocab_beng[prediction_int],end='')
1524 |                     flag=1
1525 |                 print("\n")
1526 |                 
1527 |                 print("ACTUAL TRANSLATION OF THE SAMPLE:\n")
1528 |                 for index in train_batch_y[shuffled_indices[i]][sample_no]:
1529 |                     print(vocab_beng[index],end=" ")
1530 |                 print("\n")
1531 |             
1532 |             print("loss="+str(loss))
1533 |                   
1534 |             if(loss<best_loss):
1535 |                 best_loss = loss
1536 |                 saver.save(sess, 'Model_Backup/translation_model.ckpt')
1537 | 
1538 |         step=step+1
1539 |     
1540 | ```
1541 | 
1542 |     
1543 |     CHOSEN SAMPLE NO.: 29
1544 |     
1545 |     Epoch: 1 Iteration: 1
1546 |     
1547 |     SAMPLE TEXT:
1548 |     he is having lunch now <EOS> 
1549 |     
1550 |     
1551 |     PREDICTED TRANSLATION OF THE SAMPLE:
1552 |     
1553 |     দেখছে দেখছে দেখছে দেখছে দেখছে দেখছে দেখছে দেখছে দেখছে
1554 |     
1555 |     ACTUAL TRANSLATION OF THE SAMPLE:
1556 |     
1557 |     তিনি এখন লাঞচ করছেন। <EOS> <PAD> <PAD> <PAD> <PAD> 
1558 |     
1559 |     loss=297.772
1560 |     
1561 |     CHOSEN SAMPLE NO.: 0
1562 |     
1563 |     Epoch: 1 Iteration: 2
1564 |     
1565 |     SAMPLE TEXT:
1566 |     they have got guns <EOS> 
1567 |     
1568 |     
1569 |     PREDICTED TRANSLATION OF THE SAMPLE:
1570 |     
1571 |     ঠিকানাটা ঠিকানাটা ঠিকানাটা ঠিকানাটা ঠিকানাটা ঠিকানাটা ঠিকানাটা ঠিকানাটা
1572 |     
1573 |     ACTUAL TRANSLATION OF THE SAMPLE:
1574 |     
1575 |     ওদের কাছে বনদক রযেছে। <EOS> <PAD> <PAD> <PAD> 
1576 |     
1577 |     loss=242.409
1578 |     
1579 |     CHOSEN SAMPLE NO.: 55
1580 |     
1581 |     Epoch: 1 Iteration: 3
1582 |     
1583 |     SAMPLE TEXT:
1584 |     it seems that i have lost my keys <EOS> <PAD> 
1585 |     
1586 |     
1587 |     PREDICTED TRANSLATION OF THE SAMPLE:
1588 |     
1589 |     তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি
1590 |     
1591 |     ACTUAL TRANSLATION OF THE SAMPLE:
1592 |     
1593 |     মনে হচছে আমি আমার চাবি হারিযে ফেলেছি। <EOS> <PAD> <PAD> <PAD> <PAD> 
1594 |     
1595 |     loss=358.392
1596 |     .
1597 |     .
1598 |     .
1599 |     .
1600 |     CHOSEN SAMPLE NO.: 20
1601 |     
1602 |     Epoch: 128 Iteration: 44
1603 |     
1604 |     SAMPLE TEXT:
1605 |     she knows where we live <EOS> 
1606 |     
1607 |     
1608 |     PREDICTED TRANSLATION OF THE SAMPLE:
1609 |     
1610 |     উনি জানে আমরা কোথায থাকি। <EOS> <EOS> <EOS> <EOS>
1611 |     
1612 |     ACTUAL TRANSLATION OF THE SAMPLE:
1613 |     
1614 |     তিনি জানেন আমরা কোথায থাকি। <EOS> <PAD> <PAD> <PAD> 
1615 |     
1616 |     loss=1.27356
1617 |     
1618 |     CHOSEN SAMPLE NO.: 28
1619 |     
1620 |     Epoch: 128 Iteration: 45
1621 |     
1622 |     SAMPLE TEXT:
1623 |     tom 's strong <EOS> 
1624 |     
1625 |     
1626 |     PREDICTED TRANSLATION OF THE SAMPLE:
1627 |     
1628 |     টম শকতিশালী। <EOS> <EOS> <EOS> <EOS>
1629 |     
1630 |     ACTUAL TRANSLATION OF THE SAMPLE:
1631 |     
1632 |     টম শকতিশালী। <EOS> <PAD> <PAD> <PAD> 
1633 |     
1634 |     loss=0.466606
1635 |     
1636 |     CHOSEN SAMPLE NO.: 8
1637 |     
1638 |     Epoch: 128 Iteration: 46
1639 |     
1640 |     SAMPLE TEXT:
1641 |     stop that woman <EOS> 
1642 |     
1643 |     
1644 |     PREDICTED TRANSLATION OF THE SAMPLE:
1645 |     
1646 |     ওই ফিরে আটকাও। <EOS> <EOS> <EOS>
1647 |     
1648 |     ACTUAL TRANSLATION OF THE SAMPLE:
1649 |     
1650 |     ওই মহিলাটিকে থামান। <EOS> <PAD> <PAD> 
1651 |     
1652 |     loss=0.628224
1653 |     
1654 |     CHOSEN SAMPLE NO.: 30
1655 |     
1656 |     Epoch: 128 Iteration: 47
1657 |     
1658 |     SAMPLE TEXT:
1659 |     do you have my book <EOS> 
1660 |     
1661 |     
1662 |     PREDICTED TRANSLATION OF THE SAMPLE:
1663 |     
1664 |     আমার কাছে কি ভালো বইটা আছে <EOS> <EOS> <EOS>
1665 |     
1666 |     ACTUAL TRANSLATION OF THE SAMPLE:
1667 |     
1668 |     আপনার কাছে কি আমার বইটা আছে <EOS> <PAD> <PAD> 
1669 |     
1670 |     loss=1.2308
1671 |     
1672 |     CHOSEN SAMPLE NO.: 43
1673 |     
1674 |     Epoch: 128 Iteration: 48
1675 |     
1676 |     SAMPLE TEXT:
1677 |     would you like to come inside <EOS> <PAD> 
1678 |     
1679 |     
1680 |     PREDICTED TRANSLATION OF THE SAMPLE:
1681 |     
1682 |     তমি কি ভেতরে আসবেন <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
1683 |     
1684 |     ACTUAL TRANSLATION OF THE SAMPLE:
1685 |     
1686 |     আপনারা কি ভেতরে আসবেন <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 
1687 |     
1688 |     loss=1.67444
1689 |     
1690 |     CHOSEN SAMPLE NO.: 0
1691 |     
1692 |     Epoch: 128 Iteration: 49
1693 |     
1694 |     SAMPLE TEXT:
1695 |     are you busy tomorrow night <EOS> 
1696 |     
1697 |     
1698 |     PREDICTED TRANSLATION OF THE SAMPLE:
1699 |     
1700 |     কোনো কি কাল আছে <EOS> <EOS> <EOS> <EOS>
1701 |     
1702 |     ACTUAL TRANSLATION OF THE SAMPLE:
1703 |     
1704 |     তই কি কাল রাতে বযসত থাকবি <EOS> <PAD> 
1705 |     
1706 |     loss=0.907989
1707 |     
1708 |     CHOSEN SAMPLE NO.: 57
1709 |     
1710 |     Epoch: 128 Iteration: 50
1711 |     
1712 |     SAMPLE TEXT:
1713 |     stand up <EOS> 
1714 |     
1715 |     
1716 |     PREDICTED TRANSLATION OF THE SAMPLE:
1717 |     
1718 |     দাডান। <EOS> <EOS> <EOS> <EOS>
1719 |     
1720 |     ACTUAL TRANSLATION OF THE SAMPLE:
1721 |     
1722 |     দাডা <EOS> <PAD> <PAD> <PAD> 
1723 |     
1724 |     loss=0.790484
1725 |     
1726 |     CHOSEN SAMPLE NO.: 21
1727 |     
1728 |     Epoch: 128 Iteration: 51
1729 |     
1730 |     SAMPLE TEXT:
1731 |     tom did that himself <EOS> 
1732 |     
1733 |     
1734 |     PREDICTED TRANSLATION OF THE SAMPLE:
1735 |     
1736 |     টম ওটা বযবসথা <EOS> <EOS> <EOS> <EOS> <EOS>
1737 |     
1738 |     ACTUAL TRANSLATION OF THE SAMPLE:
1739 |     
1740 |     টম ওটা নিজেই করলো। <EOS> <PAD> <PAD> <PAD> 
1741 |     
1742 |     loss=0.93948
1743 |     
1744 |     CHOSEN SAMPLE NO.: 56
1745 |     
1746 |     Epoch: 129 Iteration: 1
1747 |     
1748 |     SAMPLE TEXT:
1749 |     tom is still at school <EOS> 
1750 |     
1751 |     
1752 |     PREDICTED TRANSLATION OF THE SAMPLE:
1753 |     
1754 |     টম এখনো নামবার <EOS> <EOS> <EOS> <EOS> <EOS>
1755 |     
1756 |     ACTUAL TRANSLATION OF THE SAMPLE:
1757 |     
1758 |     টম এখনো ইসকলে। <EOS> <PAD> <PAD> <PAD> <PAD> 
1759 |     
1760 |     loss=0.921481
1761 |     
1762 |     CHOSEN SAMPLE NO.: 15
1763 |     
1764 |     Epoch: 129 Iteration: 2
1765 |     
1766 |     SAMPLE TEXT:
1767 |     were you there <EOS> 
1768 |     
1769 |     
1770 |     PREDICTED TRANSLATION OF THE SAMPLE:
1771 |     
1772 |     তমি কি ওখানে ছিলে <EOS> <EOS>
1773 |     
1774 |     ACTUAL TRANSLATION OF THE SAMPLE:
1775 |     
1776 |     তমি কি ওখানে ছিলে <EOS> <PAD> 
1777 |     
1778 |     loss=0.486593
1779 |     
1780 |     CHOSEN SAMPLE NO.: 58
1781 |     
1782 |     Epoch: 129 Iteration: 3
1783 |     
1784 |     SAMPLE TEXT:
1785 |     is there a public toilet in this building <EOS> <PAD> 
1786 |     
1787 |     
1788 |     PREDICTED TRANSLATION OF THE SAMPLE:
1789 |     
1790 |     এই কি কি এই আছে আছে আছে <EOS> <EOS> <EOS> <EOS> <EOS>
1791 |     
1792 |     ACTUAL TRANSLATION OF THE SAMPLE:
1793 |     
1794 |     এই ইমারতটিতে কি কোনো সরবজনীন শৌচাগার আছে <EOS> <PAD> <PAD> <PAD> <PAD> 
1795 |     
1796 |     loss=1.76835
1797 |     
1798 |     CHOSEN SAMPLE NO.: 28
1799 |     
1800 |     Epoch: 129 Iteration: 4
1801 |     
1802 |     SAMPLE TEXT:
1803 |     i 'm not tom <EOS> 
1804 |     
1805 |     
1806 |     PREDICTED TRANSLATION OF THE SAMPLE:
1807 |     
1808 |     আমি এখনই নই। <EOS> <EOS> <EOS> <EOS>
1809 |     
1810 |     ACTUAL TRANSLATION OF THE SAMPLE:
1811 |     
1812 |     আমি টম নই। <EOS> <PAD> <PAD> <PAD> 
1813 |     
1814 |     loss=0.733902
1815 |     
1816 |     CHOSEN SAMPLE NO.: 59
1817 |     
1818 |     Epoch: 129 Iteration: 5
1819 |     
1820 |     SAMPLE TEXT:
1821 |     do you understand french <EOS> 
1822 |     
1823 |     
1824 |     PREDICTED TRANSLATION OF THE SAMPLE:
1825 |     
1826 |     তমি কি ফরাসি ভাষা বলতে <EOS> <EOS>
1827 |     
1828 |     ACTUAL TRANSLATION OF THE SAMPLE:
1829 |     
1830 |     আপনি কি ফরাসি ভাষা বঝতে পারো <EOS> 
1831 |     
1832 |     loss=0.842568
1833 |     
1834 |     CHOSEN SAMPLE NO.: 27
1835 |     
1836 |     Epoch: 129 Iteration: 6
1837 |     
1838 |     SAMPLE TEXT:
1839 |     i 'm happy to see you again <EOS> <PAD> 
1840 |     
1841 |     
1842 |     PREDICTED TRANSLATION OF THE SAMPLE:
1843 |     
1844 |     আপনাকে আবার দেখে খশি হযেছি। <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
1845 |     
1846 |     ACTUAL TRANSLATION OF THE SAMPLE:
1847 |     
1848 |     আপনাদেরকে আবার দেখে খশি হযেছি। <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> 
1849 |     
1850 |     loss=1.91991
1851 |     
1852 |     CHOSEN SAMPLE NO.: 53
1853 |     
1854 |     Epoch: 129 Iteration: 7
1855 |     
1856 |     SAMPLE TEXT:
1857 |     i could not walk <EOS> 
1858 |     
1859 |     
1860 |     PREDICTED TRANSLATION OF THE SAMPLE:
1861 |     
1862 |     আমি এবার পারব <EOS> <EOS> <EOS> <EOS>
1863 |     
1864 |     ACTUAL TRANSLATION OF THE SAMPLE:
1865 |     
1866 |     আমি হাটতে পারিনি। <EOS> <PAD> <PAD> <PAD> 
1867 |     
1868 |     loss=0.91238
1869 |     
1870 |     CHOSEN SAMPLE NO.: 0
1871 |     
1872 |     Epoch: 129 Iteration: 8
1873 |     
1874 |     SAMPLE TEXT:
1875 |     i want to be as rich as tom <EOS> 
1876 |     
1877 |     
1878 |     PREDICTED TRANSLATION OF THE SAMPLE:
1879 |     
1880 |     আমি ওই টমের হতে হতে <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
1881 |     
1882 |     ACTUAL TRANSLATION OF THE SAMPLE:
1883 |     
1884 |     আমি টমের মত ধনী হতে চাই। <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> 
1885 |     
1886 |     loss=1.78097
1887 |     
1888 |     CHOSEN SAMPLE NO.: 4
1889 |     
1890 |     Epoch: 129 Iteration: 9
1891 |     
1892 |     SAMPLE TEXT:
1893 |     you should eat vegetables <EOS> 
1894 |     
1895 |     
1896 |     PREDICTED TRANSLATION OF THE SAMPLE:
1897 |     
1898 |     তোমার শাকসবজি খাওযা উচিত। উচিত। <EOS> <EOS> <EOS>
1899 |     
1900 |     ACTUAL TRANSLATION OF THE SAMPLE:
1901 |     
1902 |     তোমার শাকসবজি খাওযা উচিত। <EOS> <PAD> <PAD> <PAD> 
1903 |     
1904 |     loss=0.584272
1905 |     
1906 |     CHOSEN SAMPLE NO.: 52
1907 |     
1908 |     Epoch: 129 Iteration: 10
1909 |     
1910 |     SAMPLE TEXT:
1911 |     do come again <EOS> <PAD> 
1912 |     
1913 |     
1914 |     PREDICTED TRANSLATION OF THE SAMPLE:
1915 |     
1916 |     আবার আসবে কিনত। <EOS> <EOS> <EOS>
1917 |     
1918 |     ACTUAL TRANSLATION OF THE SAMPLE:
1919 |     
1920 |     আবার আসবে কিনত। <EOS> <PAD> <PAD> 
1921 |     
1922 |     loss=0.749034
1923 |     
1924 |     CHOSEN SAMPLE NO.: 58
1925 |     
1926 |     Epoch: 129 Iteration: 11
1927 |     
1928 |     SAMPLE TEXT:
1929 |     we will scream <EOS> 
1930 |     
1931 |     
1932 |     PREDICTED TRANSLATION OF THE SAMPLE:
1933 |     
1934 |     আমরা চেচাবো। <EOS> <EOS> <EOS> <EOS>
1935 |     
1936 |     ACTUAL TRANSLATION OF THE SAMPLE:
1937 |     
1938 |     আমরা চিৎকার করবো। <EOS> <PAD> <PAD> 
1939 |     
1940 |     loss=0.519659
1941 |     
1942 |     CHOSEN SAMPLE NO.: 18
1943 |     
1944 |     Epoch: 129 Iteration: 12
1945 |     
1946 |     SAMPLE TEXT:
1947 |     do you have time <EOS> 
1948 |     
1949 |     
1950 |     PREDICTED TRANSLATION OF THE SAMPLE:
1951 |     
1952 |     আপনার হাতে সময আছে আছে <EOS> <EOS> <EOS>
1953 |     
1954 |     ACTUAL TRANSLATION OF THE SAMPLE:
1955 |     
1956 |     তোমার হাতে সময আছে <EOS> <PAD> <PAD> <PAD> 
1957 |     
1958 |     loss=0.776177
1959 |     
1960 |     CHOSEN SAMPLE NO.: 26
1961 |     
1962 |     Epoch: 129 Iteration: 13
1963 |     
1964 |     SAMPLE TEXT:
1965 |     i eat everything <EOS> 
1966 |     .
1967 |     .
1968 |     .
1969 | 
1970 | 
1971 | 
1972 | ```python
1973 | def word2vec(word):
1974 |     if word in vocab_eng:
1975 |         return np_embedding_eng[vocab_eng.index(word)]
1976 |     else:
1977 |         return np_embedding_eng[vocab_eng.index('<PAD>')]
1978 | ```
1979 | 
1980 | ### Prediction.
1981 | 
1982 | 
1983 | ```python
1984 | with tf.Session() as sess: # Begin session
1985 |     
1986 |     print('Loading pre-trained weights for the model...')
1987 |     saver = tf.train.Saver()
1988 |     saver.restore(sess, 'Model_Backup/translation_model.ckpt')
1989 |     sess.run(tf.global_variables())
1990 |     print('\nRESTORATION COMPLETE\n')
1991 |     
1992 |     
1993 |     test = ['who','are','you'] # Enter tokenized text here
1994 |     test = map(word2vec,test)
1995 |     test = np.asarray(test,np.float32)
1996 |     test = test.reshape((1,test.shape[0],test.shape[1]))
1997 |     
1998 |     input_seq_len = test.shape[0]
1999 |     pe_in = positional_encoding(input_seq_len,word_vec_dim)
2000 |     pe_in = pe_in.reshape((1,input_seq_len,word_vec_dim))
2001 |     test_pe = test+pe_in
2002 |     
2003 |     output_seq_len = int(input_seq_len+20)
2004 |     illegal_position_masks = generate_masks_for_illegal_positions(output_seq_len)
2005 |     pe_out = positional_encoding(output_seq_len,word_vec_dim) 
2006 |     pe_out = pe_out.reshape((output_seq_len,1,word_vec_dim))
2007 |         
2008 |     out = sess.run(softmax_output,
2009 |                           feed_dict={x: test_pe,
2010 |                                      y: np.zeros((1,1),np.int32), 
2011 |                                      # y value doesn't matter here.
2012 |                                      # feeding y, because the network graph requires y.
2013 |                                      # but its value won't actually be used in this case. 
2014 |                                      keep_prob: 1,
2015 |                                      output_len: output_seq_len,
2016 |                                      tf_pe_out: pe_out,
2017 |                                      tf_illegal_position_masks: illegal_position_masks,
2018 |                                      teacher_forcing: False
2019 |                                     })
2020 | 
2021 |     for array in out[0]:
2022 |         if vocab_beng[np.argmax(array)] != '<EOS>':
2023 |             print(vocab_beng[np.argmax(array)],end=' ')
2024 |     
2025 | 
2026 | 
2027 | ```
2028 | 
2029 |     Loading pre-trained weights for the model...
2030 |     INFO:tensorflow:Restoring parameters from Model_Backup/translation_model.ckpt
2031 |     
2032 |     RESTORATION COMPLETE
2033 |     
2034 |     আপলোড করছিল। করছিস করছিল। করছিল। করছিল। করছিস করছিস করছিস করছিস করছিল। করছিস করছিস করছিস করছিল। 
2035 | 
2036 | ### Some comments:
2037 | 
2038 | The model seems to fit well on the training data even using only 1 layer of encoder and decoder.
2039 | In fact, it seems to be fitting better when I am training with 1 layer of encoder and decoder.
2040 | However, I suppose the model is most likely 'memorizing' and overfitting. I tried some predictions below,
2041 | the results aren't good. Things will become clearer with validation, evaluation metrics and testing. 
2042 | 
2043 | Also the model may not really learn to generalize very well, given that there are only **4378** data samples.
2044 | Most deep learning model will probably overfit on this. 
2045 | 
2046 | At each timestep the decoder output is in the format batch_size x sequence_length x word_vector_dimensions.
2047 | I am then adding the decoder output along the second axis, to transform the shape into batch_size x word_vector_dimensions.
2048 | This should ensure that the final output is influenced by all the vectors in the immediate decoder output. 
2049 | A linear layer may be usable to achieve this goal too, but a summation seems to work - at least in fitting the data.
2050 | And a simple summation won't require extra parameters. 
2051 | 
2052 | I am not sure what the original implementation does with the decoder output before converting it linearly into a probability distribution.
2053 | 
2054 | I am using output language word embedding matrix (say E) to convert the transformed decoder output into a probability distribution.
2055 | 
2056 | Probability distribution of next word = (decoder_output after summation along 2nd axis) x transpose(E)
2057 | 
2058 | The paper recommended something along that line. Using embedding matrix seemed to produce much better results. 
2059 | 
2060 | Even though I included an option to use randomzied teacher's forcing, I kept teacher forcing off throughout this training to check if it can still fit on the training data. 
2061 | 
2062 | 
2063 | ### TO DO
2064 | 
2065 | * Evaluation (BLEU\METEOR etc.)
2066 | * Validation
2067 | * Testing 
2068 | 
2069 | (For now, I was just checking if the model can at least fit on the training data- whether it overfits or not is to yet to be checked)
2070 |     
2071 | 


--------------------------------------------------------------------------------
/Translation_preprocess.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # # Preprocessing Translation Data
  5 | 
  6 | # ### Function for expanding english contractions
  7 | # 
  8 | # source: https://gist.github.com/nealrs/96342d8231b75cf4bb82
  9 | 
 10 | # In[1]:
 11 | 
 12 | 
 13 | import numpy as np
 14 | from __future__ import division
 15 | import io
 16 | import unicodedata
 17 | import nltk
 18 | from nltk import word_tokenize
 19 | import string
 20 | import re
 21 | import random
 22 | 
 23 | 
 24 | #source: https://gist.github.com/nealrs/96342d8231b75cf4bb82
 25 | cList = {
 26 |   "ain't": "am not",
 27 |   "aren't": "are not",
 28 |   "can't": "cannot",
 29 |   "can't've": "cannot have",
 30 |   "'cause": "because",
 31 |   "could've": "could have",
 32 |   "couldn't": "could not",
 33 |   "couldn't've": "could not have",
 34 |   "didn't": "did not",
 35 |   "doesn't": "does not",
 36 |   "don't": "do not",
 37 |   "hadn't": "had not",
 38 |   "hadn't've": "had not have",
 39 |   "hasn't": "has not",
 40 |   "haven't": "have not",
 41 |   "he'd": "he would",
 42 |   "he'd've": "he would have",
 43 |   "he'll": "he will",
 44 |   "he'll've": "he will have",
 45 |   "he's": "he is",
 46 |   "how'd": "how did",
 47 |   "how'd'y": "how do you",
 48 |   "how'll": "how will",
 49 |   "how's": "how is",
 50 |   "I'd": "I would",
 51 |   "I'd've": "I would have",
 52 |   "I'll": "I will",
 53 |   "I'll've": "I will have",
 54 |   "I'm": "I am",
 55 |   "I've": "I have",
 56 |   "isn't": "is not",
 57 |   "it'd": "it had",
 58 |   "it'd've": "it would have",
 59 |   "it'll": "it will",
 60 |   "it'll've": "it will have",
 61 |   "it's": "it is",
 62 |   "let's": "let us",
 63 |   "ma'am": "madam",
 64 |   "mayn't": "may not",
 65 |   "might've": "might have",
 66 |   "mightn't": "might not",
 67 |   "mightn't've": "might not have",
 68 |   "must've": "must have",
 69 |   "mustn't": "must not",
 70 |   "mustn't've": "must not have",
 71 |   "needn't": "need not",
 72 |   "needn't've": "need not have",
 73 |   "o'clock": "of the clock",
 74 |   "oughtn't": "ought not",
 75 |   "oughtn't've": "ought not have",
 76 |   "shan't": "shall not",
 77 |   "sha'n't": "shall not",
 78 |   "shan't've": "shall not have",
 79 |   "she'd": "she would",
 80 |   "she'd've": "she would have",
 81 |   "she'll": "she will",
 82 |   "she'll've": "she will have",
 83 |   "she's": "she is",
 84 |   "should've": "should have",
 85 |   "shouldn't": "should not",
 86 |   "shouldn't've": "should not have",
 87 |   "so've": "so have",
 88 |   "so's": "so is",
 89 |   "that'd": "that would",
 90 |   "that'd've": "that would have",
 91 |   "that's": "that is",
 92 |   "there'd": "there had",
 93 |   "there'd've": "there would have",
 94 |   "there's": "there is",
 95 |   "they'd": "they would",
 96 |   "they'd've": "they would have",
 97 |   "they'll": "they will",
 98 |   "they'll've": "they will have",
 99 |   "they're": "they are",
100 |   "they've": "they have",
101 |   "to've": "to have",
102 |   "wasn't": "was not",
103 |   "we'd": "we had",
104 |   "we'd've": "we would have",
105 |   "we'll": "we will",
106 |   "we'll've": "we will have",
107 |   "we're": "we are",
108 |   "we've": "we have",
109 |   "weren't": "were not",
110 |   "what'll": "what will",
111 |   "what'll've": "what will have",
112 |   "what're": "what are",
113 |   "what's": "what is",
114 |   "what've": "what have",
115 |   "when's": "when is",
116 |   "when've": "when have",
117 |   "where'd": "where did",
118 |   "where's": "where is",
119 |   "where've": "where have",
120 |   "who'll": "who will",
121 |   "who'll've": "who will have",
122 |   "who's": "who is",
123 |   "who've": "who have",
124 |   "why's": "why is",
125 |   "why've": "why have",
126 |   "will've": "will have",
127 |   "won't": "will not",
128 |   "won't've": "will not have",
129 |   "would've": "would have",
130 |   "wouldn't": "would not",
131 |   "wouldn't've": "would not have",
132 |   "y'all": "you all",
133 |   "y'alls": "you alls",
134 |   "y'all'd": "you all would",
135 |   "y'all'd've": "you all would have",
136 |   "y'all're": "you all are",
137 |   "y'all've": "you all have",
138 |   "you'd": "you had",
139 |   "you'd've": "you would have",
140 |   "you'll": "you you will",
141 |   "you'll've": "you you will have",
142 |   "you're": "you are",
143 |   "you've": "you have"
144 | }
145 | 
146 | c_re = re.compile('(%s)' % '|'.join(cList.keys()))
147 | 
148 | def expandContractions(text, c_re=c_re):
149 |     def replace(match):
150 |         return cList[match.group(0)]
151 |     return c_re.sub(replace, text)
152 | 
153 | 
154 | # ### Loading Translation Data
155 | # 
156 | # Splitting the data into eng and beng.
157 | # eng will contain the list of English lines, and beng will contain the corresponding list of Bengali lines.
158 | # 
159 | # 
160 | # Source of data: http://www.manythings.org/anki/ (downloaded ben-eng)
161 | 
162 | # In[2]:
163 | 
164 | 
165 | filename = 'ben.txt'
166 | #Datasource: http://www.manythings.org/anki/
167 |     
168 | # http://stackoverflow.com/a/518232/2809427
169 | def unicodeToAscii(s):
170 |     return ''.join(
171 |         c for c in unicodedata.normalize('NFD', s)
172 |         if unicodedata.category(c) != 'Mn'
173 |     )
174 |     
175 | def normalizeString(s):
176 |     s = unicodeToAscii(expandContractions(s.lower().strip()))
177 |     s = re.sub(r"([.!?,\"])", r" ", s)
178 |     return s
179 |     
180 | def loaddata(filename):
181 |     file = io.open(filename,'r')
182 |     eng=[]
183 |     beng = []
184 |     for line in file.readlines():
185 |         lang_pair = line.split('\t')
186 |         lang_pair[0] = normalizeString(lang_pair[0])
187 |         lang_pair[1] = normalizeString(lang_pair[1])
188 |         eng.append(word_tokenize(lang_pair[0]))
189 |         beng.append(word_tokenize(lang_pair[1]))
190 |     file.close()
191 |     return eng,beng
192 | 
193 | eng,beng = loaddata(filename)
194 | 
195 | #Example:
196 | sample = random.randint(0,len(eng))
197 | print "Example Sample #"+str(sample)+":\n"
198 | string = "ENGLISH:"
199 | for i in xrange(0,len(eng[sample])):
200 |     string+=" "+eng[sample][i]
201 | print string
202 |     
203 | string = "\nBENGALI:"
204 | for i in xrange(0,len(beng[sample])):
205 |     string+=" "+beng[sample][i]
206 | print string
207 | 
208 | 
209 | # ### Creating separate vocabulary lists for English words and Bengali words
210 | # 
211 | # The index of vocabulary will represent the numerical representation of the word which is the value of vocabulary at that index. 
212 | # 
213 | 
214 | # In[3]:
215 | 
216 | 
217 | import numpy as np
218 | 
219 | vocab_eng=[]
220 | vocab_eng.append('<PAD>')
221 | vocab_eng.append('<EOS>')
222 | 
223 | vocab_beng=[]
224 | vocab_beng.append('<PAD>')
225 | vocab_beng.append('<EOS>')
226 | 
227 | #The index of vocab will serve as an integer representation of the word
228 | 
229 | vectorized_eng = []
230 | vectorized_beng = []
231 | 
232 | for i in xrange(len(eng)):
233 |     
234 |     vectorized_eng_line = []
235 |     for word in eng[i]:
236 |         if word not in vocab_eng:
237 |             vocab_eng.append(word)
238 |             vectorized_eng_line.append(vocab_eng.index(word))
239 |         else:
240 |             vectorized_eng_line.append(vocab_eng.index(word))
241 |     vectorized_eng.append(vectorized_eng_line)
242 |     
243 |     vectorized_beng_line = []
244 |     for word in beng[i]:
245 |         if word not in vocab_beng:
246 |             vocab_beng.append(word)
247 |             vectorized_beng_line.append(vocab_beng.index(word))
248 |         else:
249 |             vectorized_beng_line.append(vocab_beng.index(word))
250 |     vectorized_beng.append(vectorized_beng_line)
251 |     
252 |     
253 | 
254 | 
255 | # ### Creating training dataset for word2vec embedding
256 | # 
257 | # if the sentence is "I am alright"
258 | # 
259 | # then for the word 'am', the context words with window size 1 will be "I" and "alright"
260 | # i.e ["I","alright"]
261 | # 
262 | # For 'I' the context words will be "PAD" and "am"
263 | # 
264 | # For 'alright' the context words will be "am" and "PAD"
265 | # 
266 | # PAD represents empty and EOS represents end of sentence.
267 | # 
268 | # Later lots of pads may be applied after the end of sentence to fit sequence length.
269 | # 
270 | # So I also added the word PAD with context words being PADs, and PAD and EOS for embedding.
271 | # 
272 | # In this way, first, from each sentence, I am creating a list of words, and corresponding list of context words.
273 | # Doing the same thing for
274 | 
275 | # In[4]:
276 | 
277 | 
278 | words_eng = []
279 | contexts_eng = []
280 | 
281 | words_beng = []
282 | contexts_beng = []
283 | 
284 | words_eng.append(vocab_eng.index('<PAD>'))
285 | contexts_eng.append([vocab_eng.index('<EOS>'),vocab_eng.index('<PAD>')])
286 | words_eng.append(vocab_eng.index('<PAD>'))
287 | contexts_eng.append([vocab_eng.index('<PAD>'),vocab_eng.index('<PAD>')])
288 | 
289 | words_beng.append(vocab_beng.index('<PAD>'))
290 | contexts_beng.append([vocab_beng.index('<EOS>'),vocab_beng.index('<PAD>')])
291 | words_beng.append(vocab_beng.index('<PAD>'))
292 | contexts_beng.append([vocab_beng.index('<PAD>'),vocab_beng.index('<PAD>')])
293 | 
294 | 
295 | for i in xrange(len(vectorized_eng)):
296 |     
297 |     for j in xrange(0,len(vectorized_eng[i])):
298 |         
299 |         context1=0
300 |         context2=0
301 |         
302 |         if j==0:
303 |             context1 = vocab_eng.index('<PAD>')
304 |             if j!=len(vectorized_eng[i])-1:
305 |                 context2 = vectorized_eng[i][j+1]
306 |         if j==len(vectorized_eng[i])-1:
307 |             context2=vocab_eng.index('<EOS>')
308 |             if j!=0:
309 |                 context1 = vectorized_eng[i][j-1]
310 |         if j>0 and j<len(vectorized_eng[i])-1:
311 |             context1 = vectorized_eng[i][j-1]
312 |             context2 = vectorized_eng[i][j+1]
313 |         
314 |         words_eng.append(vectorized_eng[i][j])
315 |         contexts_eng.append([context1,context2])
316 |     
317 |     rand = random.randint(0,3)
318 |     if rand == 1: #reduce the freuency of <EOS> for training data
319 |         words_eng.append(vocab_eng.index('<EOS>'))
320 |         context1 = vectorized_eng[i][len(vectorized_eng[i])-1]
321 |         context2 = vocab_eng.index('<PAD>')
322 |         contexts_eng.append([context1,context2])
323 |     
324 |     for j in xrange(0,len(vectorized_beng[i])):
325 |         
326 |         context1=0
327 |         context2=0
328 |         
329 |         if j==0:
330 |             context1 = vocab_beng.index('<PAD>')
331 |             if j!=len(vectorized_beng[i])-1:
332 |                 context2 = vectorized_beng[i][j+1]
333 |         if j==len(vectorized_beng[i])-1:
334 |             context2=vocab_beng.index('<EOS>')
335 |             if j!=0:
336 |                 context1 = vectorized_beng[i][j-1]
337 |         if j>0 and j<len(vectorized_beng[i])-1:
338 |             context1 = vectorized_beng[i][j-1]
339 |             context2 = vectorized_beng[i][j+1]
340 |         
341 |         words_beng.append(vectorized_beng[i][j])
342 |         contexts_beng.append([context1,context2])
343 |     
344 |     rand = random.randint(0,3)
345 |     if rand == 1: #reduce the freuency of <EOS> for training data
346 |         words_beng.append(vocab_beng.index('<EOS>'))
347 |         context1 = vectorized_beng[i][len(vectorized_beng[i])-1]
348 |         context2 = vocab_beng.index('<PAD>')
349 |         contexts_beng.append([context1,context2])
350 |     
351 |     
352 |             
353 | 
354 | 
355 | # If word = "am" and context = ["I","alright"],
356 | # then, here I will reconstrcut the data as:
357 | # 
358 | # input = "am"
359 | # output = "I"
360 | # and 
361 | # input = "am"
362 | # label = "alright"
363 | # 
364 | # Like this I will construct a list of all training inputs (words) and training outputs\labels (context words)
365 | # 
366 | # embd_inputs_eng will contain all the English training inputs.
367 | # embd_labels_eng will contain all the English training labels.
368 | # 
369 | # embd_inputs_beng will contain all the Bengali training inputs.
370 | # embd_labels_beng will contain all the Bengali training labels.
371 | 
372 | # In[5]:
373 | 
374 | 
375 | embd_inputs_eng = []
376 | embd_labels_eng = []
377 | for i in xrange(len(contexts_eng)):
378 |     for context in contexts_eng[i]:
379 |         embd_inputs_eng.append(words_eng[i])
380 |         embd_labels_eng.append(context)
381 | embd_inputs_eng = np.asarray(embd_inputs_eng,np.int32)
382 | embd_labels_eng = np.asarray(embd_labels_eng,np.int32)
383 | 
384 | embd_inputs_beng = []
385 | embd_labels_beng = []
386 | for i in xrange(len(contexts_beng)):
387 |     for context in contexts_beng[i]:
388 |         embd_inputs_beng.append(words_beng[i])
389 |         embd_labels_beng.append(context)
390 | embd_inputs_beng = np.asarray(embd_inputs_beng,np.int32)
391 | embd_labels_beng = np.asarray(embd_labels_beng,np.int32)
392 |     
393 | 
394 | 
395 | # ### Function for generating mini-batches from the total training set
396 | 
397 | # In[6]:
398 | 
399 | 
400 | batch_size = 128
401 | 
402 | def generate_batch(inputs,labels,batch_size):
403 |     rand = random.sample((np.arange(len(inputs))),batch_size)
404 |     batch_inputs=[]
405 |     batch_labels=[]
406 |     for i in xrange(batch_size):
407 |         batch_inputs.append(inputs[int(rand[i])])
408 |         batch_labels.append(labels[int(rand[i])])
409 |     batch_inputs = np.asarray(batch_inputs,np.int32)
410 |     batch_labels = np.asarray(batch_labels,np.int32)
411 |     return batch_inputs,batch_labels
412 |     
413 | 
414 | 
415 | # ### Preparing for word2vec embedding
416 | 
417 | # In[7]:
418 | 
419 | 
420 | import tensorflow as tf
421 | import math
422 | 
423 | #https://www.tensorflow.org/tutorials/word2vec
424 | embedding_size = 256
425 | vocabulary_size_eng = len(vocab_eng)
426 | vocabulary_size_beng = len(vocab_beng)
427 | 
428 | # Placeholders for inputs
429 | train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
430 | train_labels = tf.placeholder(tf.int32, shape=[batch_size,1])
431 | 
432 | 
433 | # ### Training for word2vec embedding (For English words)
434 | # 
435 | # See: https://www.tensorflow.org/tutorials/word2vec
436 | # 
437 | # for details of word2vec and code description
438 | 
439 | # In[8]:
440 | 
441 | 
442 | embeddings_eng = tf.Variable(
443 |     tf.random_uniform([vocabulary_size_eng, embedding_size], -1.0, 1.0))
444 | 
445 | nce_weights_eng = tf.Variable(
446 |   tf.truncated_normal([vocabulary_size_eng, embedding_size],
447 |                       stddev=1.0 / math.sqrt(embedding_size)))
448 | nce_biases_eng = tf.Variable(tf.zeros([vocabulary_size_eng]))
449 | 
450 | # Initializing the variables
451 | init = tf.global_variables_initializer()
452 | 
453 | 
454 | # In[9]:
455 | 
456 | 
457 | embed_eng = tf.nn.embedding_lookup(embeddings_eng, train_inputs)
458 | 
459 | # Compute the NCE loss, using a sample of the negative labels each time.
460 | loss = tf.reduce_mean(
461 |   tf.nn.nce_loss(weights=nce_weights_eng,
462 |                  biases=nce_biases_eng,
463 |                  labels=train_labels,
464 |                  inputs=embed_eng,
465 |                  num_sampled=10, 
466 |                  num_classes=vocabulary_size_eng)) #num_sampled = no. of negative samples
467 | 
468 | # We use the SGD optimizer.
469 | optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
470 | 
471 |             
472 | 
473 | 
474 | # In[10]:
475 | 
476 | 
477 | 
478 | with tf.Session() as sess:
479 |     sess.run(init)
480 |     convergence_threshold = 0.5
481 |     training_iters = 500*(int((len(embd_inputs_eng))/batch_size))
482 |     step=0
483 |     n=5
484 |     last_n_losses = np.zeros((n),np.float32)
485 |     
486 |     while step<training_iters:
487 |         
488 |         batch_inputs,batch_labels = generate_batch(embd_inputs_eng,embd_labels_eng,batch_size)
489 |         
490 |         feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels.reshape((-1,1))}
491 |         
492 |         _, np_embedding_eng, cur_loss = sess.run([optimizer, embeddings_eng, loss], feed_dict=feed_dict)
493 |         
494 |         print "Iter "+str(step)+", loss = "+str(cur_loss)
495 |         
496 |         last_n_losses[step%n]=cur_loss
497 |         
498 |         if step>=n:
499 |             if np.mean(last_n_losses)<=convergence_threshold:
500 |                 break
501 |         step+=1
502 |                 
503 | print "\nOptimization Finished\n"
504 | 
505 | 
506 | # ### Training for word2vec embedding (For Bengali words)
507 | # 
508 | # See: https://www.tensorflow.org/tutorials/word2vec
509 | # 
510 | # for details of word2vec and code description
511 | 
512 | # In[11]:
513 | 
514 | 
515 | embeddings_beng = tf.Variable(
516 |     tf.random_uniform([vocabulary_size_beng, embedding_size], -1.0, 1.0))
517 | 
518 | nce_weights_beng = tf.Variable(
519 |   tf.truncated_normal([vocabulary_size_beng, embedding_size],
520 |                       stddev=1.0 / math.sqrt(embedding_size)))
521 | nce_biases_beng = tf.Variable(tf.zeros([vocabulary_size_beng]))
522 | 
523 | # Initializing the variables
524 | init = tf.global_variables_initializer()
525 | 
526 | 
527 | # In[12]:
528 | 
529 | 
530 | embed_beng = tf.nn.embedding_lookup(embeddings_beng, train_inputs)
531 | 
532 | # Compute the NCE loss, using a sample of the negative labels each time.
533 | loss = tf.reduce_mean(
534 |   tf.nn.nce_loss(weights=nce_weights_beng,
535 |                  biases=nce_biases_beng,
536 |                  labels=train_labels,
537 |                  inputs=embed_beng,
538 |                  num_sampled=10, 
539 |                  num_classes=vocabulary_size_beng)) #num_sampled = no. of negative samples
540 | 
541 | # We use the SGD optimizer.
542 | optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
543 | 
544 |             
545 | 
546 | 
547 | # In[13]:
548 | 
549 | 
550 | 
551 | with tf.Session() as sess:
552 |     sess.run(init)
553 |     convergence_threshold = 0.5
554 |     training_iters = 500*(int((len(embd_inputs_beng))/batch_size))
555 |     step=0
556 |     n=5
557 |     last_n_losses = np.zeros((n),np.float32)
558 |     while step<training_iters:
559 |         
560 |         batch_inputs,batch_labels = generate_batch(embd_inputs_beng,embd_labels_beng,batch_size)
561 |         
562 |         feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels.reshape((-1,1))}
563 |         _, np_embedding_beng, cur_loss = sess.run([optimizer, embeddings_beng, loss], feed_dict=feed_dict)
564 |         
565 |         print "Iter "+str(step)+", loss = "+str(cur_loss)
566 |         last_n_losses[step%n]=cur_loss
567 |         if step>=n:
568 |             if np.mean(last_n_losses)<=convergence_threshold:
569 |                 break
570 |         step+=1
571 |                 
572 | print "\nOptimization Finished\n"
573 | 
574 | 
575 | # ### Creating Train, Validation, and Test set
576 | # 
577 | # Randomly shuffling the complete dataset, and then splitting it into train, validation and test set
578 | 
579 | # In[14]:
580 | 
581 | 
582 | shuffled_indices = np.arange(len(eng))
583 | np.random.shuffle(shuffled_indices)
584 | 
585 | shuffled_vectorized_eng = []
586 | shuffled_vectorized_beng = []
587 | 
588 | for i in xrange(len(eng)):
589 |     shuffled_vectorized_eng.append(vectorized_eng[shuffled_indices[i]])
590 |     shuffled_vectorized_beng.append(vectorized_beng[shuffled_indices[i]])
591 | 
592 | train_len = int(.75*len(eng))
593 | val_len = int(.15*len(eng))
594 | 
595 | train_eng = shuffled_vectorized_eng[0:train_len]
596 | train_beng = shuffled_vectorized_beng[0:train_len]
597 | 
598 | val_eng = shuffled_vectorized_eng[train_len:val_len]
599 | val_beng = shuffled_vectorized_beng[train_len:val_len]
600 | 
601 | test_eng = shuffled_vectorized_eng[train_len+val_len:]
602 | test_beng = shuffled_vectorized_beng[train_len+val_len:]
603 | 
604 | 
605 | # ### Function for bucketing and generating batches
606 | # 
607 | # Mini-batch training requires all lines in a batch to be of equal length.
608 | # We have different lines of different lengths. 
609 | # 
610 | # Solution is to fill shorter sentences with PADs so that length of all sentences become equal.
611 | # But, if one sentence in a batch has 20 words, and the same batch has another sentence with one word, then the latter sentence will have to be filled in by at least 19 pads. If most of the sentences start to have more PADs than actual content, training will become troublesome.
612 | # 
613 | # The solution to that is bucketing. First the sentences in the total list are sorted. After that sentences of similar lengths are closer to each other. Batches are then formed with sentences of similar lengths. Much less padding will be required to turning sentences of similar lengths into senetences of equal lengths. 
614 | 
615 | # In[15]:
616 | 
617 | 
618 | def bucket_and_batch(x,y,batch_size):
619 |     
620 |     len_x= np.zeros((len(x)),np.int32)
621 |     
622 |     for i in xrange(len(x)):
623 |         len_x[i] = len(x[i])
624 |         
625 |     sorted_by_len_indices = np.flip(np.argsort(len_x),0)
626 | 
627 |     sorted_x = []
628 |     sorted_y = []
629 |     
630 |     for i in xrange(len(x)):
631 |         sorted_x.append(x[sorted_by_len_indices[i]])
632 |         sorted_y.append(y[sorted_by_len_indices[i]])
633 |         
634 |     i=0
635 |     batches_x = []
636 |     batches_y = []
637 |     
638 |     while i<len(x):
639 |         
640 |         if i+batch_size>=len(x):
641 |             break
642 |         
643 |         batch_x = []
644 |         batch_y = []
645 |     
646 |         max_len_x = len(sorted_x[i])
647 |     
648 |         len_y= np.zeros((len(y)),np.int32)
649 |     
650 |         for j in xrange(i,i+batch_size):
651 |             len_y[j] = len(sorted_y[j])
652 |             
653 |         max_len_y = np.amax(len_y)
654 |         
655 |         for j in xrange(i,i+batch_size):
656 |             line=[]
657 |             for k1 in xrange(max_len_x+1): #+1 to include <EOS>
658 |                 if k1==len(sorted_x[j]):
659 |                     line.append(np_embedding_eng[vocab_eng.index('<EOS>')])
660 |                 elif k1>len(sorted_x[j]):
661 |                     line.append(np_embedding_eng[vocab_eng.index('<PAD>')])
662 |                 else:
663 |                     line.append(np_embedding_eng[sorted_x[j][k1]])
664 |             batch_x.append(line)
665 |         
666 |             line=[]
667 |             for k2 in xrange(max_len_y+1): #+1 to include <EOS>
668 |                 if k2>len(sorted_y[j]):
669 |                     line.append(vocab_beng.index('<PAD>'))
670 |                 elif k2==len(sorted_y[j]):
671 |                     line.append(vocab_beng.index('<EOS>'))
672 |                 else:
673 |                     line.append(sorted_y[j][k2])
674 |             batch_y.append(line)
675 |     
676 |         batch_x = np.asarray(batch_x,np.float32)
677 |         batch_y = np.asarray(batch_y,np.int32)
678 | 
679 |         batches_x.append(batch_x)
680 |         batches_y.append(batch_y)
681 |     
682 |         i+=batch_size
683 |         
684 |     return batches_x,batches_y
685 | 
686 | 
687 | 
688 | # ### Creating train, validation and test batches
689 | 
690 | # In[16]:
691 | 
692 | 
693 | batch_size = 64
694 | 
695 | train_batch_eng,train_batch_beng = bucket_and_batch(train_eng,train_beng,batch_size)
696 | 
697 | val_batch_eng,val_batch_beng = bucket_and_batch(val_eng,val_beng,batch_size)
698 | 
699 | test_batch_eng,test_batch_beng = bucket_and_batch(test_eng,test_beng,batch_size)
700 | 
701 | 
702 | # ### Saving processed data in another file.
703 | 
704 | # In[17]:
705 | 
706 | 
707 | #Saving processed data in another file.
708 | 
709 | import pickle
710 | 
711 | PICK = [vocab_eng,vocab_beng,np_embedding_eng,np_embedding_beng,train_batch_eng,train_batch_beng,val_batch_eng,val_batch_beng,test_batch_eng,test_batch_beng]
712 | 
713 | with open('translationPICKLE', 'wb') as fp:
714 |     pickle.dump(PICK, fp)
715 | 
716 | 


--------------------------------------------------------------------------------