├── LICENSE ├── Machine Translation.ipynb ├── Machine Translation.py ├── README.md └── Translation_preprocess.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Jishnu Ray Chowdhury 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Machine Translation.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ### Loading Pre-processed Data 5 | 6 | # In[1]: 7 | 8 | 9 | import pickle 10 | import math 11 | import numpy as np 12 | 13 | 14 | with open ('translationPICKLE', 'rb') as fp: 15 | PICK = pickle.load(fp) 16 | 17 | vocab_eng = PICK[0] 18 | vocab_beng = PICK[1] 19 | vocab_len = len(vocab_beng) 20 | 21 | np_embedding_eng = PICK[2] 22 | np_embedding_beng = PICK[3] 23 | np_embedding_eng = np.asarray(np_embedding_eng,np.float32) 24 | np_embedding_beng = np.asarray(np_embedding_beng,np.float32) 25 | 26 | word_vec_dim = np_embedding_eng.shape[1] 27 | 28 | train_batch_x = PICK[4] 29 | train_batch_y = PICK[5] 30 | 31 | val_batch_x = PICK[6] 32 | val_batch_y = PICK[7] 33 | 34 | test_batch_x = PICK[8] 35 | test_batch_y = PICK[9] 36 | 37 | 38 | 39 | # ### Function for converting vector of size word_vec_dim into the closest representative english word. 40 | 41 | # In[2]: 42 | 43 | 44 | def most_similar_eucli_eng(x): 45 | xminusy = np.subtract(np_embedding_eng,x) 46 | sq_xminusy = np.square(xminusy) 47 | sum_sq_xminusy = np.sum(sq_xminusy,1) 48 | eucli_dists = np.sqrt(sum_sq_xminusy) 49 | return np.argsort(eucli_dists) 50 | 51 | def vec2word_eng(vec): # converts a given vector representation into the represented word 52 | most_similars = most_similar_eucli_eng(np.asarray(vec,np.float32)) 53 | return vocab_eng[most_similars[0]] 54 | 55 | 56 | 57 | # ### Hyperparameters and Placeholders. 58 | 59 | # In[3]: 60 | 61 | 62 | import tensorflow as tf 63 | 64 | #Hyperparamters 65 | 66 | h=8 #no. of heads 67 | N=1 #no. of decoder and encoder layers 68 | learning_rate=0.001 69 | epochs = 200 70 | keep_prob = tf.placeholder(tf.float32) 71 | 72 | #Placeholders 73 | 74 | x = tf.placeholder(tf.float32, [None,None,word_vec_dim]) 75 | y = tf.placeholder(tf.int32, [None,None]) 76 | 77 | output_len = tf.placeholder(tf.int32) 78 | 79 | teacher_forcing = tf.placeholder(tf.bool) 80 | 81 | tf_pad_mask = tf.placeholder(tf.float32,[None,None]) 82 | tf_illegal_position_masks = tf.placeholder(tf.float32,[None,None,None]) 83 | 84 | tf_pe_out = tf.placeholder(tf.float32,[None,None,None]) #positional codes for output 85 | 86 | 87 | # ### Model Parameters. 88 | 89 | # In[4]: 90 | 91 | 92 | 93 | # Dimensions for Q (Query),K (Keys) and V (Values) for attention layers. 94 | 95 | dqkv = 32 96 | 97 | #Parameters for attention sub-layers for all n encoders 98 | 99 | Wq_enc = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 100 | Wk_enc = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 101 | Wv_enc = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 102 | Wo_enc = tf.Variable(tf.truncated_normal(shape=[N,h*dqkv,word_vec_dim],stddev=0.01)) 103 | 104 | #Parameters for position-wise fully connected layers for n encoders 105 | 106 | d = 1024 107 | W1_enc = tf.Variable(tf.truncated_normal(shape=[N,1,1,word_vec_dim,d],stddev=0.01)) 108 | b1_enc = tf.Variable(tf.constant(0,tf.float32,shape=[N,d])) 109 | W2_enc = tf.Variable(tf.truncated_normal(shape=[N,1,1,d,word_vec_dim],stddev=0.01)) 110 | b2_enc = tf.Variable(tf.constant(0,tf.float32,shape=[N,word_vec_dim])) 111 | 112 | #Parameters for 2 attention sub-layers for all n decoders 113 | 114 | Wq_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 115 | Wk_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 116 | Wv_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 117 | Wo_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h*dqkv,word_vec_dim],stddev=0.01)) 118 | Wq_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 119 | Wk_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 120 | Wv_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 121 | Wo_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h*dqkv,word_vec_dim],stddev=0.01)) 122 | 123 | #Parameters for position-wise fully connected layers for n decoders 124 | 125 | d = 1024 126 | W1_dec = tf.Variable(tf.truncated_normal(shape=[N,1,1,word_vec_dim,d],stddev=0.01)) 127 | b1_dec = tf.Variable(tf.constant(0,tf.float32,shape=[N,d])) 128 | W2_dec = tf.Variable(tf.truncated_normal(shape=[N,1,1,d,word_vec_dim],stddev=0.01)) 129 | b2_dec = tf.Variable(tf.constant(0,tf.float32,shape=[N,word_vec_dim])) 130 | 131 | #Layer Normalization parameters for encoder and decoder 132 | 133 | scale_enc_1 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32) 134 | shift_enc_1 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32) 135 | 136 | scale_enc_2 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32) 137 | shift_enc_2 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32) 138 | 139 | #Layer Normalization parameters for decoder 140 | 141 | scale_dec_1 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32) 142 | shift_dec_1 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32) 143 | 144 | scale_dec_2 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32) 145 | shift_dec_2 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32) 146 | 147 | scale_dec_3 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32) 148 | shift_dec_3 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32) 149 | 150 | 151 | # ### Function for generating a sequence of positional codes for positional encoding. 152 | 153 | # In[5]: 154 | 155 | 156 | def positional_encoding(seq_len,model_dimensions): 157 | pe = np.zeros((seq_len,model_dimensions,),np.float32) 158 | for pos in xrange(0,seq_len): 159 | for i in xrange(0,model_dimensions): 160 | pe[pos][i] = math.sin(pos/(10000**(2*i/model_dimensions))) 161 | return pe.reshape((seq_len,model_dimensions)) 162 | 163 | 164 | # ### Function for Layer Normalization 165 | # 166 | # https://arxiv.org/abs/1607.06450 167 | 168 | # In[6]: 169 | 170 | 171 | #modified version of def LN used here: 172 | #https://theneuralperspective.com/2016/10/27/gradient-topics/ 173 | 174 | def layer_norm(inputs,scale,shift,epsilon = 1e-5): 175 | 176 | mean, var = tf.nn.moments(inputs, [1,2], keep_dims=True) 177 | 178 | LN = tf.multiply((scale / tf.sqrt(var + epsilon)),(inputs - mean)) + shift 179 | 180 | return LN 181 | 182 | 183 | # ### Function to pre-generate masks for illegal positions. 184 | # 185 | # These masks are to be used to fill illegal positions with -infinity (or a very low value eg. -2^30). 186 | # 187 | # Illegal positions are positions of the decoder input tokens that aren't predicted at a given timestep. 188 | # 189 | # { In a transformer, the decoder input is of the same shape as the WHOLE decoder output sequence. One word for the sequence is predicted at each timestep (from left to right). So in most timesteps, the left side of the decoder input sequence will contain valid previously predicted output words, but the right side -the yet to be predicted side should contain some values that should be ignored and never attended. We make sure that they're ignored by masking it } 190 | # 191 | # So, the illegal positions depends on the total output length and the no. of predicted output tokens. 192 | # 193 | # The appropriate mask when i output tokens are predicted can be retrieved from mask[i-1] where mask is the return value from this function. The argument out_len that function takes, signifies the total length of the output. 194 | # 195 | # The masks are used to assign the value -2^30 to all positions in the tensor influenced by the illegal ones. 196 | # After going through the softmax layer, these positions become close to 0, as it should be. 197 | # 198 | # Dynamically creating masks depending on the current position\timestep (depending on which the program can know which positions are legal and which aren't) is, however, 199 | # a bit troublesome with tensorflow tf_while_loop. 200 | # 201 | # I will be pre-generating all the masks with Python native code and feed the list of all required masks to the network at each training step (output length can be different at different training steps). 202 | # 203 | 204 | # In[7]: 205 | 206 | 207 | def generate_masks_for_illegal_positions(out_len): 208 | 209 | masks=np.zeros((out_len-1,out_len,out_len),dtype=np.float32) 210 | 211 | for i in xrange(1,out_len): 212 | mask = np.zeros((out_len,out_len),dtype=np.float32) 213 | mask[i:out_len,:] = -2**30 214 | mask[:,i:out_len] = -2**30 215 | masks[i-1] = mask 216 | 217 | return masks 218 | 219 | 220 | # ### Function for Multi-Headed Attention. 221 | # 222 | # Details: https://arxiv.org/pdf/1706.03762.pdf 223 | # 224 | # Q = Query 225 | # 226 | # K = Key 227 | # 228 | # V = Value 229 | # 230 | # d is the dimension for Q, K and V. 231 | 232 | # In[8]: 233 | 234 | 235 | 236 | def attention(Q,K,V,d,filled=0,mask=False): 237 | 238 | K = tf.transpose(K,[0,2,1]) 239 | d = tf.cast(d,tf.float32) 240 | 241 | softmax_component = tf.div(tf.matmul(Q,K),tf.sqrt(d)) 242 | 243 | if mask == True: 244 | softmax_component = softmax_component + tf_illegal_position_masks[filled-1] 245 | 246 | result = tf.matmul(tf.nn.dropout(tf.nn.softmax(softmax_component),keep_prob),V) 247 | 248 | return result 249 | 250 | 251 | def multihead_attention(Q,K,V,d,weights,filled=0,mask=False): 252 | 253 | Q_ = tf.reshape(Q,[-1,tf.shape(Q)[2]]) 254 | K_ = tf.reshape(K,[-1,tf.shape(Q)[2]]) 255 | V_ = tf.reshape(V,[-1,tf.shape(Q)[2]]) 256 | 257 | heads = tf.TensorArray(size=h,dtype=tf.float32) 258 | 259 | Wq = weights['Wq'] 260 | Wk = weights['Wk'] 261 | Wv = weights['Wv'] 262 | Wo = weights['Wo'] 263 | 264 | for i in xrange(0,h): 265 | 266 | Q_w = tf.matmul(Q_,Wq[i]) 267 | Q_w = tf.reshape(Q_w,[tf.shape(Q)[0],tf.shape(Q)[1],d]) 268 | 269 | K_w = tf.matmul(K_,Wk[i]) 270 | K_w = tf.reshape(K_w,[tf.shape(K)[0],tf.shape(K)[1],d]) 271 | 272 | V_w = tf.matmul(V_,Wv[i]) 273 | V_w = tf.reshape(V_w,[tf.shape(V)[0],tf.shape(V)[1],d]) 274 | 275 | head = attention(Q_w,K_w,V_w,d,filled,mask) 276 | 277 | heads = heads.write(i,head) 278 | 279 | heads = heads.stack() 280 | 281 | concated = heads[0] 282 | 283 | for i in xrange(1,h): 284 | concated = tf.concat([concated,heads[i]],2) 285 | 286 | concated = tf.reshape(concated,[-1,h*d]) 287 | out = tf.matmul(concated,Wo) 288 | out = tf.reshape(out,[tf.shape(heads)[1],tf.shape(heads)[2],word_vec_dim]) 289 | 290 | return out 291 | 292 | 293 | 294 | # ### Function for encoder 295 | # 296 | # More details: https://arxiv.org/pdf/1706.03762.pdf 297 | 298 | # In[9]: 299 | 300 | 301 | def encoder(x,weights,attention_weights,dqkv): 302 | 303 | W1 = weights['W1'] 304 | W2 = weights['W2'] 305 | b1 = weights['b1'] 306 | b2 = weights['b2'] 307 | 308 | scale1 = weights['scale1'] 309 | shift1 = weights['shift1'] 310 | scale2 = weights['scale2'] 311 | shift2 = weights['shift2'] 312 | 313 | # SUBLAYER 1 (MASKED MULTI HEADED SELF ATTENTION) 314 | 315 | sublayer1 = multihead_attention(x,x,x,dqkv,attention_weights) 316 | sublayer1 = tf.nn.dropout(sublayer1,keep_prob) 317 | sublayer1 = layer_norm(sublayer1 + x,scale1,shift1) 318 | 319 | sublayer1_ = tf.reshape(sublayer1,[tf.shape(sublayer1)[0],1,tf.shape(sublayer1)[1],word_vec_dim]) 320 | 321 | # SUBLAYER 2 (TWO 1x1 CONVOLUTIONAL LAYERS AKA POSITION WISE FULLY CONNECTED NETWORKS) 322 | 323 | sublayer2 = tf.nn.conv2d(sublayer1_, W1, strides=[1,1,1,1], padding='SAME') 324 | sublayer2 = tf.nn.bias_add(sublayer2,b1) 325 | sublayer2 = tf.nn.relu(sublayer2) 326 | 327 | sublayer2 = tf.nn.conv2d(sublayer2, W2, strides=[1,1,1,1], padding='SAME') 328 | sublayer2 = tf.nn.bias_add(sublayer2,b2) 329 | 330 | sublayer2 = tf.reshape(sublayer2,[tf.shape(sublayer2)[0],tf.shape(sublayer2)[2],word_vec_dim]) 331 | 332 | sublayer2 = tf.nn.dropout(sublayer2,keep_prob) 333 | sublayer2 = layer_norm(sublayer2 + sublayer1,scale2,shift2) 334 | 335 | return sublayer2 336 | 337 | 338 | # ### Function for decoder 339 | # 340 | # More details: https://arxiv.org/pdf/1706.03762.pdf 341 | 342 | # In[10]: 343 | 344 | 345 | def decoder(y,enc_out,weights,masked_attention_weights,attention_weights,dqkv,mask=False,filled=0): 346 | 347 | W1 = weights['W1'] 348 | W2 = weights['W2'] 349 | b1 = weights['b1'] 350 | b2 = weights['b2'] 351 | 352 | scale1 = weights['scale1'] 353 | shift1 = weights['shift1'] 354 | scale2 = weights['scale2'] 355 | shift2 = weights['shift2'] 356 | scale3 = weights['scale3'] 357 | shift3 = weights['shift3'] 358 | 359 | # SUBLAYER 1 (MASKED MULTI HEADED SELF ATTENTION) 360 | 361 | sublayer1 = multihead_attention(y,y,y,dqkv,masked_attention_weights,filled,mask) 362 | sublayer1 = tf.nn.dropout(sublayer1,keep_prob) 363 | sublayer1 = layer_norm(sublayer1 + y,scale1,shift1) 364 | 365 | # SUBLAYER 2 (MULTIHEADED ENCODER-DECODER INTERLAYER ATTENTION) 366 | 367 | sublayer2 = multihead_attention(sublayer1,enc_out,enc_out,dqkv,attention_weights) 368 | sublayer2 = tf.nn.dropout(sublayer2,keep_prob) 369 | sublayer2 = layer_norm(sublayer2 + sublayer1,scale2,shift2) 370 | 371 | # SUBLAYER 3 (TWO 1x1 CONVOLUTIONAL LAYERS AKA POSITION WISE FULLY CONNECTED NETWORKS) 372 | 373 | sublayer2_ = tf.reshape(sublayer2,[tf.shape(sublayer2)[0],1,tf.shape(sublayer2)[1],word_vec_dim]) 374 | 375 | sublayer3 = tf.nn.conv2d(sublayer2_, W1, strides=[1,1,1,1], padding='SAME') 376 | sublayer3 = tf.nn.bias_add(sublayer3,b1) 377 | sublayer3 = tf.nn.relu(sublayer3) 378 | 379 | sublayer3 = tf.nn.conv2d(sublayer3, W2, strides=[1,1,1,1], padding='SAME') 380 | sublayer3 = tf.nn.bias_add(sublayer3,b2) 381 | 382 | sublayer3 = tf.reshape(sublayer3,[tf.shape(sublayer3)[0],tf.shape(sublayer3)[2],word_vec_dim]) 383 | 384 | sublayer3 = tf.nn.dropout(sublayer3,keep_prob) 385 | sublayer3 = layer_norm(sublayer3 + sublayer2,scale3,shift3) 386 | 387 | return sublayer3 388 | 389 | 390 | # ### Function for Stacking Encoders. 391 | 392 | # In[11]: 393 | 394 | 395 | def stacked_encoders(layer_num,encoderin): 396 | 397 | for i in xrange(0,layer_num): 398 | 399 | encoder_weights = { 400 | 401 | 'W1': W1_enc[i], 402 | 'W2': W2_enc[i], 403 | 'b1': b1_enc[i], 404 | 'b2': b2_enc[i], 405 | 'scale1': scale_enc_1[i], 406 | 'shift1': shift_enc_1[i], 407 | 'scale2': scale_enc_2[i], 408 | 'shift2': shift_enc_2[i], 409 | } 410 | 411 | attention_weights = { 412 | 413 | 'Wq': Wq_enc[i], 414 | 'Wk': Wk_enc[i], 415 | 'Wv': Wv_enc[i], 416 | 'Wo': Wo_enc[i], 417 | } 418 | 419 | encoderin = encoder(encoderin,encoder_weights,attention_weights,dqkv) 420 | 421 | return encoderin 422 | 423 | 424 | 425 | # ### Function for Stacking Decoders. 426 | 427 | # In[12]: 428 | 429 | 430 | def stacked_decoders(layer_num,decoderin,encoderout,filled): 431 | 432 | for j in xrange(0,layer_num): 433 | 434 | decoder_weights = { 435 | 436 | 'W1': W1_dec[j], 437 | 'W2': W2_dec[j], 438 | 'b1': b1_dec[j], 439 | 'b2': b2_dec[j], 440 | 'scale1': scale_dec_1[j], 441 | 'shift1': shift_dec_1[j], 442 | 'scale2': scale_dec_2[j], 443 | 'shift2': shift_dec_2[j], 444 | 'scale3': scale_dec_3[j], 445 | 'shift3': shift_dec_3[j], 446 | } 447 | 448 | masked_attention_weights = { 449 | 450 | 'Wq': Wq_dec_1[j], 451 | 'Wk': Wk_dec_1[j], 452 | 'Wv': Wv_dec_1[j], 453 | 'Wo': Wo_dec_1[j], 454 | } 455 | 456 | attention_weights = { 457 | 458 | 'Wq': Wq_dec_2[j], 459 | 'Wk': Wk_dec_2[j], 460 | 'Wv': Wv_dec_2[j], 461 | 'Wo': Wo_dec_2[j], 462 | } 463 | 464 | decoderin = decoder(decoderin,encoderout, 465 | decoder_weights, 466 | masked_attention_weights, 467 | attention_weights, 468 | dqkv, 469 | mask=True,filled=filled) 470 | return decoderin 471 | 472 | 473 | 474 | # ### predicted_embedding(): 475 | # 476 | # Given a probability distribution and an embedding matrix, this function returns the embedding of the word with the maximum probability in the given distribution. 477 | # 478 | # ### replaceSOS(): 479 | # 480 | # SOS signifies the start of sentence for the decoder. Also often represented as 'GO'. I am using an all ones vector as the first decoder input token. 481 | # In the next time step, the SOS will be forgotten, and only the context of the previously predicted output (or the target output at the previous timestep, if teacher forcing is on) will be used. 482 | # 483 | # ### add_pred_to_output_lists(): 484 | # 485 | # This function will concatenate the last predicted output into a tensor of concatenated sequence of output tokens. 486 | 487 | # In[13]: 488 | 489 | 490 | def predicted_embedding(out_prob_dist,tf_embd): 491 | out_index = tf.cast(tf.argmax(out_prob_dist,1),tf.int32) 492 | return tf.gather(tf_embd,out_index) 493 | 494 | def replaceSOS(output,out_prob_dist): 495 | return output,tf.constant(1),tf.reshape(out_prob_dist,[tf.shape(x)[0],1,vocab_len]) 496 | 497 | def add_pred_to_output_list(decoderin_part_1,output,filled,out_probs,out_prob_dist): 498 | decoderin_part_1 = tf.concat([decoderin_part_1,output],1) 499 | filled += 1 500 | out_probs = tf.concat([out_probs,tf.reshape(out_prob_dist,[tf.shape(x)[0],1,vocab_len])],1) 501 | return decoderin_part_1,filled,out_probs 502 | 503 | 504 | # ### Model Definition 505 | # 506 | # It follows the encoder-decoder paradigm. The main exception from standard encoder-decoder paradigm, is that it uses 'transformers' instead of Reccurrent networks. The decoder undergoes a sequential processing, though. 507 | # 508 | # If teacher forcing is True, the decoder is made to guess the next output from the previous words in the actual target output, else the decoder predicts the next output from the previously predicted output of the decoder. 509 | # 510 | # Details about the model: https://arxiv.org/pdf/1706.03762.pdf 511 | 512 | # In[14]: 513 | 514 | 515 | def model(x,y,teacher_forcing=True): 516 | 517 | 518 | # NOTE: tf.shape(x)[0] == batch_size 519 | 520 | encoderin = x # (should be already positionally encoded) 521 | encoderin = tf.nn.dropout(encoderin,keep_prob) 522 | 523 | 524 | # ENCODER LAYERS 525 | 526 | encoderout = stacked_encoders(N,encoderin) 527 | 528 | 529 | # DECODER LAYERS 530 | 531 | decoderin_part_1 = tf.ones([tf.shape(x)[0],1,word_vec_dim],dtype=tf.float32) #represents SOS 532 | 533 | filled = tf.constant(1) 534 | # no. of output words that are filled 535 | # filled value is used to retrieve appropriate mask for illegal positions. 536 | 537 | 538 | tf_embd = tf.convert_to_tensor(np_embedding_beng) 539 | Wpd = tf.transpose(tf_embd) 540 | # Wpd the transpose of the output embedding matrix will be used to convert the decoder output 541 | # into a probability distribution over the output language vocabulary. 542 | 543 | out_probs = tf.zeros([tf.shape(x)[0],output_len,vocab_len],tf.float32) 544 | # out_probs will contain the list of probability distributions. 545 | 546 | #tf_while_loop since output_len will be dynamically defined during session run 547 | 548 | i=tf.constant(0) 549 | 550 | def cond(i,filled,decoderin_part_1,out_probs): 551 | return i') 648 | mask = np.ones_like((output_batch),np.float32) 649 | for i in xrange(len(mask)): 650 | for j in xrange(len(mask[i])): 651 | if output_batch[i,j]==pad_index: 652 | mask[i,j]=0 653 | return mask 654 | 655 | 656 | # ### Training ..... 657 | # 658 | # The input batch is positionally encoded before its fed to the network. 659 | 660 | # In[17]: 661 | 662 | 663 | import string 664 | import random 665 | from __future__ import print_function 666 | 667 | init = tf.global_variables_initializer() 668 | 669 | with tf.Session() as sess: # Start Tensorflow Session 670 | 671 | saver = tf.train.Saver() 672 | # Prepares variable for saving the model 673 | sess.run(init) #initialize all variables 674 | step = 0 675 | best_loss = 999 676 | display_step = 1 677 | warm_up_steps = 7000 678 | 679 | while step < epochs: 680 | 681 | batch_len = len(train_batch_x) 682 | shuffled_indices = np.arange(batch_len) 683 | np.random.shuffle(shuffled_indices) 684 | 685 | for i in xrange(0,batch_len): 686 | 687 | # Adaptive learning rate formula 688 | #learning_rate = ((word_vec_dim)**(-0.5))*min((step*batch_len+i+1)**(-0.5),(step*batch_len+i+1)*warm_up_steps**(-1.5)) 689 | 690 | sample_no = np.random.randint(0,len(train_batch_x[0])) 691 | print("\nCHOSEN SAMPLE NO.: "+str(sample_no)) 692 | 693 | if i%display_step==0: 694 | 695 | print("\nEpoch: "+str(step+1)+" Iteration: "+str(i+1)) 696 | print("\nSAMPLE TEXT:") 697 | for vec in train_batch_x[shuffled_indices[i]][sample_no]: 698 | print(vec2word_eng(vec),end=" ") 699 | print("\n") 700 | 701 | input_seq_len = len(train_batch_x[shuffled_indices[i]][0]) 702 | 703 | pe_in = positional_encoding(input_seq_len,word_vec_dim) 704 | pe_in = pe_in.reshape((1,input_seq_len,word_vec_dim)) 705 | 706 | output_seq_len = len(train_batch_y[shuffled_indices[i]][0]) 707 | 708 | 709 | 710 | illegal_position_masks = generate_masks_for_illegal_positions(output_seq_len) 711 | 712 | pe_out = positional_encoding(output_seq_len,word_vec_dim) 713 | pe_out = pe_out.reshape((output_seq_len,1,word_vec_dim)) 714 | 715 | 716 | rand = random.randint(0,2) #determines chance of using Teacher Forcing 717 | if rand==1: 718 | random_bool = True 719 | else: 720 | random_bool = False 721 | 722 | pad_mask = create_pad_Mask(train_batch_y[shuffled_indices[i]]) 723 | 724 | # Run optimization operation (backpropagation) 725 | _,loss,out = sess.run([optimizer,cost,softmax_output], 726 | feed_dict={x: (train_batch_x[shuffled_indices[i]]+pe_in), 727 | y: train_batch_y[shuffled_indices[i]], 728 | keep_prob: 0.9, 729 | output_len: len(train_batch_y[shuffled_indices[i]][0]), 730 | tf_pad_mask: pad_mask, 731 | tf_illegal_position_masks: illegal_position_masks, 732 | tf_pe_out: pe_out, 733 | teacher_forcing: False #random_bool 734 | # feed random bool for randomized teacher forcing. 735 | }) 736 | 737 | if i%display_step==0: 738 | 739 | print("\nPREDICTED TRANSLATION OF THE SAMPLE:\n") 740 | flag = 0 741 | for array in out[sample_no]: 742 | 743 | #prediction_int = np.random.choice(range(vocab_len), p=array.ravel()) 744 | #(^use this if you want some variety) 745 | #(or use this what's below:) 746 | 747 | prediction_int = np.argmax(array) 748 | 749 | if vocab_beng[prediction_int] in string.punctuation or flag==0: 750 | print(vocab_beng[prediction_int],end='') 751 | else: 752 | print(" "+vocab_beng[prediction_int],end='') 753 | flag=1 754 | print("\n") 755 | 756 | print("ACTUAL TRANSLATION OF THE SAMPLE:\n") 757 | for index in train_batch_y[shuffled_indices[i]][sample_no]: 758 | print(vocab_beng[index],end=" ") 759 | print("\n") 760 | 761 | print("loss="+str(loss)) 762 | 763 | if(loss')] 779 | 780 | 781 | # ### Prediction. 782 | 783 | # In[21]: 784 | 785 | 786 | with tf.Session() as sess: # Begin session 787 | 788 | print('Loading pre-trained weights for the model...') 789 | saver = tf.train.Saver() 790 | saver.restore(sess, 'Model_Backup/translation_model.ckpt') 791 | sess.run(tf.global_variables()) 792 | print('\nRESTORATION COMPLETE\n') 793 | 794 | 795 | test = ['who','are','you'] # Enter tokenized text here 796 | test = map(word2vec,test) 797 | test = np.asarray(test,np.float32) 798 | test = test.reshape((1,test.shape[0],test.shape[1])) 799 | 800 | input_seq_len = test.shape[0] 801 | pe_in = positional_encoding(input_seq_len,word_vec_dim) 802 | pe_in = pe_in.reshape((1,input_seq_len,word_vec_dim)) 803 | test_pe = test+pe_in 804 | 805 | output_seq_len = int(input_seq_len+20) 806 | illegal_position_masks = generate_masks_for_illegal_positions(output_seq_len) 807 | pe_out = positional_encoding(output_seq_len,word_vec_dim) 808 | pe_out = pe_out.reshape((output_seq_len,1,word_vec_dim)) 809 | 810 | out = sess.run(softmax_output, 811 | feed_dict={x: test_pe, 812 | y: np.zeros((1,1),np.int32), 813 | # y value doesn't matter here. 814 | # feeding y, because the network graph requires y. 815 | # but its value won't actually be used in this case. 816 | keep_prob: 1, 817 | output_len: output_seq_len, 818 | tf_pe_out: pe_out, 819 | tf_illegal_position_masks: illegal_position_masks, 820 | teacher_forcing: False 821 | }) 822 | 823 | for array in out[0]: 824 | if vocab_beng[np.argmax(array)] != '': 825 | print(vocab_beng[np.argmax(array)],end=' ') 826 | 827 | 828 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Machine Translation using Transformers 4 | 5 | The model is based on: 6 | 7 | ["Attention Is All You Need" by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin. arXiv:1706.03762](https://arxiv.org/abs/1706.03762) 8 | 9 | # WARNING: 10 | 11 | This is an old code. I have an updated version of Transformers over here: https://github.com/JRC1995/Transformers 12 | 13 | # Preprocessing Translation Data 14 | (from Translation_preprocess.py) 15 | 16 | ### Function for expanding English contractions 17 | 18 | source: https://gist.github.com/nealrs/96342d8231b75cf4bb82 19 | 20 | 21 | ```python 22 | import numpy as np 23 | from __future__ import division 24 | import io 25 | import unicodedata 26 | import nltk 27 | from nltk import word_tokenize 28 | import string 29 | import re 30 | import random 31 | 32 | 33 | #source: https://gist.github.com/nealrs/96342d8231b75cf4bb82 34 | cList = { 35 | "ain't": "am not", 36 | "aren't": "are not", 37 | "can't": "cannot", 38 | "can't've": "cannot have", 39 | "'cause": "because", 40 | "could've": "could have", 41 | "couldn't": "could not", 42 | "couldn't've": "could not have", 43 | "didn't": "did not", 44 | "doesn't": "does not", 45 | "don't": "do not", 46 | "hadn't": "had not", 47 | "hadn't've": "had not have", 48 | "hasn't": "has not", 49 | "haven't": "have not", 50 | "he'd": "he would", 51 | "he'd've": "he would have", 52 | "he'll": "he will", 53 | "he'll've": "he will have", 54 | "he's": "he is", 55 | "how'd": "how did", 56 | "how'd'y": "how do you", 57 | "how'll": "how will", 58 | "how's": "how is", 59 | "I'd": "I would", 60 | "I'd've": "I would have", 61 | "I'll": "I will", 62 | "I'll've": "I will have", 63 | "I'm": "I am", 64 | "I've": "I have", 65 | "isn't": "is not", 66 | "it'd": "it had", 67 | "it'd've": "it would have", 68 | "it'll": "it will", 69 | "it'll've": "it will have", 70 | "it's": "it is", 71 | "let's": "let us", 72 | "ma'am": "madam", 73 | "mayn't": "may not", 74 | "might've": "might have", 75 | "mightn't": "might not", 76 | "mightn't've": "might not have", 77 | "must've": "must have", 78 | "mustn't": "must not", 79 | "mustn't've": "must not have", 80 | "needn't": "need not", 81 | "needn't've": "need not have", 82 | "o'clock": "of the clock", 83 | "oughtn't": "ought not", 84 | "oughtn't've": "ought not have", 85 | "shan't": "shall not", 86 | "sha'n't": "shall not", 87 | "shan't've": "shall not have", 88 | "she'd": "she would", 89 | "she'd've": "she would have", 90 | "she'll": "she will", 91 | "she'll've": "she will have", 92 | "she's": "she is", 93 | "should've": "should have", 94 | "shouldn't": "should not", 95 | "shouldn't've": "should not have", 96 | "so've": "so have", 97 | "so's": "so is", 98 | "that'd": "that would", 99 | "that'd've": "that would have", 100 | "that's": "that is", 101 | "there'd": "there had", 102 | "there'd've": "there would have", 103 | "there's": "there is", 104 | "they'd": "they would", 105 | "they'd've": "they would have", 106 | "they'll": "they will", 107 | "they'll've": "they will have", 108 | "they're": "they are", 109 | "they've": "they have", 110 | "to've": "to have", 111 | "wasn't": "was not", 112 | "we'd": "we had", 113 | "we'd've": "we would have", 114 | "we'll": "we will", 115 | "we'll've": "we will have", 116 | "we're": "we are", 117 | "we've": "we have", 118 | "weren't": "were not", 119 | "what'll": "what will", 120 | "what'll've": "what will have", 121 | "what're": "what are", 122 | "what's": "what is", 123 | "what've": "what have", 124 | "when's": "when is", 125 | "when've": "when have", 126 | "where'd": "where did", 127 | "where's": "where is", 128 | "where've": "where have", 129 | "who'll": "who will", 130 | "who'll've": "who will have", 131 | "who's": "who is", 132 | "who've": "who have", 133 | "why's": "why is", 134 | "why've": "why have", 135 | "will've": "will have", 136 | "won't": "will not", 137 | "won't've": "will not have", 138 | "would've": "would have", 139 | "wouldn't": "would not", 140 | "wouldn't've": "would not have", 141 | "y'all": "you all", 142 | "y'alls": "you alls", 143 | "y'all'd": "you all would", 144 | "y'all'd've": "you all would have", 145 | "y'all're": "you all are", 146 | "y'all've": "you all have", 147 | "you'd": "you had", 148 | "you'd've": "you would have", 149 | "you'll": "you you will", 150 | "you'll've": "you you will have", 151 | "you're": "you are", 152 | "you've": "you have" 153 | } 154 | 155 | c_re = re.compile('(%s)' % '|'.join(cList.keys())) 156 | 157 | def expandContractions(text, c_re=c_re): 158 | def replace(match): 159 | return cList[match.group(0)] 160 | return c_re.sub(replace, text) 161 | ``` 162 | 163 | ### Loading Translation Data 164 | 165 | Splitting the data into eng and beng. 166 | eng will contain the list of English lines, and beng will contain the corresponding list of Bengali lines. 167 | 168 | 169 | Source of data: http://www.manythings.org/anki/ (downloaded ben-eng) 170 | 171 | 172 | ```python 173 | filename = 'ben.txt' 174 | #Datasource: http://www.manythings.org/anki/ 175 | 176 | # http://stackoverflow.com/a/518232/2809427 177 | def unicodeToAscii(s): 178 | return ''.join( 179 | c for c in unicodedata.normalize('NFD', s) 180 | if unicodedata.category(c) != 'Mn' 181 | ) 182 | 183 | def normalizeString(s): 184 | s = unicodeToAscii(expandContractions(s.lower().strip())) 185 | s = re.sub(r"([.!?,\"])", r" ", s) 186 | return s 187 | 188 | def loaddata(filename): 189 | file = io.open(filename,'r') 190 | eng=[] 191 | beng = [] 192 | for line in file.readlines(): 193 | lang_pair = line.split('\t') 194 | lang_pair[0] = normalizeString(lang_pair[0]) 195 | lang_pair[1] = normalizeString(lang_pair[1]) 196 | eng.append(word_tokenize(lang_pair[0])) 197 | beng.append(word_tokenize(lang_pair[1])) 198 | file.close() 199 | return eng,beng 200 | 201 | eng,beng = loaddata(filename) 202 | 203 | #Example: 204 | sample = random.randint(0,len(eng)) 205 | print "Example Sample #"+str(sample)+":\n" 206 | string = "ENGLISH:" 207 | for i in xrange(0,len(eng[sample])): 208 | string+=" "+eng[sample][i] 209 | print string 210 | 211 | string = "\nBENGALI:" 212 | for i in xrange(0,len(beng[sample])): 213 | string+=" "+beng[sample][i] 214 | print string 215 | 216 | ``` 217 | 218 | Example Sample #646: 219 | 220 | ENGLISH: tom 's right 221 | 222 | BENGALI: টমই ঠিক। 223 | 224 | 225 | ### Creating separate vocabulary lists for English words and Bengali words 226 | 227 | The index of vocabulary will represent the numerical representation of the word which is stored at that index. 228 | 229 | 230 | 231 | ```python 232 | import numpy as np 233 | 234 | vocab_eng=[] 235 | vocab_eng.append('') 236 | vocab_eng.append('') 237 | 238 | vocab_beng=[] 239 | vocab_beng.append('') 240 | vocab_beng.append('') 241 | 242 | #The index of vocab will serve as an integer representation of the word 243 | 244 | vectorized_eng = [] 245 | vectorized_beng = [] 246 | 247 | for i in xrange(len(eng)): 248 | 249 | vectorized_eng_line = [] 250 | for word in eng[i]: 251 | if word not in vocab_eng: 252 | vocab_eng.append(word) 253 | vectorized_eng_line.append(vocab_eng.index(word)) 254 | else: 255 | vectorized_eng_line.append(vocab_eng.index(word)) 256 | vectorized_eng.append(vectorized_eng_line) 257 | 258 | vectorized_beng_line = [] 259 | for word in beng[i]: 260 | if word not in vocab_beng: 261 | vocab_beng.append(word) 262 | vectorized_beng_line.append(vocab_beng.index(word)) 263 | else: 264 | vectorized_beng_line.append(vocab_beng.index(word)) 265 | vectorized_beng.append(vectorized_beng_line) 266 | 267 | 268 | ``` 269 | 270 | ### Creating training dataset for word2vec embedding 271 | 272 | if the sentence is "I am alright" 273 | 274 | then for the word 'am', the context words with window size 1 will be "I" and "alright" 275 | i.e ["I","alright"] 276 | 277 | For 'I' the context words will be "PAD" and "am" 278 | 279 | For 'alright' the context words will be "am" and "PAD" 280 | 281 | PAD represents empty and EOS represents end of sentence. 282 | 283 | Later lots of pads may be applied after the end of sentence to fit sequence length. 284 | 285 | So I also added the word PAD with context words being PADs, and PAD and EOS for embedding. 286 | 287 | (Doing what I wrote directly above, was actually unnecessary but I already did it. We don't need to consider these cases. With masking I will ignore the effect of PADs on the cost, anyway, and the model doesn't need to predict pads correctly. Predicting the EOS properly will be enough. So PAD embedding doesn't need to be taken so seriously.) 288 | 289 | In this way, first, from each sentence, I am creating a list of words, and a corresponding list of context words. 290 | I am doing the same thing for both English and Bengali lines. 291 | 292 | 293 | ```python 294 | words_eng = [] 295 | contexts_eng = [] 296 | 297 | words_beng = [] 298 | contexts_beng = [] 299 | 300 | words_eng.append(vocab_eng.index('')) 301 | contexts_eng.append([vocab_eng.index(''),vocab_eng.index('')]) 302 | words_eng.append(vocab_eng.index('')) 303 | contexts_eng.append([vocab_eng.index(''),vocab_eng.index('')]) 304 | 305 | words_beng.append(vocab_beng.index('')) 306 | contexts_beng.append([vocab_beng.index(''),vocab_beng.index('')]) 307 | words_beng.append(vocab_beng.index('')) 308 | contexts_beng.append([vocab_beng.index(''),vocab_beng.index('')]) 309 | 310 | 311 | for i in xrange(len(vectorized_eng)): 312 | 313 | for j in xrange(0,len(vectorized_eng[i])): 314 | 315 | context1=0 316 | context2=0 317 | 318 | if j==0: 319 | context1 = vocab_eng.index('') 320 | if j!=len(vectorized_eng[i])-1: 321 | context2 = vectorized_eng[i][j+1] 322 | if j==len(vectorized_eng[i])-1: 323 | context2=vocab_eng.index('') 324 | if j!=0: 325 | context1 = vectorized_eng[i][j-1] 326 | if j>0 and j for training data 335 | words_eng.append(vocab_eng.index('')) 336 | context1 = vectorized_eng[i][len(vectorized_eng[i])-1] 337 | context2 = vocab_eng.index('') 338 | contexts_eng.append([context1,context2]) 339 | 340 | for j in xrange(0,len(vectorized_beng[i])): 341 | 342 | context1=0 343 | context2=0 344 | 345 | if j==0: 346 | context1 = vocab_beng.index('') 347 | if j!=len(vectorized_beng[i])-1: 348 | context2 = vectorized_beng[i][j+1] 349 | if j==len(vectorized_beng[i])-1: 350 | context2=vocab_beng.index('') 351 | if j!=0: 352 | context1 = vectorized_beng[i][j-1] 353 | if j>0 and j for training data 362 | words_beng.append(vocab_beng.index('')) 363 | context1 = vectorized_beng[i][len(vectorized_beng[i])-1] 364 | context2 = vocab_beng.index('') 365 | contexts_beng.append([context1,context2]) 366 | 367 | 368 | 369 | ``` 370 | 371 | If word = "am" and context = ["I","alright"], 372 | then, from this data I will create the following samples: 373 | 374 | input = "am" 375 | output = "I" 376 | and 377 | input = "am" 378 | label = "alright" 379 | 380 | Like this I will construct a list of all training inputs (words) and training outputs\labels (context words) 381 | 382 | embd_inputs_eng will contain all the English training inputs. 383 | embd_labels_eng will contain all the English training labels. 384 | 385 | embd_inputs_beng will contain all the Bengali training inputs. 386 | embd_labels_beng will contain all the Bengali training labels. 387 | 388 | 389 | ```python 390 | embd_inputs_eng = [] 391 | embd_labels_eng = [] 392 | for i in xrange(len(contexts_eng)): 393 | for context in contexts_eng[i]: 394 | embd_inputs_eng.append(words_eng[i]) 395 | embd_labels_eng.append(context) 396 | embd_inputs_eng = np.asarray(embd_inputs_eng,np.int32) 397 | embd_labels_eng = np.asarray(embd_labels_eng,np.int32) 398 | 399 | embd_inputs_beng = [] 400 | embd_labels_beng = [] 401 | for i in xrange(len(contexts_beng)): 402 | for context in contexts_beng[i]: 403 | embd_inputs_beng.append(words_beng[i]) 404 | embd_labels_beng.append(context) 405 | embd_inputs_beng = np.asarray(embd_inputs_beng,np.int32) 406 | embd_labels_beng = np.asarray(embd_labels_beng,np.int32) 407 | 408 | ``` 409 | 410 | ### Function for generating mini-batches from the total training set 411 | 412 | 413 | ```python 414 | batch_size = 128 415 | 416 | def generate_batch(inputs,labels,batch_size): 417 | rand = random.sample((np.arange(len(inputs))),batch_size) 418 | batch_inputs=[] 419 | batch_labels=[] 420 | for i in xrange(batch_size): 421 | batch_inputs.append(inputs[int(rand[i])]) 422 | batch_labels.append(labels[int(rand[i])]) 423 | batch_inputs = np.asarray(batch_inputs,np.int32) 424 | batch_labels = np.asarray(batch_labels,np.int32) 425 | return batch_inputs,batch_labels 426 | 427 | ``` 428 | 429 | ### Preparing for word2vec embedding 430 | 431 | 432 | ```python 433 | import tensorflow as tf 434 | import math 435 | 436 | #https://www.tensorflow.org/tutorials/word2vec 437 | embedding_size = 256 438 | vocabulary_size_eng = len(vocab_eng) 439 | vocabulary_size_beng = len(vocab_beng) 440 | 441 | # Placeholders for inputs 442 | train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) 443 | train_labels = tf.placeholder(tf.int32, shape=[batch_size,1]) 444 | 445 | ``` 446 | 447 | ### Training for word2vec embedding (For English words) 448 | 449 | See: https://www.tensorflow.org/tutorials/word2vec 450 | 451 | for details of word2vec and code description. 452 | 453 | Most of the word2vec code used here are from the Tensorflow tutorial. 454 | 455 | 456 | ```python 457 | embeddings_eng = tf.Variable( 458 | tf.random_uniform([vocabulary_size_eng, embedding_size], -1.0, 1.0)) 459 | 460 | nce_weights_eng = tf.Variable( 461 | tf.truncated_normal([vocabulary_size_eng, embedding_size], 462 | stddev=1.0 / math.sqrt(embedding_size))) 463 | nce_biases_eng = tf.Variable(tf.zeros([vocabulary_size_eng])) 464 | 465 | # Initializing the variables 466 | init = tf.global_variables_initializer() 467 | ``` 468 | 469 | 470 | ```python 471 | embed_eng = tf.nn.embedding_lookup(embeddings_eng, train_inputs) 472 | 473 | # Compute the NCE loss, using a sample of the negative labels each time. 474 | loss = tf.reduce_mean( 475 | tf.nn.nce_loss(weights=nce_weights_eng, 476 | biases=nce_biases_eng, 477 | labels=train_labels, 478 | inputs=embed_eng, 479 | num_sampled=10, 480 | num_classes=vocabulary_size_eng)) #num_sampled = no. of negative samples 481 | 482 | # We use the SGD optimizer. 483 | optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss) 484 | ``` 485 | 486 | 487 | ```python 488 | 489 | with tf.Session() as sess: 490 | sess.run(init) 491 | convergence_threshold = 0.5 492 | training_iters = 500*(int((len(embd_inputs_eng))/batch_size)) 493 | step=0 494 | n=5 495 | last_n_losses = np.zeros((n),np.float32) 496 | 497 | while step=len(x): 713 | break 714 | 715 | batch_x = [] 716 | batch_y = [] 717 | 718 | max_len_x = len(sorted_x[i]) 719 | 720 | len_y= np.zeros((len(y)),np.int32) 721 | 722 | for j in xrange(i,i+batch_size): 723 | len_y[j] = len(sorted_y[j]) 724 | 725 | max_len_y = np.amax(len_y) 726 | 727 | for j in xrange(i,i+batch_size): 728 | line=[] 729 | for k1 in xrange(max_len_x+1): #+1 to include 730 | if k1==len(sorted_x[j]): 731 | line.append(np_embedding_eng[vocab_eng.index('')]) 732 | elif k1>len(sorted_x[j]): 733 | line.append(np_embedding_eng[vocab_eng.index('')]) 734 | else: 735 | line.append(np_embedding_eng[sorted_x[j][k1]]) 736 | batch_x.append(line) 737 | 738 | line=[] 739 | for k2 in xrange(max_len_y+1): #+1 to include 740 | if k2>len(sorted_y[j]): 741 | line.append(vocab_beng.index('')) 742 | elif k2==len(sorted_y[j]): 743 | line.append(vocab_beng.index('')) 744 | else: 745 | line.append(sorted_y[j][k2]) 746 | batch_y.append(line) 747 | 748 | batch_x = np.asarray(batch_x,np.float32) 749 | batch_y = np.asarray(batch_y,np.int32) 750 | 751 | batches_x.append(batch_x) 752 | batches_y.append(batch_y) 753 | 754 | i+=batch_size 755 | 756 | return batches_x,batches_y 757 | 758 | 759 | ``` 760 | 761 | ### Creating train, validation, and test batches 762 | 763 | 764 | ```python 765 | batch_size = 64 766 | 767 | train_batch_eng,train_batch_beng = bucket_and_batch(train_eng,train_beng,batch_size) 768 | 769 | val_batch_eng,val_batch_beng = bucket_and_batch(val_eng,val_beng,batch_size) 770 | 771 | test_batch_eng,test_batch_beng = bucket_and_batch(test_eng,test_beng,batch_size) 772 | 773 | ``` 774 | 775 | ### Saving processed data in another file. 776 | 777 | 778 | ```python 779 | #Saving processed data in another file. 780 | 781 | import pickle 782 | 783 | PICK = [vocab_eng,vocab_beng,np_embedding_eng,np_embedding_beng,train_batch_eng,train_batch_beng,val_batch_eng,val_batch_beng,test_batch_eng,test_batch_beng] 784 | 785 | with open('translationPICKLE', 'wb') as fp: 786 | pickle.dump(PICK, fp) 787 | 788 | ``` 789 | 790 | ### Loading Pre-processed Data 791 | (start of Machine Translation.ipynb) 792 | 793 | 794 | ```python 795 | import pickle 796 | import math 797 | import numpy as np 798 | 799 | 800 | with open ('translationPICKLE', 'rb') as fp: 801 | PICK = pickle.load(fp) 802 | 803 | vocab_eng = PICK[0] 804 | vocab_beng = PICK[1] 805 | vocab_len = len(vocab_beng) 806 | 807 | np_embedding_eng = PICK[2] 808 | np_embedding_beng = PICK[3] 809 | np_embedding_eng = np.asarray(np_embedding_eng,np.float32) 810 | np_embedding_beng = np.asarray(np_embedding_beng,np.float32) 811 | 812 | word_vec_dim = np_embedding_eng.shape[1] 813 | 814 | train_batch_x = PICK[4] 815 | train_batch_y = PICK[5] 816 | 817 | val_batch_x = PICK[6] 818 | val_batch_y = PICK[7] 819 | 820 | test_batch_x = PICK[8] 821 | test_batch_y = PICK[9] 822 | 823 | ``` 824 | 825 | ### Function for converting vector of size word_vec_dim into the closest representative english word. 826 | 827 | 828 | ```python 829 | def most_similar_eucli_eng(x): 830 | xminusy = np.subtract(np_embedding_eng,x) 831 | sq_xminusy = np.square(xminusy) 832 | sum_sq_xminusy = np.sum(sq_xminusy,1) 833 | eucli_dists = np.sqrt(sum_sq_xminusy) 834 | return np.argsort(eucli_dists) 835 | 836 | def vec2word_eng(vec): # converts a given vector representation into the represented word 837 | most_similars = most_similar_eucli_eng(np.asarray(vec,np.float32)) 838 | return vocab_eng[most_similars[0]] 839 | 840 | ``` 841 | 842 | ### Hyperparameters and Placeholders. 843 | 844 | 845 | ```python 846 | import tensorflow as tf 847 | 848 | #Hyperparamters 849 | 850 | h=8 #no. of heads 851 | N=1 #no. of decoder and encoder layers 852 | learning_rate=0.001 853 | epochs = 200 854 | keep_prob = tf.placeholder(tf.float32) 855 | 856 | #Placeholders 857 | 858 | x = tf.placeholder(tf.float32, [None,None,word_vec_dim]) 859 | y = tf.placeholder(tf.int32, [None,None]) 860 | 861 | output_len = tf.placeholder(tf.int32) 862 | 863 | teacher_forcing = tf.placeholder(tf.bool) 864 | 865 | tf_pad_mask = tf.placeholder(tf.float32,[None,None]) 866 | tf_illegal_position_masks = tf.placeholder(tf.float32,[None,None,None]) 867 | 868 | tf_pe_out = tf.placeholder(tf.float32,[None,None,None]) #positional codes for output 869 | ``` 870 | 871 | ### Model Parameters. 872 | 873 | 874 | ```python 875 | 876 | # Dimensions for Q (Query),K (Keys) and V (Values) for attention layers. 877 | 878 | dqkv = 32 879 | 880 | #Parameters for attention sub-layers for all n encoders 881 | 882 | Wq_enc = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 883 | Wk_enc = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 884 | Wv_enc = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 885 | Wo_enc = tf.Variable(tf.truncated_normal(shape=[N,h*dqkv,word_vec_dim],stddev=0.01)) 886 | 887 | #Parameters for position-wise fully connected layers for n encoders 888 | 889 | d = 1024 890 | W1_enc = tf.Variable(tf.truncated_normal(shape=[N,1,1,word_vec_dim,d],stddev=0.01)) 891 | b1_enc = tf.Variable(tf.constant(0,tf.float32,shape=[N,d])) 892 | W2_enc = tf.Variable(tf.truncated_normal(shape=[N,1,1,d,word_vec_dim],stddev=0.01)) 893 | b2_enc = tf.Variable(tf.constant(0,tf.float32,shape=[N,word_vec_dim])) 894 | 895 | #Parameters for 2 attention sub-layers for all n decoders 896 | 897 | Wq_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 898 | Wk_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 899 | Wv_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 900 | Wo_dec_1 = tf.Variable(tf.truncated_normal(shape=[N,h*dqkv,word_vec_dim],stddev=0.01)) 901 | Wq_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 902 | Wk_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 903 | Wv_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h,word_vec_dim,dqkv],stddev=0.01)) 904 | Wo_dec_2 = tf.Variable(tf.truncated_normal(shape=[N,h*dqkv,word_vec_dim],stddev=0.01)) 905 | 906 | #Parameters for position-wise fully connected layers for n decoders 907 | 908 | d = 1024 909 | W1_dec = tf.Variable(tf.truncated_normal(shape=[N,1,1,word_vec_dim,d],stddev=0.01)) 910 | b1_dec = tf.Variable(tf.constant(0,tf.float32,shape=[N,d])) 911 | W2_dec = tf.Variable(tf.truncated_normal(shape=[N,1,1,d,word_vec_dim],stddev=0.01)) 912 | b2_dec = tf.Variable(tf.constant(0,tf.float32,shape=[N,word_vec_dim])) 913 | 914 | #Layer Normalization parameters for encoder 915 | 916 | scale_enc_1 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32) 917 | shift_enc_1 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32) 918 | 919 | scale_enc_2 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32) 920 | shift_enc_2 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32) 921 | 922 | #Layer Normalization parameters for decoder 923 | 924 | scale_dec_1 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32) 925 | shift_dec_1 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32) 926 | 927 | scale_dec_2 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32) 928 | shift_dec_2 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32) 929 | 930 | scale_dec_3 = tf.Variable(tf.ones([N,1,1,word_vec_dim]),dtype=tf.float32) 931 | shift_dec_3 = tf.Variable(tf.zeros([N,1,1,word_vec_dim]),dtype=tf.float32) 932 | ``` 933 | 934 | ### Function for generating a sequence of positional codes for positional encoding. 935 | 936 | 937 | ```python 938 | def positional_encoding(seq_len,model_dimensions): 939 | pe = np.zeros((seq_len,model_dimensions,),np.float32) 940 | for pos in xrange(0,seq_len): 941 | for i in xrange(0,model_dimensions): 942 | pe[pos][i] = math.sin(pos/(10000**(2*i/model_dimensions))) 943 | return pe.reshape((seq_len,model_dimensions)) 944 | ``` 945 | 946 | ### Function for Layer Normalization 947 | 948 | [Layer Normalization - by Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton](https://arxiv.org/abs/1607.06450) 949 | 950 | 951 | ```python 952 | 953 | def layer_norm(inputs,scale,shift,epsilon = 1e-5): 954 | 955 | mean, var = tf.nn.moments(inputs, [1,2], keep_dims=True) 956 | 957 | LN = tf.multiply((scale / tf.sqrt(var + epsilon)),(inputs - mean)) + shift 958 | 959 | return LN 960 | ``` 961 | 962 | ### Function to pre-generate masks for illegal positions. 963 | 964 | These masks are to be used to fill illegal positions with -infinity (or a very low value eg. -2^30). 965 | 966 | Illegal positions are positions of the decoder input tokens that aren't predicted at a given timestep. 967 | 968 | { In a transformer, the decoder input is of the same shape as the WHOLE decoder output sequence. One word for the sequence is predicted at each timestep (from left to right). So in most timesteps, the left side of the decoder input sequence will contain valid previously predicted output words, but the right side -the yet to be predicted side should contain some values that should be ignored and never attended. We make sure that they're ignored by masking it } 969 | 970 | So, the illegal positions depends on the total output length and the no. of predicted output tokens. 971 | 972 | The appropriate mask when i output tokens are predicted can be retrieved from mask[i-1] where mask is the return value from this function. The argument out_len that function takes, signifies the total length of the output. 973 | 974 | The masks are used to assign the value -2^30 to all positions in the tensor influenced by the illegal ones. 975 | After going through the softmax layer, these positions become close to 0, as it should be. 976 | 977 | Dynamically creating masks depending on the current position\timestep (depending on which the program can know which positions are legal and which aren't) is, however, 978 | a bit troublesome with tensorflow tf_while_loop. 979 | 980 | I will be pre-generating all the masks with Python native code and feed the list of all required masks to the network at each training step (output length can be different at different training steps). 981 | 982 | 983 | 984 | ```python 985 | def generate_masks_for_illegal_positions(out_len): 986 | 987 | masks=np.zeros((out_len-1,out_len,out_len),dtype=np.float32) 988 | 989 | for i in xrange(1,out_len): 990 | mask = np.zeros((out_len,out_len),dtype=np.float32) 991 | mask[i:out_len,:] = -2**30 992 | mask[:,i:out_len] = -2**30 993 | masks[i-1] = mask 994 | 995 | return masks 996 | ``` 997 | 998 | ### Function for Multi-Headed Attention. 999 | 1000 | Details: https://arxiv.org/pdf/1706.03762.pdf 1001 | 1002 | Q = Query 1003 | 1004 | K = Key 1005 | 1006 | V = Value 1007 | 1008 | d is the dimension for Q, K and V. 1009 | 1010 | 1011 | ```python 1012 | 1013 | def attention(Q,K,V,d,filled=0,mask=False): 1014 | 1015 | K = tf.transpose(K,[0,2,1]) 1016 | d = tf.cast(d,tf.float32) 1017 | 1018 | softmax_component = tf.div(tf.matmul(Q,K),tf.sqrt(d)) 1019 | 1020 | if mask == True: 1021 | softmax_component = softmax_component + tf_illegal_position_masks[filled-1] 1022 | 1023 | result = tf.matmul(tf.nn.dropout(tf.nn.softmax(softmax_component),keep_prob),V) 1024 | 1025 | return result 1026 | 1027 | 1028 | def multihead_attention(Q,K,V,d,weights,filled=0,mask=False): 1029 | 1030 | Q_ = tf.reshape(Q,[-1,tf.shape(Q)[2]]) 1031 | K_ = tf.reshape(K,[-1,tf.shape(Q)[2]]) 1032 | V_ = tf.reshape(V,[-1,tf.shape(Q)[2]]) 1033 | 1034 | heads = tf.TensorArray(size=h,dtype=tf.float32) 1035 | 1036 | Wq = weights['Wq'] 1037 | Wk = weights['Wk'] 1038 | Wv = weights['Wv'] 1039 | Wo = weights['Wo'] 1040 | 1041 | for i in xrange(0,h): 1042 | 1043 | Q_w = tf.matmul(Q_,Wq[i]) 1044 | Q_w = tf.reshape(Q_w,[tf.shape(Q)[0],tf.shape(Q)[1],d]) 1045 | 1046 | K_w = tf.matmul(K_,Wk[i]) 1047 | K_w = tf.reshape(K_w,[tf.shape(K)[0],tf.shape(K)[1],d]) 1048 | 1049 | V_w = tf.matmul(V_,Wv[i]) 1050 | V_w = tf.reshape(V_w,[tf.shape(V)[0],tf.shape(V)[1],d]) 1051 | 1052 | head = attention(Q_w,K_w,V_w,d,filled,mask) 1053 | 1054 | heads = heads.write(i,head) 1055 | 1056 | heads = heads.stack() 1057 | 1058 | concated = heads[0] 1059 | 1060 | for i in xrange(1,h): 1061 | concated = tf.concat([concated,heads[i]],2) 1062 | 1063 | concated = tf.reshape(concated,[-1,h*d]) 1064 | out = tf.matmul(concated,Wo) 1065 | out = tf.reshape(out,[tf.shape(heads)[1],tf.shape(heads)[2],word_vec_dim]) 1066 | 1067 | return out 1068 | 1069 | ``` 1070 | 1071 | ### Function for encoder 1072 | 1073 | More details: https://arxiv.org/pdf/1706.03762.pdf 1074 | 1075 | 1076 | ```python 1077 | def encoder(x,weights,attention_weights,dqkv): 1078 | 1079 | W1 = weights['W1'] 1080 | W2 = weights['W2'] 1081 | b1 = weights['b1'] 1082 | b2 = weights['b2'] 1083 | 1084 | scale1 = weights['scale1'] 1085 | shift1 = weights['shift1'] 1086 | scale2 = weights['scale2'] 1087 | shift2 = weights['shift2'] 1088 | 1089 | # SUBLAYER 1 (MASKED MULTI HEADED SELF ATTENTION) 1090 | 1091 | sublayer1 = multihead_attention(x,x,x,dqkv,attention_weights) 1092 | sublayer1 = tf.nn.dropout(sublayer1,keep_prob) 1093 | sublayer1 = layer_norm(sublayer1 + x,scale1,shift1) 1094 | 1095 | sublayer1_ = tf.reshape(sublayer1,[tf.shape(sublayer1)[0],1,tf.shape(sublayer1)[1],word_vec_dim]) 1096 | 1097 | # SUBLAYER 2 (TWO 1x1 CONVOLUTIONAL LAYERS AKA POSITION WISE FULLY CONNECTED NETWORKS) 1098 | 1099 | sublayer2 = tf.nn.conv2d(sublayer1_, W1, strides=[1,1,1,1], padding='SAME') 1100 | sublayer2 = tf.nn.bias_add(sublayer2,b1) 1101 | sublayer2 = tf.nn.relu(sublayer2) 1102 | 1103 | sublayer2 = tf.nn.conv2d(sublayer2, W2, strides=[1,1,1,1], padding='SAME') 1104 | sublayer2 = tf.nn.bias_add(sublayer2,b2) 1105 | 1106 | sublayer2 = tf.reshape(sublayer2,[tf.shape(sublayer2)[0],tf.shape(sublayer2)[2],word_vec_dim]) 1107 | 1108 | sublayer2 = tf.nn.dropout(sublayer2,keep_prob) 1109 | sublayer2 = layer_norm(sublayer2 + sublayer1,scale2,shift2) 1110 | 1111 | return sublayer2 1112 | 1113 | ``` 1114 | 1115 | ### Function for decoder 1116 | 1117 | More details: https://arxiv.org/pdf/1706.03762.pdf 1118 | 1119 | 1120 | ```python 1121 | def decoder(y,enc_out,weights,masked_attention_weights,attention_weights,dqkv,mask=False,filled=0): 1122 | 1123 | W1 = weights['W1'] 1124 | W2 = weights['W2'] 1125 | b1 = weights['b1'] 1126 | b2 = weights['b2'] 1127 | 1128 | scale1 = weights['scale1'] 1129 | shift1 = weights['shift1'] 1130 | scale2 = weights['scale2'] 1131 | shift2 = weights['shift2'] 1132 | scale3 = weights['scale3'] 1133 | shift3 = weights['shift3'] 1134 | 1135 | # SUBLAYER 1 (MASKED MULTI HEADED SELF ATTENTION) 1136 | 1137 | sublayer1 = multihead_attention(y,y,y,dqkv,masked_attention_weights,filled,mask) 1138 | sublayer1 = tf.nn.dropout(sublayer1,keep_prob) 1139 | sublayer1 = layer_norm(sublayer1 + y,scale1,shift1) 1140 | 1141 | # SUBLAYER 2 (MULTIHEADED ENCODER-DECODER INTERLAYER ATTENTION) 1142 | 1143 | sublayer2 = multihead_attention(sublayer1,enc_out,enc_out,dqkv,attention_weights) 1144 | sublayer2 = tf.nn.dropout(sublayer2,keep_prob) 1145 | sublayer2 = layer_norm(sublayer2 + sublayer1,scale2,shift2) 1146 | 1147 | # SUBLAYER 3 (TWO 1x1 CONVOLUTIONAL LAYERS AKA POSITION WISE FULLY CONNECTED NETWORKS) 1148 | 1149 | sublayer2_ = tf.reshape(sublayer2,[tf.shape(sublayer2)[0],1,tf.shape(sublayer2)[1],word_vec_dim]) 1150 | 1151 | sublayer3 = tf.nn.conv2d(sublayer2_, W1, strides=[1,1,1,1], padding='SAME') 1152 | sublayer3 = tf.nn.bias_add(sublayer3,b1) 1153 | sublayer3 = tf.nn.relu(sublayer3) 1154 | 1155 | sublayer3 = tf.nn.conv2d(sublayer3, W2, strides=[1,1,1,1], padding='SAME') 1156 | sublayer3 = tf.nn.bias_add(sublayer3,b2) 1157 | 1158 | sublayer3 = tf.reshape(sublayer3,[tf.shape(sublayer3)[0],tf.shape(sublayer3)[2],word_vec_dim]) 1159 | 1160 | sublayer3 = tf.nn.dropout(sublayer3,keep_prob) 1161 | sublayer3 = layer_norm(sublayer3 + sublayer2,scale3,shift3) 1162 | 1163 | return sublayer3 1164 | ``` 1165 | 1166 | ### Function for Stacking Encoders. 1167 | 1168 | 1169 | ```python 1170 | def stacked_encoders(layer_num,encoderin): 1171 | 1172 | for i in xrange(0,layer_num): 1173 | 1174 | encoder_weights = { 1175 | 1176 | 'W1': W1_enc[i], 1177 | 'W2': W2_enc[i], 1178 | 'b1': b1_enc[i], 1179 | 'b2': b2_enc[i], 1180 | 'scale1': scale_enc_1[i], 1181 | 'shift1': shift_enc_1[i], 1182 | 'scale2': scale_enc_2[i], 1183 | 'shift2': shift_enc_2[i], 1184 | } 1185 | 1186 | attention_weights = { 1187 | 1188 | 'Wq': Wq_enc[i], 1189 | 'Wk': Wk_enc[i], 1190 | 'Wv': Wv_enc[i], 1191 | 'Wo': Wo_enc[i], 1192 | } 1193 | 1194 | encoderin = encoder(encoderin,encoder_weights,attention_weights,dqkv) 1195 | 1196 | return encoderin 1197 | 1198 | ``` 1199 | 1200 | ### Function for Stacking Decoders. 1201 | 1202 | 1203 | ```python 1204 | def stacked_decoders(layer_num,decoderin,encoderout,filled): 1205 | 1206 | for j in xrange(0,layer_num): 1207 | 1208 | decoder_weights = { 1209 | 1210 | 'W1': W1_dec[j], 1211 | 'W2': W2_dec[j], 1212 | 'b1': b1_dec[j], 1213 | 'b2': b2_dec[j], 1214 | 'scale1': scale_dec_1[j], 1215 | 'shift1': shift_dec_1[j], 1216 | 'scale2': scale_dec_2[j], 1217 | 'shift2': shift_dec_2[j], 1218 | 'scale3': scale_dec_3[j], 1219 | 'shift3': shift_dec_3[j], 1220 | } 1221 | 1222 | masked_attention_weights = { 1223 | 1224 | 'Wq': Wq_dec_1[j], 1225 | 'Wk': Wk_dec_1[j], 1226 | 'Wv': Wv_dec_1[j], 1227 | 'Wo': Wo_dec_1[j], 1228 | } 1229 | 1230 | attention_weights = { 1231 | 1232 | 'Wq': Wq_dec_2[j], 1233 | 'Wk': Wk_dec_2[j], 1234 | 'Wv': Wv_dec_2[j], 1235 | 'Wo': Wo_dec_2[j], 1236 | } 1237 | 1238 | decoderin = decoder(decoderin,encoderout, 1239 | decoder_weights, 1240 | masked_attention_weights, 1241 | attention_weights, 1242 | dqkv, 1243 | mask=True,filled=filled) 1244 | return decoderin 1245 | 1246 | ``` 1247 | 1248 | ### predicted_embedding(): 1249 | 1250 | Given a probability distribution and an embedding matrix, this function returns the embedding of the word with the maximum probability in the given distribution. 1251 | 1252 | ### replaceSOS(): 1253 | 1254 | SOS signifies the start of sentence for the decoder. Also often represented as 'GO'. I am using an all ones vector as the first decoder input token. 1255 | In the next time step, the SOS will be forgotten, and only the context of the previously predicted output (or the target output at the previous timestep, if teacher forcing is on) will be used. 1256 | 1257 | ### add_pred_to_output_lists(): 1258 | 1259 | This function will concatenate the last predicted output into a tensor of concatenated sequence of output tokens. 1260 | 1261 | 1262 | ```python 1263 | def predicted_embedding(out_prob_dist,tf_embd): 1264 | out_index = tf.cast(tf.argmax(out_prob_dist,1),tf.int32) 1265 | return tf.gather(tf_embd,out_index) 1266 | 1267 | def replaceSOS(output,out_prob_dist): 1268 | return output,tf.constant(1),tf.reshape(out_prob_dist,[tf.shape(x)[0],1,vocab_len]) 1269 | 1270 | def add_pred_to_output_list(decoderin_part_1,output,filled,out_probs,out_prob_dist): 1271 | decoderin_part_1 = tf.concat([decoderin_part_1,output],1) 1272 | filled += 1 1273 | out_probs = tf.concat([out_probs,tf.reshape(out_prob_dist,[tf.shape(x)[0],1,vocab_len])],1) 1274 | return decoderin_part_1,filled,out_probs 1275 | ``` 1276 | 1277 | ### Model Definition 1278 | 1279 | It follows the encoder-decoder paradigm. The main exception from standard encoder-decoder paradigm, is that it uses 'transformers' instead of Reccurrent networks. The decoder undergoes a sequential processing, though. 1280 | 1281 | If teacher forcing is True, the decoder is made to guess the next output from the previous words in the actual target output, else the decoder predicts the next output from the previously predicted output of the decoder. 1282 | 1283 | Details about the model: https://arxiv.org/pdf/1706.03762.pdf 1284 | 1285 | 1286 | ```python 1287 | def model(x,y,teacher_forcing=True): 1288 | 1289 | 1290 | # NOTE: tf.shape(x)[0] == batch_size 1291 | 1292 | encoderin = x # (should be already positionally encoded) 1293 | encoderin = tf.nn.dropout(encoderin,keep_prob) 1294 | 1295 | 1296 | # ENCODER LAYERS 1297 | 1298 | encoderout = stacked_encoders(N,encoderin) 1299 | 1300 | 1301 | # DECODER LAYERS 1302 | 1303 | decoderin_part_1 = tf.ones([tf.shape(x)[0],1,word_vec_dim],dtype=tf.float32) #represents SOS 1304 | 1305 | filled = tf.constant(1) 1306 | # no. of output words that are filled i.e already predicted - are stored in 'filled' 1307 | # filled value is used to retrieve appropriate mask for illegal positions. 1308 | 1309 | 1310 | tf_embd = tf.convert_to_tensor(np_embedding_beng) 1311 | Wpd = tf.transpose(tf_embd) 1312 | # Wpd the transpose of the output embedding matrix will be used to convert the decoder output 1313 | # into a probability distribution over the output language vocabulary. 1314 | 1315 | out_probs = tf.zeros([tf.shape(x)[0],output_len,vocab_len],tf.float32) 1316 | # out_probs will contain the list of probability distributions. 1317 | 1318 | #tf_while_loop since output_len will be dynamically defined during session run 1319 | 1320 | i=tf.constant(0) 1321 | 1322 | def cond(i,filled,decoderin_part_1,out_probs): 1323 | return i') 1420 | mask = np.ones_like((output_batch),np.float32) 1421 | for i in xrange(len(mask)): 1422 | for j in xrange(len(mask[i])): 1423 | if output_batch[i,j]==pad_index: 1424 | mask[i,j]=0 1425 | return mask 1426 | ``` 1427 | 1428 | ### Training ..... 1429 | 1430 | The input batch is positionally encoded before its fed to the network. 1431 | 1432 | 1433 | ```python 1434 | import string 1435 | import random 1436 | from __future__ import print_function 1437 | 1438 | init = tf.global_variables_initializer() 1439 | 1440 | with tf.Session() as sess: # Start Tensorflow Session 1441 | 1442 | saver = tf.train.Saver() 1443 | # Prepares variable for saving the model 1444 | sess.run(init) #initialize all variables 1445 | step = 0 1446 | best_loss = 999 1447 | display_step = 1 1448 | warm_up_steps = 7000 1449 | 1450 | while step < epochs: 1451 | 1452 | batch_len = len(train_batch_x) 1453 | shuffled_indices = np.arange(batch_len) 1454 | np.random.shuffle(shuffled_indices) 1455 | 1456 | for i in xrange(0,batch_len): 1457 | 1458 | # Adaptive learning rate formula 1459 | #learning_rate = ((word_vec_dim)**(-0.5))*min((step*batch_len+i+1)**(-0.5),(step*batch_len+i+1)*warm_up_steps**(-1.5)) 1460 | 1461 | sample_no = np.random.randint(0,len(train_batch_x[0])) 1462 | print("\nCHOSEN SAMPLE NO.: "+str(sample_no)) 1463 | 1464 | if i%display_step==0: 1465 | 1466 | print("\nEpoch: "+str(step+1)+" Iteration: "+str(i+1)) 1467 | print("\nSAMPLE TEXT:") 1468 | for vec in train_batch_x[shuffled_indices[i]][sample_no]: 1469 | print(vec2word_eng(vec),end=" ") 1470 | print("\n") 1471 | 1472 | input_seq_len = len(train_batch_x[shuffled_indices[i]][0]) 1473 | 1474 | pe_in = positional_encoding(input_seq_len,word_vec_dim) 1475 | pe_in = pe_in.reshape((1,input_seq_len,word_vec_dim)) 1476 | 1477 | output_seq_len = len(train_batch_y[shuffled_indices[i]][0]) 1478 | 1479 | 1480 | 1481 | illegal_position_masks = generate_masks_for_illegal_positions(output_seq_len) 1482 | 1483 | pe_out = positional_encoding(output_seq_len,word_vec_dim) 1484 | pe_out = pe_out.reshape((output_seq_len,1,word_vec_dim)) 1485 | 1486 | 1487 | rand = random.randint(0,2) #determines chance of using Teacher Forcing 1488 | if rand==1: 1489 | random_bool = True 1490 | else: 1491 | random_bool = False 1492 | 1493 | pad_mask = create_pad_Mask(train_batch_y[shuffled_indices[i]]) 1494 | 1495 | # Run optimization operation (backpropagation) 1496 | _,loss,out = sess.run([optimizer,cost,softmax_output], 1497 | feed_dict={x: (train_batch_x[shuffled_indices[i]]+pe_in), 1498 | y: train_batch_y[shuffled_indices[i]], 1499 | keep_prob: 0.9, 1500 | output_len: len(train_batch_y[shuffled_indices[i]][0]), 1501 | tf_pad_mask: pad_mask, 1502 | tf_illegal_position_masks: illegal_position_masks, 1503 | tf_pe_out: pe_out, 1504 | teacher_forcing: False #random_bool 1505 | # feed random bool for randomized teacher forcing. 1506 | }) 1507 | 1508 | if i%display_step==0: 1509 | 1510 | print("\nPREDICTED TRANSLATION OF THE SAMPLE:\n") 1511 | flag = 0 1512 | for array in out[sample_no]: 1513 | 1514 | #prediction_int = np.random.choice(range(vocab_len), p=array.ravel()) 1515 | #(^use this if you want some variety) 1516 | #(or use this what's below:) 1517 | 1518 | prediction_int = np.argmax(array) 1519 | 1520 | if vocab_beng[prediction_int] in string.punctuation or flag==0: 1521 | print(vocab_beng[prediction_int],end='') 1522 | else: 1523 | print(" "+vocab_beng[prediction_int],end='') 1524 | flag=1 1525 | print("\n") 1526 | 1527 | print("ACTUAL TRANSLATION OF THE SAMPLE:\n") 1528 | for index in train_batch_y[shuffled_indices[i]][sample_no]: 1529 | print(vocab_beng[index],end=" ") 1530 | print("\n") 1531 | 1532 | print("loss="+str(loss)) 1533 | 1534 | if(loss 1549 | 1550 | 1551 | PREDICTED TRANSLATION OF THE SAMPLE: 1552 | 1553 | দেখছে দেখছে দেখছে দেখছে দেখছে দেখছে দেখছে দেখছে দেখছে 1554 | 1555 | ACTUAL TRANSLATION OF THE SAMPLE: 1556 | 1557 | তিনি এখন লাঞচ করছেন। 1558 | 1559 | loss=297.772 1560 | 1561 | CHOSEN SAMPLE NO.: 0 1562 | 1563 | Epoch: 1 Iteration: 2 1564 | 1565 | SAMPLE TEXT: 1566 | they have got guns 1567 | 1568 | 1569 | PREDICTED TRANSLATION OF THE SAMPLE: 1570 | 1571 | ঠিকানাটা ঠিকানাটা ঠিকানাটা ঠিকানাটা ঠিকানাটা ঠিকানাটা ঠিকানাটা ঠিকানাটা 1572 | 1573 | ACTUAL TRANSLATION OF THE SAMPLE: 1574 | 1575 | ওদের কাছে বনদক রযেছে। 1576 | 1577 | loss=242.409 1578 | 1579 | CHOSEN SAMPLE NO.: 55 1580 | 1581 | Epoch: 1 Iteration: 3 1582 | 1583 | SAMPLE TEXT: 1584 | it seems that i have lost my keys 1585 | 1586 | 1587 | PREDICTED TRANSLATION OF THE SAMPLE: 1588 | 1589 | তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি তারাতারি 1590 | 1591 | ACTUAL TRANSLATION OF THE SAMPLE: 1592 | 1593 | মনে হচছে আমি আমার চাবি হারিযে ফেলেছি। 1594 | 1595 | loss=358.392 1596 | . 1597 | . 1598 | . 1599 | . 1600 | CHOSEN SAMPLE NO.: 20 1601 | 1602 | Epoch: 128 Iteration: 44 1603 | 1604 | SAMPLE TEXT: 1605 | she knows where we live 1606 | 1607 | 1608 | PREDICTED TRANSLATION OF THE SAMPLE: 1609 | 1610 | উনি জানে আমরা কোথায থাকি। 1611 | 1612 | ACTUAL TRANSLATION OF THE SAMPLE: 1613 | 1614 | তিনি জানেন আমরা কোথায থাকি। 1615 | 1616 | loss=1.27356 1617 | 1618 | CHOSEN SAMPLE NO.: 28 1619 | 1620 | Epoch: 128 Iteration: 45 1621 | 1622 | SAMPLE TEXT: 1623 | tom 's strong 1624 | 1625 | 1626 | PREDICTED TRANSLATION OF THE SAMPLE: 1627 | 1628 | টম শকতিশালী। 1629 | 1630 | ACTUAL TRANSLATION OF THE SAMPLE: 1631 | 1632 | টম শকতিশালী। 1633 | 1634 | loss=0.466606 1635 | 1636 | CHOSEN SAMPLE NO.: 8 1637 | 1638 | Epoch: 128 Iteration: 46 1639 | 1640 | SAMPLE TEXT: 1641 | stop that woman 1642 | 1643 | 1644 | PREDICTED TRANSLATION OF THE SAMPLE: 1645 | 1646 | ওই ফিরে আটকাও। 1647 | 1648 | ACTUAL TRANSLATION OF THE SAMPLE: 1649 | 1650 | ওই মহিলাটিকে থামান। 1651 | 1652 | loss=0.628224 1653 | 1654 | CHOSEN SAMPLE NO.: 30 1655 | 1656 | Epoch: 128 Iteration: 47 1657 | 1658 | SAMPLE TEXT: 1659 | do you have my book 1660 | 1661 | 1662 | PREDICTED TRANSLATION OF THE SAMPLE: 1663 | 1664 | আমার কাছে কি ভালো বইটা আছে 1665 | 1666 | ACTUAL TRANSLATION OF THE SAMPLE: 1667 | 1668 | আপনার কাছে কি আমার বইটা আছে 1669 | 1670 | loss=1.2308 1671 | 1672 | CHOSEN SAMPLE NO.: 43 1673 | 1674 | Epoch: 128 Iteration: 48 1675 | 1676 | SAMPLE TEXT: 1677 | would you like to come inside 1678 | 1679 | 1680 | PREDICTED TRANSLATION OF THE SAMPLE: 1681 | 1682 | তমি কি ভেতরে আসবেন 1683 | 1684 | ACTUAL TRANSLATION OF THE SAMPLE: 1685 | 1686 | আপনারা কি ভেতরে আসবেন 1687 | 1688 | loss=1.67444 1689 | 1690 | CHOSEN SAMPLE NO.: 0 1691 | 1692 | Epoch: 128 Iteration: 49 1693 | 1694 | SAMPLE TEXT: 1695 | are you busy tomorrow night 1696 | 1697 | 1698 | PREDICTED TRANSLATION OF THE SAMPLE: 1699 | 1700 | কোনো কি কাল আছে 1701 | 1702 | ACTUAL TRANSLATION OF THE SAMPLE: 1703 | 1704 | তই কি কাল রাতে বযসত থাকবি 1705 | 1706 | loss=0.907989 1707 | 1708 | CHOSEN SAMPLE NO.: 57 1709 | 1710 | Epoch: 128 Iteration: 50 1711 | 1712 | SAMPLE TEXT: 1713 | stand up 1714 | 1715 | 1716 | PREDICTED TRANSLATION OF THE SAMPLE: 1717 | 1718 | দাডান। 1719 | 1720 | ACTUAL TRANSLATION OF THE SAMPLE: 1721 | 1722 | দাডা 1723 | 1724 | loss=0.790484 1725 | 1726 | CHOSEN SAMPLE NO.: 21 1727 | 1728 | Epoch: 128 Iteration: 51 1729 | 1730 | SAMPLE TEXT: 1731 | tom did that himself 1732 | 1733 | 1734 | PREDICTED TRANSLATION OF THE SAMPLE: 1735 | 1736 | টম ওটা বযবসথা 1737 | 1738 | ACTUAL TRANSLATION OF THE SAMPLE: 1739 | 1740 | টম ওটা নিজেই করলো। 1741 | 1742 | loss=0.93948 1743 | 1744 | CHOSEN SAMPLE NO.: 56 1745 | 1746 | Epoch: 129 Iteration: 1 1747 | 1748 | SAMPLE TEXT: 1749 | tom is still at school 1750 | 1751 | 1752 | PREDICTED TRANSLATION OF THE SAMPLE: 1753 | 1754 | টম এখনো নামবার 1755 | 1756 | ACTUAL TRANSLATION OF THE SAMPLE: 1757 | 1758 | টম এখনো ইসকলে। 1759 | 1760 | loss=0.921481 1761 | 1762 | CHOSEN SAMPLE NO.: 15 1763 | 1764 | Epoch: 129 Iteration: 2 1765 | 1766 | SAMPLE TEXT: 1767 | were you there 1768 | 1769 | 1770 | PREDICTED TRANSLATION OF THE SAMPLE: 1771 | 1772 | তমি কি ওখানে ছিলে 1773 | 1774 | ACTUAL TRANSLATION OF THE SAMPLE: 1775 | 1776 | তমি কি ওখানে ছিলে 1777 | 1778 | loss=0.486593 1779 | 1780 | CHOSEN SAMPLE NO.: 58 1781 | 1782 | Epoch: 129 Iteration: 3 1783 | 1784 | SAMPLE TEXT: 1785 | is there a public toilet in this building 1786 | 1787 | 1788 | PREDICTED TRANSLATION OF THE SAMPLE: 1789 | 1790 | এই কি কি এই আছে আছে আছে 1791 | 1792 | ACTUAL TRANSLATION OF THE SAMPLE: 1793 | 1794 | এই ইমারতটিতে কি কোনো সরবজনীন শৌচাগার আছে 1795 | 1796 | loss=1.76835 1797 | 1798 | CHOSEN SAMPLE NO.: 28 1799 | 1800 | Epoch: 129 Iteration: 4 1801 | 1802 | SAMPLE TEXT: 1803 | i 'm not tom 1804 | 1805 | 1806 | PREDICTED TRANSLATION OF THE SAMPLE: 1807 | 1808 | আমি এখনই নই। 1809 | 1810 | ACTUAL TRANSLATION OF THE SAMPLE: 1811 | 1812 | আমি টম নই। 1813 | 1814 | loss=0.733902 1815 | 1816 | CHOSEN SAMPLE NO.: 59 1817 | 1818 | Epoch: 129 Iteration: 5 1819 | 1820 | SAMPLE TEXT: 1821 | do you understand french 1822 | 1823 | 1824 | PREDICTED TRANSLATION OF THE SAMPLE: 1825 | 1826 | তমি কি ফরাসি ভাষা বলতে 1827 | 1828 | ACTUAL TRANSLATION OF THE SAMPLE: 1829 | 1830 | আপনি কি ফরাসি ভাষা বঝতে পারো 1831 | 1832 | loss=0.842568 1833 | 1834 | CHOSEN SAMPLE NO.: 27 1835 | 1836 | Epoch: 129 Iteration: 6 1837 | 1838 | SAMPLE TEXT: 1839 | i 'm happy to see you again 1840 | 1841 | 1842 | PREDICTED TRANSLATION OF THE SAMPLE: 1843 | 1844 | আপনাকে আবার দেখে খশি হযেছি। 1845 | 1846 | ACTUAL TRANSLATION OF THE SAMPLE: 1847 | 1848 | আপনাদেরকে আবার দেখে খশি হযেছি। 1849 | 1850 | loss=1.91991 1851 | 1852 | CHOSEN SAMPLE NO.: 53 1853 | 1854 | Epoch: 129 Iteration: 7 1855 | 1856 | SAMPLE TEXT: 1857 | i could not walk 1858 | 1859 | 1860 | PREDICTED TRANSLATION OF THE SAMPLE: 1861 | 1862 | আমি এবার পারব 1863 | 1864 | ACTUAL TRANSLATION OF THE SAMPLE: 1865 | 1866 | আমি হাটতে পারিনি। 1867 | 1868 | loss=0.91238 1869 | 1870 | CHOSEN SAMPLE NO.: 0 1871 | 1872 | Epoch: 129 Iteration: 8 1873 | 1874 | SAMPLE TEXT: 1875 | i want to be as rich as tom 1876 | 1877 | 1878 | PREDICTED TRANSLATION OF THE SAMPLE: 1879 | 1880 | আমি ওই টমের হতে হতে 1881 | 1882 | ACTUAL TRANSLATION OF THE SAMPLE: 1883 | 1884 | আমি টমের মত ধনী হতে চাই। 1885 | 1886 | loss=1.78097 1887 | 1888 | CHOSEN SAMPLE NO.: 4 1889 | 1890 | Epoch: 129 Iteration: 9 1891 | 1892 | SAMPLE TEXT: 1893 | you should eat vegetables 1894 | 1895 | 1896 | PREDICTED TRANSLATION OF THE SAMPLE: 1897 | 1898 | তোমার শাকসবজি খাওযা উচিত। উচিত। 1899 | 1900 | ACTUAL TRANSLATION OF THE SAMPLE: 1901 | 1902 | তোমার শাকসবজি খাওযা উচিত। 1903 | 1904 | loss=0.584272 1905 | 1906 | CHOSEN SAMPLE NO.: 52 1907 | 1908 | Epoch: 129 Iteration: 10 1909 | 1910 | SAMPLE TEXT: 1911 | do come again 1912 | 1913 | 1914 | PREDICTED TRANSLATION OF THE SAMPLE: 1915 | 1916 | আবার আসবে কিনত। 1917 | 1918 | ACTUAL TRANSLATION OF THE SAMPLE: 1919 | 1920 | আবার আসবে কিনত। 1921 | 1922 | loss=0.749034 1923 | 1924 | CHOSEN SAMPLE NO.: 58 1925 | 1926 | Epoch: 129 Iteration: 11 1927 | 1928 | SAMPLE TEXT: 1929 | we will scream 1930 | 1931 | 1932 | PREDICTED TRANSLATION OF THE SAMPLE: 1933 | 1934 | আমরা চেচাবো। 1935 | 1936 | ACTUAL TRANSLATION OF THE SAMPLE: 1937 | 1938 | আমরা চিৎকার করবো। 1939 | 1940 | loss=0.519659 1941 | 1942 | CHOSEN SAMPLE NO.: 18 1943 | 1944 | Epoch: 129 Iteration: 12 1945 | 1946 | SAMPLE TEXT: 1947 | do you have time 1948 | 1949 | 1950 | PREDICTED TRANSLATION OF THE SAMPLE: 1951 | 1952 | আপনার হাতে সময আছে আছে 1953 | 1954 | ACTUAL TRANSLATION OF THE SAMPLE: 1955 | 1956 | তোমার হাতে সময আছে 1957 | 1958 | loss=0.776177 1959 | 1960 | CHOSEN SAMPLE NO.: 26 1961 | 1962 | Epoch: 129 Iteration: 13 1963 | 1964 | SAMPLE TEXT: 1965 | i eat everything 1966 | . 1967 | . 1968 | . 1969 | 1970 | 1971 | 1972 | ```python 1973 | def word2vec(word): 1974 | if word in vocab_eng: 1975 | return np_embedding_eng[vocab_eng.index(word)] 1976 | else: 1977 | return np_embedding_eng[vocab_eng.index('')] 1978 | ``` 1979 | 1980 | ### Prediction. 1981 | 1982 | 1983 | ```python 1984 | with tf.Session() as sess: # Begin session 1985 | 1986 | print('Loading pre-trained weights for the model...') 1987 | saver = tf.train.Saver() 1988 | saver.restore(sess, 'Model_Backup/translation_model.ckpt') 1989 | sess.run(tf.global_variables()) 1990 | print('\nRESTORATION COMPLETE\n') 1991 | 1992 | 1993 | test = ['who','are','you'] # Enter tokenized text here 1994 | test = map(word2vec,test) 1995 | test = np.asarray(test,np.float32) 1996 | test = test.reshape((1,test.shape[0],test.shape[1])) 1997 | 1998 | input_seq_len = test.shape[0] 1999 | pe_in = positional_encoding(input_seq_len,word_vec_dim) 2000 | pe_in = pe_in.reshape((1,input_seq_len,word_vec_dim)) 2001 | test_pe = test+pe_in 2002 | 2003 | output_seq_len = int(input_seq_len+20) 2004 | illegal_position_masks = generate_masks_for_illegal_positions(output_seq_len) 2005 | pe_out = positional_encoding(output_seq_len,word_vec_dim) 2006 | pe_out = pe_out.reshape((output_seq_len,1,word_vec_dim)) 2007 | 2008 | out = sess.run(softmax_output, 2009 | feed_dict={x: test_pe, 2010 | y: np.zeros((1,1),np.int32), 2011 | # y value doesn't matter here. 2012 | # feeding y, because the network graph requires y. 2013 | # but its value won't actually be used in this case. 2014 | keep_prob: 1, 2015 | output_len: output_seq_len, 2016 | tf_pe_out: pe_out, 2017 | tf_illegal_position_masks: illegal_position_masks, 2018 | teacher_forcing: False 2019 | }) 2020 | 2021 | for array in out[0]: 2022 | if vocab_beng[np.argmax(array)] != '': 2023 | print(vocab_beng[np.argmax(array)],end=' ') 2024 | 2025 | 2026 | 2027 | ``` 2028 | 2029 | Loading pre-trained weights for the model... 2030 | INFO:tensorflow:Restoring parameters from Model_Backup/translation_model.ckpt 2031 | 2032 | RESTORATION COMPLETE 2033 | 2034 | আপলোড করছিল। করছিস করছিল। করছিল। করছিল। করছিস করছিস করছিস করছিস করছিল। করছিস করছিস করছিস করছিল। 2035 | 2036 | ### Some comments: 2037 | 2038 | The model seems to fit well on the training data even using only 1 layer of encoder and decoder. 2039 | In fact, it seems to be fitting better when I am training with 1 layer of encoder and decoder. 2040 | However, I suppose the model is most likely 'memorizing' and overfitting. I tried some predictions below, 2041 | the results aren't good. Things will become clearer with validation, evaluation metrics and testing. 2042 | 2043 | Also the model may not really learn to generalize very well, given that there are only **4378** data samples. 2044 | Most deep learning model will probably overfit on this. 2045 | 2046 | At each timestep the decoder output is in the format batch_size x sequence_length x word_vector_dimensions. 2047 | I am then adding the decoder output along the second axis, to transform the shape into batch_size x word_vector_dimensions. 2048 | This should ensure that the final output is influenced by all the vectors in the immediate decoder output. 2049 | A linear layer may be usable to achieve this goal too, but a summation seems to work - at least in fitting the data. 2050 | And a simple summation won't require extra parameters. 2051 | 2052 | I am not sure what the original implementation does with the decoder output before converting it linearly into a probability distribution. 2053 | 2054 | I am using output language word embedding matrix (say E) to convert the transformed decoder output into a probability distribution. 2055 | 2056 | Probability distribution of next word = (decoder_output after summation along 2nd axis) x transpose(E) 2057 | 2058 | The paper recommended something along that line. Using embedding matrix seemed to produce much better results. 2059 | 2060 | Even though I included an option to use randomzied teacher's forcing, I kept teacher forcing off throughout this training to check if it can still fit on the training data. 2061 | 2062 | 2063 | ### TO DO 2064 | 2065 | * Evaluation (BLEU\METEOR etc.) 2066 | * Validation 2067 | * Testing 2068 | 2069 | (For now, I was just checking if the model can at least fit on the training data- whether it overfits or not is to yet to be checked) 2070 | 2071 | -------------------------------------------------------------------------------- /Translation_preprocess.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # # Preprocessing Translation Data 5 | 6 | # ### Function for expanding english contractions 7 | # 8 | # source: https://gist.github.com/nealrs/96342d8231b75cf4bb82 9 | 10 | # In[1]: 11 | 12 | 13 | import numpy as np 14 | from __future__ import division 15 | import io 16 | import unicodedata 17 | import nltk 18 | from nltk import word_tokenize 19 | import string 20 | import re 21 | import random 22 | 23 | 24 | #source: https://gist.github.com/nealrs/96342d8231b75cf4bb82 25 | cList = { 26 | "ain't": "am not", 27 | "aren't": "are not", 28 | "can't": "cannot", 29 | "can't've": "cannot have", 30 | "'cause": "because", 31 | "could've": "could have", 32 | "couldn't": "could not", 33 | "couldn't've": "could not have", 34 | "didn't": "did not", 35 | "doesn't": "does not", 36 | "don't": "do not", 37 | "hadn't": "had not", 38 | "hadn't've": "had not have", 39 | "hasn't": "has not", 40 | "haven't": "have not", 41 | "he'd": "he would", 42 | "he'd've": "he would have", 43 | "he'll": "he will", 44 | "he'll've": "he will have", 45 | "he's": "he is", 46 | "how'd": "how did", 47 | "how'd'y": "how do you", 48 | "how'll": "how will", 49 | "how's": "how is", 50 | "I'd": "I would", 51 | "I'd've": "I would have", 52 | "I'll": "I will", 53 | "I'll've": "I will have", 54 | "I'm": "I am", 55 | "I've": "I have", 56 | "isn't": "is not", 57 | "it'd": "it had", 58 | "it'd've": "it would have", 59 | "it'll": "it will", 60 | "it'll've": "it will have", 61 | "it's": "it is", 62 | "let's": "let us", 63 | "ma'am": "madam", 64 | "mayn't": "may not", 65 | "might've": "might have", 66 | "mightn't": "might not", 67 | "mightn't've": "might not have", 68 | "must've": "must have", 69 | "mustn't": "must not", 70 | "mustn't've": "must not have", 71 | "needn't": "need not", 72 | "needn't've": "need not have", 73 | "o'clock": "of the clock", 74 | "oughtn't": "ought not", 75 | "oughtn't've": "ought not have", 76 | "shan't": "shall not", 77 | "sha'n't": "shall not", 78 | "shan't've": "shall not have", 79 | "she'd": "she would", 80 | "she'd've": "she would have", 81 | "she'll": "she will", 82 | "she'll've": "she will have", 83 | "she's": "she is", 84 | "should've": "should have", 85 | "shouldn't": "should not", 86 | "shouldn't've": "should not have", 87 | "so've": "so have", 88 | "so's": "so is", 89 | "that'd": "that would", 90 | "that'd've": "that would have", 91 | "that's": "that is", 92 | "there'd": "there had", 93 | "there'd've": "there would have", 94 | "there's": "there is", 95 | "they'd": "they would", 96 | "they'd've": "they would have", 97 | "they'll": "they will", 98 | "they'll've": "they will have", 99 | "they're": "they are", 100 | "they've": "they have", 101 | "to've": "to have", 102 | "wasn't": "was not", 103 | "we'd": "we had", 104 | "we'd've": "we would have", 105 | "we'll": "we will", 106 | "we'll've": "we will have", 107 | "we're": "we are", 108 | "we've": "we have", 109 | "weren't": "were not", 110 | "what'll": "what will", 111 | "what'll've": "what will have", 112 | "what're": "what are", 113 | "what's": "what is", 114 | "what've": "what have", 115 | "when's": "when is", 116 | "when've": "when have", 117 | "where'd": "where did", 118 | "where's": "where is", 119 | "where've": "where have", 120 | "who'll": "who will", 121 | "who'll've": "who will have", 122 | "who's": "who is", 123 | "who've": "who have", 124 | "why's": "why is", 125 | "why've": "why have", 126 | "will've": "will have", 127 | "won't": "will not", 128 | "won't've": "will not have", 129 | "would've": "would have", 130 | "wouldn't": "would not", 131 | "wouldn't've": "would not have", 132 | "y'all": "you all", 133 | "y'alls": "you alls", 134 | "y'all'd": "you all would", 135 | "y'all'd've": "you all would have", 136 | "y'all're": "you all are", 137 | "y'all've": "you all have", 138 | "you'd": "you had", 139 | "you'd've": "you would have", 140 | "you'll": "you you will", 141 | "you'll've": "you you will have", 142 | "you're": "you are", 143 | "you've": "you have" 144 | } 145 | 146 | c_re = re.compile('(%s)' % '|'.join(cList.keys())) 147 | 148 | def expandContractions(text, c_re=c_re): 149 | def replace(match): 150 | return cList[match.group(0)] 151 | return c_re.sub(replace, text) 152 | 153 | 154 | # ### Loading Translation Data 155 | # 156 | # Splitting the data into eng and beng. 157 | # eng will contain the list of English lines, and beng will contain the corresponding list of Bengali lines. 158 | # 159 | # 160 | # Source of data: http://www.manythings.org/anki/ (downloaded ben-eng) 161 | 162 | # In[2]: 163 | 164 | 165 | filename = 'ben.txt' 166 | #Datasource: http://www.manythings.org/anki/ 167 | 168 | # http://stackoverflow.com/a/518232/2809427 169 | def unicodeToAscii(s): 170 | return ''.join( 171 | c for c in unicodedata.normalize('NFD', s) 172 | if unicodedata.category(c) != 'Mn' 173 | ) 174 | 175 | def normalizeString(s): 176 | s = unicodeToAscii(expandContractions(s.lower().strip())) 177 | s = re.sub(r"([.!?,\"])", r" ", s) 178 | return s 179 | 180 | def loaddata(filename): 181 | file = io.open(filename,'r') 182 | eng=[] 183 | beng = [] 184 | for line in file.readlines(): 185 | lang_pair = line.split('\t') 186 | lang_pair[0] = normalizeString(lang_pair[0]) 187 | lang_pair[1] = normalizeString(lang_pair[1]) 188 | eng.append(word_tokenize(lang_pair[0])) 189 | beng.append(word_tokenize(lang_pair[1])) 190 | file.close() 191 | return eng,beng 192 | 193 | eng,beng = loaddata(filename) 194 | 195 | #Example: 196 | sample = random.randint(0,len(eng)) 197 | print "Example Sample #"+str(sample)+":\n" 198 | string = "ENGLISH:" 199 | for i in xrange(0,len(eng[sample])): 200 | string+=" "+eng[sample][i] 201 | print string 202 | 203 | string = "\nBENGALI:" 204 | for i in xrange(0,len(beng[sample])): 205 | string+=" "+beng[sample][i] 206 | print string 207 | 208 | 209 | # ### Creating separate vocabulary lists for English words and Bengali words 210 | # 211 | # The index of vocabulary will represent the numerical representation of the word which is the value of vocabulary at that index. 212 | # 213 | 214 | # In[3]: 215 | 216 | 217 | import numpy as np 218 | 219 | vocab_eng=[] 220 | vocab_eng.append('') 221 | vocab_eng.append('') 222 | 223 | vocab_beng=[] 224 | vocab_beng.append('') 225 | vocab_beng.append('') 226 | 227 | #The index of vocab will serve as an integer representation of the word 228 | 229 | vectorized_eng = [] 230 | vectorized_beng = [] 231 | 232 | for i in xrange(len(eng)): 233 | 234 | vectorized_eng_line = [] 235 | for word in eng[i]: 236 | if word not in vocab_eng: 237 | vocab_eng.append(word) 238 | vectorized_eng_line.append(vocab_eng.index(word)) 239 | else: 240 | vectorized_eng_line.append(vocab_eng.index(word)) 241 | vectorized_eng.append(vectorized_eng_line) 242 | 243 | vectorized_beng_line = [] 244 | for word in beng[i]: 245 | if word not in vocab_beng: 246 | vocab_beng.append(word) 247 | vectorized_beng_line.append(vocab_beng.index(word)) 248 | else: 249 | vectorized_beng_line.append(vocab_beng.index(word)) 250 | vectorized_beng.append(vectorized_beng_line) 251 | 252 | 253 | 254 | 255 | # ### Creating training dataset for word2vec embedding 256 | # 257 | # if the sentence is "I am alright" 258 | # 259 | # then for the word 'am', the context words with window size 1 will be "I" and "alright" 260 | # i.e ["I","alright"] 261 | # 262 | # For 'I' the context words will be "PAD" and "am" 263 | # 264 | # For 'alright' the context words will be "am" and "PAD" 265 | # 266 | # PAD represents empty and EOS represents end of sentence. 267 | # 268 | # Later lots of pads may be applied after the end of sentence to fit sequence length. 269 | # 270 | # So I also added the word PAD with context words being PADs, and PAD and EOS for embedding. 271 | # 272 | # In this way, first, from each sentence, I am creating a list of words, and corresponding list of context words. 273 | # Doing the same thing for 274 | 275 | # In[4]: 276 | 277 | 278 | words_eng = [] 279 | contexts_eng = [] 280 | 281 | words_beng = [] 282 | contexts_beng = [] 283 | 284 | words_eng.append(vocab_eng.index('')) 285 | contexts_eng.append([vocab_eng.index(''),vocab_eng.index('')]) 286 | words_eng.append(vocab_eng.index('')) 287 | contexts_eng.append([vocab_eng.index(''),vocab_eng.index('')]) 288 | 289 | words_beng.append(vocab_beng.index('')) 290 | contexts_beng.append([vocab_beng.index(''),vocab_beng.index('')]) 291 | words_beng.append(vocab_beng.index('')) 292 | contexts_beng.append([vocab_beng.index(''),vocab_beng.index('')]) 293 | 294 | 295 | for i in xrange(len(vectorized_eng)): 296 | 297 | for j in xrange(0,len(vectorized_eng[i])): 298 | 299 | context1=0 300 | context2=0 301 | 302 | if j==0: 303 | context1 = vocab_eng.index('') 304 | if j!=len(vectorized_eng[i])-1: 305 | context2 = vectorized_eng[i][j+1] 306 | if j==len(vectorized_eng[i])-1: 307 | context2=vocab_eng.index('') 308 | if j!=0: 309 | context1 = vectorized_eng[i][j-1] 310 | if j>0 and j for training data 319 | words_eng.append(vocab_eng.index('')) 320 | context1 = vectorized_eng[i][len(vectorized_eng[i])-1] 321 | context2 = vocab_eng.index('') 322 | contexts_eng.append([context1,context2]) 323 | 324 | for j in xrange(0,len(vectorized_beng[i])): 325 | 326 | context1=0 327 | context2=0 328 | 329 | if j==0: 330 | context1 = vocab_beng.index('') 331 | if j!=len(vectorized_beng[i])-1: 332 | context2 = vectorized_beng[i][j+1] 333 | if j==len(vectorized_beng[i])-1: 334 | context2=vocab_beng.index('') 335 | if j!=0: 336 | context1 = vectorized_beng[i][j-1] 337 | if j>0 and j for training data 346 | words_beng.append(vocab_beng.index('')) 347 | context1 = vectorized_beng[i][len(vectorized_beng[i])-1] 348 | context2 = vocab_beng.index('') 349 | contexts_beng.append([context1,context2]) 350 | 351 | 352 | 353 | 354 | 355 | # If word = "am" and context = ["I","alright"], 356 | # then, here I will reconstrcut the data as: 357 | # 358 | # input = "am" 359 | # output = "I" 360 | # and 361 | # input = "am" 362 | # label = "alright" 363 | # 364 | # Like this I will construct a list of all training inputs (words) and training outputs\labels (context words) 365 | # 366 | # embd_inputs_eng will contain all the English training inputs. 367 | # embd_labels_eng will contain all the English training labels. 368 | # 369 | # embd_inputs_beng will contain all the Bengali training inputs. 370 | # embd_labels_beng will contain all the Bengali training labels. 371 | 372 | # In[5]: 373 | 374 | 375 | embd_inputs_eng = [] 376 | embd_labels_eng = [] 377 | for i in xrange(len(contexts_eng)): 378 | for context in contexts_eng[i]: 379 | embd_inputs_eng.append(words_eng[i]) 380 | embd_labels_eng.append(context) 381 | embd_inputs_eng = np.asarray(embd_inputs_eng,np.int32) 382 | embd_labels_eng = np.asarray(embd_labels_eng,np.int32) 383 | 384 | embd_inputs_beng = [] 385 | embd_labels_beng = [] 386 | for i in xrange(len(contexts_beng)): 387 | for context in contexts_beng[i]: 388 | embd_inputs_beng.append(words_beng[i]) 389 | embd_labels_beng.append(context) 390 | embd_inputs_beng = np.asarray(embd_inputs_beng,np.int32) 391 | embd_labels_beng = np.asarray(embd_labels_beng,np.int32) 392 | 393 | 394 | 395 | # ### Function for generating mini-batches from the total training set 396 | 397 | # In[6]: 398 | 399 | 400 | batch_size = 128 401 | 402 | def generate_batch(inputs,labels,batch_size): 403 | rand = random.sample((np.arange(len(inputs))),batch_size) 404 | batch_inputs=[] 405 | batch_labels=[] 406 | for i in xrange(batch_size): 407 | batch_inputs.append(inputs[int(rand[i])]) 408 | batch_labels.append(labels[int(rand[i])]) 409 | batch_inputs = np.asarray(batch_inputs,np.int32) 410 | batch_labels = np.asarray(batch_labels,np.int32) 411 | return batch_inputs,batch_labels 412 | 413 | 414 | 415 | # ### Preparing for word2vec embedding 416 | 417 | # In[7]: 418 | 419 | 420 | import tensorflow as tf 421 | import math 422 | 423 | #https://www.tensorflow.org/tutorials/word2vec 424 | embedding_size = 256 425 | vocabulary_size_eng = len(vocab_eng) 426 | vocabulary_size_beng = len(vocab_beng) 427 | 428 | # Placeholders for inputs 429 | train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) 430 | train_labels = tf.placeholder(tf.int32, shape=[batch_size,1]) 431 | 432 | 433 | # ### Training for word2vec embedding (For English words) 434 | # 435 | # See: https://www.tensorflow.org/tutorials/word2vec 436 | # 437 | # for details of word2vec and code description 438 | 439 | # In[8]: 440 | 441 | 442 | embeddings_eng = tf.Variable( 443 | tf.random_uniform([vocabulary_size_eng, embedding_size], -1.0, 1.0)) 444 | 445 | nce_weights_eng = tf.Variable( 446 | tf.truncated_normal([vocabulary_size_eng, embedding_size], 447 | stddev=1.0 / math.sqrt(embedding_size))) 448 | nce_biases_eng = tf.Variable(tf.zeros([vocabulary_size_eng])) 449 | 450 | # Initializing the variables 451 | init = tf.global_variables_initializer() 452 | 453 | 454 | # In[9]: 455 | 456 | 457 | embed_eng = tf.nn.embedding_lookup(embeddings_eng, train_inputs) 458 | 459 | # Compute the NCE loss, using a sample of the negative labels each time. 460 | loss = tf.reduce_mean( 461 | tf.nn.nce_loss(weights=nce_weights_eng, 462 | biases=nce_biases_eng, 463 | labels=train_labels, 464 | inputs=embed_eng, 465 | num_sampled=10, 466 | num_classes=vocabulary_size_eng)) #num_sampled = no. of negative samples 467 | 468 | # We use the SGD optimizer. 469 | optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss) 470 | 471 | 472 | 473 | 474 | # In[10]: 475 | 476 | 477 | 478 | with tf.Session() as sess: 479 | sess.run(init) 480 | convergence_threshold = 0.5 481 | training_iters = 500*(int((len(embd_inputs_eng))/batch_size)) 482 | step=0 483 | n=5 484 | last_n_losses = np.zeros((n),np.float32) 485 | 486 | while step=len(x): 641 | break 642 | 643 | batch_x = [] 644 | batch_y = [] 645 | 646 | max_len_x = len(sorted_x[i]) 647 | 648 | len_y= np.zeros((len(y)),np.int32) 649 | 650 | for j in xrange(i,i+batch_size): 651 | len_y[j] = len(sorted_y[j]) 652 | 653 | max_len_y = np.amax(len_y) 654 | 655 | for j in xrange(i,i+batch_size): 656 | line=[] 657 | for k1 in xrange(max_len_x+1): #+1 to include 658 | if k1==len(sorted_x[j]): 659 | line.append(np_embedding_eng[vocab_eng.index('')]) 660 | elif k1>len(sorted_x[j]): 661 | line.append(np_embedding_eng[vocab_eng.index('')]) 662 | else: 663 | line.append(np_embedding_eng[sorted_x[j][k1]]) 664 | batch_x.append(line) 665 | 666 | line=[] 667 | for k2 in xrange(max_len_y+1): #+1 to include 668 | if k2>len(sorted_y[j]): 669 | line.append(vocab_beng.index('')) 670 | elif k2==len(sorted_y[j]): 671 | line.append(vocab_beng.index('')) 672 | else: 673 | line.append(sorted_y[j][k2]) 674 | batch_y.append(line) 675 | 676 | batch_x = np.asarray(batch_x,np.float32) 677 | batch_y = np.asarray(batch_y,np.int32) 678 | 679 | batches_x.append(batch_x) 680 | batches_y.append(batch_y) 681 | 682 | i+=batch_size 683 | 684 | return batches_x,batches_y 685 | 686 | 687 | 688 | # ### Creating train, validation and test batches 689 | 690 | # In[16]: 691 | 692 | 693 | batch_size = 64 694 | 695 | train_batch_eng,train_batch_beng = bucket_and_batch(train_eng,train_beng,batch_size) 696 | 697 | val_batch_eng,val_batch_beng = bucket_and_batch(val_eng,val_beng,batch_size) 698 | 699 | test_batch_eng,test_batch_beng = bucket_and_batch(test_eng,test_beng,batch_size) 700 | 701 | 702 | # ### Saving processed data in another file. 703 | 704 | # In[17]: 705 | 706 | 707 | #Saving processed data in another file. 708 | 709 | import pickle 710 | 711 | PICK = [vocab_eng,vocab_beng,np_embedding_eng,np_embedding_beng,train_batch_eng,train_batch_beng,val_batch_eng,val_batch_beng,test_batch_eng,test_batch_beng] 712 | 713 | with open('translationPICKLE', 'wb') as fp: 714 | pickle.dump(PICK, fp) 715 | 716 | --------------------------------------------------------------------------------