├── references ├── code │ ├── __init__.py │ ├── vanilla-gan.py │ └── seq2seq_translation_tutorial.py ├── __init__.py └── papers │ ├── 1707.07328.pdf │ ├── WORDS OR CHARACTERS _ FINE-GRAINED GATING.pdf │ ├── deep reinforcement learning for dialogue generation.pdf │ ├── Semi-Supervised QA with Generative Domain-Adaptive Nets.pdf │ └── learning cooperative visual dialog agents with deep reinforcement learning.pdf ├── .gitignore ├── src ├── __init__.py ├── util │ ├── util.pyc │ ├── __init__.py │ ├── data_proc.pyc │ ├── masked_cross_entropy.py │ ├── util.py │ ├── test.py │ └── data_proc.py ├── GAN_model │ ├── __init__.py │ ├── GAN_main.py │ └── GAN_model.py ├── G_c_a_sep │ ├── __init__.py │ ├── G_eval.py │ ├── G_main.py │ ├── G_c_a_sep.py │ └── G_train.py ├── D_baseline │ ├── D_eval.pyc │ ├── D_train.pyc │ ├── __init__.py │ ├── D_baseline_model.pyc │ ├── D_model.py │ ├── D_eval.py │ ├── D_main.py │ └── D_train.py ├── G_baseline │ ├── __init__.py │ ├── G_main.py │ ├── G_train.py │ ├── G_eval.py │ └── G_model.py └── model_zoo.py └── .idea ├── GAN-QA.iml └── misc.xml /references/code/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | 3 | .gitignore 4 | .idea -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division -------------------------------------------------------------------------------- /src/util/util.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/util/util.pyc -------------------------------------------------------------------------------- /src/util/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division -------------------------------------------------------------------------------- /references/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division -------------------------------------------------------------------------------- /src/GAN_model/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division -------------------------------------------------------------------------------- /src/G_c_a_sep/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division -------------------------------------------------------------------------------- /src/util/data_proc.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/util/data_proc.pyc -------------------------------------------------------------------------------- /src/D_baseline/D_eval.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/D_baseline/D_eval.pyc -------------------------------------------------------------------------------- /src/D_baseline/D_train.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/D_baseline/D_train.pyc -------------------------------------------------------------------------------- /src/D_baseline/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division -------------------------------------------------------------------------------- /src/G_baseline/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division -------------------------------------------------------------------------------- /references/papers/1707.07328.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/1707.07328.pdf -------------------------------------------------------------------------------- /src/D_baseline/D_baseline_model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/D_baseline/D_baseline_model.pyc -------------------------------------------------------------------------------- /references/papers/WORDS OR CHARACTERS _ FINE-GRAINED GATING.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/WORDS OR CHARACTERS _ FINE-GRAINED GATING.pdf -------------------------------------------------------------------------------- /references/papers/deep reinforcement learning for dialogue generation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/deep reinforcement learning for dialogue generation.pdf -------------------------------------------------------------------------------- /references/papers/Semi-Supervised QA with Generative Domain-Adaptive Nets.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/Semi-Supervised QA with Generative Domain-Adaptive Nets.pdf -------------------------------------------------------------------------------- /references/papers/learning cooperative visual dialog agents with deep reinforcement learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/learning cooperative visual dialog agents with deep reinforcement learning.pdf -------------------------------------------------------------------------------- /.idea/GAN-QA.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 8 | 9 | 10 | 11 | 12 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /src/D_baseline/D_model.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import os 4 | sys.path.append(os.path.abspath(__file__ + "/../../")) 5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util') 6 | 7 | # FIXME: spacy has some problem with torch. need to import spacy first. therefore import data_proc first. 8 | from data_proc import * 9 | from model_zoo import * 10 | 11 | import torch 12 | import torch.nn as nn 13 | 14 | use_cuda = torch.cuda.is_available() 15 | 16 | ###################################################################### 17 | # The Encoder 18 | # ----------- 19 | # FIXME: not sure if __name__ is to be used. 20 | # if __name__ == '__main__': 21 | 22 | class D(nn.Module): 23 | 24 | def __init__(self, enc_input_size, enc_hidden_size, enc_n_layers, num_directions, 25 | mlp_hidden_size, num_attn_weights, mlp_output_size, use_attn, 26 | batch_size): 27 | # super constructor 28 | super(D, self).__init__() 29 | 30 | self.encoder = EncoderRNN(enc_input_size, enc_hidden_size, batch_size, enc_n_layers, num_directions) 31 | self.mlp = MLP(mlp_hidden_size, mlp_output_size, self.encoder, num_attn_weights, use_attn = True) 32 | 33 | 34 | def forward(self, inputs, seq_lens, hidden=None): 35 | # input size = (seq len, batch size, word embedding dimension) 36 | 37 | # encoding 38 | # outputs dim (seq_len, batch size, hidden_size*num_directions) 39 | encoder_outputs, encoder_hidden = self.encoder(inputs, seq_lens) 40 | 41 | # MLP 42 | out = self.mlp(encoder_outputs) 43 | 44 | return out 45 | 46 | 47 | def backward(self, out, labels, criterion, optimizer): 48 | loss = criterion(out, labels) 49 | loss.backward() 50 | optimizer.step() 51 | return loss 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /src/D_baseline/D_eval.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import os 4 | sys.path.append(os.path.abspath(__file__ + "/../../")) 5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util') 6 | 7 | from data_proc import * 8 | from D_model import * 9 | 10 | import torch 11 | from torch.autograd import Variable 12 | 13 | use_cuda = torch.cuda.is_available() 14 | 15 | 16 | def evaluate(discriminator, triplets, 17 | word2index, embeddings_index, embeddings_size, 18 | eval_batch_size=10): 19 | 20 | # prepare batch 21 | training_batch, seq_lens, fake_training_batch, fake_seq_lens = get_random_batch(triplets, eval_batch_size, with_fake=True) 22 | # concat the context_ans batch with the question batch 23 | # each element in the training batch is context + question + answer 24 | training_batch, _, seq_lens = prepare_batch_var(training_batch, seq_lens, fake_training_batch, fake_seq_lens, 25 | eval_batch_size, word2index, embeddings_index, embeddings_size, 26 | mode = ['word'], concat_opt='cqa', with_fake=True) 27 | 28 | train_input = Variable(training_batch[0].cuda()) if use_cuda else Variable( 29 | training_batch[0]) # embeddings vectors, size = [seq len x batch size x embedding dim] 30 | true_labels = Variable(torch.FloatTensor(training_batch[-1]).cuda()) if use_cuda else Variable( 31 | torch.FloatTensor(training_batch[-1])) 32 | 33 | # pass through discriminator model 34 | outputs = discriminator.forward(train_input, true_labels, seq_lens[0]) 35 | 36 | # get label predictions from model & compare the number of correct predictions 37 | pred_labels = torch.zeros(outputs.size()) 38 | num_correct_pred = 0 39 | for i in range(outputs.size(0)): 40 | pred_labels[i] = 0 if outputs.data[i][0] <= 0.5 else 1 41 | if pred_labels[i][0] == true_labels[i].data[0]: 42 | num_correct_pred += 1 43 | 44 | print('percentage of correct predictions (True/False): ' + 45 | str(float(num_correct_pred)/float(outputs.size(0))*100) + '%.\n') 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /src/util/masked_cross_entropy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import functional 3 | from torch.autograd import Variable 4 | 5 | def sequence_mask(sequence_length, max_len=None): 6 | if max_len is None: 7 | max_len = sequence_length.data.max() 8 | batch_size = sequence_length.size(0) 9 | seq_range = torch.range(0, max_len - 1).long() 10 | seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) 11 | seq_range_expand = Variable(seq_range_expand) 12 | if sequence_length.is_cuda: 13 | seq_range_expand = seq_range_expand.cuda() 14 | seq_length_expand = (sequence_length.unsqueeze(1) 15 | .expand_as(seq_range_expand)) 16 | return seq_range_expand < seq_length_expand 17 | 18 | 19 | def masked_cross_entropy(logits, target, length): 20 | length = Variable(torch.LongTensor(length)).cuda() 21 | 22 | """ 23 | Args: 24 | logits: A Variable containing a FloatTensor of size 25 | (batch, max_len, num_classes) which contains the 26 | unnormalized probability for each class. 27 | target: A Variable containing a LongTensor of size 28 | (batch, max_len) which contains the index of the true 29 | class for each corresponding step. 30 | length: A Variable containing a LongTensor of size (batch,) 31 | which contains the length of each data in a batch. 32 | Returns: 33 | loss: An average loss value masked by the length. 34 | """ 35 | 36 | # logits_flat: (batch * max_len, num_classes) 37 | logits_flat = logits.view(-1, logits.size(-1)) 38 | # log_probs_flat: (batch * max_len, num_classes) 39 | log_probs_flat = functional.log_softmax(logits_flat) 40 | # target_flat: (batch * max_len, 1) 41 | target_flat = target.view(-1, 1) 42 | # losses_flat: (batch * max_len, 1) 43 | losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat) 44 | # losses: (batch, max_len) 45 | losses = losses_flat.view(*target.size()) 46 | # mask: (batch, max_len) 47 | mask = sequence_mask(sequence_length=length, max_len=target.size(1)) 48 | losses = losses * mask.float() 49 | loss = losses.sum() / length.float().sum() 50 | return loss -------------------------------------------------------------------------------- /src/D_baseline/D_main.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | from __future__ import division 4 | 5 | import sys 6 | import os 7 | sys.path.append(os.path.abspath(__file__ + "/../../")) 8 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util') 9 | from data_proc import * 10 | 11 | from D_model import * 12 | from D_train import * 13 | from D_eval import * 14 | import numpy as np 15 | 16 | from torch import optim 17 | 18 | use_cuda = torch.cuda.is_available() 19 | 20 | 21 | ######### set paths 22 | # TODO: to run properly, change the following paths and filenames 23 | # default values for the dataset and the path to the project/dataset 24 | dataset = 'squad' 25 | f_name = 'dev-v1.1.json' 26 | path_to_dataset = os.path.abspath(__file__ + '/../../../../') + '/data/' 27 | path_to_data = path_to_dataset + dataset + '/' + f_name 28 | GLOVE_DIR = path_to_dataset + 'glove.6B/' 29 | # path for experiment outputs 30 | # exp_name = 'QG_seq2seq_baseline' 31 | path_to_exp_out = os.path.abspath(__file__ + '/../../../../') + '/exp_results_D_temp/' 32 | loss_f = 'loss_temp.txt' 33 | sample_out_f = 'sample_outputs_temp.txt' 34 | path_to_loss_f = path_to_exp_out + '/' + loss_f 35 | path_to_sample_out_f = path_to_exp_out + '/' + sample_out_f 36 | 37 | 38 | ######### first load the pretrained word embeddings 39 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.50d.txt') 40 | embeddings_index, embeddings_size = readGlove(path_to_glove) 41 | 42 | 43 | ######### read corpus 44 | raw_triplets = read_raw_squad(path_to_data) 45 | triplets = tokenize_squad(raw_triplets, embeddings_index) 46 | 47 | # find max length of context, question, answer, respectively 48 | max_len_c, max_len_q, max_len_a = max_length(triplets) 49 | 50 | ######### corpus preprocessing 51 | # words that do not appear in embeddings, etc 52 | 53 | ## find all unique tokens in the data (should be a subset of the number of embeddings) 54 | effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index) 55 | print('effective number of tokens: ' + str(effective_num_tokens)) 56 | print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n') 57 | # build word2index dictionary and index2word dictionary 58 | word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens) 59 | 60 | 61 | ######### set up model 62 | enc_hidden_size = 256 63 | enc_n_layers = 1 64 | num_directions = 1 65 | mlp_hidden_size = 64 66 | mlp_output_size = 1 67 | num_attn_weights = 1 # 1000 68 | use_attn = True 69 | batch_size = 100 70 | enc_lr = 0.01 71 | mlp_lr = 0.01 72 | learning_rate = 0.001 73 | discriminator = D(embeddings_size, enc_hidden_size, enc_n_layers, num_directions, 74 | mlp_hidden_size, num_attn_weights, mlp_output_size, use_attn, 75 | batch_size) 76 | if use_cuda: 77 | discriminator = discriminator.cuda() 78 | 79 | criterion = nn.BCELoss() 80 | optimizer = optim.Adam(discriminator.parameters(), lr=learning_rate) 81 | 82 | 83 | ######### start training 84 | to_file = False 85 | train(discriminator, criterion, optimizer, batch_size, embeddings_size, 86 | embeddings_index, word2index, index2word, triplets, 87 | to_file, path_to_loss_f, path_to_sample_out_f, path_to_exp_out, 88 | n_iters=3000, print_every=100, plot_every=1) 89 | 90 | 91 | # save the final model 92 | # if to_file: 93 | # torch.save(encoder, path_to_exp_out+'/encoder.pth') 94 | # torch.save(mlp, path_to_exp_out+'/mlp.pth') 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /src/G_c_a_sep/G_eval.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.abspath(__file__ + "/../../")) 4 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util') 5 | from data_proc import * 6 | 7 | import torch 8 | from torch.autograd import Variable 9 | 10 | use_cuda = torch.cuda.is_available() 11 | 12 | 13 | # max_length constrains the maximum length of the generated question 14 | def evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length, 15 | to_file = False, sample_out_f = None): 16 | 17 | # prepare test input 18 | batch_size = 1 19 | training, seq_lens = get_random_batch(triplets, batch_size) 20 | context_words = training[0] 21 | answer_words = training[2] 22 | question_words = training[1] 23 | training, _, seq_lens = prepare_batch_var(training, seq_lens, batch_size, 24 | word2index, embeddings_index, embeddings_size) 25 | inputs = [] 26 | for var in training: 27 | if not isinstance(var, list): 28 | inputs.append(Variable(var.cuda())) if use_cuda else inputs.append(Variable(var)) 29 | # NOTE not currently appending start and end index to inputs because model does not use them 30 | # else: 31 | # inputs.append(Variable(inputs)) 32 | 33 | inputs_q = None 34 | 35 | all_decoder_outputs = generator.forward(inputs, seq_lens, batch_size, max_length, 36 | embeddings_index, embeddings_size, word2index, index2word, 37 | teacher_forcing_ratio=0) 38 | 39 | decoded_sentences = [] 40 | decoded_words = [] 41 | for b in range(batch_size): 42 | # get the word token and add to the list of words 43 | for di in range(max_length): 44 | # top value and index of every batch 45 | topv, topi = all_decoder_outputs[di,b].data.topk(1) 46 | ni = topi[0] 47 | if (ni == word2index['EOS']) or (ni == word2index['PAD']): 48 | decoded_words.append('EOS') 49 | # decoder_attentions[di] = decoder_attention[0].data 50 | break 51 | else: 52 | decoded_words.append(index2word[ni]) 53 | decoded_sentences.append(decoded_words) 54 | 55 | # print results 56 | if not to_file: 57 | print('context > ' + ' '.join(context_words[0]).encode('utf-8').strip()) 58 | print('answer > ' + ' '.join(answer_words[0]).encode('utf-8').strip()) 59 | print('question > ' + ' '.join(question_words[0]).encode('utf-8').strip()) 60 | # true_q = [] 61 | # for i in range(seq_lens[1][0]): 62 | # true_q.append(index2word[inputs_q[i][0].data[0]]) 63 | # print('question with padding> ' + ' '.join(true_q)) 64 | print('generated question > ' + ' '.join(decoded_words)) 65 | return decoded_words 66 | else: 67 | sample_out_f.write(unicode('context > ' + ' '.join(context_words[0]) + '\n')) 68 | sample_out_f.write(unicode('answer > ' + ' '.join(answer_words[0]) + '\n')) 69 | sample_out_f.write(unicode('question > ' + ' '.join(question_words[0]) + '\n')) 70 | sample_out_f.write(unicode('generated question > ' + ' '.join(decoded_words) + '\n')) 71 | 72 | # TODO: uncomment the following return if you want to record the decoder outputs in file 73 | # (note: need to modify this function call in G_train.py) 74 | # return decoded_sentences 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /src/G_baseline/G_main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division 3 | 4 | import sys 5 | import os 6 | sys.path.append(os.path.abspath(__file__ + "/../../")) 7 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util') 8 | 9 | from G_train import * 10 | from G_model import * 11 | import numpy as np 12 | 13 | global use_cuda 14 | use_cuda = torch.cuda.is_available() 15 | teacher_forcing_ratio = 0.75 # default in original code is 0.5 16 | 17 | 18 | ######### set paths 19 | # TODO: to run properly, change the following paths and filenames 20 | # default values for the dataset and the path to the project/dataset 21 | dataset = 'squad' 22 | f_name = 'train-v1.1.json' 23 | path_to_dataset = '/home/jack/Documents/QA_QG/data/' 24 | path_to_data = path_to_dataset + dataset + '/' + f_name 25 | GLOVE_DIR = path_to_dataset + 'glove.6B/' 26 | 27 | 28 | ######### first load the pretrained word embeddings 29 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt') 30 | embeddings_index, embeddings_size = readGlove(path_to_glove) 31 | 32 | 33 | ######### read corpus 34 | raw_triplets = read_raw_squad(path_to_data) 35 | triplets = tokenize_squad(raw_triplets, embeddings_index) 36 | 37 | # find max length of context, question, answer, respectively 38 | # max_len_c, max_len_q, max_len_a = max_length(triplets) 39 | 40 | ######### corpus preprocessing 41 | # words that do not appear in embeddings, etc 42 | 43 | ## find all unique tokens in the data (should be a subset of the number of embeddings) 44 | effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index) 45 | print('effective number of tokens: ' + str(effective_num_tokens)) 46 | print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n') 47 | # build word2index dictionary and index2word dictionary 48 | word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens) 49 | 50 | 51 | print('reading and preprocessing data complete.') 52 | print('found %s unique tokens in the intersection of corpus and word embeddings.' % effective_num_tokens) 53 | if use_cuda: 54 | print('GPU ready.') 55 | print('') 56 | print('start training...') 57 | print('') 58 | 59 | 60 | ######### set up model 61 | enc_hidden_size = 256 62 | enc_n_layers = 1 63 | enc_num_directions = 2 64 | dec_hidden_size = 256 65 | dec_n_layers = 1 66 | dec_num_directions = 2 67 | batch_size = 5 68 | learning_rate = 0.0005 69 | 70 | generator = G(embeddings_size, enc_hidden_size, enc_n_layers, enc_num_directions, 71 | embeddings_size, dec_hidden_size, effective_num_tokens, dec_n_layers, dec_num_directions, 72 | batch_size) 73 | 74 | if use_cuda: 75 | generator = generator.cuda() 76 | 77 | optimizer = optim.Adam(generator.parameters(), lr=learning_rate) 78 | criterion = nn.NLLLoss() 79 | 80 | # max_length of generated question 81 | max_length = 100 82 | to_file = False 83 | 84 | # open the files 85 | if to_file: 86 | exp_name = 'G_pretrain_exp_0827' 87 | path_to_exp_out = '/home/jack/Documents/QA_QG/exp_results_temp/' 88 | if not os.path.exists(path_to_exp_out+exp_name): 89 | os.mkdir(path_to_exp_out+exp_name) 90 | loss_f = 'loss_temp.txt' 91 | sample_out_f = 'sample_outputs_temp.txt' 92 | path_to_loss_f = path_to_exp_out + exp_name + '/' + loss_f 93 | path_to_sample_out_f = path_to_exp_out + exp_name + '/' + sample_out_f 94 | loss_f = open(path_to_loss_f,'w+') 95 | sample_out_f = open(path_to_sample_out_f, 'w+') 96 | else: 97 | loss_f = None 98 | sample_out_f = None 99 | 100 | trainIters(generator, optimizer, batch_size, embeddings_size, 101 | embeddings_index, word2index, index2word, max_length, triplets, teacher_forcing_ratio, 102 | to_file, loss_f, sample_out_f, 103 | n_iters = 1, print_every=1, plot_every=1) 104 | 105 | # save the final model 106 | if to_file: 107 | torch.save(generator, path_to_exp_out + exp_name +'/generator_temp.pth') 108 | 109 | 110 | -------------------------------------------------------------------------------- /src/D_baseline/D_train.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | from __future__ import division 4 | 5 | import sys 6 | import os 7 | import time 8 | sys.path.append(os.path.abspath(__file__ + "/../../")) 9 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util') 10 | from data_proc import * 11 | # FIXME: had some problem importing util.py; importing successful but 12 | # functions cannot be called (NameError: global name XXX is not defined) 13 | # fast solution: copied asMinutes and timeSince functions here 14 | from util import * 15 | 16 | import torch 17 | from torch.autograd import Variable 18 | from D_eval import * 19 | 20 | use_cuda = torch.cuda.is_available() 21 | 22 | import time 23 | import math 24 | 25 | # FIXME: added these two functions because import util does not seem to work (see above) 26 | def asMinutes(s): 27 | m = math.floor(s / 60) 28 | s -= m * 60 29 | return '%dm %ds' % (m, s) 30 | 31 | def timeSince(since, percent): 32 | now = time.time() 33 | s = now - since 34 | es = s / (percent) 35 | rs = es - s 36 | return '%s (- %s)' % (asMinutes(s), asMinutes(rs)) 37 | 38 | 39 | ###################################################################### 40 | # Training the Model 41 | # context = input_variable 42 | def train(discriminator, criterion, optimizer, batch_size, embeddings_size, 43 | embeddings_index, word2index, index2word, triplets, 44 | to_file, path_to_loss_f, path_to_sample_out_f, path_to_exp_out, 45 | n_iters=10, print_every=10, plot_every=100): 46 | 47 | begin_time = time.time() 48 | 49 | # open the files 50 | if to_file: 51 | loss_f = open(path_to_loss_f,'w+') 52 | sample_out_f = open(path_to_sample_out_f, 'w+') 53 | 54 | # plot_losses = [] 55 | print_loss_total = 0 # Reset every print_every 56 | plot_loss_total = 0 # Reset every plot_every 57 | 58 | print() 59 | 60 | for iter in range(1, n_iters + 1): 61 | 62 | # prepare batch 63 | training_batch, seq_lens, fake_training_batch, fake_seq_lens = get_random_batch(triplets, batch_size, with_fake=True) 64 | # concat the context_ans batch with the question batch 65 | # each element in the training batch is context + question + answer 66 | training_batch, _, seq_lens = prepare_batch_var(training_batch, seq_lens, fake_training_batch, fake_seq_lens, 67 | batch_size, word2index, embeddings_index, embeddings_size, 68 | mode = ['word'], concat_opt='cqa', with_fake=True) 69 | 70 | train_input = Variable(training_batch[0].cuda()) if use_cuda else Variable(training_batch[0]) # embeddings vectors, size = [seq len x batch size x embedding dim] 71 | # the labels are the last element of training_batch; see prepare_batch_var in data_proc.py for detail 72 | train_label = Variable(torch.FloatTensor(training_batch[-1]).cuda()) if use_cuda else Variable(torch.FloatTensor(training_batch[-1])) 73 | 74 | optimizer.zero_grad() 75 | loss = 0 76 | outputs = discriminator.forward(train_input, seq_lens[0]) 77 | loss += discriminator.backward(outputs, train_label, criterion, optimizer) 78 | 79 | print_loss_total += loss.data[0] 80 | plot_loss_total += loss.data[0] 81 | 82 | # log on console 83 | if iter % print_every == 0: 84 | print_loss_avg = print_loss_total / print_every 85 | print_loss_total = 0 86 | print('%s (%d %d%%) %.4f' % (timeSince(begin_time, iter / float(n_iters)), 87 | iter, iter / n_iters * 100, print_loss_avg)) 88 | evaluate(discriminator, triplets, word2index, embeddings_index, embeddings_size, eval_batch_size=100) 89 | print('-------------------------------') 90 | print('-------------------------------') 91 | print() 92 | 93 | # save error to file for plotting later 94 | if iter % plot_every == 0: 95 | plot_loss_avg = plot_loss_total / plot_every 96 | # plot_losses.append(plot_loss_avg) 97 | plot_loss_total = 0 98 | if to_file: 99 | loss_f.write(unicode(plot_loss_avg)) 100 | loss_f.write(unicode('\n')) 101 | 102 | # showPlot(plot_losses) 103 | if to_file: 104 | loss_f.close() 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /src/util/util.py: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | # Plotting results 3 | # ---------------- 4 | # 5 | # Plotting is done with matplotlib, using the array of loss values 6 | # ``plot_losses`` saved while training. 7 | # 8 | import matplotlib 9 | matplotlib.use('Agg') 10 | import matplotlib.pyplot as plt 11 | import matplotlib.ticker as ticker 12 | import numpy as np 13 | import difflib 14 | 15 | 16 | def showPlot(points): 17 | plt.figure() 18 | fig, ax = plt.subplots() 19 | # this locator puts ticks at regular intervals 20 | loc = ticker.MultipleLocator(base=0.2) 21 | ax.yaxis.set_major_locator(loc) 22 | plt.plot(points) 23 | 24 | 25 | 26 | def extract(v): 27 | return v.data.storage().tolist() 28 | 29 | 30 | 31 | ###################################################################### 32 | # This is a helper function to print time elapsed and estimated time 33 | # remaining given the current time and progress %. 34 | # 35 | 36 | import time 37 | import math 38 | 39 | def asMinutes(s): 40 | m = math.floor(s / 60) 41 | s -= m * 60 42 | return '%dm %ds' % (m, s) 43 | 44 | def timeSince(since, percent): 45 | now = time.time() 46 | s = now - since 47 | es = s / (percent) 48 | rs = es - s 49 | return '%s (- %s)' % (asMinutes(s), asMinutes(rs)) 50 | 51 | 52 | 53 | ###################################################################### 54 | # show loss function 55 | def plotLoss(loss_f, plot_every, save_path=None, from_file=True, f_name='loss.png', title='training loss'): 56 | if from_file: 57 | loss_vec = [] 58 | with open(loss_f) as f: 59 | content = f.readlines() 60 | content = [x.strip() for x in content] # list of every line, each a string 61 | for line in content: 62 | try: 63 | loss_vec.append(float(line)) 64 | except ValueError: 65 | pass 66 | else: 67 | loss_vec = loss_f 68 | # plot 69 | plt.figure() 70 | plt.title(title) 71 | plt.xlabel('training iterations') 72 | plt.ylabel('loss') 73 | plt.grid() 74 | plt.plot([x*plot_every for x in range(1, len(loss_vec)+1)], loss_vec) 75 | if save_path == None: 76 | plt.savefig(f_name) 77 | else: 78 | plt.savefig(save_path + '/' + f_name) 79 | 80 | # test 81 | # from util import * 82 | # plotLoss('../../../exp_results_temp/G_c_a_sep_pretrain_exp_0902/loss_temp.txt', 30) 83 | 84 | 85 | ###################################################################### 86 | # check if the generated question already exist in the corpus 87 | def generated_q_novelty(triplets, generated_q): 88 | # input - tokenized triplets, each one a list of strings 89 | # input - generated question 90 | # output - a similarity score vector for each of the questions in the triplets 91 | scores = [] 92 | if not (isinstance(generated_q, str) or isinstance(generated_q, unicode)): 93 | generated_q = ' '.join(generated_q) 94 | for idx in range(len(triplets)): 95 | q = ' '.join(triplets[idx][1]) 96 | scores.append(difflib.SequenceMatcher(None, generated_q, q).ratio) 97 | return np.array(scores) 98 | # test 99 | 100 | 101 | # ###################################################################### 102 | # # For a better viewing experience we will do the extra work of adding axes 103 | # # and labels: 104 | # # 105 | # def showAttention(input_sentence, output_words, attentions): 106 | # # Set up figure with colorbar 107 | # fig = plt.figure() 108 | # ax = fig.add_subplot(111) 109 | # cax = ax.matshow(attentions.numpy(), cmap='bone') 110 | # fig.colorbar(cax) 111 | 112 | # # Set up axes 113 | # ax.set_xticklabels([''] + input_sentence.split(' ') + 114 | # [''], rotation=90) 115 | # ax.set_yticklabels([''] + output_words) 116 | 117 | # # Show label at every tick 118 | # ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) 119 | # ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) 120 | 121 | # plt.show() 122 | 123 | 124 | # def evaluateAndShowAttention(input_sentence): 125 | # output_words, attentions = evaluate( 126 | # encoder1, attn_decoder1, input_sentence) 127 | # print('input =', input_sentence) 128 | # print('output =', ' '.join(output_words)) 129 | # showAttention(input_sentence, output_words, attentions) 130 | 131 | 132 | -------------------------------------------------------------------------------- /src/G_c_a_sep/G_main.py: -------------------------------------------------------------------------------- 1 | # from __future__ import print_function 2 | # from __future__ import division 3 | 4 | import sys 5 | import os 6 | sys.path.append(os.path.abspath(__file__ + "/../../")) 7 | 8 | from G_train import * 9 | from G_c_a_sep import * 10 | # import numpy as np 11 | from torch import optim 12 | 13 | global use_cuda 14 | use_cuda = torch.cuda.is_available() 15 | teacher_forcing_ratio = 0.75 # default in original code is 0.5 16 | 17 | 18 | ######### set paths 19 | # TODO: to run properly, change the following paths and filenames 20 | # default values for the dataset and the path to the project/dataset 21 | dataset = 'squad' 22 | f_name = 'train-v1.1.json' 23 | path_to_dataset = '/home/jack/Documents/QA_QG/data/' 24 | path_to_data = path_to_dataset + dataset + '/' + f_name 25 | GLOVE_DIR = path_to_dataset + 'glove.6B/' 26 | 27 | 28 | ######### first load the pretrained word embeddings 29 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt') 30 | embeddings_index, embeddings_size = readGlove(path_to_glove) 31 | 32 | 33 | ######### read corpus - only the sentence containing the answer as context 34 | # raw_triplets = read_raw_squad(path_to_data) 35 | # sent_c_triplets = get_ans_sentence(raw_triplets) 36 | # sent_c_triplets = tokenize_squad(sent_c_triplets, embeddings_index, opt='sent') 37 | import pickle 38 | load_path = '/home/jack/Documents/QA_QG/data/processed/' 39 | # triplets = pickle.load(open(load_path+'triplets.txt', 'rb')) 40 | # sent_c_triplets = pickle.load(open(load_path+'sent_c_triplets.txt', 'rb')) 41 | windowed_c_triplets_30_noEOS = pickle.load(open(load_path+'windowed_c_triplets_30_noEOS.txt', 'rb')) 42 | # triplets = sent_c_triplets 43 | triplets = windowed_c_triplets_30_noEOS 44 | 45 | # find max length of context, question, answer, respectively 46 | # max_len_c, max_len_q, max_len_a = max_length(triplets) 47 | 48 | ######### corpus preprocessing 49 | # words that do not appear in embeddings, etc 50 | 51 | ## find all unique tokens in the data (should be a subset of the number of embeddings) 52 | effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index) 53 | print('effective number of tokens: ' + str(effective_num_tokens)) 54 | print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n') 55 | # build word2index dictionary and index2word dictionary 56 | word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens) 57 | 58 | 59 | print('reading and preprocessing data complete.') 60 | print('found %s unique tokens in the intersection of corpus and word embeddings.' % effective_num_tokens) 61 | if use_cuda: 62 | print('GPU ready.') 63 | print('') 64 | print('start training...') 65 | print('') 66 | 67 | 68 | ######### set up model 69 | enc_hidden_size = 256 70 | enc_n_layers = 1 71 | enc_num_directions = 2 72 | dec_hidden_size = 256 73 | dec_n_layers = 1 74 | dec_num_directions = 2 75 | batch_size = 5 76 | learning_rate = 0.001 77 | 78 | generator = G(embeddings_size, enc_hidden_size, enc_n_layers, enc_num_directions, 79 | embeddings_size, dec_hidden_size, effective_num_tokens, dec_n_layers, dec_num_directions, 80 | batch_size) 81 | 82 | if use_cuda: 83 | generator = generator.cuda() 84 | 85 | optimizer = optim.Adam(generator.parameters(), lr=learning_rate) 86 | criterion = nn.NLLLoss() 87 | 88 | # max_length of generated question 89 | max_length = 100 90 | to_file = True 91 | 92 | # open the files 93 | if to_file: 94 | exp_name = 'G_c_a_sep_pretrain_exp_windowed_c_noEOS_0911' 95 | path_to_exp_out = '/home/jack/Documents/QA_QG/exp_results_temp/' 96 | if not os.path.exists(path_to_exp_out+exp_name): 97 | os.mkdir(path_to_exp_out+exp_name) 98 | loss_f = 'loss_temp.txt' 99 | sample_out_f = 'sample_outputs_temp.txt' 100 | path_to_loss_f = path_to_exp_out + exp_name + '/' + loss_f 101 | path_to_sample_out_f = path_to_exp_out + exp_name + '/' + sample_out_f 102 | loss_f = open(path_to_loss_f,'w+') 103 | sample_out_f = open(path_to_sample_out_f, 'w+') 104 | else: 105 | loss_f = None 106 | sample_out_f = None 107 | path_to_exp_out = None 108 | 109 | trainIters(generator, optimizer, batch_size, embeddings_size, 110 | embeddings_index, word2index, index2word, max_length, triplets, teacher_forcing_ratio, 111 | to_file, loss_f, sample_out_f, path_to_exp_out, 112 | n_iters=30000, print_every=300, plot_every=30, checkpoint_every=6000) 113 | 114 | # save the final model 115 | if to_file: 116 | torch.save(generator, path_to_exp_out + exp_name +'/generator_temp.pth.tar') 117 | 118 | 119 | -------------------------------------------------------------------------------- /src/G_baseline/G_train.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | from __future__ import division 4 | 5 | import sys 6 | import os 7 | sys.path.append(os.path.abspath(__file__ + "/../../")) 8 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util') 9 | from data_proc import * 10 | # FIXME: had some problem importing util.py; importing successful but 11 | # functions cannot be called (NameError: global name XXX is not defined) 12 | # fast solution: copied asMinutes and timeSince functions herefrom util import * 13 | from G_eval import * 14 | 15 | import torch 16 | import torch.nn as nn 17 | from torch import optim 18 | from torch.autograd import Variable 19 | import torch.nn.functional as F 20 | import time 21 | 22 | use_cuda = torch.cuda.is_available() 23 | 24 | 25 | import time 26 | import math 27 | 28 | # FIXME: added these two functions because import util does not seem to work (see above) 29 | def asMinutes(s): 30 | m = math.floor(s / 60) 31 | s -= m * 60 32 | return '%dm %ds' % (m, s) 33 | 34 | def timeSince(since, percent): 35 | now = time.time() 36 | s = now - since 37 | es = s / (percent) 38 | rs = es - s 39 | return '%s (- %s)' % (asMinutes(s), asMinutes(rs)) 40 | 41 | 42 | 43 | def trainIters(generator, optimizer, batch_size, embeddings_size, 44 | embeddings_index, word2index, index2word, max_length, triplets, teacher_forcing_ratio, 45 | to_file, loss_f, sample_out_f, 46 | n_iters=5, print_every=10, plot_every=100): 47 | 48 | begin_time = time.time() 49 | 50 | # plot_losses = [] 51 | print_loss_total = 0 # Reset every print_every 52 | plot_loss_total = 0 # Reset every plot_every 53 | 54 | print() 55 | 56 | for iter in range(1, n_iters + 1): 57 | 58 | # prepare batch 59 | training_batch, seq_lens = get_random_batch(triplets, batch_size) 60 | training_batch, _, seq_lens = prepare_batch_var(training_batch, seq_lens, batch_size, word2index, embeddings_index, embeddings_size, use_cuda=1, mode=['word', 'index'], concat_opt='ca') 61 | inputs_ca = Variable(training_batch[0].cuda()) if use_cuda else Variable(training_batch[0]) # embeddings vectors, size = [seq len x batch size x embedding dim] 62 | inputs_q = Variable(training_batch[1].cuda()) if use_cuda else Variable(training_batch[1]) # represented as indices, size = [seq len x batch size] 63 | 64 | max_c_a_len = max(seq_lens[0]) # max seq length of context + ans combined 65 | max_q_len = max(seq_lens[1]) # max seq length of question 66 | 67 | optimizer.zero_grad() 68 | loss = 0 69 | all_decoder_outputs = generator.forward(inputs_ca, inputs_q, seq_lens[0], batch_size, max_q_len, 70 | embeddings_index, embeddings_size, word2index, index2word, 71 | teacher_forcing_ratio) 72 | loss += generator.backward(all_decoder_outputs, inputs_q, seq_lens[1], optimizer) 73 | 74 | print_loss_total += loss.data[0] 75 | plot_loss_total += loss.data[0] 76 | 77 | if iter % print_every == 0: 78 | print_loss_avg = print_loss_total / print_every 79 | print_loss_total = 0 80 | print('%s (%d %d%%) %.4f' % (timeSince(begin_time, iter / float(n_iters)), 81 | iter, iter / n_iters * 100, print_loss_avg)) 82 | print('---sample generated question---') 83 | # sample a triple and print the generated question 84 | evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length) 85 | print('-------------------------------') 86 | print('-------------------------------') 87 | print() 88 | 89 | if iter % plot_every == 0: 90 | plot_loss_avg = plot_loss_total / plot_every 91 | # plot_losses.append(plot_loss_avg) 92 | plot_loss_total = 0 93 | if to_file: 94 | loss_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100))) 95 | loss_f.write(unicode(plot_loss_avg)) 96 | loss_f.write(unicode('\n')) 97 | sample_out_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100))) 98 | evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length, to_file, sample_out_f) 99 | sample_out_f.write(unicode('\n')) 100 | 101 | 102 | 103 | # showPlot(plot_losses) 104 | if to_file: 105 | loss_f.close() 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /src/G_baseline/G_eval.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.abspath(__file__ + "/../../")) 4 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util') 5 | from data_proc import * 6 | from util import * 7 | 8 | import torch 9 | import torch.nn as nn 10 | from torch import optim 11 | from torch.autograd import Variable 12 | import torch.nn.functional as F 13 | import time 14 | 15 | use_cuda = torch.cuda.is_available() 16 | 17 | 18 | # max_length constrains the maximum length of the generated question 19 | def evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length, 20 | to_file = False, sample_out_f = None): 21 | 22 | # prepare test input 23 | batch_size = 1 24 | training, seq_lens = get_random_batch(triplets, batch_size) 25 | context_words = training[0] 26 | answer_words = training[2] 27 | question_words = training[1] 28 | training, _, seq_lens = prepare_batch_var(training, seq_lens, batch_size, word2index, embeddings_index, embeddings_size, mode=['word', 'index'], concat_opt='ca') 29 | inputs_ca = Variable(training[0].cuda()) if use_cuda else Variable(training[0]) # embeddings vectors, size = [seq len x batch size x embedding dim] 30 | # inputs_q = Variable(training[1].cuda()) if use_cuda else Variable(training[1]) # represented as indices, size = [seq len x batch size] 31 | inputs_q = None 32 | 33 | all_decoder_outputs = generator.forward(inputs_ca, inputs_q, seq_lens[0], batch_size, max_length, 34 | embeddings_index, embeddings_size, word2index, index2word, 35 | teacher_forcing_ratio=0) 36 | 37 | decoded_sentences = [] 38 | decoded_words = [] 39 | for b in range(batch_size): 40 | # get the word token and add to the list of words 41 | for di in range(max_length): 42 | # top value and index of every batch 43 | topv, topi = all_decoder_outputs[di,b].data.topk(1) 44 | ni = topi[0] 45 | if (ni == word2index['EOS']) or (ni == word2index['PAD']): 46 | decoded_words.append('EOS') 47 | # decoder_attentions[di] = decoder_attention[0].data 48 | break 49 | else: 50 | decoded_words.append(index2word[ni]) 51 | decoded_sentences.append(decoded_words) 52 | 53 | # print results 54 | if not to_file: 55 | print('context > ' + ' '.join(context_words[0]).encode('utf-8').strip()) 56 | print('answer > ' + ' '.join(answer_words[0]).encode('utf-8').strip()) 57 | print('question > ' + ' '.join(question_words[0]).encode('utf-8').strip()) 58 | # true_q = [] 59 | # for i in range(seq_lens[1][0]): 60 | # true_q.append(index2word[inputs_q[i][0].data[0]]) 61 | # print('question with padding> ' + ' '.join(true_q)) 62 | print('generated question > ' + ' '.join(decoded_words)) 63 | else: 64 | sample_out_f.write(unicode('context > ' + ' '.join(context_words[0]) + '\n')) 65 | sample_out_f.write(unicode('answer > ' + ' '.join(answer_words[0]) + '\n')) 66 | sample_out_f.write(unicode('question > ' + ' '.join(question_words[0]) + '\n')) 67 | sample_out_f.write(unicode('generated question > ' + ' '.join(decoded_words) + '\n')) 68 | 69 | # TODO: uncomment the following return if you want to record the decoder outputs in file 70 | # (note: need to modify this function call in G_train.py) 71 | # return decoded_sentences 72 | 73 | 74 | def G_sampler(generator, ca, embeddings_index, embeddings_size, word2index, index2word, max_length): 75 | # NOTE currently only generate one question at a time. multiple questions not yet supported 76 | 77 | var = torch.FloatTensor(len(ca), embeddings_size) 78 | for j in range(len(ca)): 79 | var[j] = embeddings_index[ca[j]] 80 | var = var.unsqueeze(1) 81 | if use_cuda: 82 | var = Variable(var.cuda()) 83 | else: 84 | var = Variable(var) 85 | 86 | decoder_output = generator.forward(var, None, [len(ca)], 1, max_length, 87 | embeddings_index, embeddings_size, word2index, index2word, 88 | teacher_forcing_ratio=0).detach() 89 | decoder_output = decoder_output.squeeze(1) 90 | 91 | decoded_words = [] 92 | for di in range(max_length): 93 | # top value and index of every batch 94 | topv, topi = decoder_output[di].data.topk(1) 95 | ni = topi[0] 96 | if (ni == word2index['EOS']) or (ni == word2index['PAD']): 97 | decoded_words.append('EOS') 98 | # decoder_attentions[di] = decoder_attention[0].data 99 | break 100 | else: 101 | decoded_words.append(index2word[ni]) 102 | 103 | return decoded_words 104 | 105 | -------------------------------------------------------------------------------- /src/G_baseline/G_model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import random 4 | sys.path.append(os.path.abspath(__file__ + "/../../")) 5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util') 6 | 7 | from model_zoo import * 8 | from masked_cross_entropy import * 9 | import torch 10 | import torch.nn as nn 11 | from torch.autograd import Variable 12 | 13 | use_cuda = torch.cuda.is_available() 14 | 15 | 16 | class G(nn.Module): 17 | def __init__(self, enc_input_size, enc_hidden_size, enc_n_layers, enc_num_directions, 18 | dec_input_size, dec_hidden_size, output_size, dec_n_layers, dec_num_directions, 19 | batch_size, use_attn=True): 20 | super(G, self).__init__() 21 | self.encoder = EncoderRNN(enc_input_size, enc_hidden_size, batch_size, enc_n_layers, enc_num_directions) 22 | if use_attn: 23 | self.decoder = AttnDecoderRNN(dec_input_size, dec_hidden_size, output_size, self.encoder, 24 | dec_n_layers, dec_num_directions) 25 | else: 26 | # TODO: complete case when not using attention (add decoder class in model zoo) 27 | pass 28 | 29 | 30 | def forward(self, inputs_ca, inputs_q, seq_lens, batch_size, max_q_len, 31 | embeddings_index, embeddings_size, word2index, index2word, teacher_forcing_ratio): 32 | # context encoding 33 | # output size: (seq_len, batch, hidden_size) 34 | # hidden size: (num_layers, batch, hidden_size) 35 | # the collection of all hidden states per batch is of size (seq_len, batch, hidden_size * num_directions) 36 | encoder_hiddens, encoder_hidden = self.encoder(inputs_ca, seq_lens, None) 37 | 38 | print(type(encoder_hiddens.data)) 39 | print(encoder_hiddens.size()) 40 | 41 | # decoder 42 | # prepare decoder inputs as word embeddings in a batch 43 | # decoder_input size: (1, batch size, embedding size); first dim is 1 because only one time step; 44 | # nee to have a 3D tensor for input to nn.GRU module 45 | decoder_input = Variable(embeddings_index['SOS'].repeat(batch_size, 1).unsqueeze(0)) 46 | # init all decoder outputs 47 | all_decoder_outputs = Variable(torch.zeros(max_q_len, batch_size, self.decoder.output_size)) 48 | if use_cuda: 49 | decoder_input = decoder_input.cuda() 50 | all_decoder_outputs = all_decoder_outputs.cuda() 51 | 52 | use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False 53 | 54 | if use_teacher_forcing: 55 | # Teacher forcing: Feed the target as the next input 56 | for di in range(max_q_len): 57 | decoder_output, decoder_hidden, decoder_attention = self.decoder( 58 | decoder_input, encoder_hiddens, embeddings_index) 59 | 60 | all_decoder_outputs[di] = decoder_output 61 | 62 | # change next time step input to current target output, in embedding format 63 | decoder_input = Variable(torch.FloatTensor(1, batch_size, embeddings_size).cuda()) if use_cuda else \ 64 | Variable(torch.FloatTensor(1, batch_size, embeddings_size)) 65 | for b in range(batch_size): 66 | decoder_input[0, b] = embeddings_index[index2word[inputs_q[di, b].data[0]]].cuda() if use_cuda else \ 67 | embeddings_index[index2word[inputs_q[di, b].data[0]]] # Teacher forcing 68 | 69 | else: 70 | # Without teacher forcing: use its own predictions as the next input 71 | for di in range(max_q_len): 72 | decoder_output, decoder_hidden, decoder_attention = self.decoder( 73 | decoder_input, encoder_hiddens, embeddings_index) 74 | 75 | all_decoder_outputs[di] = decoder_output 76 | 77 | # top value and index of every batch 78 | # size of both topv, topi = (batch size, 1) 79 | topv, topi = decoder_output.data.topk(1) 80 | 81 | # get the output word for every batch 82 | decoder_input = Variable(torch.FloatTensor(1, batch_size, embeddings_size).cuda()) if use_cuda else \ 83 | Variable(torch.FloatTensor(1, batch_size, embeddings_size)) 84 | for b in range(batch_size): 85 | decoder_input[0, b] = embeddings_index[index2word[topi[0][0]]].cuda() if use_cuda else \ 86 | embeddings_index[index2word[topi[0][0]]] 87 | 88 | return all_decoder_outputs 89 | 90 | 91 | def backward(self, out, labels, true_lens, optimizer): 92 | loss = masked_cross_entropy( 93 | out.transpose(0, 1).contiguous(), # -> batch x seq 94 | labels.transpose(0, 1).contiguous(), # -> batch x seq 95 | true_lens 96 | ) 97 | loss.backward() 98 | optimizer.step() 99 | return loss 100 | -------------------------------------------------------------------------------- /src/G_c_a_sep/G_c_a_sep.py: -------------------------------------------------------------------------------- 1 | # the encoder in the generator process the context and answer separately. 2 | 3 | import sys 4 | import os 5 | import random 6 | sys.path.append(os.path.abspath(__file__ + "/../../")) 7 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util') 8 | 9 | from model_zoo import * 10 | from masked_cross_entropy import * 11 | import torch 12 | import torch.nn as nn 13 | from torch.autograd import Variable 14 | 15 | use_cuda = torch.cuda.is_available() 16 | 17 | 18 | class G(nn.Module): 19 | def __init__(self, enc_input_size, enc_hidden_size, enc_n_layers, enc_num_directions, 20 | dec_input_size, dec_hidden_size, output_size, dec_n_layers, dec_num_directions, 21 | batch_size, use_attn=True): 22 | super(G, self).__init__() 23 | self.c_encoder = EncoderRNN(enc_input_size, enc_hidden_size, batch_size, enc_n_layers, enc_num_directions) 24 | self.a_encoder = EncoderRNN(enc_input_size, enc_hidden_size, batch_size, enc_n_layers, enc_num_directions) 25 | if use_attn: 26 | self.decoder = AttnDecoderRNN(dec_input_size, dec_hidden_size, output_size, self.a_encoder, 27 | dec_n_layers, dec_num_directions) 28 | else: 29 | # TODO: complete case when not using attention (add decoder class in model zoo) 30 | pass 31 | 32 | 33 | def forward(self, inputs, seq_lens, batch_size, max_q_len, 34 | embeddings_index, embeddings_size, word2index, index2word, teacher_forcing_ratio): 35 | # inputs is a collection of c, a, q. index by 0,2,1 36 | # output size: (seq_len, batch, hidden_size) 37 | # hidden size: (num_layers, batch, hidden_size) 38 | # the collection of all hidden states per batch is of size (seq_len, batch, hidden_size * num_directions) 39 | c_encoder_hiddens, c_encoder_hidden = self.c_encoder(inputs[0], seq_lens[0]) 40 | a_encoder_hiddens, a_encoder_hidden = self.a_encoder(inputs[2], seq_lens[2]) 41 | 42 | # TODO: the below code of how to use/combine hidden states from context/answer can be changed 43 | encoder_hiddens = torch.cat((c_encoder_hiddens, a_encoder_hiddens), 0) # concat along the first dimension (seq len) 44 | 45 | # decoder 46 | # prepare decoder inputs as word embeddings in a batch 47 | # decoder_input size: (1, batch size, embedding size); first dim is 1 because only one time step; 48 | # nee to have a 3D tensor for input to nn.GRU module 49 | decoder_input = Variable(embeddings_index['SOS'].repeat(batch_size, 1).unsqueeze(0)) 50 | # init all decoder outputs 51 | all_decoder_outputs = Variable(torch.zeros(max_q_len, batch_size, self.decoder.output_size)) 52 | if use_cuda: 53 | decoder_input = decoder_input.cuda() 54 | all_decoder_outputs = all_decoder_outputs.cuda() 55 | 56 | use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False 57 | 58 | if use_teacher_forcing: 59 | # Teacher forcing: Feed the target as the next input 60 | for di in range(max_q_len): 61 | decoder_output, decoder_hidden, decoder_attention = self.decoder( 62 | decoder_input, c_encoder_hiddens, embeddings_index) 63 | 64 | all_decoder_outputs[di] = decoder_output 65 | 66 | # change next time step input to current target output, in embedding format 67 | decoder_input = Variable(torch.FloatTensor(1, batch_size, embeddings_size).cuda()) if use_cuda else \ 68 | Variable(torch.FloatTensor(1, batch_size, embeddings_size)) 69 | for b in range(batch_size): 70 | decoder_input[0, b] = embeddings_index[index2word[inputs[1][di, b].data[0]]].cuda() if use_cuda else \ 71 | embeddings_index[index2word[inputs[1][di, b].data[0]]] # Teacher forcing 72 | 73 | else: 74 | # Without teacher forcing: use its own predictions as the next input 75 | for di in range(max_q_len): 76 | decoder_output, decoder_hidden, decoder_attention = self.decoder( 77 | decoder_input, encoder_hiddens, embeddings_index) 78 | 79 | all_decoder_outputs[di] = decoder_output 80 | 81 | # top value and index of every batch 82 | # size of both topv, topi = (batch size, 1) 83 | topv, topi = decoder_output.data.topk(1) 84 | 85 | # get the output word for every batch 86 | decoder_input = Variable(torch.FloatTensor(1, batch_size, embeddings_size).cuda()) if use_cuda else \ 87 | Variable(torch.FloatTensor(1, batch_size, embeddings_size)) 88 | for b in range(batch_size): 89 | decoder_input[0, b] = embeddings_index[index2word[topi[0][0]]].cuda() if use_cuda else \ 90 | embeddings_index[index2word[topi[0][0]]] 91 | 92 | return all_decoder_outputs 93 | 94 | 95 | def backward(self, out, labels, true_lens, optimizer): 96 | loss = masked_cross_entropy( 97 | out.transpose(0, 1).contiguous(), # -> batch x seq 98 | labels.transpose(0, 1).contiguous(), # -> batch x seq 99 | true_lens 100 | ) 101 | loss.backward() 102 | optimizer.step() 103 | return loss 104 | -------------------------------------------------------------------------------- /references/code/vanilla-gan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # code from https://github.com/devnag/pytorch-generative-adversarial-networks/blob/master/gan_pytorch.py 4 | 5 | # Generative Adversarial Networks (GAN) example in PyTorch. 6 | # See related blog post at https://medium.com/@devnag/generative-adversarial-networks-gans-in-50-lines-of-code-pytorch-e81b79659e3f#.sch4xgsa9 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | import torch.optim as optim 12 | from torch.autograd import Variable 13 | 14 | # Data params 15 | data_mean = 4 16 | data_stddev = 1.25 17 | 18 | # Model params 19 | g_input_size = 1 # Random noise dimension coming into generator, per output vector 20 | g_hidden_size = 50 # Generator complexity 21 | g_output_size = 1 # size of generated output vector 22 | d_input_size = 100 # Minibatch size - cardinality of distributions 23 | d_hidden_size = 50 # Discriminator complexity 24 | d_output_size = 1 # Single dimension for 'real' vs. 'fake' 25 | minibatch_size = d_input_size 26 | 27 | d_learning_rate = 2e-4 # 2e-4 28 | g_learning_rate = 2e-4 29 | optim_betas = (0.9, 0.999) 30 | num_epochs = 30000 31 | print_interval = 200 32 | d_steps = 1 # 'k' steps in the original GAN paper. Can put the discriminator on higher training freq than generator 33 | g_steps = 1 34 | 35 | # ### Uncomment only one of these 36 | #(name, preprocess, d_input_func) = ("Raw data", lambda data: data, lambda x: x) 37 | (name, preprocess, d_input_func) = ("Data and variances", lambda data: decorate_with_diffs(data, 2.0), lambda x: x * 2) 38 | 39 | print("Using data [%s]" % (name)) 40 | 41 | # ##### DATA: Target data and generator input data 42 | 43 | def get_distribution_sampler(mu, sigma): 44 | return lambda n: torch.Tensor(np.random.normal(mu, sigma, (1, n))) # Gaussian 45 | 46 | def get_generator_input_sampler(): 47 | return lambda m, n: torch.rand(m, n) # Uniform-dist data into generator, _NOT_ Gaussian 48 | 49 | # ##### MODELS: Generator model and discriminator model 50 | 51 | class Generator(nn.Module): 52 | def __init__(self, input_size, hidden_size, output_size): 53 | super(Generator, self).__init__() 54 | self.map1 = nn.Linear(input_size, hidden_size) 55 | self.map2 = nn.Linear(hidden_size, hidden_size) 56 | self.map3 = nn.Linear(hidden_size, output_size) 57 | 58 | def forward(self, x): 59 | x = F.elu(self.map1(x)) 60 | x = F.sigmoid(self.map2(x)) 61 | return self.map3(x) 62 | 63 | class Discriminator(nn.Module): 64 | def __init__(self, input_size, hidden_size, output_size): 65 | super(Discriminator, self).__init__() 66 | self.map1 = nn.Linear(input_size, hidden_size) 67 | self.map2 = nn.Linear(hidden_size, hidden_size) 68 | self.map3 = nn.Linear(hidden_size, output_size) 69 | 70 | def forward(self, x): 71 | x = F.elu(self.map1(x)) 72 | x = F.elu(self.map2(x)) 73 | return F.sigmoid(self.map3(x)) 74 | 75 | def extract(v): 76 | return v.data.storage().tolist() 77 | 78 | def stats(d): 79 | return [np.mean(d), np.std(d)] 80 | 81 | def decorate_with_diffs(data, exponent): 82 | mean = torch.mean(data.data, 1) 83 | mean_broadcast = torch.mul(torch.ones(data.size()), mean.tolist()[0][0]) 84 | diffs = torch.pow(data - Variable(mean_broadcast), exponent) 85 | return torch.cat([data, diffs], 1) 86 | 87 | d_sampler = get_distribution_sampler(data_mean, data_stddev) 88 | gi_sampler = get_generator_input_sampler() 89 | G = Generator(input_size=g_input_size, hidden_size=g_hidden_size, output_size=g_output_size) 90 | D = Discriminator(input_size=d_input_func(d_input_size), hidden_size=d_hidden_size, output_size=d_output_size) 91 | criterion = nn.BCELoss() # Binary cross entropy: http://pytorch.org/docs/nn.html#bceloss 92 | d_optimizer = optim.Adam(D.parameters(), lr=d_learning_rate, betas=optim_betas) 93 | g_optimizer = optim.Adam(G.parameters(), lr=g_learning_rate, betas=optim_betas) 94 | 95 | for epoch in range(num_epochs): 96 | for d_index in range(d_steps): 97 | # 1. Train D on real+fake 98 | D.zero_grad() 99 | 100 | # 1A: Train D on real 101 | d_real_data = Variable(d_sampler(d_input_size)) 102 | d_real_decision = D(preprocess(d_real_data)) 103 | d_real_error = criterion(d_real_decision, Variable(torch.ones(1))) # ones = true 104 | d_real_error.backward() # compute/store gradients, but don't change params 105 | 106 | # 1B: Train D on fake 107 | d_gen_input = Variable(gi_sampler(minibatch_size, g_input_size)) 108 | d_fake_data = G(d_gen_input).detach() # detach to avoid training G on these labels 109 | d_fake_decision = D(preprocess(d_fake_data.t())) 110 | d_fake_error = criterion(d_fake_decision, Variable(torch.zeros(1))) # zeros = fake 111 | d_fake_error.backward() 112 | d_optimizer.step() # Only optimizes D's parameters; changes based on stored gradients from backward() 113 | 114 | for g_index in range(g_steps): 115 | # 2. Train G on D's response (but DO NOT train D on these labels) 116 | G.zero_grad() 117 | 118 | gen_input = Variable(gi_sampler(minibatch_size, g_input_size)) 119 | g_fake_data = G(gen_input) 120 | dg_fake_decision = D(preprocess(g_fake_data.t())) 121 | g_error = criterion(dg_fake_decision, Variable(torch.ones(1))) # we want to fool, so pretend it's all genuine 122 | 123 | g_error.backward() 124 | g_optimizer.step() # Only optimizes G's parameters 125 | 126 | if epoch % print_interval == 0: 127 | print("%s: D: %s/%s G: %s (Real: %s, Fake: %s) " % (epoch, 128 | extract(d_real_error)[0], 129 | extract(d_fake_error)[0], 130 | extract(g_error)[0], 131 | stats(extract(d_real_data)), 132 | stats(extract(d_fake_data)))) -------------------------------------------------------------------------------- /src/GAN_model/GAN_main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from __future__ import division 3 | 4 | import sys, os 5 | # sys.path.append(os.path.abspath(__file__ + "/../../")) 6 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util') 7 | # sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_c_a_sep') 8 | # sys.path.append(os.path.abspath(__file__ + "/../../") + '/D_baseline') 9 | 10 | from data_proc import * 11 | # from G_train import * 12 | # from G_c_a_sep import * 13 | from GAN_model import * 14 | import numpy as np 15 | 16 | from torch import optim 17 | 18 | global use_cuda 19 | use_cuda = torch.cuda.is_available() 20 | teacher_forcing_ratio = 0.5 # default in original code is 0.5 21 | 22 | 23 | ######### set paths 24 | # TODO: to run properly, change the following paths and filenames 25 | # path variables 26 | path_to_dataset = '/home/jack/Documents/QA_QG/data/' # path to original dataset 27 | load_path = '/home/jack/Documents/QA_QG/data/processed/' # path to processed dataset 28 | G_path = '/home/jack/Documents/QA_QG/exp_results_temp/G_c_a_sep_pretrain_exp_0902(2)/generator_temp.pth' # path to saved generator model 29 | path_to_exp = '/home/jack/Documents/QA_QG/exp_results_temp/' # path to experiment folder 30 | 31 | 32 | # default values for the dataset and the path to the project/dataset 33 | dataset = 'squad' 34 | f_name = 'train-v1.1.json' 35 | path_to_data = path_to_dataset + dataset + '/' + f_name 36 | GLOVE_DIR = path_to_dataset + 'glove.6B/' 37 | 38 | ######### first load the pretrained word embeddings 39 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt') 40 | embeddings_index, embeddings_size = readGlove(path_to_glove) 41 | 42 | 43 | import pickle 44 | # triplets = pickle.load(open(load_path+'triplets.txt', 'rb')) 45 | sent_c_triplets = pickle.load(open(load_path+'sent_c_triplets.txt', 'rb')) 46 | # windowed_c_triplets_10 = pickle.load(open(load_path+'windowed_c_triplets_10.txt', 'rb')) 47 | triplets = sent_c_triplets 48 | # ######### read corpus 49 | # raw_triplets = read_raw_squad(path_to_data) 50 | # triplets = tokenize_squad(raw_triplets, embeddings_index) 51 | 52 | # # find max length of context, question, answer, respectively 53 | # max_len_c, max_len_q, max_len_a = max_length(triplets) 54 | 55 | ######### corpus preprocessing 56 | # words that do not appear in embeddings, etc 57 | 58 | ## find all unique tokens in the data (should be a subset of the number of embeddings) 59 | effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index) 60 | print('effective number of tokens: ' + str(effective_num_tokens)) 61 | print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n') 62 | # build word2index dictionary and index2word dictionary 63 | word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens) 64 | 65 | 66 | print('reading and preprocessing data complete.') 67 | print('found %s unique tokens in the intersection of corpus and word embeddings.' % effective_num_tokens) 68 | if use_cuda: 69 | print('GPU ready.') 70 | print('') 71 | print('start training...') 72 | print('') 73 | 74 | 75 | ######### set up model 76 | G_enc_input_size = embeddings_size 77 | G_enc_hidden_size = 256 78 | G_enc_n_layers = 1 79 | G_enc_num_directions = 1 80 | G_dec_input_size = embeddings_size 81 | G_dec_hidden_size = 256 82 | G_output_size = effective_num_tokens 83 | G_dec_n_layers = 1 84 | G_dec_num_directions = 1 85 | D_enc_input_size = embeddings_size 86 | D_enc_hidden_size = 256 87 | D_enc_n_layers = 1 88 | D_num_directions = 1 89 | D_mlp_hidden_size = 64 90 | D_num_attn_weights = 1 91 | D_mlp_output_size = 1 92 | use_attn = True 93 | batch_size = 5 94 | 95 | 96 | vanilla_gan = GAN_model(G_enc_input_size, G_enc_hidden_size, G_enc_n_layers, G_enc_num_directions, 97 | G_dec_input_size, G_dec_hidden_size, G_output_size, G_dec_n_layers, G_dec_num_directions, 98 | D_enc_input_size, D_enc_hidden_size, D_enc_n_layers, D_num_directions, 99 | D_mlp_hidden_size, D_num_attn_weights, D_mlp_output_size, 100 | use_attn, batch_size, G_path=G_path, pretrain=True) 101 | if use_cuda: 102 | vanilla_gan = vanilla_gan.cuda() 103 | 104 | learning_rate = 1e-3 105 | d_optimizer = optim.Adam(vanilla_gan.D.parameters(), lr=learning_rate) 106 | g_optimizer = optim.Adam(vanilla_gan.G.parameters(), lr=learning_rate) 107 | criterion = nn.BCELoss() 108 | 109 | # max_length of generated question 110 | max_len = 100 111 | to_file = True 112 | print_every = 500 113 | plot_every = 50 114 | checkpoint_every = 2000 115 | n_iters = 10000 116 | d_steps = 1 117 | g_steps = 5 118 | 119 | # open the files 120 | exp_name = 'GAN_0911' 121 | path_to_exp_out = path_to_exp + exp_name 122 | if to_file: 123 | if not os.path.exists(path_to_exp_out): 124 | os.mkdir(path_to_exp_out) 125 | loss_f = 'loss_temp.txt' 126 | sample_out_f = 'sample_outputs_temp.txt' 127 | path_to_loss_f = path_to_exp_out + '/' + loss_f 128 | path_to_sample_out_f = path_to_exp_out + '/' + sample_out_f 129 | loss_f = open(path_to_loss_f,'w+') 130 | sample_out_f = open(path_to_sample_out_f, 'w+') 131 | # else: 132 | # loss_f = None 133 | # sample_out_f = None 134 | # path_to_exp_out = None 135 | 136 | # # load a pre-trained model 137 | # model_fname = 'checkpoint_iter_1.pth.tar' 138 | # path_to_model = path_to_exp_out + '/' + model_fname 139 | # checkpoint = torch.load(path_to_model) 140 | # vanilla_gan.D.load_state_dict(checkpoint['d_state_dict']) 141 | # vanilla_gan.G.load_state_dict(checkpoint['g_state_dict']) 142 | # d_optimizer.load_state_dict(checkpoint['d_optimizer']) 143 | # g_optimizer.load_state_dict(checkpoint['g_optimizer']) 144 | 145 | # train 146 | vanilla_gan.train(triplets, n_iters, d_steps, d_optimizer, g_steps, g_optimizer, batch_size, max_len, 147 | criterion, word2index, index2word, embeddings_index, embeddings_size, print_every, plot_every, checkpoint_every, 148 | to_file=to_file, loss_f=loss_f, sample_out_f=sample_out_f, path_to_exp_out=path_to_exp_out) 149 | 150 | if to_file: 151 | loss_f.close() 152 | sample_out_f.close() 153 | torch.save(vanilla_gan, path_to_exp_out + exp_name + '/GAN_model.pth.tar') -------------------------------------------------------------------------------- /src/G_c_a_sep/G_train.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import os 4 | sys.path.append(os.path.abspath(__file__ + "/../../")) 5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util') 6 | # from util import timeSince, asMinutes, plotLoss 7 | from data_proc import * 8 | # FIXME: had some problem importing util.py; importing successful but 9 | # functions cannot be called (NameError: global name XXX is not defined) 10 | # fast solution: copied asMinutes and timeSince functions herefrom util import * 11 | from G_eval import * 12 | 13 | import torch 14 | from torch.autograd import Variable 15 | 16 | use_cuda = torch.cuda.is_available() 17 | 18 | 19 | ######################################################################################################################## 20 | import matplotlib 21 | matplotlib.use('Agg') 22 | import matplotlib.pyplot as plt 23 | import time 24 | import math 25 | 26 | def asMinutes(s): 27 | m = math.floor(s / 60) 28 | s -= m * 60 29 | return '%dm %ds' % (m, s) 30 | 31 | def timeSince(since, percent): 32 | now = time.time() 33 | s = now - since 34 | es = s / (percent) 35 | rs = es - s 36 | return '%s (- %s)' % (asMinutes(s), asMinutes(rs)) 37 | 38 | 39 | 40 | ###################################################################### 41 | # show loss function 42 | def plotLoss(loss_f, plot_every, save_path=None, from_file=True, f_name='loss.png', title='training loss'): 43 | if from_file: 44 | loss_vec = [] 45 | with open(loss_f) as f: 46 | content = f.readlines() 47 | content = [x.strip() for x in content] # list of every line, each a string 48 | for line in content: 49 | try: 50 | loss_vec.append(float(line)) 51 | except ValueError: 52 | pass 53 | else: 54 | loss_vec = loss_f 55 | # plot 56 | plt.figure() 57 | plt.title(title) 58 | plt.xlabel('training iterations') 59 | plt.ylabel('loss') 60 | plt.grid() 61 | plt.plot([x*plot_every for x in range(1, len(loss_vec)+1)], loss_vec) 62 | if save_path == None: 63 | plt.savefig(f_name) 64 | else: 65 | plt.savefig(save_path + '/' + f_name) 66 | ######################################################################################################################## 67 | 68 | 69 | def trainIters(generator, optimizer, batch_size, embeddings_size, 70 | embeddings_index, word2index, index2word, max_length, triplets, teacher_forcing_ratio, 71 | to_file, loss_f, sample_out_f, path_to_exp_out, 72 | n_iters=1, print_every=1, plot_every=1, checkpoint_every=1): 73 | 74 | begin_time = time.time() 75 | 76 | # plot_losses = [] 77 | print_loss_total = 0 # Reset every print_every 78 | plot_loss_total = 0 # Reset every plot_every 79 | plot_loss_avgs = [] 80 | 81 | print() 82 | 83 | for iter in range(1, n_iters + 1): 84 | 85 | # prepare batch 86 | training_batch, seq_lens = get_random_batch(triplets, batch_size) 87 | training_batch, _, seq_lens = prepare_batch_var( 88 | training_batch, seq_lens, batch_size, word2index, embeddings_index, embeddings_size) 89 | 90 | # print(type(training_batch)) 91 | # print(type(training_batch[0])) 92 | 93 | # prepare inputs (load to cuda) 94 | inputs = [] 95 | for var in training_batch: 96 | if not isinstance(var, list): 97 | inputs.append(Variable(var.cuda())) if use_cuda else inputs.append(Variable(var)) 98 | # NOTE not currently appending start and end index to inputs because model does not use them. 99 | # NOTE if want to apend, make sure these are changed from list to LongTensor 100 | # else: 101 | # inputs.append(Variable(var)) 102 | 103 | max_c_a_len = max(seq_lens[0]) # max seq length of context + ans combined 104 | max_q_len = max(seq_lens[1]) # max seq length of question 105 | 106 | optimizer.zero_grad() 107 | loss = 0 108 | all_decoder_outputs = generator.forward(inputs, seq_lens, batch_size, max_q_len, 109 | embeddings_index, embeddings_size, word2index, index2word, 110 | teacher_forcing_ratio) 111 | loss += generator.backward(all_decoder_outputs, inputs[1], seq_lens[1], optimizer) 112 | 113 | print_loss_total += loss.data[0] 114 | plot_loss_total += loss.data[0] 115 | 116 | if iter % print_every == 0: 117 | print_loss_avg = print_loss_total / print_every 118 | print_loss_total = 0 119 | print('%s (%d %d%%) %.4f' % (timeSince(begin_time, iter / float(n_iters)), 120 | iter, iter / n_iters * 100, print_loss_avg)) 121 | print('---sample generated question---') 122 | # sample a triple and print the generated question 123 | evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length) 124 | print('-------------------------------') 125 | print('-------------------------------') 126 | print() 127 | if to_file: 128 | sample_out_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100))) 129 | evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length, to_file, sample_out_f) 130 | sample_out_f.write(unicode('\n')) 131 | if iter % plot_every == 0: 132 | plot_loss_avg = plot_loss_total / plot_every 133 | plot_loss_avgs.append(plot_loss_avg) 134 | # plot_losses.append(plot_loss_avg) 135 | plot_loss_total = 0 136 | if to_file: 137 | loss_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100))) 138 | loss_f.write(unicode(plot_loss_avg)) 139 | loss_f.write(unicode('\n')) 140 | if to_file and ((iter % checkpoint_every == 0) or (iter == n_iters)): 141 | checkpoint_fname = 'checkpoint_iter_' + str(iter) + '.pth.tar' 142 | state = { 143 | 'iteration': iter + 1, 144 | 'g_state_dict': generator.state_dict(), 145 | 'g_optimizer' : optimizer.state_dict(), 146 | } 147 | torch.save(state, path_to_exp_out+'/'+checkpoint_fname) 148 | plotLoss(plot_loss_avgs, plot_every, save_path=path_to_exp_out, f_name='d_loss_itr_'+str(iter)+'.png', 149 | title='training loss', from_file=False) 150 | 151 | # showPlot(plot_losses) 152 | if to_file: 153 | loss_f.close() 154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /src/util/test.py: -------------------------------------------------------------------------------- 1 | # various test cases 2 | 3 | # load model 4 | import sys, os 5 | __file__ = '/home/jack/Documents/QA_QG/GAN-QA/src/util/' 6 | sys.path.append(os.path.abspath(__file__)) 7 | import data_proc 8 | reload(data_proc) 9 | from data_proc import * 10 | import util 11 | reload(util) 12 | from util import * 13 | sys.path.append(os.path.abspath(__file__ + "/../../")) 14 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/D_baseline') 15 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_baseline') 16 | from G_model import * 17 | from model_zoo import * 18 | from G_eval import * 19 | import torch 20 | import numpy as np 21 | 22 | global use_cuda 23 | use_cuda = torch.cuda.is_available() 24 | 25 | ###################################################################### 26 | ###################################################################### 27 | # test for various util functions 28 | # uncomment this for much of the later unit tests in this file 29 | ######### set paths 30 | # TODO: to run properly, change the following paths and filenames 31 | # default values for the dataset and the path to the project/dataset 32 | dataset = 'squad' 33 | f_name = 'train-v1.1.json' 34 | path_to_dataset = '/home/jack/Documents/QA_QG/data/' 35 | path_to_data = path_to_dataset + dataset + '/' + f_name 36 | GLOVE_DIR = path_to_dataset + 'glove.6B/' 37 | # path for experiment outputs 38 | # exp_name = 'QG_seq2seq_baseline' 39 | path_to_exp_out = '/home/jack/Documents/QA_QG/exp_results_temp/' 40 | loss_f = 'loss_temp.txt' 41 | sample_out_f = 'sample_outputs_temp.txt' 42 | path_to_loss_f = path_to_exp_out + '/' + loss_f 43 | path_to_sample_out_f = path_to_exp_out + '/' + sample_out_f 44 | 45 | ######### first load the pretrained word embeddings 46 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt') 47 | embeddings_index, embeddings_size = readGlove(path_to_glove) 48 | 49 | ######### read corpus 50 | raw_triplets = read_raw_squad(path_to_data) 51 | 52 | # # test of windowed triplets 53 | # window_size = 10 54 | # test_idx = 250 55 | # windowed_c_triplets_10 = get_windowed_ans(raw_triplets, window_size) 56 | # print(raw_triplets[test_idx][0]) 57 | # print(raw_triplets[test_idx][2]) 58 | # print(windowed_c_triplets[0][0]) 59 | 60 | # test of selecting the sentence containing answer from context 61 | # test_idx = 0 62 | sent_window = 1 63 | sent_c_triplets, unmatch = get_ans_sentence(raw_triplets) 64 | # print(raw_triplets[test_idx][0]) 65 | # print(raw_triplets[test_idx][2]) 66 | # print('ans start idx: %d' % raw_triplets[test_idx][3]) 67 | # print('ans end idx: %d' % raw_triplets[test_idx][4]) 68 | # print(sent_c_triplets[0][0]) 69 | # windowed_c_triplets_10_noEOS = tokenize_squad(windowed_c_triplets_10, embeddings_index, opt='window', a_EOS=False, c_EOS=False) 70 | # triplets = windowed_c_triplets_30_noEOS 71 | # windowed_c_triplets_10_noEOS = tokenize_squad(windowed_c_triplets_10_noEOS, embeddings_index, opt='window') 72 | # sent_c_triplets = tokenize_squad(sent_c_triplets, embeddings_index, opt='sent') 73 | # triplets = tokenize_squad(raw_triplets, embeddings_index) 74 | 75 | # print(raw_triplets[test_idx][0]) 76 | # print(' '.join(triplets[test_idx][0])) 77 | # print(raw_triplets[test_idx][1]) 78 | # print(' '.join(triplets[test_idx][1])) 79 | # print(raw_triplets[test_idx][2]) 80 | # print(' '.join(triplets[test_idx][2])) 81 | 82 | # # save to files 83 | # import pickle 84 | # save_path = '/home/jack/Documents/QA_QG/data/processed/' 85 | # if not os.path.exists(save_path): 86 | # os.mkdir(save_path) 87 | # with open(save_path+'windowed_c_triplets_10_noEOS.txt', 'wb') as fp: 88 | # pickle.dump(windowed_c_triplets_10_noEOS, fp) 89 | # with open(save_path+'sent_c_triplets.txt', 'wb') as fp: 90 | # pickle.dump(sent_c_triplets, fp) 91 | # with open(save_path+'triplets.txt', 'wb') as fp: 92 | # pickle.dump(triplets, fp) 93 | 94 | # # test pickle load 95 | import pickle 96 | load_path = '/home/jack/Documents/QA_QG/data/processed/' 97 | # triplets = pickle.load(open(load_path+'triplets.txt', 'rb')) 98 | sent_c_triplets = pickle.load(open(load_path+'sent_c_triplets.txt', 'rb')) 99 | # windowed_c_triplets_10 = pickle.load(open(load_path+'windowed_c_triplets_10.txt', 'rb')) 100 | 101 | # # find max length of context, question, answer, respectively 102 | # # max_len_c, max_len_q, max_len_a = max_length(triplets) 103 | # 104 | # effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index) 105 | # # print('effective number of tokens: ' + str(effective_num_tokens)) 106 | # # print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n') 107 | # # # build word2index dictionary and index2word dictionary 108 | # word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens) 109 | 110 | # test similarity test 111 | q = 'what is the language spoken in germany ? EOS' 112 | scores = generated_q_novelty(sent_c_triplets, q) 113 | idx = np.argpartition(scores, -10)[-10:] 114 | scores[idx] 115 | for i in idx: 116 | print(sent_c_triplets[i][1]) 117 | 118 | ###################################################################### 119 | ###################################################################### 120 | # test case of get_random_batch and prepare_batch_var functions in data_proc.py 121 | # (uncomment code below to test) 122 | # test and time 123 | # to run this test, you need to have these things ready: 124 | # 1) triplet processed by tokenize_squad, 125 | # 2) embeddings_index 126 | # 3) a mini batch processed by get_random_batch 127 | # batch_size = 500 128 | # start = time.time() 129 | # batch, seq_lens, fake_batch, fake_seq_lens = get_random_batch(triplets, batch_size, with_fake=True) 130 | # batch, seq_lens = get_random_batch(triplets, batch_size) 131 | # 132 | # # temp, temp_orig, seq_lens_cqa = prepare_batch_var(batch, seq_lens, fake_batch, fake_seq_lens, batch_size, word2index, embeddings_index, embeddings_size, 133 | # # mode = ['word', 'index'], concat_opt='cqa', with_fake=True) 134 | # batch_vars, batch_paddings, seq_lens = prepare_batch_var(batch, seq_lens, batch_size, word2index, embeddings_index, embeddings_size) 135 | 136 | # end = time.time() 137 | # print('time elapsed: ' + str(end-start)) 138 | # # the following check if the batched data matches with the original data 139 | # batch_idx = random.choice(range(batch_size)) 140 | # print(batch_idx) 141 | # 142 | # print('context > ', ' '.join(temp_orig[0][batch_idx])) 143 | # print('question > ', ' '.join(temp_orig[1][batch_idx])) 144 | # print('answer > ', ' '.join(temp_orig[2][batch_idx])) 145 | # 146 | # idx = batch[0].index(temp_orig[0][batch_idx]) 147 | # print('context > ', ' '.join(batch[0][idx])) 148 | # print('question > ', ' '.join(batch[1][idx])) 149 | # print('answer > ', ' '.join(batch[2][idx])) 150 | 151 | # seq_idx = random.choice(range(min(seq_lens[0]))) 152 | # print(seq_idx) 153 | # word1 = embeddings_index[batch[0][seq_lens[0].index(heapq.nlargest(batch_idx, seq_lens[0])[-1])][seq_idx]] 154 | # word2 = temp[0][seq_idx, batch_idx,] 155 | # set(word1) == set(word2.data.cpu()) 156 | 157 | 158 | ###################################################################### 159 | ###################################################################### 160 | # # test case to load the G model and sample from G 161 | # teacher_forcing_ratio = 0.5 # default in original code is 0.5 162 | 163 | # # param for G 164 | # enc_hidden_size = 256 165 | # enc_n_layers = 1 166 | # enc_num_directions = 1 167 | # dec_hidden_size = 256 168 | # dec_n_layers = 1 169 | # dec_num_directions = 1 170 | # batch_size = 5 171 | # learning_rate = 0.0005 172 | 173 | # generator = G(embeddings_size, enc_hidden_size, enc_n_layers, enc_num_directions, 174 | # embeddings_size, dec_hidden_size, effective_num_tokens, dec_n_layers, dec_num_directions, 175 | # batch_size) 176 | # if use_cuda: 177 | # generator = generator.cuda() 178 | 179 | # # prepare G input 180 | # training_batch, seq_lens = get_random_batch(triplets, batch_size) 181 | # ca = training_batch[0][0] + training_batch[2][0] 182 | # # sample from G 183 | # max_len = 100 184 | # sample_q = G_sampler(generator, ca, embeddings_index, embeddings_size, word2index, index2word, max_len) 185 | # print(' '.join(sample_q)) 186 | -------------------------------------------------------------------------------- /src/model_zoo.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import os 4 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util') 5 | from data_proc import * 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.autograd import Variable 10 | import torch.nn.functional as F 11 | 12 | use_cuda = torch.cuda.is_available() 13 | 14 | ###################################################################### 15 | # The Encoder 16 | # ----------- 17 | class EncoderRNN(nn.Module): 18 | # output is the same dimension as input (dimension defined by externalword embedding model) 19 | def __init__(self, input_size, hidden_size, batch_size, n_layers=1, num_directions=1): 20 | super(EncoderRNN, self).__init__() 21 | self.n_layers = n_layers 22 | self.hidden_size = hidden_size 23 | self.input_size = input_size 24 | self.num_directions = num_directions 25 | self.batch_size = batch_size 26 | # print('batch size is: %d' % batch_size) 27 | 28 | if self.num_directions == 1: 29 | self.gru = nn.GRU(input_size, hidden_size, n_layers, bidirectional=False) 30 | elif self.num_directions == 2: 31 | self.gru = nn.GRU(input_size, hidden_size, n_layers, bidirectional=True) 32 | else: 33 | raise Exception('input num_directions is wrong - need to be either 1 or 2') 34 | 35 | def forward(self, input, seq_lens, hidden=None): 36 | 37 | # # prepare encoder input 38 | # if self.batch_size > 1: 39 | # # see how pack_padded_sequence works, take a look here (this is a wrong example): https://goo.gl/oN9uc9 40 | # input = nn.utils.rnn.pack_padded_sequence(input, seq_lens) 41 | # # input = pack_sequence(input, seq_lens) 42 | 43 | # input is matrix of size [max seq len x batch size x embedding dimension] 44 | encoder_outputs, hidden = self.gru(input, hidden) 45 | 46 | # # unpack the sequence 47 | # # size of unpacked sequence: (seq_len, batch size, hidden_size*num_directions) 48 | # if self.batch_size > 1: 49 | # encoder_outputs, output_lens = torch.nn.utils.rnn.pad_packed_sequence(encoder_outputs) 50 | 51 | # FIXME: do I need to sum the eocnder_outputs when the network is bidirectional: 52 | # e.g. outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] 53 | 54 | return encoder_outputs, hidden 55 | 56 | 57 | ###################################################################### 58 | # Vanilla Decoder 59 | # ^^^^^^^^^^^^^^^^^ 60 | # TODO: take another look at the attn implementation; there might be some errors 61 | class DecoderRNN(nn.Module): 62 | def __init__(self, input_size, hidden_size, output_size, n_layers=1, num_directions=1, dropout_p=0.1): 63 | super(AttnDecoderRNN, self).__init__() 64 | self.input_size = input_size 65 | self.hidden_size = hidden_size 66 | self.output_size = output_size 67 | self.n_layers = n_layers 68 | self.dropout_p = dropout_p 69 | self.bidi = True if num_directions==2 else False 70 | 71 | # recurrent model 72 | self.dropout = nn.Dropout(self.dropout_p) 73 | self.gru = nn.GRU(self.input_size, self.hidden_size, num_layers=self.n_layers, bidirectional=self.bidi) 74 | self.out = nn.Linear(self.hidden_size, self.output_size) 75 | 76 | # forward for each time step. 77 | # need to do this because of teacher forcing at each time step 78 | def forward(self, input, encoder_hidden, embeddings_index, hidden=None): 79 | 80 | # get the output 81 | # hidden: (num_layers * num_directions, batch, hidden_size) 82 | # note: for each time step, output and hidden are the same 83 | output, hidden = self.gru(input, hidden) 84 | 85 | # if bidirectional, sum decoder hidden states of both directions 86 | if self.bidi: 87 | hidden = hidden[2*self.n_layer - 1] + hidden[2*self.n_layer] 88 | hidden = hidden.unsqueeze(0) 89 | 90 | # output size: (batch size, vocab size) 91 | output = F.log_softmax(self.out(output)) 92 | 93 | return output, hidden 94 | 95 | 96 | ###################################################################### 97 | # Attention Decoder 98 | # ^^^^^^^^^^^^^^^^^ 99 | # TODO: take another look at the attn implementation; there might be some errors 100 | class AttnDecoderRNN(nn.Module): 101 | def __init__(self, input_size, hidden_size, output_size, encoder, n_layers=1, num_directions=1, dropout_p=0.1): 102 | super(AttnDecoderRNN, self).__init__() 103 | self.input_size = input_size 104 | self.hidden_size = hidden_size 105 | self.output_size = output_size 106 | self.n_layers = n_layers 107 | self.dropout_p = dropout_p 108 | self.num_directions = num_directions 109 | 110 | # recurrent model 111 | self.dropout = nn.Dropout(self.dropout_p) 112 | self.gru = nn.GRU(self.input_size, self.hidden_size) 113 | self.out = nn.Linear(self.hidden_size + encoder.num_directions * encoder.hidden_size, self.output_size) 114 | 115 | # attention mechanism 116 | self.attn = nn.Linear(self.hidden_size + encoder.num_directions * encoder.hidden_size, self.hidden_size) 117 | 118 | # forward for each time step. 119 | # need to do this because of teacher forcing at each time step 120 | def forward(self, input, encoder_outputs, embeddings_index, hidden=None): 121 | 122 | # get the output 123 | # hidden: (num_layers * num_directions, batch, hidden_size) 124 | # note: for each time step, output and hidden are the same 125 | output, hidden = self.gru(input, hidden) 126 | 127 | # # unpack the sequence 128 | # # decoder_outputs size (seq len, batch, hidden_size * num_directions) 129 | # # --> collection of hidden states at every time step 130 | # TODO: should figure out how to do this in a batch 131 | # current implementation is one token at a time using a forloop 132 | # decoder_outputs, output_lens = torch.nn.utils.rnn.pad_packed_sequence(decoder_outputs) 133 | 134 | # init attention weights 135 | # length = batch_size x encoder output lens 136 | attn_weights = Variable(torch.zeros(encoder_outputs.size(1), encoder_outputs.size(0))) 137 | if use_cuda: 138 | attn_weights = attn_weights.cuda() 139 | 140 | for b in range(encoder_outputs.size(1)): 141 | # copy the decoder output at the present time step to N rows, where N = num encoder outputs 142 | # first dimension of append = first dimension of encoder_outputs[:,b] = seq_len of encoder 143 | # the scores for calculating attention weights of all encoder outputs for one time step of decoder output 144 | for i in range(encoder_outputs.size(0)): 145 | attn_weights[b,i] = hidden[:,b].squeeze(0).dot(self.attn(torch.cat((hidden[:,b], encoder_outputs[i,b].unsqueeze(0)), 1)).t()) 146 | # Below is an alternative implementation using matrices instead of for loop 147 | # not sure which one is more space efficient 148 | # (the out of memory error most likely comes from here) 149 | # attn_weights[i,b] = torch.mm(hidden[:, b], 150 | # self.attn(torch.cat((append, encoder_outputs[:, b]), 1)).t()) 151 | 152 | attn_weights = F.softmax(attn_weights) 153 | 154 | # input to bmm: 155 | # weights size: (batch size, 1, seq_len) 156 | # hidden states size: (seq_len, batch, hidden_size * num_directions) 157 | # transpose hidden state size: (batch, seq len, hidden_size * num_directions) 158 | # output size: (batch size, 1, hidden_size * num_directions) 159 | context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs.transpose(0, 1)) 160 | 161 | # calculate 162 | decoder_output = torch.cat((hidden.squeeze(0), context.squeeze(1)), 1) 163 | 164 | # output size: (batch size, vocab size) 165 | decoder_output = F.log_softmax(self.out(decoder_output)) 166 | 167 | return decoder_output, hidden, attn_weights 168 | 169 | 170 | ###################################################################### 171 | # multi-layer perceptron 172 | # ^^^^^^^^^^^^^^^^^^^^^^ 173 | # code adapted from pytorch tutorial 174 | class MLP(nn.Module): 175 | # FIXME: the number of attention weights here is hard coded for tensor multiplication instead of using for loops 176 | def __init__(self, hidden_size, output_size, encoder, num_attn_weights, use_attn = True): 177 | # maximum input length it can take (for attention mechanism) 178 | super(MLP, self).__init__() 179 | self.hidden_size = hidden_size 180 | self.use_attn = use_attn 181 | self.num_attn_weights = num_attn_weights 182 | self.output_size = output_size 183 | 184 | # fully connected layers (2) and non-linearity 185 | self.layer1 = nn.Linear(encoder.num_directions * encoder.hidden_size, self.hidden_size) 186 | self.relu = nn.ReLU() 187 | self.layer2 = nn.Linear(self.hidden_size, self.output_size) 188 | self.sigmoid = nn.Sigmoid() 189 | 190 | # attention 191 | if self.use_attn: 192 | self.tanh = nn.Tanh() 193 | self.attn = nn.Linear(encoder.hidden_size*encoder.num_directions, self.num_attn_weights) 194 | 195 | def forward(self, inputs): 196 | # inputs size (seq len, batch size, hidden size * num directions) 197 | # if use attention, the output vector is a weighted combination of input hidden states 198 | # if not use attention, the output vector is simply a feedforward network operated on input's last hidden state 199 | # TODO: write the attn function into another module??? 200 | if self.use_attn: 201 | 202 | # reshape input to be 2D tensor instead of 3D 203 | seq_len = inputs.size(0) 204 | batch_size = inputs.size(1) 205 | inputs_for_attn_calc = inputs.view(-1, inputs.size(-1)) 206 | 207 | attn_weights = Variable(torch.zeros(inputs.size(1), inputs.size(0))) 208 | if use_cuda: 209 | attn_weights = attn_weights.cuda() 210 | 211 | # calculate attention weight for each output time step 212 | # remember encoder_outputs size: (seq_len, batch, hidden_size * num_directions) 213 | # for each token in the decoder output sequences: 214 | for b in range(inputs.size(1)): 215 | # the scores for calculating attention weights of all encoder outputs for one time step of decoder output 216 | attn_weights[b] = self.attn(inputs[:, b]).t() 217 | 218 | attn_weights = F.softmax(attn_weights) 219 | 220 | # input to bmm: 221 | # weights size: (batch size, 1, seq_len) 222 | # hidden states size: (seq_len, batch, hidden_size * num_directions) 223 | # transpose hidden state size: (batch, seq len, hidden_size * num_directions) 224 | # output size: (batch size, 1, hidden_size * num_directions) 225 | context = torch.bmm(attn_weights.unsqueeze(1), inputs.transpose(0, 1)).squeeze(1) 226 | else: 227 | context = torch.sum( inputs.transpose(0,1), 1 ).squeeze(1) 228 | 229 | # feedforward 230 | out = self.layer1(context) 231 | out = self.relu(out) 232 | out = self.layer2(out) 233 | out = self.sigmoid(out) 234 | 235 | return out 236 | -------------------------------------------------------------------------------- /src/GAN_model/GAN_model.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util') 3 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_c_a_sep') 4 | # sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_baseline/') 5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/D_baseline') 6 | from util import asMinutes, timeSince 7 | from data_proc import * 8 | from G_c_a_sep import G 9 | from G_eval import * 10 | # from G_model import G 11 | from D_model import * 12 | 13 | import torch 14 | import torch.nn as nn 15 | import numpy as np 16 | import torch.autograd as autograd 17 | from torch.autograd import Variable 18 | 19 | ################################################################## 20 | 21 | use_cuda = torch.cuda.is_available() 22 | if use_cuda: 23 | gpu = 0 24 | 25 | def to_var(x): 26 | if use_cuda: 27 | x = x.cuda() 28 | return Variable(x) 29 | 30 | 31 | class GAN_model(nn.Module): 32 | def __init__(self, G_enc_input_size, G_enc_hidden_size, G_enc_n_layers, G_enc_num_directions, 33 | G_dec_input_size, G_dec_hidden_size, G_output_size, G_dec_n_layers, G_dec_num_directions, 34 | D_enc_input_size, D_enc_hidden_size, D_enc_n_layers, D_num_directions, 35 | D_mlp_hidden_size, D_num_attn_weights, D_mlp_output_size, 36 | use_attn, batch_size, pretrain=False, G_path=None 37 | ): 38 | 39 | super(GAN_model, self).__init__() 40 | 41 | self.G = G(G_enc_input_size, G_enc_hidden_size, G_enc_n_layers, G_enc_num_directions, G_dec_input_size, 42 | G_dec_hidden_size, G_output_size, G_dec_n_layers, G_dec_num_directions, batch_size) 43 | if pretrain: 44 | # load the G model from G_path 45 | self.G = torch.load(G_path) 46 | 47 | self.D = D(D_enc_input_size, D_enc_hidden_size, D_enc_n_layers, D_num_directions, D_mlp_hidden_size, 48 | D_num_attn_weights, D_mlp_output_size, use_attn, batch_size) 49 | 50 | def train(self, triplets, n_iters, d_steps, d_optimizer, g_steps, g_optimizer, batch_size, max_len, 51 | criterion, word2index, index2word, embeddings_index, embeddings_size, print_every, plot_every, checkpoint_every, 52 | to_file=False, loss_f=None, sample_out_f=None, path_to_exp_out=None): 53 | # criterion is for both G and D 54 | 55 | # record start time for logging 56 | begin_time = time.time() 57 | print_d_loss_total = 0 # Reset every print_every 58 | plot_d_loss_total = 0 # Reset every plot_every 59 | print_g_loss_total = 0 # Reset every print_every 60 | plot_g_loss_total = 0 # Reset every plot_every 61 | plot_d_loss_avgs = [] 62 | plot_g_loss_avgs = [] 63 | 64 | for iter in range(1, n_iters + 1): 65 | 66 | # train D 67 | for d_train_idx in range(d_steps): 68 | # 1. Train D on real+fake 69 | self.D.zero_grad() 70 | 71 | # 1A: Train D on real 72 | # get data 73 | # prepare batch 74 | training_batch, seq_lens = get_random_batch(triplets, batch_size) 75 | # concat the context_ans batch with the question batch 76 | # each element in the training batch is context + question + answer 77 | cqa_batch, _, cqa_lens = prepare_batch_var(training_batch, seq_lens, 78 | batch_size, word2index, embeddings_index, 79 | embeddings_size, mode=['word'], concat_opt='cqa') 80 | 81 | train_input = Variable(cqa_batch[0].cuda()) if use_cuda else Variable( 82 | cqa_batch[0]) # embeddings vectors, size = [seq len x batch size x embedding dim] 83 | 84 | d_real_decision = self.D.forward(train_input, cqa_lens[0]) 85 | real_target = Variable(torch.FloatTensor([1]*batch_size)).cuda() if use_cuda else \ 86 | Variable(torch.FloatTensor([1]*batch_size)) 87 | d_real_error = criterion(d_real_decision, real_target) # ones = true 88 | d_real_error.backward() # compute/store gradients, but don't change params 89 | 90 | # 1B: Train D on fake 91 | fake_cqa_batch, fake_cqa_lens = prepare_fake_batch_var(self.G, training_batch, max_len, batch_size, 92 | word2index, index2word, embeddings_index, 93 | embeddings_size, mode = ('word')) 94 | 95 | # # sanity check: rpepare fake batch and prepare batch have the same order 96 | # print(fake_cqa_batch[0][12] == cqa_batch[0][12]) 97 | 98 | d_fake_data = Variable(fake_cqa_batch[0].cuda()) if use_cuda else Variable(fake_cqa_batch[0]) 99 | d_fake_decision = self.D.forward(d_fake_data, fake_cqa_lens[0]) 100 | fake_target = Variable(torch.FloatTensor([0]*batch_size)).cuda() if use_cuda else \ 101 | Variable(torch.FloatTensor([0]*batch_size)) 102 | # d_fake_error = criterion(d_fake_decision, fake_target) # zeros = fake 103 | # d_fake_error.backward() 104 | # d_optimizer.step() 105 | 106 | # accumulate loss 107 | # FIXME I dont think below implementation works for batch version 108 | d_error = torch.mean(d_fake_decision) - torch.mean(d_real_decision) # W_GAN loss 109 | # d_error = -torch.mean(self.log(1 - d_fake_decision)) - torch.mean(self.log(d_real_decision)) # GAN loss 110 | d_error.backward() 111 | d_optimizer.step() 112 | 113 | # d_error = d_real_error + d_fake_error 114 | 115 | # train G 116 | for g_train_idx in range(g_steps): 117 | self.G.zero_grad() 118 | 119 | # conditional data for generator 120 | training_batch, seq_lens = get_random_batch(triplets, batch_size) 121 | fake_cqa_batch, fake_cqa_lens = prepare_fake_batch_var(self.G, training_batch, max_len, batch_size, 122 | word2index, index2word, embeddings_index, 123 | embeddings_size, mode=('word'), detach=False) 124 | g_fake_data = Variable(fake_cqa_batch[0].cuda()) if use_cuda else Variable(fake_cqa_batch[0]) 125 | dg_fake_decision = self.D.forward(g_fake_data, fake_cqa_lens[0]) 126 | target = Variable(torch.FloatTensor([1]*batch_size).cuda()) if use_cuda else \ 127 | Variable(torch.FloatTensor([1]*batch_size)) 128 | # g_error = criterion(dg_fake_decision, target) 129 | g_error = -torch.mean(dg_fake_decision) # wgan loss 130 | # G_error = -torch.mean(self.log(dg_fake_decision)) # gan loss 131 | g_error.backward() 132 | g_optimizer.step() # Only optimizes G's parameters 133 | 134 | # log error 135 | print_d_loss_total += d_error.data[0] 136 | print_g_loss_total += g_error.data[0] 137 | plot_d_loss_total += d_error.data[0] 138 | plot_g_loss_total += g_error.data[0] 139 | if iter % print_every == 0: 140 | print_d_loss_avg = print_d_loss_total / print_every 141 | print_g_loss_avg = print_g_loss_total / print_every 142 | print_d_loss_total = 0 143 | print_g_loss_total = 0 144 | 145 | if not to_file: 146 | print('%s (%d %d%%)' % (timeSince(begin_time, iter / float(n_iters)), iter, iter / n_iters * 100)) 147 | # print("errors: D: real-%s/fake-%s G: %s " % ( d_real_error.data[0], d_fake_error.data[0], g_error.data[0]) ) 148 | print("errors: D: %s G: %s " % (print_d_loss_avg, print_g_loss_avg)) 149 | print('---sample generated question---') 150 | # sample a triple and print the generated question 151 | evaluate(self.G, triplets, embeddings_index, embeddings_size, word2index, index2word, max_len) 152 | else: 153 | sample_out_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100))) 154 | evaluate(self.G, triplets, embeddings_index, embeddings_size, word2index, index2word, max_len, 155 | to_file, sample_out_f) 156 | sample_out_f.write(unicode('\n')) 157 | 158 | if iter % plot_every == 0: 159 | plot_d_loss_avg = plot_d_loss_total / plot_every 160 | plot_d_loss_avgs.append(plot_d_loss_avg) 161 | plot_g_loss_avg = plot_g_loss_total / plot_every 162 | plot_g_loss_avgs.append(plot_g_loss_avg) 163 | plot_d_loss_total = 0 164 | plot_g_loss_total = 0 165 | 166 | if to_file: 167 | loss_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100))) 168 | loss_f.write(unicode("errors: D: %s G: %s " % (print_d_loss_avg, print_g_loss_avg))) 169 | loss_f.write(unicode('\n')) 170 | 171 | if (iter % checkpoint_every == 0) or (iter == n_iters): 172 | checkpoint_fname = 'checkpoint_iter_' + str(iter) + '.pth.tar' 173 | state = { 174 | 'iteration': iter + 1, 175 | 'd_state_dict': self.D.state_dict(), 176 | 'g_state_dict': self.G.state_dict(), 177 | 'd_optimizer' : d_optimizer.state_dict(), 178 | 'g_optimizer' : g_optimizer.state_dict(), 179 | } 180 | torch.save(state, path_to_exp_out+'/'+checkpoint_fname) 181 | plotLoss(plot_d_loss_avgs, plot_every, save_path=path_to_exp_out, f_name='d_loss_itr_'+str(iter)+'.png', 182 | title='training loss D (monitoring purpose)', from_file=False) 183 | plotLoss(plot_g_loss_avgs, plot_every, save_path=path_to_exp_out, f_name='g_loss_itr_'+str(iter)+'.png', 184 | title='training loss G (monitoring purpose)', from_file=False) 185 | 186 | # def train(self, **kwargs): 187 | # pass 188 | # 189 | # def test(self): 190 | # pass 191 | 192 | # L2 loss instead of Binary cross entropy loss (this is optional for stable training) 193 | # FIXME: is L2 loss the same as MSELoss in torch loss module? 194 | # FIXME: these losses don't work with minibatch yet? 195 | def loss(self, D_real, D_fake, gen_params, disc_params, cond_real_data, cond_fake_data, mode, lr=None): 196 | mode = mode.lower() 197 | if mode == 'gan': 198 | G_loss = -torch.mean(self.log(D_fake)) 199 | # FIXME G_loss.backward() 200 | D_loss = -torch.mean(self.log(1 - D_fake)) - torch.mean(self.log(D_real)) 201 | # FIXME D_loss.backward() 202 | metric = -D_loss / 2 + np.log(2) # JS divergence 203 | 204 | G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3) 205 | D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3) 206 | 207 | elif mode == 'lsgan-1': 208 | G_loss = torch.mean(D_fake ** 2) 209 | D_loss = torch.mean((D_real - 1) ** 2) 210 | metric = 0 # TBD 211 | 212 | G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3) 213 | D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3) 214 | 215 | elif mode == 'lsgan-2': 216 | G_loss = torch.mean((D_fake - 1) ** 2) 217 | D_loss = torch.mean((D_real - 1) ** 2) + torch.mean(D_fake ** 2) 218 | metric = D_loss / 2 # Pearson Chi-Square divergence 219 | 220 | G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3) 221 | D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3) 222 | 223 | elif mode == 'wgan': 224 | G_loss = -torch.mean(D_fake) 225 | D_loss = torch.mean(D_fake) - torch.mean(D_real) 226 | metric = -D_loss # Earth-mover distance 227 | 228 | grad_penalty = self.cal_grad_penalty(cond_real_data, cond_fake_data) 229 | D_loss += self.lmd * grad_penalty 230 | 231 | G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3) 232 | D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3) 233 | 234 | elif mode == 'bgan': 235 | G_loss = 0.5 * torch.mean((self.log(D_fake) - self.log(1 - D_fake)) ** 2) 236 | D_loss = -torch.mean(self.log(D_real) + self.log(1 - D_fake)) 237 | metric = 0 # TBD 238 | G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3) 239 | D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3) 240 | 241 | else: 242 | raise ValueError('Unknown mode: {}'.format(mode)) 243 | 244 | return G_loss, D_loss, metric, G_solver, D_solver 245 | 246 | def cal_grad_penalty(self, cond_real_data, cond_fake_data): 247 | epsilon = to_var(torch.rand(self.batch_size, 1)) 248 | epsilon = epsilon.expand(cond_real_data.size()) 249 | 250 | data_diff = cond_fake_data - cond_real_data 251 | cond_interp_data = cond_real_data + epsilon * data_diff 252 | disc_interp = self.D(self.d_net, cond_interp_data, reuse=True) # TODO: change the arguments 253 | 254 | grad_interp = autograd.grad(outputs=disc_interp, inputs=cond_interp_data, 255 | grad_outputs=torch.ones(disc_interp.size()).cuda( 256 | gpu) if use_cuda else torch.ones( 257 | disc_interp.size()), 258 | create_graph=True, retain_graph=True, only_inputs=True)[0] 259 | 260 | grad_interp_flat = grad_interp.view([self.batch_size, -1]) 261 | slope = grad_interp_flat.norm(p=2, dim=1) 262 | 263 | grad_penalty = torch.mean((slope - 1.) ** 2) 264 | return grad_penalty 265 | 266 | 267 | 268 | # same context and answer as in the real batch, but generated question 269 | def prepare_fake_batch_var(generator, batch, max_len, batch_size, word2index, index2word, 270 | embeddings_index, embeddings_size, sort=False, mode = ('word'), detach=True, concat=None): 271 | 272 | batch_vars = [] 273 | batch_var_orig = [] 274 | 275 | cqa = [] 276 | cqa_len = [] 277 | labels = torch.LongTensor([0] * batch_size) # all fake labels, thus all 0's 278 | for b in range(batch_size): 279 | if concat=='ca': 280 | ca = batch[0][b] + batch[2][b] 281 | fake_q_sample = G_sampler(generator, ca, embeddings_index, embeddings_size, word2index, index2word, max_len, detach=detach, concat=concat) 282 | elif concat==None: 283 | inputs = [batch[0][b], batch[1][b], batch[2][b]] 284 | fake_q_sample = G_sampler(generator, inputs, embeddings_index, embeddings_size, word2index, index2word, max_len, detach=detach) 285 | cqa.append(batch[0][b] + fake_q_sample + batch[2][b]) 286 | cqa_len.append(len(batch[0][b] + fake_q_sample + batch[2][b])) 287 | 288 | batch = [cqa, batch[3], batch[4], labels] 289 | seq_lens = [cqa_len] 290 | 291 | # sort this batch_var in descending order according to the values of the lengths of the first element in batch 292 | num_batch = len(batch) 293 | 294 | if sort: 295 | all = batch + seq_lens 296 | all = sorted(zip(*all), key=lambda p: len(p[0]), reverse=True) 297 | all = zip(*all) 298 | batch = all[0:num_batch] 299 | seq_lens = all[num_batch:] 300 | batch_orig = batch 301 | 302 | for b in range(num_batch): 303 | 304 | batch_var = batch[b] 305 | 306 | # if element in batch is float, i.e. indices, then do nothing 307 | if isinstance(batch_var[0], int): 308 | batch_var = list(batch_var) 309 | pass 310 | else: 311 | # pad each context, question, answer to their respective max length 312 | if mode[b] == 'index': 313 | batch_padded = [pad_sequence(s, max(seq_lens[b]), word2index, mode='index') for s in batch_var] 314 | else: 315 | batch_padded = [pad_sequence(s, max(seq_lens[b]), word2index) for s in batch_var] 316 | 317 | # init variable matrices 318 | if mode[b] == 'index': 319 | batch_var = torch.LongTensor(max(seq_lens[b]), batch_size) # long tensor for module loss criterion 320 | else: 321 | batch_var = torch.FloatTensor(max(seq_lens[b]), batch_size, embeddings_size) 322 | 323 | # FIXME: very stupid embedded for loop implementation 324 | for i in range(batch_size): 325 | for j in range(max(seq_lens[b])): 326 | if mode[b] == 'index': 327 | batch_var[j, i] = batch_padded[i][j] 328 | else: 329 | batch_var[j, i,] = embeddings_index[batch_padded[i][j]] 330 | 331 | batch_vars.append(batch_var) 332 | 333 | # the second output is for debugging purpose 334 | if sort: 335 | return batch_vars, batch_orig, seq_lens 336 | else: 337 | return batch_vars, seq_lens 338 | 339 | 340 | # function to sample generator output 341 | def G_sampler(generator, input, embeddings_index, embeddings_size, word2index, index2word, max_length, concat=None, detach=True): 342 | # NOTE currently only generate one question at a time. multiple questions not yet supported 343 | 344 | if concat == 'ca': 345 | var = torch.FloatTensor(len(input), embeddings_size) 346 | for j in range(len(input)): 347 | var[j] = embeddings_index[input[j]] 348 | var = inputs.unsqueeze(1) 349 | if use_cuda: 350 | var = Variable(var.cuda()) 351 | else: 352 | var = Variable(var) 353 | 354 | decoder_output = generator.forward(var, None, [len(input)], 1, max_length, 355 | embeddings_index, embeddings_size, word2index, index2word, 356 | teacher_forcing_ratio=0).detach() 357 | decoder_output = decoder_output.squeeze(1) 358 | elif concat == None: 359 | # NOTE: hardcode indices of c, q, a, in the line - for i in range(0,3) 360 | inputs = [] 361 | for i in range(0,3): 362 | # print(input[i]) 363 | var = torch.FloatTensor(len(input[i]), embeddings_size) 364 | for j in range(len(input[i])): 365 | var[j] = embeddings_index[input[i][j]] 366 | var = var.unsqueeze(1) 367 | if use_cuda: 368 | var = Variable(var.cuda()) 369 | else: 370 | var = Variable(var) 371 | inputs.append(var) 372 | 373 | decoder_output = generator.forward(inputs, [len(x) for x in input], 1, max_length, 374 | embeddings_index, embeddings_size, word2index, index2word, 375 | teacher_forcing_ratio=0) 376 | if detach: 377 | decoder_output = decoder_output.detach() 378 | decoder_output = decoder_output.squeeze(1) 379 | 380 | 381 | 382 | decoded_words = [] 383 | for di in range(max_length): 384 | # top value and index of every batch 385 | topv, topi = decoder_output[di].data.topk(1) 386 | ni = topi[0] 387 | if (ni == word2index['EOS']) or (ni == word2index['PAD']): 388 | decoded_words.append('EOS') 389 | # decoder_attentions[di] = decoder_attention[0].data 390 | break 391 | else: 392 | decoded_words.append(index2word[ni]) 393 | 394 | return decoded_words 395 | -------------------------------------------------------------------------------- /src/util/data_proc.py: -------------------------------------------------------------------------------- 1 | #-----------------------------------------------------------------------------------------------# 2 | #-----------------------------------------------------------------------------------------------# 3 | # data loading helper functions 4 | #-----------------------------------------------------------------------------------------------# 5 | #-----------------------------------------------------------------------------------------------# 6 | from __future__ import unicode_literals, print_function, division 7 | from io import open 8 | import unicodedata 9 | import random 10 | 11 | # import spacy 12 | from spacy.en import English 13 | spacynlp = English() 14 | 15 | import torch 16 | from torch.autograd import Variable 17 | 18 | # FIXME: import spacy again below to avoid an error encountered when importing torch and spacy 19 | # it seems that spacy needs to be imported before torch. However, on Baylor cluster, 20 | # you need to import spacy again here for it to actually be imported without error. 21 | from spacy.en import English 22 | spacynlp = English() 23 | 24 | import json 25 | import numpy as np 26 | 27 | # import sys, os 28 | # sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_baseline') 29 | # from G_eval import * 30 | 31 | 32 | ###################################################################### 33 | # The files are all in Unicode, to simplify we will turn Unicode 34 | # characters to ASCII, make everything lowercase 35 | # 36 | 37 | # Turn a Unicode string to plain ASCII, thanks to 38 | # http://stackoverflow.com/a/518232/2809427 39 | def unicodeToAscii(s): 40 | return ''.join( 41 | c for c in unicodedata.normalize('NFD', s) 42 | if unicodedata.category(c) != 'Mn' 43 | ) 44 | 45 | # Lowercase, trim, and remove non-letter characters 46 | def normalizeString(s): 47 | s = unicodeToAscii(s.lower().strip()) 48 | # s = re.sub(r"([.!?])", r" \1", s) 49 | # s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) 50 | return s 51 | 52 | 53 | 54 | ###################################################################### 55 | # read GLOVE word embeddings 56 | def readGlove(path_to_data): 57 | embeddings_index = {} 58 | f = open(path_to_data) 59 | for line in f: 60 | values = line.split() 61 | word = values[0] 62 | coefs = np.asarray(values[1:], dtype='float32') 63 | coefs = torch.from_numpy(coefs) 64 | embeddings_index[word] = coefs 65 | f.close() 66 | 67 | print('Found %s word vectors.' % len(embeddings_index)) 68 | 69 | # get dimension from a random sample in the dict 70 | embeddings_size = random.sample( embeddings_index.items(), 1 )[0][1].size(-1) 71 | print('dimension of word embeddings: ' + str(embeddings_size)) 72 | 73 | SOS_token = -torch.ones(embeddings_size) # start of sentence token, all zerons 74 | EOS_token = torch.ones(embeddings_size) # end of sentence token, all ones 75 | UNK_token = torch.ones(embeddings_size) + torch.ones(embeddings_size) # these choices are pretty random 76 | PAD_token = torch.zeros(embeddings_size) 77 | 78 | # add special tokens to the embeddings 79 | embeddings_index['SOS'] = SOS_token 80 | embeddings_index['EOS'] = EOS_token 81 | embeddings_index['UNK'] = UNK_token 82 | embeddings_index['PAD'] = PAD_token 83 | 84 | return embeddings_index, embeddings_size 85 | 86 | 87 | ###################################################################### 88 | # read data specific for SQUAD dataset 89 | 90 | def read_raw_squad(path_to_data, normalize=True): 91 | # output (context, question, answer, ans_start_idx, ans_end_idx) triplets 92 | print("Reading dataset...") 93 | triplets = [] 94 | with open(path_to_data) as f: 95 | train = json.load(f) 96 | train = train['data'] 97 | for s in range(0, len(train)): 98 | samples = train[s]['paragraphs'] 99 | for p in range(0, len(samples)): 100 | context = samples[p]['context'] 101 | qas = samples[p]['qas'] 102 | for i in range(0, len(qas)): 103 | # print('current s,p,i are: ' + str(s)+str(p)+str(i)) 104 | answers = qas[i]['answers'] 105 | question = qas[i]['question'] 106 | for a in range(0, len(answers)): 107 | ans_text = answers[a]['text'] 108 | ans_start_idx = answers[a]['answer_start'] 109 | ans_end_idx = ans_start_idx + len(ans_text) 110 | 111 | if normalize: 112 | # turn from unicode to ascii and lower case everything 113 | context = unicodeToAscii(context) 114 | question = unicodeToAscii(question) 115 | ans_text = unicodeToAscii(ans_text) 116 | 117 | triplets.append((context, question, ans_text, ans_start_idx, ans_end_idx)) 118 | return triplets 119 | 120 | 121 | # helper function to tokenize the raw squad data 122 | # e.g. the context is read as a string; this function produces a list of word tokens from context string 123 | # and return as the processed tuple (context, question, ans_text, ans_start_idx, ans_end_idx) 124 | # the first three are lists, the last two are LongTensor 125 | def tokenize_squad(squad, embeddings_index, opt='raw', c_EOS=True, a_EOS=True): 126 | tokenized_triplets = [] 127 | if opt == 'raw': 128 | for triple in squad: 129 | tokenized_triplets.append( ( tokenize_sentence(triple[0], embeddings_index, EOS=c_EOS), 130 | tokenize_sentence(triple[1], embeddings_index), 131 | tokenize_sentence(triple[2], embeddings_index, EOS=a_EOS), 132 | triple[3], 133 | triple[4] ) ) 134 | elif opt == 'window': 135 | for triple in squad: 136 | tokenized_triplets.append( ( tokenize_sentence(triple[0], embeddings_index, spacy=False, EOS=c_EOS), 137 | tokenize_sentence(triple[1], embeddings_index), 138 | tokenize_sentence(triple[2], embeddings_index, spacy=False, EOS=a_EOS), 139 | triple[3], 140 | triple[4] ) ) 141 | elif opt == 'sent': 142 | for triple in squad: 143 | tokenized_triplets.append( ( tokenize_sentence(triple[0], embeddings_index, spacy=False, EOS=c_EOS), 144 | tokenize_sentence(triple[1], embeddings_index), 145 | tokenize_sentence(triple[2], embeddings_index, EOS=a_EOS), 146 | triple[3], 147 | triple[4] ) ) 148 | else: 149 | raise Exception('unknown option. should be one of "raw", "window", or "sent".') 150 | return tokenized_triplets 151 | 152 | 153 | # helper function to get the sentence of where the answer appear in the context 154 | # based on tokenized_squad, first element in output 155 | # output seq of tokens only from the answer sentence (same format as element in tokenize_squad output) 156 | def get_ans_sentence(raw_squad, sent_window=0): 157 | 158 | sent_c_triplets = [] # now each context in 159 | unmatch = [] # for debug 160 | for t in range(len(raw_squad)): 161 | sent = None 162 | c = raw_squad[t][0] 163 | a = raw_squad[t][2] 164 | sent_c = list(spacynlp(c).sents) 165 | tokenized_a = spacynlp.tokenizer(a) 166 | # sanity check 167 | # if len(sent_c) == 1: 168 | # print('WARNING: sentence segmentation may not work in this triple') 169 | # print(sent_c) 170 | # print(tokenized_c) 171 | ans_start_idx = raw_squad[t][3] 172 | ans_end_idx = raw_squad[t][4] 173 | 174 | # print(ans_start_idx) 175 | # print(ans_end_idx) 176 | 177 | idx = 0 178 | for s in sent_c: 179 | print(idx) 180 | # print('currenet index: %d' % idx) 181 | if idx <= ans_start_idx and idx+len(s.string)>=ans_end_idx: 182 | # print('enter if statement') 183 | # print(s) 184 | sent = s 185 | # print(sent_c.index(sent)) 186 | # if isinstance(sent, unicode): 187 | # raise Exception('unicode detected, where expecting spacy span object.') 188 | if tokenized_a[0].string not in sent.string: 189 | # print('c') 190 | # print(idx) 191 | # print(idx+len(s.string)) 192 | # print(ans_start_idx) 193 | # print(ans_end_idx) 194 | print(type(tokenized_a[0])) 195 | print(type(sent)) 196 | unmatch.append(t) 197 | # raise Exception('answer token not in current sentence') 198 | break 199 | else: 200 | idx += len(s.string) 201 | 202 | try: 203 | idx_temp = sent_c.index(sent) 204 | except: 205 | 206 | print(sent_c) 207 | print(sent) 208 | print(tokenized_a) 209 | print('\n') 210 | unmatch.append(t) 211 | 212 | #TODO: multiple sentences as context 213 | if sent_window > 0: 214 | ans_sent_idx = sent_c.index(sent) 215 | # print(ans_sent_idx) 216 | for i in range(1,sent_window): 217 | if ans_sent_idx-i > 0 and ans_sent_idx+i < len(sent_c): 218 | sent = [sent_c[ans_sent_idx-i], sent, sent_c[ans_sent_idx+i]] 219 | elif ans_sent_idx-1 <= 0 and ans_sent_idx+1 < len(sent_c): 220 | sent = [sent, sent_c[ans_sent_idx+i]] 221 | elif ans_sent_idx-1 > 0 and ans_sent_idx+1 >= len(sent_c): 222 | sent = [sent_c[ans_sent_idx-i], sent] 223 | sent_c_triplets.append( ( sent, raw_squad[t][1], raw_squad[t][2], raw_squad[t][3], raw_squad[t][4] ) ) 224 | 225 | return sent_c_triplets, set(unmatch) 226 | 227 | 228 | # helper function to get a window of tokens around the answer 229 | # similar to get_ans_sentence; only difference is the span of tokens 230 | # NOTE: here the number of window operates on crude tokens: there's = one token. 231 | # in proc_tokenized_sent, there's = 3 tokens. therefore, the actual 232 | # number of tokens before and after the answer may exceed the set window size 233 | def get_windowed_ans(raw_squad, window_size): 234 | 235 | windowed_c_triplets = [] 236 | 237 | for triple in raw_squad: 238 | c = triple[0] 239 | a = triple[2] 240 | tokenized_c = spacynlp.tokenizer(c) 241 | # sanity check 242 | # print(tokenized_c) 243 | tokenized_a = spacynlp.tokenizer(a) 244 | ans_start_idx = triple[3] 245 | ans_end_idx = triple[4] 246 | c_sub = c[:ans_start_idx] 247 | # print('first token in answer = %s' % tokenized_a[0]) 248 | 249 | # find the start token of the answer in context 250 | idx = 0 251 | t = 0 252 | for token in tokenized_c: 253 | if idx+c_sub.count(' ') == ans_start_idx and unicode(token) == unicode(tokenized_a[0]): 254 | break 255 | else: 256 | idx += len(token) 257 | t += 1 258 | if t < window_size: 259 | left_window = 0 260 | else: 261 | left_window = t - window_size 262 | if t + window_size + len(tokenized_a) > len(tokenized_c): 263 | right_window = len(tokenized_c) 264 | else: 265 | right_window = t + window_size + len(tokenized_a) 266 | 267 | windowed_c = tokenized_c[left_window:right_window] 268 | # # sanity check 269 | # if tokenized_a[0] not in windowed_c: 270 | # print('ERROR: windowed context does not contain answer token') 271 | 272 | windowed_c_triplets.append( ( windowed_c , triple[1], tokenized_a, triple[3], triple[4] ) ) 273 | 274 | return windowed_c_triplets 275 | 276 | 277 | def annotate_context_w_ans(raw_squad): 278 | pass 279 | 280 | 281 | 282 | 283 | # turns a sentence into individual tokens 284 | # this function takes care of word tokens that does not appear in pre trained embeddings 285 | # solution is to turn those word tokens into 'UNK' 286 | def tokenize_sentence(sentence, data_tokens, spacy=True, EOS=True): 287 | if spacy: 288 | tokenized_sentence = spacynlp.tokenizer(sentence) 289 | else: 290 | tokenized_sentence = sentence 291 | # # an additional preprocessing step to separate words and non-words when they appear together 292 | proc_tokenized_sentence = post_proc_tokenize_sentence(tokenized_sentence) 293 | 294 | token_num = len(proc_tokenized_sentence) 295 | 296 | var = [] 297 | 298 | for t in range(0, token_num): 299 | # the first if loop only for experimental use to aviod large vocab size 300 | if proc_tokenized_sentence[t] not in data_tokens: 301 | var.append('UNK') 302 | else: 303 | var.append(proc_tokenized_sentence[t]) 304 | 305 | if EOS: 306 | var.append('EOS') 307 | return var 308 | 309 | 310 | # helper function for post processing tokenizer 311 | # separate all punctuations into single tokens 312 | # e.g. "(they're)" --> "they", "'", "re" 313 | # outputs a list of strings 314 | def post_proc_tokenize_sentence(tokenized_sentence): 315 | proc_tokenized_sentence = [] 316 | for t in range(0, len(tokenized_sentence)): 317 | # try: 318 | # token = tokenized_sentence[t].string.lower().strip() 319 | # except: 320 | # print(tokenized_sentence) 321 | token = tokenized_sentence[t].string.lower().strip() 322 | # first check if the string is number or alphabet only 323 | if token.isdigit() or token.isalpha(): 324 | proc_tokenized_sentence.append(token) 325 | # sepatate this token into substrings of only words, numbers, or individual symbols 326 | else: 327 | index = -1 328 | for s in range(0, len(token)): 329 | if s > index: 330 | if token[s].isdigit(): 331 | # print('find digit') 332 | for i in range(s,len(token)): 333 | if (not token[i].isdigit()): 334 | proc_tokenized_sentence.append(token[s:i]) 335 | index = i-1 336 | break 337 | elif (token[i].isdigit()) and (i == len(token)-1): 338 | proc_tokenized_sentence.append(token[s:i+1]) 339 | index = i 340 | break 341 | elif token[s].isalpha(): 342 | # print('find alphabet') 343 | for i in range(s,len(token)): 344 | if (not token[i].isalpha()): 345 | proc_tokenized_sentence.append(token[s:i]) 346 | index = i-1 347 | break 348 | elif (token[i].isalpha()) and (i == len(token)-1): 349 | proc_tokenized_sentence.append(token[s:i+1]) 350 | index = i 351 | break 352 | else: 353 | # print('find symbol') 354 | proc_tokenized_sentence.append(token[s]) 355 | index += 1 356 | # print(index) 357 | return proc_tokenized_sentence 358 | # test 359 | # x = post_proc_tokenizer(spacynlp.tokenizer(u'mid-1960s')) 360 | 361 | 362 | # # find the max length of context, answer, and question 363 | # def max_length(triplets): 364 | 365 | # max_len_c = 0 366 | # max_len_q = 0 367 | # max_len_a = 0 368 | 369 | # for triple in triplets: 370 | # len_c = len(triple[0]) 371 | # len_q = len(triple[1]) 372 | # len_a = len(triple[2]) 373 | # if len_c > max_len_c: 374 | # max_len_c = len_c 375 | # if len_q > max_len_q: 376 | # max_len_q = len_q 377 | # if len_a > max_len_a: 378 | # max_len_a = len_a 379 | 380 | # return max_len_c, max_len_q, max_len_a 381 | 382 | 383 | ###################################################################### 384 | # count the number of tokens in both the word embeddings and the corpus 385 | def count_effective_num_tokens(triplets, embeddings_index, sos_eos = True): 386 | ## find all unique tokens in the data (should be a subset of the number of embeddings) 387 | data_tokens = [] 388 | for triple in triplets: 389 | data_tokens += triple[0] + triple[1] + triple[2] 390 | data_tokens = list(set(data_tokens)) # find unique 391 | if sos_eos: 392 | data_tokens = ['SOS', 'EOS', 'UNK', 'PAD'] + data_tokens 393 | else: 394 | data_tokens = ['UNK', 'PAD'] 395 | 396 | effective_tokens = list(set(data_tokens).intersection(embeddings_index.keys())) 397 | effective_num_tokens = len(effective_tokens) 398 | 399 | return effective_tokens, effective_num_tokens 400 | 401 | 402 | ###################################################################### 403 | # generate word index and index word look up tables 404 | def generate_look_up_table(effective_tokens, effective_num_tokens, use_cuda = True): 405 | word2index = {} 406 | index2word = {} 407 | for i in range(effective_num_tokens): 408 | index2word[i] = effective_tokens[i] 409 | word2index[effective_tokens[i]] = i 410 | return word2index, index2word 411 | 412 | 413 | ###################################################################### 414 | # prepare minibatch of data 415 | # output is (contexts, questions, answers, answer_start_idxs, answer_end_idxs) 416 | # each is of dimension [batch_size x their respective max length] 417 | def get_random_batch(triplets, batch_size, with_fake = False): 418 | 419 | # init values 420 | contexts = [] 421 | questions = [] 422 | answers = [] 423 | ans_start_idxs = [] 424 | ans_end_idxs = [] 425 | 426 | # inside this forloop, all word tokens are turned into their respective index according to word2index lookup table 427 | for i in range(batch_size): 428 | triple = random.choice(triplets) 429 | contexts.append(triple[0]) 430 | questions.append( triple[1] ) 431 | answers.append(triple[2]) 432 | ans_start_idxs.append( triple[3] ) 433 | ans_end_idxs.append( triple[4] ) 434 | 435 | # get lengths of each context, question, answer in their respective arrays 436 | context_lens = [len(s) for s in contexts] 437 | question_lens = [len(s) for s in questions] 438 | answer_lens = [len(s) for s in answers] 439 | 440 | if with_fake: 441 | idx = int(batch_size/2) 442 | return [contexts[:idx], questions[:idx], answers[:idx], ans_start_idxs[:idx], ans_end_idxs[:idx]], \ 443 | [context_lens[:idx], question_lens[:idx], answer_lens[:idx]],\ 444 | [contexts[idx:], questions, answers[idx:], ans_start_idxs[idx:], ans_end_idxs[idx:]], \ 445 | [context_lens[idx:], question_lens[idx:], answer_lens[idx:]] 446 | else: 447 | return [contexts, questions, answers, ans_start_idxs, ans_end_idxs], \ 448 | [context_lens, question_lens, answer_lens] 449 | 450 | 451 | # - prepare batch training data 452 | # - training_batch contains five pieces of data. The first three with size [batch size x max seq len], 453 | # - the last two with size [batch size]. 454 | # - seq_lens contains lengths of the first three sequences, each of size [batch size] 455 | # - the output would be matrices of size [max seq len x batch size x embedding size] 456 | # - if question is represented as index, then its size is [max seq len x batch size] --> this is transpose of the input 457 | # from get_random_batch in order to fit NLLLoss function (indexing and selecting the whole batch of a single token) is 458 | # easier. e.g. you can do question[i] which selects the whole sequence of the first dimension 459 | def prepare_batch_var(batch, seq_lens, batch_size, word2index, embeddings_index, embeddings_size, 460 | use_cuda=1, sort=False, mode=('word', 'index', 'word'), concat_opt=None, 461 | with_fake=False, fake_batch=None, fake_seq_lens=None): 462 | 463 | batch_vars = [] 464 | batch_var_orig = [] 465 | batch_paddings = [] 466 | 467 | if with_fake: 468 | batch_size = int(batch_size/2) 469 | fake_q = fake_batch[1] 470 | fake_q_lens = fake_seq_lens[1] 471 | 472 | #TODO (for different applications): change the below code (before for loop) to concat different portions of the batch_triplets 473 | if concat_opt == None: 474 | pass 475 | 476 | elif concat_opt == 'ca': 477 | ca = [] 478 | ca_len = [] 479 | for b in range(batch_size): 480 | ca.append(batch[0][b] + batch[2][b]) 481 | ca_len.append(len(batch[0][b] + batch[2][b])) 482 | batch = [ca, batch[1], batch[3], batch[4]] 483 | seq_lens = [ca_len] + seq_lens 484 | 485 | elif concat_opt == 'qa': 486 | pass 487 | 488 | # FIXME: only this following elif implemented fake question 489 | elif concat_opt == 'cqa': 490 | cqa = [] 491 | cqa_len = [] 492 | labels = [] 493 | for b in range(batch_size): 494 | cqa.append(batch[0][b] + batch[1][b] + batch[2][b]) # append real 495 | cqa_len.append(len(batch[0][b] + batch[1][b] + batch[2][b])) # append real 496 | labels.append(1) 497 | if with_fake: # append fake 498 | fake_q_sample = random.sample(fake_q,1)[0] 499 | cqa.append(batch[0][b] + fake_q_sample + batch[2][b]) 500 | cqa_len.append(len(batch[0][b] + fake_q_sample + batch[2][b])) 501 | labels.append(0) 502 | if with_fake: 503 | batch = [cqa, batch[3]+fake_batch[3], batch[4]+fake_batch[4], labels] 504 | else: 505 | batch = [cqa, batch[3], batch[4]] 506 | seq_lens = [cqa_len] 507 | elif concat_opt == 'qca': 508 | pass 509 | 510 | else: 511 | raise ValueError('not a valid concat option.') 512 | 513 | num_batch = len(batch) 514 | # sort this batch_var in descending order according to the values of the lengths of the first element in batch 515 | if sort: 516 | all = batch + seq_lens 517 | all = sorted(zip(*all), key=lambda p: len(p[0]), reverse=True) 518 | all = zip(*all) 519 | batch = all[0:num_batch] 520 | seq_lens = all[num_batch:] 521 | batch_orig = batch 522 | 523 | # get bacth size back to 2x if with fake 524 | if with_fake: 525 | batch_size = batch_size * 2 526 | 527 | for b in range(num_batch): 528 | 529 | batch_var = batch[b] 530 | 531 | # if element in batch is float, i.e. indices, then do nothing 532 | if isinstance(batch_var[0], int): 533 | batch_var = list(batch_var) 534 | pass 535 | else: 536 | # pad each context, question, answer to their respective max length 537 | if mode[b] == 'index': 538 | batch_padded = [pad_sequence(s, max(seq_lens[b]), word2index, mode='index') for s in batch_var] 539 | else: 540 | batch_padded = [pad_sequence(s, max(seq_lens[b]), word2index) for s in batch_var] 541 | 542 | # init variable matrices 543 | if mode[b] == 'index': 544 | batch_var = torch.LongTensor(max(seq_lens[b]), batch_size) # long tensor for module loss criterion 545 | else: 546 | batch_var = torch.FloatTensor(max(seq_lens[b]), batch_size, embeddings_size) 547 | 548 | # FIXME: very stupid embedded for loop implementation 549 | for i in range(batch_size): 550 | for j in range(max(seq_lens[b])): 551 | if mode[b] == 'index': 552 | batch_var[j, i] = batch_padded[i][j] 553 | else: 554 | batch_var[j, i,] = embeddings_index[batch_padded[i][j]] 555 | 556 | batch_vars.append(batch_var) 557 | batch_paddings.append(batch_padded) 558 | 559 | # the second output is for debugging purpose 560 | return batch_vars, batch_paddings, seq_lens 561 | 562 | # helper function to zero pad context, question, answer to their respective maximum length 563 | def pad_sequence(s, max_len, word2index, mode = 'word'): 564 | if mode == 'word': 565 | return s + ['PAD' for i in range(max_len - len(s))] 566 | elif mode == 'index': 567 | return [word2index[i] for i in s] + [word2index['PAD'] for i in range(max_len - len(s))] 568 | 569 | 570 | ###################################################################### 571 | # TODO: need a function to sample some (c, q, a) triplets from the generator 572 | def sample_generated_triples(triplets, G, batch_size): 573 | 574 | # should return the same thing as get_random_batch with with_fake = False 575 | return None 576 | 577 | 578 | ###################################################################### 579 | # test function for examining the output of the batch 580 | # primarily see whether the context, question, answer triplets make sense 581 | def print_batch(batch, batch_size, index2word): 582 | idx = random.choice(range(batch_size)) 583 | context = [ index2word[i] for i in batch[0][idx,] ] 584 | question = [ index2word[i] for i in batch[1][idx,] ] 585 | answer = [ index2word[i] for i in batch[2][idx,] ] 586 | return (' '.join(context), ' '.join(question), ' '.join(answer)) 587 | 588 | 589 | 590 | -------------------------------------------------------------------------------- /references/code/seq2seq_translation_tutorial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Translation with a Sequence to Sequence Network and Attention 4 | ************************************************************* 5 | **Author**: `Sean Robertson `_ 6 | 7 | In this project we will be teaching a neural network to translate from 8 | French to English. 9 | 10 | :: 11 | 12 | [KEY: > input, = target, < output] 13 | 14 | > il est en train de peindre un tableau . 15 | = he is painting a picture . 16 | < he is painting a picture . 17 | 18 | > pourquoi ne pas essayer ce vin delicieux ? 19 | = why not try that delicious wine ? 20 | < why not try that delicious wine ? 21 | 22 | > elle n est pas poete mais romanciere . 23 | = she is not a poet but a novelist . 24 | < she not not a poet but a novelist . 25 | 26 | > vous etes trop maigre . 27 | = you re too skinny . 28 | < you re all alone . 29 | 30 | ... to varying degrees of success. 31 | 32 | This is made possible by the simple but powerful idea of the `sequence 33 | to sequence network `__, in which two 34 | recurrent neural networks work together to transform one sequence to 35 | another. An encoder network condenses an input sequence into a vector, 36 | and a decoder network unfolds that vector into a new sequence. 37 | 38 | .. figure:: /_static/img/seq-seq-images/seq2seq.png 39 | :alt: 40 | 41 | To improve upon this model we'll use an `attention 42 | mechanism `__, which lets the decoder 43 | learn to focus over a specific range of the input sequence. 44 | 45 | **Recommended Reading:** 46 | 47 | I assume you have at least installed PyTorch, know Python, and 48 | understand Tensors: 49 | 50 | - http://pytorch.org/ For installation instructions 51 | - :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general 52 | - :doc:`/beginner/pytorch_with_examples` for a wide and deep overview 53 | - :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user 54 | 55 | 56 | It would also be useful to know about Sequence to Sequence networks and 57 | how they work: 58 | 59 | - `Learning Phrase Representations using RNN Encoder-Decoder for 60 | Statistical Machine Translation `__ 61 | - `Sequence to Sequence Learning with Neural 62 | Networks `__ 63 | - `Neural Machine Translation by Jointly Learning to Align and 64 | Translate `__ 65 | - `A Neural Conversational Model `__ 66 | 67 | You will also find the previous tutorials on 68 | :doc:`/intermediate/char_rnn_classification_tutorial` 69 | and :doc:`/intermediate/char_rnn_generation_tutorial` 70 | helpful as those concepts are very similar to the Encoder and Decoder 71 | models, respectively. 72 | 73 | And for more, read the papers that introduced these topics: 74 | 75 | - `Learning Phrase Representations using RNN Encoder-Decoder for 76 | Statistical Machine Translation `__ 77 | - `Sequence to Sequence Learning with Neural 78 | Networks `__ 79 | - `Neural Machine Translation by Jointly Learning to Align and 80 | Translate `__ 81 | - `A Neural Conversational Model `__ 82 | 83 | 84 | **Requirements** 85 | """ 86 | from __future__ import unicode_literals, print_function, division 87 | from io import open 88 | import unicodedata 89 | import string 90 | import re 91 | import random 92 | 93 | import torch 94 | import torch.nn as nn 95 | from torch.autograd import Variable 96 | from torch import optim 97 | import torch.nn.functional as F 98 | 99 | use_cuda = torch.cuda.is_available() 100 | 101 | ###################################################################### 102 | # Loading data files 103 | # ================== 104 | # 105 | # The data for this project is a set of many thousands of English to 106 | # French translation pairs. 107 | # 108 | # `This question on Open Data Stack 109 | # Exchange `__ 110 | # pointed me to the open translation site http://tatoeba.org/ which has 111 | # downloads available at http://tatoeba.org/eng/downloads - and better 112 | # yet, someone did the extra work of splitting language pairs into 113 | # individual text files here: http://www.manythings.org/anki/ 114 | # 115 | # The English to French pairs are too big to include in the repo, so 116 | # download to ``data/eng-fra.txt`` before continuing. The file is a tab 117 | # separated list of translation pairs: 118 | # 119 | # :: 120 | # 121 | # I am cold. Je suis froid. 122 | # 123 | # .. Note:: 124 | # Download the data from 125 | # `here `_ 126 | # and extract it to the current directory. 127 | 128 | ###################################################################### 129 | # Similar to the character encoding used in the character-level RNN 130 | # tutorials, we will be representing each word in a language as a one-hot 131 | # vector, or giant vector of zeros except for a single one (at the index 132 | # of the word). Compared to the dozens of characters that might exist in a 133 | # language, there are many many more words, so the encoding vector is much 134 | # larger. We will however cheat a bit and trim the data to only use a few 135 | # thousand words per language. 136 | # 137 | # .. figure:: /_static/img/seq-seq-images/word-encoding.png 138 | # :alt: 139 | # 140 | # 141 | 142 | 143 | ###################################################################### 144 | # We'll need a unique index per word to use as the inputs and targets of 145 | # the networks later. To keep track of all this we will use a helper class 146 | # called ``Lang`` which has word → index (``word2index``) and index → word 147 | # (``index2word``) dictionaries, as well as a count of each word 148 | # ``word2count`` to use to later replace rare words. 149 | # 150 | 151 | SOS_token = 0 152 | EOS_token = 1 153 | 154 | 155 | class Lang: 156 | def __init__(self, name): 157 | self.name = name 158 | self.word2index = {} 159 | self.word2count = {} 160 | self.index2word = {0: "SOS", 1: "EOS"} 161 | self.n_words = 2 # Count SOS and EOS 162 | 163 | def addSentence(self, sentence): 164 | for word in sentence.split(' '): 165 | self.addWord(word) 166 | 167 | def addWord(self, word): 168 | if word not in self.word2index: 169 | self.word2index[word] = self.n_words 170 | self.word2count[word] = 1 171 | self.index2word[self.n_words] = word 172 | self.n_words += 1 173 | else: 174 | self.word2count[word] += 1 175 | 176 | 177 | ###################################################################### 178 | # The files are all in Unicode, to simplify we will turn Unicode 179 | # characters to ASCII, make everything lowercase, and trim most 180 | # punctuation. 181 | # 182 | 183 | # Turn a Unicode string to plain ASCII, thanks to 184 | # http://stackoverflow.com/a/518232/2809427 185 | def unicodeToAscii(s): 186 | return ''.join( 187 | c for c in unicodedata.normalize('NFD', s) 188 | if unicodedata.category(c) != 'Mn' 189 | ) 190 | 191 | 192 | # Lowercase, trim, and remove non-letter characters 193 | 194 | 195 | def normalizeString(s): 196 | s = unicodeToAscii(s.lower().strip()) 197 | s = re.sub(r"([.!?])", r" \1", s) 198 | s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) 199 | return s 200 | 201 | 202 | ###################################################################### 203 | # To read the data file we will split the file into lines, and then split 204 | # lines into pairs. The files are all English → Other Language, so if we 205 | # want to translate from Other Language → English I added the ``reverse`` 206 | # flag to reverse the pairs. 207 | # 208 | 209 | def readLangs(lang1, lang2, reverse=False): 210 | print("Reading lines...") 211 | 212 | # Read the file and split into lines 213 | lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8'). \ 214 | read().strip().split('\n') 215 | 216 | # Split every line into pairs and normalize 217 | pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines] 218 | 219 | # Reverse pairs, make Lang instances 220 | if reverse: 221 | pairs = [list(reversed(p)) for p in pairs] 222 | input_lang = Lang(lang2) 223 | output_lang = Lang(lang1) 224 | else: 225 | input_lang = Lang(lang1) 226 | output_lang = Lang(lang2) 227 | 228 | return input_lang, output_lang, pairs 229 | 230 | 231 | ###################################################################### 232 | # Since there are a *lot* of example sentences and we want to train 233 | # something quickly, we'll trim the data set to only relatively short and 234 | # simple sentences. Here the maximum length is 10 words (that includes 235 | # ending punctuation) and we're filtering to sentences that translate to 236 | # the form "I am" or "He is" etc. (accounting for apostrophes replaced 237 | # earlier). 238 | # 239 | 240 | MAX_LENGTH = 10 241 | 242 | eng_prefixes = ( 243 | "i am ", "i m ", 244 | "he is", "he s ", 245 | "she is", "she s", 246 | "you are", "you re ", 247 | "we are", "we re ", 248 | "they are", "they re " 249 | ) 250 | 251 | 252 | def filterPair(p): 253 | return len(p[0].split(' ')) < MAX_LENGTH and \ 254 | len(p[1].split(' ')) < MAX_LENGTH and \ 255 | p[1].startswith(eng_prefixes) 256 | 257 | 258 | def filterPairs(pairs): 259 | return [pair for pair in pairs if filterPair(pair)] 260 | 261 | 262 | ###################################################################### 263 | # The full process for preparing the data is: 264 | # 265 | # - Read text file and split into lines, split lines into pairs 266 | # - Normalize text, filter by length and content 267 | # - Make word lists from sentences in pairs 268 | # 269 | 270 | def prepareData(lang1, lang2, reverse=False): 271 | input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse) 272 | print("Read %s sentence pairs" % len(pairs)) 273 | pairs = filterPairs(pairs) 274 | print("Trimmed to %s sentence pairs" % len(pairs)) 275 | print("Counting words...") 276 | for pair in pairs: 277 | input_lang.addSentence(pair[0]) 278 | output_lang.addSentence(pair[1]) 279 | print("Counted words:") 280 | print(input_lang.name, input_lang.n_words) 281 | print(output_lang.name, output_lang.n_words) 282 | return input_lang, output_lang, pairs 283 | 284 | 285 | input_lang, output_lang, pairs = prepareData('eng', 'fra', True) 286 | print(random.choice(pairs)) 287 | 288 | 289 | ###################################################################### 290 | # The Seq2Seq Model 291 | # ================= 292 | # 293 | # A Recurrent Neural Network, or RNN, is a network that operates on a 294 | # sequence and uses its own output as input for subsequent steps. 295 | # 296 | # A `Sequence to Sequence network `__, or 297 | # seq2seq network, or `Encoder Decoder 298 | # network `__, is a model 299 | # consisting of two RNNs called the encoder and decoder. The encoder reads 300 | # an input sequence and outputs a single vector, and the decoder reads 301 | # that vector to produce an output sequence. 302 | # 303 | # .. figure:: /_static/img/seq-seq-images/seq2seq.png 304 | # :alt: 305 | # 306 | # Unlike sequence prediction with a single RNN, where every input 307 | # corresponds to an output, the seq2seq model frees us from sequence 308 | # length and order, which makes it ideal for translation between two 309 | # languages. 310 | # 311 | # Consider the sentence "Je ne suis pas le chat noir" → "I am not the 312 | # black cat". Most of the words in the input sentence have a direct 313 | # translation in the output sentence, but are in slightly different 314 | # orders, e.g. "chat noir" and "black cat". Because of the "ne/pas" 315 | # construction there is also one more word in the input sentence. It would 316 | # be difficult to produce a correct translation directly from the sequence 317 | # of input words. 318 | # 319 | # With a seq2seq model the encoder creates a single vector which, in the 320 | # ideal case, encodes the "meaning" of the input sequence into a single 321 | # vector — a single point in some N dimensional space of sentences. 322 | # 323 | 324 | 325 | ###################################################################### 326 | # The Encoder 327 | # ----------- 328 | # 329 | # The encoder of a seq2seq network is a RNN that outputs some value for 330 | # every word from the input sentence. For every input word the encoder 331 | # outputs a vector and a hidden state, and uses the hidden state for the 332 | # next input word. 333 | # 334 | # .. figure:: /_static/img/seq-seq-images/encoder-network.png 335 | # :alt: 336 | # 337 | # 338 | 339 | class EncoderRNN(nn.Module): 340 | def __init__(self, input_size, hidden_size, n_layers=1): 341 | super(EncoderRNN, self).__init__() 342 | self.n_layers = n_layers 343 | self.hidden_size = hidden_size 344 | 345 | self.embedding = nn.Embedding(input_size, hidden_size) 346 | self.gru = nn.GRU(hidden_size, hidden_size) 347 | 348 | def forward(self, input, hidden): 349 | embedded = self.embedding(input).view(1, 1, -1) 350 | output = embedded 351 | for i in range(self.n_layers): 352 | output, hidden = self.gru(output, hidden) 353 | return output, hidden 354 | 355 | def initHidden(self): 356 | result = Variable(torch.zeros(1, 1, self.hidden_size)) 357 | if use_cuda: 358 | return result.cuda() 359 | else: 360 | return result 361 | 362 | 363 | ###################################################################### 364 | # The Decoder 365 | # ----------- 366 | # 367 | # The decoder is another RNN that takes the encoder output vector(s) and 368 | # outputs a sequence of words to create the translation. 369 | # 370 | 371 | 372 | ###################################################################### 373 | # Simple Decoder 374 | # ^^^^^^^^^^^^^^ 375 | # 376 | # In the simplest seq2seq decoder we use only last output of the encoder. 377 | # This last output is sometimes called the *context vector* as it encodes 378 | # context from the entire sequence. This context vector is used as the 379 | # initial hidden state of the decoder. 380 | # 381 | # At every step of decoding, the decoder is given an input token and 382 | # hidden state. The initial input token is the start-of-string ```` 383 | # token, and the first hidden state is the context vector (the encoder's 384 | # last hidden state). 385 | # 386 | # .. figure:: /_static/img/seq-seq-images/decoder-network.png 387 | # :alt: 388 | # 389 | # 390 | 391 | class DecoderRNN(nn.Module): 392 | def __init__(self, hidden_size, output_size, n_layers=1): 393 | super(DecoderRNN, self).__init__() 394 | self.n_layers = n_layers 395 | self.hidden_size = hidden_size 396 | 397 | self.embedding = nn.Embedding(output_size, hidden_size) 398 | self.gru = nn.GRU(hidden_size, hidden_size) 399 | self.out = nn.Linear(hidden_size, output_size) 400 | self.softmax = nn.LogSoftmax() 401 | 402 | def forward(self, input, hidden): 403 | output = self.embedding(input).view(1, 1, -1) 404 | for i in range(self.n_layers): 405 | output = F.relu(output) 406 | output, hidden = self.gru(output, hidden) 407 | output = self.softmax(self.out(output[0])) 408 | return output, hidden 409 | 410 | def initHidden(self): 411 | result = Variable(torch.zeros(1, 1, self.hidden_size)) 412 | if use_cuda: 413 | return result.cuda() 414 | else: 415 | return result 416 | 417 | 418 | ###################################################################### 419 | # I encourage you to train and observe the results of this model, but to 420 | # save space we'll be going straight for the gold and introducing the 421 | # Attention Mechanism. 422 | # 423 | 424 | 425 | ###################################################################### 426 | # Attention Decoder 427 | # ^^^^^^^^^^^^^^^^^ 428 | # 429 | # If only the context vector is passed betweeen the encoder and decoder, 430 | # that single vector carries the burden of encoding the entire sentence. 431 | # 432 | # Attention allows the decoder network to "focus" on a different part of 433 | # the encoder's outputs for every step of the decoder's own outputs. First 434 | # we calculate a set of *attention weights*. These will be multiplied by 435 | # the encoder output vectors to create a weighted combination. The result 436 | # (called ``attn_applied`` in the code) should contain information about 437 | # that specific part of the input sequence, and thus help the decoder 438 | # choose the right output words. 439 | # 440 | # .. figure:: https://i.imgur.com/1152PYf.png 441 | # :alt: 442 | # 443 | # Calculating the attention weights is done with another feed-forward 444 | # layer ``attn``, using the decoder's input and hidden state as inputs. 445 | # Because there are sentences of all sizes in the training data, to 446 | # actually create and train this layer we have to choose a maximum 447 | # sentence length (input length, for encoder outputs) that it can apply 448 | # to. Sentences of the maximum length will use all the attention weights, 449 | # while shorter sentences will only use the first few. 450 | # 451 | # .. figure:: /_static/img/seq-seq-images/attention-decoder-network.png 452 | # :alt: 453 | # 454 | # 455 | 456 | class AttnDecoderRNN(nn.Module): 457 | def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1, max_length=MAX_LENGTH): 458 | super(AttnDecoderRNN, self).__init__() 459 | self.hidden_size = hidden_size 460 | self.output_size = output_size 461 | self.n_layers = n_layers 462 | self.dropout_p = dropout_p 463 | self.max_length = max_length 464 | 465 | self.embedding = nn.Embedding(self.output_size, self.hidden_size) 466 | self.attn = nn.Linear(self.hidden_size * 2, self.max_length) 467 | self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size) 468 | self.dropout = nn.Dropout(self.dropout_p) 469 | self.gru = nn.GRU(self.hidden_size, self.hidden_size) 470 | self.out = nn.Linear(self.hidden_size, self.output_size) 471 | 472 | def forward(self, input, hidden, encoder_output, encoder_outputs): 473 | embedded = self.embedding(input).view(1, 1, -1) 474 | embedded = self.dropout(embedded) 475 | 476 | attn_weights = F.softmax( 477 | self.attn(torch.cat((embedded[0], hidden[0]), 1))) 478 | attn_applied = torch.bmm(attn_weights.unsqueeze(0), 479 | encoder_outputs.unsqueeze(0)) 480 | 481 | output = torch.cat((embedded[0], attn_applied[0]), 1) 482 | output = self.attn_combine(output).unsqueeze(0) 483 | 484 | for i in range(self.n_layers): 485 | output = F.relu(output) 486 | output, hidden = self.gru(output, hidden) 487 | 488 | output = F.log_softmax(self.out(output[0])) 489 | return output, hidden, attn_weights 490 | 491 | def initHidden(self): 492 | result = Variable(torch.zeros(1, 1, self.hidden_size)) 493 | if use_cuda: 494 | return result.cuda() 495 | else: 496 | return result 497 | 498 | 499 | ###################################################################### 500 | # .. note:: There are other forms of attention that work around the length 501 | # limitation by using a relative position approach. Read about "local 502 | # attention" in `Effective Approaches to Attention-based Neural Machine 503 | # Translation `__. 504 | # 505 | # Training 506 | # ======== 507 | # 508 | # Preparing Training Data 509 | # ----------------------- 510 | # 511 | # To train, for each pair we will need an input tensor (indexes of the 512 | # words in the input sentence) and target tensor (indexes of the words in 513 | # the target sentence). While creating these vectors we will append the 514 | # EOS token to both sequences. 515 | # 516 | 517 | def indexesFromSentence(lang, sentence): 518 | return [lang.word2index[word] for word in sentence.split(' ')] 519 | 520 | 521 | def variableFromSentence(lang, sentence): 522 | indexes = indexesFromSentence(lang, sentence) 523 | indexes.append(EOS_token) 524 | result = Variable(torch.LongTensor(indexes).view(-1, 1)) 525 | if use_cuda: 526 | return result.cuda() 527 | else: 528 | return result 529 | 530 | 531 | def variablesFromPair(pair): 532 | input_variable = variableFromSentence(input_lang, pair[0]) 533 | target_variable = variableFromSentence(output_lang, pair[1]) 534 | return (input_variable, target_variable) 535 | 536 | 537 | ###################################################################### 538 | # Training the Model 539 | # ------------------ 540 | # 541 | # To train we run the input sentence through the encoder, and keep track 542 | # of every output and the latest hidden state. Then the decoder is given 543 | # the ```` token as its first input, and the last hidden state of the 544 | # encoder as its first hidden state. 545 | # 546 | # "Teacher forcing" is the concept of using the real target outputs as 547 | # each next input, instead of using the decoder's guess as the next input. 548 | # Using teacher forcing causes it to converge faster but `when the trained 549 | # network is exploited, it may exhibit 550 | # instability `__. 551 | # 552 | # You can observe outputs of teacher-forced networks that read with 553 | # coherent grammar but wander far from the correct translation - 554 | # intuitively it has learned to represent the output grammar and can "pick 555 | # up" the meaning once the teacher tells it the first few words, but it 556 | # has not properly learned how to create the sentence from the translation 557 | # in the first place. 558 | # 559 | # Because of the freedom PyTorch's autograd gives us, we can randomly 560 | # choose to use teacher forcing or not with a simple if statement. Turn 561 | # ``teacher_forcing_ratio`` up to use more of it. 562 | # 563 | 564 | teacher_forcing_ratio = 0.5 565 | 566 | 567 | def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, 568 | max_length=MAX_LENGTH): 569 | encoder_hidden = encoder.initHidden() 570 | 571 | encoder_optimizer.zero_grad() 572 | decoder_optimizer.zero_grad() 573 | 574 | input_length = input_variable.size()[0] 575 | target_length = target_variable.size()[0] 576 | 577 | encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size)) 578 | encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs 579 | 580 | loss = 0 581 | 582 | for ei in range(input_length): 583 | encoder_output, encoder_hidden = encoder( 584 | input_variable[ei], encoder_hidden) 585 | encoder_outputs[ei] = encoder_output[0][0] 586 | 587 | decoder_input = Variable(torch.LongTensor([[SOS_token]])) 588 | decoder_input = decoder_input.cuda() if use_cuda else decoder_input 589 | 590 | decoder_hidden = encoder_hidden 591 | 592 | use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False 593 | 594 | if use_teacher_forcing: 595 | # Teacher forcing: Feed the target as the next input 596 | for di in range(target_length): 597 | decoder_output, decoder_hidden, decoder_attention = decoder( 598 | decoder_input, decoder_hidden, encoder_output, encoder_outputs) 599 | loss += criterion(decoder_output[0], target_variable[di]) 600 | decoder_input = target_variable[di] # Teacher forcing 601 | 602 | else: 603 | # Without teacher forcing: use its own predictions as the next input 604 | for di in range(target_length): 605 | decoder_output, decoder_hidden, decoder_attention = decoder( 606 | decoder_input, decoder_hidden, encoder_output, encoder_outputs) 607 | topv, topi = decoder_output.data.topk(1) 608 | ni = topi[0][0] 609 | 610 | decoder_input = Variable(torch.LongTensor([[ni]])) 611 | decoder_input = decoder_input.cuda() if use_cuda else decoder_input 612 | 613 | loss += criterion(decoder_output[0], target_variable[di]) 614 | if ni == EOS_token: 615 | break 616 | 617 | loss.backward() 618 | 619 | encoder_optimizer.step() 620 | decoder_optimizer.step() 621 | 622 | return loss.data[0] / target_length 623 | 624 | 625 | ###################################################################### 626 | # This is a helper function to print time elapsed and estimated time 627 | # remaining given the current time and progress %. 628 | # 629 | 630 | import time 631 | import math 632 | 633 | 634 | def asMinutes(s): 635 | m = math.floor(s / 60) 636 | s -= m * 60 637 | return '%dm %ds' % (m, s) 638 | 639 | 640 | def timeSince(since, percent): 641 | now = time.time() 642 | s = now - since 643 | es = s / (percent) 644 | rs = es - s 645 | return '%s (- %s)' % (asMinutes(s), asMinutes(rs)) 646 | 647 | 648 | ###################################################################### 649 | # The whole training process looks like this: 650 | # 651 | # - Start a timer 652 | # - Initialize optimizers and criterion 653 | # - Create set of training pairs 654 | # - Start empty losses array for plotting 655 | # 656 | # Then we call ``train`` many times and occasionally print the progress (% 657 | # of examples, time so far, estimated time) and average loss. 658 | # 659 | 660 | def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01): 661 | start = time.time() 662 | plot_losses = [] 663 | print_loss_total = 0 # Reset every print_every 664 | plot_loss_total = 0 # Reset every plot_every 665 | 666 | encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) 667 | decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate) 668 | training_pairs = [variablesFromPair(random.choice(pairs)) 669 | for i in range(n_iters)] 670 | criterion = nn.NLLLoss() 671 | 672 | for iter in range(1, n_iters + 1): 673 | training_pair = training_pairs[iter - 1] 674 | input_variable = training_pair[0] 675 | target_variable = training_pair[1] 676 | 677 | loss = train(input_variable, target_variable, encoder, 678 | decoder, encoder_optimizer, decoder_optimizer, criterion) 679 | print_loss_total += loss 680 | plot_loss_total += loss 681 | 682 | if iter % print_every == 0: 683 | print_loss_avg = print_loss_total / print_every 684 | print_loss_total = 0 685 | print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), 686 | iter, iter / n_iters * 100, print_loss_avg)) 687 | 688 | if iter % plot_every == 0: 689 | plot_loss_avg = plot_loss_total / plot_every 690 | plot_losses.append(plot_loss_avg) 691 | plot_loss_total = 0 692 | 693 | showPlot(plot_losses) 694 | 695 | 696 | ###################################################################### 697 | # Plotting results 698 | # ---------------- 699 | # 700 | # Plotting is done with matplotlib, using the array of loss values 701 | # ``plot_losses`` saved while training. 702 | # 703 | 704 | import matplotlib.pyplot as plt 705 | import matplotlib.ticker as ticker 706 | import numpy as np 707 | 708 | 709 | def showPlot(points): 710 | plt.figure() 711 | fig, ax = plt.subplots() 712 | # this locator puts ticks at regular intervals 713 | loc = ticker.MultipleLocator(base=0.2) 714 | ax.yaxis.set_major_locator(loc) 715 | plt.plot(points) 716 | 717 | 718 | ###################################################################### 719 | # Evaluation 720 | # ========== 721 | # 722 | # Evaluation is mostly the same as training, but there are no targets so 723 | # we simply feed the decoder's predictions back to itself for each step. 724 | # Every time it predicts a word we add it to the output string, and if it 725 | # predicts the EOS token we stop there. We also store the decoder's 726 | # attention outputs for display later. 727 | # 728 | 729 | def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH): 730 | input_variable = variableFromSentence(input_lang, sentence) 731 | input_length = input_variable.size()[0] 732 | encoder_hidden = encoder.initHidden() 733 | 734 | encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size)) 735 | encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs 736 | 737 | for ei in range(input_length): 738 | encoder_output, encoder_hidden = encoder(input_variable[ei], 739 | encoder_hidden) 740 | encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0] 741 | 742 | decoder_input = Variable(torch.LongTensor([[SOS_token]])) # SOS 743 | decoder_input = decoder_input.cuda() if use_cuda else decoder_input 744 | 745 | decoder_hidden = encoder_hidden 746 | 747 | decoded_words = [] 748 | decoder_attentions = torch.zeros(max_length, max_length) 749 | 750 | for di in range(max_length): 751 | decoder_output, decoder_hidden, decoder_attention = decoder( 752 | decoder_input, decoder_hidden, encoder_output, encoder_outputs) 753 | decoder_attentions[di] = decoder_attention.data 754 | topv, topi = decoder_output.data.topk(1) 755 | ni = topi[0][0] 756 | if ni == EOS_token: 757 | decoded_words.append('') 758 | break 759 | else: 760 | decoded_words.append(output_lang.index2word[ni]) 761 | 762 | decoder_input = Variable(torch.LongTensor([[ni]])) 763 | decoder_input = decoder_input.cuda() if use_cuda else decoder_input 764 | 765 | return decoded_words, decoder_attentions[:di + 1] 766 | 767 | 768 | ###################################################################### 769 | # We can evaluate random sentences from the training set and print out the 770 | # input, target, and output to make some subjective quality judgements: 771 | # 772 | 773 | def evaluateRandomly(encoder, decoder, n=10): 774 | for i in range(n): 775 | pair = random.choice(pairs) 776 | print('>', pair[0]) 777 | print('=', pair[1]) 778 | output_words, attentions = evaluate(encoder, decoder, pair[0]) 779 | output_sentence = ' '.join(output_words) 780 | print('<', output_sentence) 781 | print('') 782 | 783 | 784 | ###################################################################### 785 | # Training and Evaluating 786 | # ======================= 787 | # 788 | # With all these helper functions in place (it looks like extra work, but 789 | # it's easier to run multiple experiments easier) we can actually 790 | # initialize a network and start training. 791 | # 792 | # Remember that the input sentences were heavily filtered. For this small 793 | # dataset we can use relatively small networks of 256 hidden nodes and a 794 | # single GRU layer. After about 40 minutes on a MacBook CPU we'll get some 795 | # reasonable results. 796 | # 797 | # .. Note:: 798 | # If you run this notebook you can train, interrupt the kernel, 799 | # evaluate, and continue training later. Comment out the lines where the 800 | # encoder and decoder are initialized and run ``trainIters`` again. 801 | # 802 | 803 | hidden_size = 256 804 | encoder1 = EncoderRNN(input_lang.n_words, hidden_size) 805 | attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, 806 | 1, dropout_p=0.1) 807 | 808 | if use_cuda: 809 | encoder1 = encoder1.cuda() 810 | attn_decoder1 = attn_decoder1.cuda() 811 | 812 | trainIters(encoder1, attn_decoder1, 75000, print_every=5000) 813 | 814 | ###################################################################### 815 | # 816 | 817 | evaluateRandomly(encoder1, attn_decoder1) 818 | 819 | ###################################################################### 820 | # Visualizing Attention 821 | # --------------------- 822 | # 823 | # A useful property of the attention mechanism is its highly interpretable 824 | # outputs. Because it is used to weight specific encoder outputs of the 825 | # input sequence, we can imagine looking where the network is focused most 826 | # at each time step. 827 | # 828 | # You could simply run ``plt.matshow(attentions)`` to see attention output 829 | # displayed as a matrix, with the columns being input steps and rows being 830 | # output steps: 831 | # 832 | 833 | output_words, attentions = evaluate( 834 | encoder1, attn_decoder1, "je suis trop froid .") 835 | plt.matshow(attentions.numpy()) 836 | 837 | 838 | ###################################################################### 839 | # For a better viewing experience we will do the extra work of adding axes 840 | # and labels: 841 | # 842 | 843 | def showAttention(input_sentence, output_words, attentions): 844 | # Set up figure with colorbar 845 | fig = plt.figure() 846 | ax = fig.add_subplot(111) 847 | cax = ax.matshow(attentions.numpy(), cmap='bone') 848 | fig.colorbar(cax) 849 | 850 | # Set up axes 851 | ax.set_xticklabels([''] + input_sentence.split(' ') + 852 | [''], rotation=90) 853 | ax.set_yticklabels([''] + output_words) 854 | 855 | # Show label at every tick 856 | ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) 857 | ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) 858 | 859 | plt.show() 860 | 861 | 862 | def evaluateAndShowAttention(input_sentence): 863 | output_words, attentions = evaluate( 864 | encoder1, attn_decoder1, input_sentence) 865 | print('input =', input_sentence) 866 | print('output =', ' '.join(output_words)) 867 | showAttention(input_sentence, output_words, attentions) 868 | 869 | 870 | evaluateAndShowAttention("elle a cinq ans de moins que moi .") 871 | 872 | evaluateAndShowAttention("elle est trop petit .") 873 | 874 | evaluateAndShowAttention("je ne crains pas de mourir .") 875 | 876 | evaluateAndShowAttention("c est un jeune directeur plein de talent .") 877 | 878 | 879 | ###################################################################### 880 | # Exercises 881 | # ========= 882 | # 883 | # - Try with a different dataset 884 | # 885 | # - Another language pair 886 | # - Human → Machine (e.g. IOT commands) 887 | # - Chat → Response 888 | # - Question → Answer 889 | # 890 | # - Replace the embeddings with pre-trained word embeddings such as word2vec or 891 | # GloVe 892 | # - Try with more layers, more hidden units, and more sentences. Compare 893 | # the training time and results. 894 | # - If you use a translation file where pairs have two of the same phrase 895 | # (``I am test \t I am test``), you can use this as an autoencoder. Try 896 | # this: 897 | # 898 | # - Train as an autoencoder 899 | # - Save only the Encoder network 900 | # - Train a new Decoder for translation from there 901 | # --------------------------------------------------------------------------------