├── references
├── code
│ ├── __init__.py
│ ├── vanilla-gan.py
│ └── seq2seq_translation_tutorial.py
├── __init__.py
└── papers
│ ├── 1707.07328.pdf
│ ├── WORDS OR CHARACTERS _ FINE-GRAINED GATING.pdf
│ ├── deep reinforcement learning for dialogue generation.pdf
│ ├── Semi-Supervised QA with Generative Domain-Adaptive Nets.pdf
│ └── learning cooperative visual dialog agents with deep reinforcement learning.pdf
├── .gitignore
├── src
├── __init__.py
├── util
│ ├── util.pyc
│ ├── __init__.py
│ ├── data_proc.pyc
│ ├── masked_cross_entropy.py
│ ├── util.py
│ ├── test.py
│ └── data_proc.py
├── GAN_model
│ ├── __init__.py
│ ├── GAN_main.py
│ └── GAN_model.py
├── G_c_a_sep
│ ├── __init__.py
│ ├── G_eval.py
│ ├── G_main.py
│ ├── G_c_a_sep.py
│ └── G_train.py
├── D_baseline
│ ├── D_eval.pyc
│ ├── D_train.pyc
│ ├── __init__.py
│ ├── D_baseline_model.pyc
│ ├── D_model.py
│ ├── D_eval.py
│ ├── D_main.py
│ └── D_train.py
├── G_baseline
│ ├── __init__.py
│ ├── G_main.py
│ ├── G_train.py
│ ├── G_eval.py
│ └── G_model.py
└── model_zoo.py
└── .idea
├── GAN-QA.iml
└── misc.xml
/references/code/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 |
3 | .gitignore
4 | .idea
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division
--------------------------------------------------------------------------------
/src/util/util.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/util/util.pyc
--------------------------------------------------------------------------------
/src/util/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division
--------------------------------------------------------------------------------
/references/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division
--------------------------------------------------------------------------------
/src/GAN_model/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division
--------------------------------------------------------------------------------
/src/G_c_a_sep/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division
--------------------------------------------------------------------------------
/src/util/data_proc.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/util/data_proc.pyc
--------------------------------------------------------------------------------
/src/D_baseline/D_eval.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/D_baseline/D_eval.pyc
--------------------------------------------------------------------------------
/src/D_baseline/D_train.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/D_baseline/D_train.pyc
--------------------------------------------------------------------------------
/src/D_baseline/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division
--------------------------------------------------------------------------------
/src/G_baseline/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division
--------------------------------------------------------------------------------
/references/papers/1707.07328.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/1707.07328.pdf
--------------------------------------------------------------------------------
/src/D_baseline/D_baseline_model.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/D_baseline/D_baseline_model.pyc
--------------------------------------------------------------------------------
/references/papers/WORDS OR CHARACTERS _ FINE-GRAINED GATING.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/WORDS OR CHARACTERS _ FINE-GRAINED GATING.pdf
--------------------------------------------------------------------------------
/references/papers/deep reinforcement learning for dialogue generation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/deep reinforcement learning for dialogue generation.pdf
--------------------------------------------------------------------------------
/references/papers/Semi-Supervised QA with Generative Domain-Adaptive Nets.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/Semi-Supervised QA with Generative Domain-Adaptive Nets.pdf
--------------------------------------------------------------------------------
/references/papers/learning cooperative visual dialog agents with deep reinforcement learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/learning cooperative visual dialog agents with deep reinforcement learning.pdf
--------------------------------------------------------------------------------
/.idea/GAN-QA.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/src/D_baseline/D_model.py:
--------------------------------------------------------------------------------
1 |
2 | import sys
3 | import os
4 | sys.path.append(os.path.abspath(__file__ + "/../../"))
5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
6 |
7 | # FIXME: spacy has some problem with torch. need to import spacy first. therefore import data_proc first.
8 | from data_proc import *
9 | from model_zoo import *
10 |
11 | import torch
12 | import torch.nn as nn
13 |
14 | use_cuda = torch.cuda.is_available()
15 |
16 | ######################################################################
17 | # The Encoder
18 | # -----------
19 | # FIXME: not sure if __name__ is to be used.
20 | # if __name__ == '__main__':
21 |
22 | class D(nn.Module):
23 |
24 | def __init__(self, enc_input_size, enc_hidden_size, enc_n_layers, num_directions,
25 | mlp_hidden_size, num_attn_weights, mlp_output_size, use_attn,
26 | batch_size):
27 | # super constructor
28 | super(D, self).__init__()
29 |
30 | self.encoder = EncoderRNN(enc_input_size, enc_hidden_size, batch_size, enc_n_layers, num_directions)
31 | self.mlp = MLP(mlp_hidden_size, mlp_output_size, self.encoder, num_attn_weights, use_attn = True)
32 |
33 |
34 | def forward(self, inputs, seq_lens, hidden=None):
35 | # input size = (seq len, batch size, word embedding dimension)
36 |
37 | # encoding
38 | # outputs dim (seq_len, batch size, hidden_size*num_directions)
39 | encoder_outputs, encoder_hidden = self.encoder(inputs, seq_lens)
40 |
41 | # MLP
42 | out = self.mlp(encoder_outputs)
43 |
44 | return out
45 |
46 |
47 | def backward(self, out, labels, criterion, optimizer):
48 | loss = criterion(out, labels)
49 | loss.backward()
50 | optimizer.step()
51 | return loss
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/src/D_baseline/D_eval.py:
--------------------------------------------------------------------------------
1 |
2 | import sys
3 | import os
4 | sys.path.append(os.path.abspath(__file__ + "/../../"))
5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
6 |
7 | from data_proc import *
8 | from D_model import *
9 |
10 | import torch
11 | from torch.autograd import Variable
12 |
13 | use_cuda = torch.cuda.is_available()
14 |
15 |
16 | def evaluate(discriminator, triplets,
17 | word2index, embeddings_index, embeddings_size,
18 | eval_batch_size=10):
19 |
20 | # prepare batch
21 | training_batch, seq_lens, fake_training_batch, fake_seq_lens = get_random_batch(triplets, eval_batch_size, with_fake=True)
22 | # concat the context_ans batch with the question batch
23 | # each element in the training batch is context + question + answer
24 | training_batch, _, seq_lens = prepare_batch_var(training_batch, seq_lens, fake_training_batch, fake_seq_lens,
25 | eval_batch_size, word2index, embeddings_index, embeddings_size,
26 | mode = ['word'], concat_opt='cqa', with_fake=True)
27 |
28 | train_input = Variable(training_batch[0].cuda()) if use_cuda else Variable(
29 | training_batch[0]) # embeddings vectors, size = [seq len x batch size x embedding dim]
30 | true_labels = Variable(torch.FloatTensor(training_batch[-1]).cuda()) if use_cuda else Variable(
31 | torch.FloatTensor(training_batch[-1]))
32 |
33 | # pass through discriminator model
34 | outputs = discriminator.forward(train_input, true_labels, seq_lens[0])
35 |
36 | # get label predictions from model & compare the number of correct predictions
37 | pred_labels = torch.zeros(outputs.size())
38 | num_correct_pred = 0
39 | for i in range(outputs.size(0)):
40 | pred_labels[i] = 0 if outputs.data[i][0] <= 0.5 else 1
41 | if pred_labels[i][0] == true_labels[i].data[0]:
42 | num_correct_pred += 1
43 |
44 | print('percentage of correct predictions (True/False): ' +
45 | str(float(num_correct_pred)/float(outputs.size(0))*100) + '%.\n')
46 |
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/src/util/masked_cross_entropy.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.nn import functional
3 | from torch.autograd import Variable
4 |
5 | def sequence_mask(sequence_length, max_len=None):
6 | if max_len is None:
7 | max_len = sequence_length.data.max()
8 | batch_size = sequence_length.size(0)
9 | seq_range = torch.range(0, max_len - 1).long()
10 | seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
11 | seq_range_expand = Variable(seq_range_expand)
12 | if sequence_length.is_cuda:
13 | seq_range_expand = seq_range_expand.cuda()
14 | seq_length_expand = (sequence_length.unsqueeze(1)
15 | .expand_as(seq_range_expand))
16 | return seq_range_expand < seq_length_expand
17 |
18 |
19 | def masked_cross_entropy(logits, target, length):
20 | length = Variable(torch.LongTensor(length)).cuda()
21 |
22 | """
23 | Args:
24 | logits: A Variable containing a FloatTensor of size
25 | (batch, max_len, num_classes) which contains the
26 | unnormalized probability for each class.
27 | target: A Variable containing a LongTensor of size
28 | (batch, max_len) which contains the index of the true
29 | class for each corresponding step.
30 | length: A Variable containing a LongTensor of size (batch,)
31 | which contains the length of each data in a batch.
32 | Returns:
33 | loss: An average loss value masked by the length.
34 | """
35 |
36 | # logits_flat: (batch * max_len, num_classes)
37 | logits_flat = logits.view(-1, logits.size(-1))
38 | # log_probs_flat: (batch * max_len, num_classes)
39 | log_probs_flat = functional.log_softmax(logits_flat)
40 | # target_flat: (batch * max_len, 1)
41 | target_flat = target.view(-1, 1)
42 | # losses_flat: (batch * max_len, 1)
43 | losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
44 | # losses: (batch, max_len)
45 | losses = losses_flat.view(*target.size())
46 | # mask: (batch, max_len)
47 | mask = sequence_mask(sequence_length=length, max_len=target.size(1))
48 | losses = losses * mask.float()
49 | loss = losses.sum() / length.float().sum()
50 | return loss
--------------------------------------------------------------------------------
/src/D_baseline/D_main.py:
--------------------------------------------------------------------------------
1 |
2 | from __future__ import print_function
3 | from __future__ import division
4 |
5 | import sys
6 | import os
7 | sys.path.append(os.path.abspath(__file__ + "/../../"))
8 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
9 | from data_proc import *
10 |
11 | from D_model import *
12 | from D_train import *
13 | from D_eval import *
14 | import numpy as np
15 |
16 | from torch import optim
17 |
18 | use_cuda = torch.cuda.is_available()
19 |
20 |
21 | ######### set paths
22 | # TODO: to run properly, change the following paths and filenames
23 | # default values for the dataset and the path to the project/dataset
24 | dataset = 'squad'
25 | f_name = 'dev-v1.1.json'
26 | path_to_dataset = os.path.abspath(__file__ + '/../../../../') + '/data/'
27 | path_to_data = path_to_dataset + dataset + '/' + f_name
28 | GLOVE_DIR = path_to_dataset + 'glove.6B/'
29 | # path for experiment outputs
30 | # exp_name = 'QG_seq2seq_baseline'
31 | path_to_exp_out = os.path.abspath(__file__ + '/../../../../') + '/exp_results_D_temp/'
32 | loss_f = 'loss_temp.txt'
33 | sample_out_f = 'sample_outputs_temp.txt'
34 | path_to_loss_f = path_to_exp_out + '/' + loss_f
35 | path_to_sample_out_f = path_to_exp_out + '/' + sample_out_f
36 |
37 |
38 | ######### first load the pretrained word embeddings
39 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.50d.txt')
40 | embeddings_index, embeddings_size = readGlove(path_to_glove)
41 |
42 |
43 | ######### read corpus
44 | raw_triplets = read_raw_squad(path_to_data)
45 | triplets = tokenize_squad(raw_triplets, embeddings_index)
46 |
47 | # find max length of context, question, answer, respectively
48 | max_len_c, max_len_q, max_len_a = max_length(triplets)
49 |
50 | ######### corpus preprocessing
51 | # words that do not appear in embeddings, etc
52 |
53 | ## find all unique tokens in the data (should be a subset of the number of embeddings)
54 | effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index)
55 | print('effective number of tokens: ' + str(effective_num_tokens))
56 | print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n')
57 | # build word2index dictionary and index2word dictionary
58 | word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens)
59 |
60 |
61 | ######### set up model
62 | enc_hidden_size = 256
63 | enc_n_layers = 1
64 | num_directions = 1
65 | mlp_hidden_size = 64
66 | mlp_output_size = 1
67 | num_attn_weights = 1 # 1000
68 | use_attn = True
69 | batch_size = 100
70 | enc_lr = 0.01
71 | mlp_lr = 0.01
72 | learning_rate = 0.001
73 | discriminator = D(embeddings_size, enc_hidden_size, enc_n_layers, num_directions,
74 | mlp_hidden_size, num_attn_weights, mlp_output_size, use_attn,
75 | batch_size)
76 | if use_cuda:
77 | discriminator = discriminator.cuda()
78 |
79 | criterion = nn.BCELoss()
80 | optimizer = optim.Adam(discriminator.parameters(), lr=learning_rate)
81 |
82 |
83 | ######### start training
84 | to_file = False
85 | train(discriminator, criterion, optimizer, batch_size, embeddings_size,
86 | embeddings_index, word2index, index2word, triplets,
87 | to_file, path_to_loss_f, path_to_sample_out_f, path_to_exp_out,
88 | n_iters=3000, print_every=100, plot_every=1)
89 |
90 |
91 | # save the final model
92 | # if to_file:
93 | # torch.save(encoder, path_to_exp_out+'/encoder.pth')
94 | # torch.save(mlp, path_to_exp_out+'/mlp.pth')
95 |
96 |
97 |
98 |
99 |
100 |
--------------------------------------------------------------------------------
/src/G_c_a_sep/G_eval.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | sys.path.append(os.path.abspath(__file__ + "/../../"))
4 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
5 | from data_proc import *
6 |
7 | import torch
8 | from torch.autograd import Variable
9 |
10 | use_cuda = torch.cuda.is_available()
11 |
12 |
13 | # max_length constrains the maximum length of the generated question
14 | def evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length,
15 | to_file = False, sample_out_f = None):
16 |
17 | # prepare test input
18 | batch_size = 1
19 | training, seq_lens = get_random_batch(triplets, batch_size)
20 | context_words = training[0]
21 | answer_words = training[2]
22 | question_words = training[1]
23 | training, _, seq_lens = prepare_batch_var(training, seq_lens, batch_size,
24 | word2index, embeddings_index, embeddings_size)
25 | inputs = []
26 | for var in training:
27 | if not isinstance(var, list):
28 | inputs.append(Variable(var.cuda())) if use_cuda else inputs.append(Variable(var))
29 | # NOTE not currently appending start and end index to inputs because model does not use them
30 | # else:
31 | # inputs.append(Variable(inputs))
32 |
33 | inputs_q = None
34 |
35 | all_decoder_outputs = generator.forward(inputs, seq_lens, batch_size, max_length,
36 | embeddings_index, embeddings_size, word2index, index2word,
37 | teacher_forcing_ratio=0)
38 |
39 | decoded_sentences = []
40 | decoded_words = []
41 | for b in range(batch_size):
42 | # get the word token and add to the list of words
43 | for di in range(max_length):
44 | # top value and index of every batch
45 | topv, topi = all_decoder_outputs[di,b].data.topk(1)
46 | ni = topi[0]
47 | if (ni == word2index['EOS']) or (ni == word2index['PAD']):
48 | decoded_words.append('EOS')
49 | # decoder_attentions[di] = decoder_attention[0].data
50 | break
51 | else:
52 | decoded_words.append(index2word[ni])
53 | decoded_sentences.append(decoded_words)
54 |
55 | # print results
56 | if not to_file:
57 | print('context > ' + ' '.join(context_words[0]).encode('utf-8').strip())
58 | print('answer > ' + ' '.join(answer_words[0]).encode('utf-8').strip())
59 | print('question > ' + ' '.join(question_words[0]).encode('utf-8').strip())
60 | # true_q = []
61 | # for i in range(seq_lens[1][0]):
62 | # true_q.append(index2word[inputs_q[i][0].data[0]])
63 | # print('question with padding> ' + ' '.join(true_q))
64 | print('generated question > ' + ' '.join(decoded_words))
65 | return decoded_words
66 | else:
67 | sample_out_f.write(unicode('context > ' + ' '.join(context_words[0]) + '\n'))
68 | sample_out_f.write(unicode('answer > ' + ' '.join(answer_words[0]) + '\n'))
69 | sample_out_f.write(unicode('question > ' + ' '.join(question_words[0]) + '\n'))
70 | sample_out_f.write(unicode('generated question > ' + ' '.join(decoded_words) + '\n'))
71 |
72 | # TODO: uncomment the following return if you want to record the decoder outputs in file
73 | # (note: need to modify this function call in G_train.py)
74 | # return decoded_sentences
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/src/G_baseline/G_main.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division
3 |
4 | import sys
5 | import os
6 | sys.path.append(os.path.abspath(__file__ + "/../../"))
7 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
8 |
9 | from G_train import *
10 | from G_model import *
11 | import numpy as np
12 |
13 | global use_cuda
14 | use_cuda = torch.cuda.is_available()
15 | teacher_forcing_ratio = 0.75 # default in original code is 0.5
16 |
17 |
18 | ######### set paths
19 | # TODO: to run properly, change the following paths and filenames
20 | # default values for the dataset and the path to the project/dataset
21 | dataset = 'squad'
22 | f_name = 'train-v1.1.json'
23 | path_to_dataset = '/home/jack/Documents/QA_QG/data/'
24 | path_to_data = path_to_dataset + dataset + '/' + f_name
25 | GLOVE_DIR = path_to_dataset + 'glove.6B/'
26 |
27 |
28 | ######### first load the pretrained word embeddings
29 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')
30 | embeddings_index, embeddings_size = readGlove(path_to_glove)
31 |
32 |
33 | ######### read corpus
34 | raw_triplets = read_raw_squad(path_to_data)
35 | triplets = tokenize_squad(raw_triplets, embeddings_index)
36 |
37 | # find max length of context, question, answer, respectively
38 | # max_len_c, max_len_q, max_len_a = max_length(triplets)
39 |
40 | ######### corpus preprocessing
41 | # words that do not appear in embeddings, etc
42 |
43 | ## find all unique tokens in the data (should be a subset of the number of embeddings)
44 | effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index)
45 | print('effective number of tokens: ' + str(effective_num_tokens))
46 | print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n')
47 | # build word2index dictionary and index2word dictionary
48 | word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens)
49 |
50 |
51 | print('reading and preprocessing data complete.')
52 | print('found %s unique tokens in the intersection of corpus and word embeddings.' % effective_num_tokens)
53 | if use_cuda:
54 | print('GPU ready.')
55 | print('')
56 | print('start training...')
57 | print('')
58 |
59 |
60 | ######### set up model
61 | enc_hidden_size = 256
62 | enc_n_layers = 1
63 | enc_num_directions = 2
64 | dec_hidden_size = 256
65 | dec_n_layers = 1
66 | dec_num_directions = 2
67 | batch_size = 5
68 | learning_rate = 0.0005
69 |
70 | generator = G(embeddings_size, enc_hidden_size, enc_n_layers, enc_num_directions,
71 | embeddings_size, dec_hidden_size, effective_num_tokens, dec_n_layers, dec_num_directions,
72 | batch_size)
73 |
74 | if use_cuda:
75 | generator = generator.cuda()
76 |
77 | optimizer = optim.Adam(generator.parameters(), lr=learning_rate)
78 | criterion = nn.NLLLoss()
79 |
80 | # max_length of generated question
81 | max_length = 100
82 | to_file = False
83 |
84 | # open the files
85 | if to_file:
86 | exp_name = 'G_pretrain_exp_0827'
87 | path_to_exp_out = '/home/jack/Documents/QA_QG/exp_results_temp/'
88 | if not os.path.exists(path_to_exp_out+exp_name):
89 | os.mkdir(path_to_exp_out+exp_name)
90 | loss_f = 'loss_temp.txt'
91 | sample_out_f = 'sample_outputs_temp.txt'
92 | path_to_loss_f = path_to_exp_out + exp_name + '/' + loss_f
93 | path_to_sample_out_f = path_to_exp_out + exp_name + '/' + sample_out_f
94 | loss_f = open(path_to_loss_f,'w+')
95 | sample_out_f = open(path_to_sample_out_f, 'w+')
96 | else:
97 | loss_f = None
98 | sample_out_f = None
99 |
100 | trainIters(generator, optimizer, batch_size, embeddings_size,
101 | embeddings_index, word2index, index2word, max_length, triplets, teacher_forcing_ratio,
102 | to_file, loss_f, sample_out_f,
103 | n_iters = 1, print_every=1, plot_every=1)
104 |
105 | # save the final model
106 | if to_file:
107 | torch.save(generator, path_to_exp_out + exp_name +'/generator_temp.pth')
108 |
109 |
110 |
--------------------------------------------------------------------------------
/src/D_baseline/D_train.py:
--------------------------------------------------------------------------------
1 |
2 | from __future__ import print_function
3 | from __future__ import division
4 |
5 | import sys
6 | import os
7 | import time
8 | sys.path.append(os.path.abspath(__file__ + "/../../"))
9 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
10 | from data_proc import *
11 | # FIXME: had some problem importing util.py; importing successful but
12 | # functions cannot be called (NameError: global name XXX is not defined)
13 | # fast solution: copied asMinutes and timeSince functions here
14 | from util import *
15 |
16 | import torch
17 | from torch.autograd import Variable
18 | from D_eval import *
19 |
20 | use_cuda = torch.cuda.is_available()
21 |
22 | import time
23 | import math
24 |
25 | # FIXME: added these two functions because import util does not seem to work (see above)
26 | def asMinutes(s):
27 | m = math.floor(s / 60)
28 | s -= m * 60
29 | return '%dm %ds' % (m, s)
30 |
31 | def timeSince(since, percent):
32 | now = time.time()
33 | s = now - since
34 | es = s / (percent)
35 | rs = es - s
36 | return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
37 |
38 |
39 | ######################################################################
40 | # Training the Model
41 | # context = input_variable
42 | def train(discriminator, criterion, optimizer, batch_size, embeddings_size,
43 | embeddings_index, word2index, index2word, triplets,
44 | to_file, path_to_loss_f, path_to_sample_out_f, path_to_exp_out,
45 | n_iters=10, print_every=10, plot_every=100):
46 |
47 | begin_time = time.time()
48 |
49 | # open the files
50 | if to_file:
51 | loss_f = open(path_to_loss_f,'w+')
52 | sample_out_f = open(path_to_sample_out_f, 'w+')
53 |
54 | # plot_losses = []
55 | print_loss_total = 0 # Reset every print_every
56 | plot_loss_total = 0 # Reset every plot_every
57 |
58 | print()
59 |
60 | for iter in range(1, n_iters + 1):
61 |
62 | # prepare batch
63 | training_batch, seq_lens, fake_training_batch, fake_seq_lens = get_random_batch(triplets, batch_size, with_fake=True)
64 | # concat the context_ans batch with the question batch
65 | # each element in the training batch is context + question + answer
66 | training_batch, _, seq_lens = prepare_batch_var(training_batch, seq_lens, fake_training_batch, fake_seq_lens,
67 | batch_size, word2index, embeddings_index, embeddings_size,
68 | mode = ['word'], concat_opt='cqa', with_fake=True)
69 |
70 | train_input = Variable(training_batch[0].cuda()) if use_cuda else Variable(training_batch[0]) # embeddings vectors, size = [seq len x batch size x embedding dim]
71 | # the labels are the last element of training_batch; see prepare_batch_var in data_proc.py for detail
72 | train_label = Variable(torch.FloatTensor(training_batch[-1]).cuda()) if use_cuda else Variable(torch.FloatTensor(training_batch[-1]))
73 |
74 | optimizer.zero_grad()
75 | loss = 0
76 | outputs = discriminator.forward(train_input, seq_lens[0])
77 | loss += discriminator.backward(outputs, train_label, criterion, optimizer)
78 |
79 | print_loss_total += loss.data[0]
80 | plot_loss_total += loss.data[0]
81 |
82 | # log on console
83 | if iter % print_every == 0:
84 | print_loss_avg = print_loss_total / print_every
85 | print_loss_total = 0
86 | print('%s (%d %d%%) %.4f' % (timeSince(begin_time, iter / float(n_iters)),
87 | iter, iter / n_iters * 100, print_loss_avg))
88 | evaluate(discriminator, triplets, word2index, embeddings_index, embeddings_size, eval_batch_size=100)
89 | print('-------------------------------')
90 | print('-------------------------------')
91 | print()
92 |
93 | # save error to file for plotting later
94 | if iter % plot_every == 0:
95 | plot_loss_avg = plot_loss_total / plot_every
96 | # plot_losses.append(plot_loss_avg)
97 | plot_loss_total = 0
98 | if to_file:
99 | loss_f.write(unicode(plot_loss_avg))
100 | loss_f.write(unicode('\n'))
101 |
102 | # showPlot(plot_losses)
103 | if to_file:
104 | loss_f.close()
105 |
106 |
107 |
108 |
--------------------------------------------------------------------------------
/src/util/util.py:
--------------------------------------------------------------------------------
1 | ######################################################################
2 | # Plotting results
3 | # ----------------
4 | #
5 | # Plotting is done with matplotlib, using the array of loss values
6 | # ``plot_losses`` saved while training.
7 | #
8 | import matplotlib
9 | matplotlib.use('Agg')
10 | import matplotlib.pyplot as plt
11 | import matplotlib.ticker as ticker
12 | import numpy as np
13 | import difflib
14 |
15 |
16 | def showPlot(points):
17 | plt.figure()
18 | fig, ax = plt.subplots()
19 | # this locator puts ticks at regular intervals
20 | loc = ticker.MultipleLocator(base=0.2)
21 | ax.yaxis.set_major_locator(loc)
22 | plt.plot(points)
23 |
24 |
25 |
26 | def extract(v):
27 | return v.data.storage().tolist()
28 |
29 |
30 |
31 | ######################################################################
32 | # This is a helper function to print time elapsed and estimated time
33 | # remaining given the current time and progress %.
34 | #
35 |
36 | import time
37 | import math
38 |
39 | def asMinutes(s):
40 | m = math.floor(s / 60)
41 | s -= m * 60
42 | return '%dm %ds' % (m, s)
43 |
44 | def timeSince(since, percent):
45 | now = time.time()
46 | s = now - since
47 | es = s / (percent)
48 | rs = es - s
49 | return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
50 |
51 |
52 |
53 | ######################################################################
54 | # show loss function
55 | def plotLoss(loss_f, plot_every, save_path=None, from_file=True, f_name='loss.png', title='training loss'):
56 | if from_file:
57 | loss_vec = []
58 | with open(loss_f) as f:
59 | content = f.readlines()
60 | content = [x.strip() for x in content] # list of every line, each a string
61 | for line in content:
62 | try:
63 | loss_vec.append(float(line))
64 | except ValueError:
65 | pass
66 | else:
67 | loss_vec = loss_f
68 | # plot
69 | plt.figure()
70 | plt.title(title)
71 | plt.xlabel('training iterations')
72 | plt.ylabel('loss')
73 | plt.grid()
74 | plt.plot([x*plot_every for x in range(1, len(loss_vec)+1)], loss_vec)
75 | if save_path == None:
76 | plt.savefig(f_name)
77 | else:
78 | plt.savefig(save_path + '/' + f_name)
79 |
80 | # test
81 | # from util import *
82 | # plotLoss('../../../exp_results_temp/G_c_a_sep_pretrain_exp_0902/loss_temp.txt', 30)
83 |
84 |
85 | ######################################################################
86 | # check if the generated question already exist in the corpus
87 | def generated_q_novelty(triplets, generated_q):
88 | # input - tokenized triplets, each one a list of strings
89 | # input - generated question
90 | # output - a similarity score vector for each of the questions in the triplets
91 | scores = []
92 | if not (isinstance(generated_q, str) or isinstance(generated_q, unicode)):
93 | generated_q = ' '.join(generated_q)
94 | for idx in range(len(triplets)):
95 | q = ' '.join(triplets[idx][1])
96 | scores.append(difflib.SequenceMatcher(None, generated_q, q).ratio)
97 | return np.array(scores)
98 | # test
99 |
100 |
101 | # ######################################################################
102 | # # For a better viewing experience we will do the extra work of adding axes
103 | # # and labels:
104 | # #
105 | # def showAttention(input_sentence, output_words, attentions):
106 | # # Set up figure with colorbar
107 | # fig = plt.figure()
108 | # ax = fig.add_subplot(111)
109 | # cax = ax.matshow(attentions.numpy(), cmap='bone')
110 | # fig.colorbar(cax)
111 |
112 | # # Set up axes
113 | # ax.set_xticklabels([''] + input_sentence.split(' ') +
114 | # [''], rotation=90)
115 | # ax.set_yticklabels([''] + output_words)
116 |
117 | # # Show label at every tick
118 | # ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
119 | # ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
120 |
121 | # plt.show()
122 |
123 |
124 | # def evaluateAndShowAttention(input_sentence):
125 | # output_words, attentions = evaluate(
126 | # encoder1, attn_decoder1, input_sentence)
127 | # print('input =', input_sentence)
128 | # print('output =', ' '.join(output_words))
129 | # showAttention(input_sentence, output_words, attentions)
130 |
131 |
132 |
--------------------------------------------------------------------------------
/src/G_c_a_sep/G_main.py:
--------------------------------------------------------------------------------
1 | # from __future__ import print_function
2 | # from __future__ import division
3 |
4 | import sys
5 | import os
6 | sys.path.append(os.path.abspath(__file__ + "/../../"))
7 |
8 | from G_train import *
9 | from G_c_a_sep import *
10 | # import numpy as np
11 | from torch import optim
12 |
13 | global use_cuda
14 | use_cuda = torch.cuda.is_available()
15 | teacher_forcing_ratio = 0.75 # default in original code is 0.5
16 |
17 |
18 | ######### set paths
19 | # TODO: to run properly, change the following paths and filenames
20 | # default values for the dataset and the path to the project/dataset
21 | dataset = 'squad'
22 | f_name = 'train-v1.1.json'
23 | path_to_dataset = '/home/jack/Documents/QA_QG/data/'
24 | path_to_data = path_to_dataset + dataset + '/' + f_name
25 | GLOVE_DIR = path_to_dataset + 'glove.6B/'
26 |
27 |
28 | ######### first load the pretrained word embeddings
29 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')
30 | embeddings_index, embeddings_size = readGlove(path_to_glove)
31 |
32 |
33 | ######### read corpus - only the sentence containing the answer as context
34 | # raw_triplets = read_raw_squad(path_to_data)
35 | # sent_c_triplets = get_ans_sentence(raw_triplets)
36 | # sent_c_triplets = tokenize_squad(sent_c_triplets, embeddings_index, opt='sent')
37 | import pickle
38 | load_path = '/home/jack/Documents/QA_QG/data/processed/'
39 | # triplets = pickle.load(open(load_path+'triplets.txt', 'rb'))
40 | # sent_c_triplets = pickle.load(open(load_path+'sent_c_triplets.txt', 'rb'))
41 | windowed_c_triplets_30_noEOS = pickle.load(open(load_path+'windowed_c_triplets_30_noEOS.txt', 'rb'))
42 | # triplets = sent_c_triplets
43 | triplets = windowed_c_triplets_30_noEOS
44 |
45 | # find max length of context, question, answer, respectively
46 | # max_len_c, max_len_q, max_len_a = max_length(triplets)
47 |
48 | ######### corpus preprocessing
49 | # words that do not appear in embeddings, etc
50 |
51 | ## find all unique tokens in the data (should be a subset of the number of embeddings)
52 | effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index)
53 | print('effective number of tokens: ' + str(effective_num_tokens))
54 | print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n')
55 | # build word2index dictionary and index2word dictionary
56 | word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens)
57 |
58 |
59 | print('reading and preprocessing data complete.')
60 | print('found %s unique tokens in the intersection of corpus and word embeddings.' % effective_num_tokens)
61 | if use_cuda:
62 | print('GPU ready.')
63 | print('')
64 | print('start training...')
65 | print('')
66 |
67 |
68 | ######### set up model
69 | enc_hidden_size = 256
70 | enc_n_layers = 1
71 | enc_num_directions = 2
72 | dec_hidden_size = 256
73 | dec_n_layers = 1
74 | dec_num_directions = 2
75 | batch_size = 5
76 | learning_rate = 0.001
77 |
78 | generator = G(embeddings_size, enc_hidden_size, enc_n_layers, enc_num_directions,
79 | embeddings_size, dec_hidden_size, effective_num_tokens, dec_n_layers, dec_num_directions,
80 | batch_size)
81 |
82 | if use_cuda:
83 | generator = generator.cuda()
84 |
85 | optimizer = optim.Adam(generator.parameters(), lr=learning_rate)
86 | criterion = nn.NLLLoss()
87 |
88 | # max_length of generated question
89 | max_length = 100
90 | to_file = True
91 |
92 | # open the files
93 | if to_file:
94 | exp_name = 'G_c_a_sep_pretrain_exp_windowed_c_noEOS_0911'
95 | path_to_exp_out = '/home/jack/Documents/QA_QG/exp_results_temp/'
96 | if not os.path.exists(path_to_exp_out+exp_name):
97 | os.mkdir(path_to_exp_out+exp_name)
98 | loss_f = 'loss_temp.txt'
99 | sample_out_f = 'sample_outputs_temp.txt'
100 | path_to_loss_f = path_to_exp_out + exp_name + '/' + loss_f
101 | path_to_sample_out_f = path_to_exp_out + exp_name + '/' + sample_out_f
102 | loss_f = open(path_to_loss_f,'w+')
103 | sample_out_f = open(path_to_sample_out_f, 'w+')
104 | else:
105 | loss_f = None
106 | sample_out_f = None
107 | path_to_exp_out = None
108 |
109 | trainIters(generator, optimizer, batch_size, embeddings_size,
110 | embeddings_index, word2index, index2word, max_length, triplets, teacher_forcing_ratio,
111 | to_file, loss_f, sample_out_f, path_to_exp_out,
112 | n_iters=30000, print_every=300, plot_every=30, checkpoint_every=6000)
113 |
114 | # save the final model
115 | if to_file:
116 | torch.save(generator, path_to_exp_out + exp_name +'/generator_temp.pth.tar')
117 |
118 |
119 |
--------------------------------------------------------------------------------
/src/G_baseline/G_train.py:
--------------------------------------------------------------------------------
1 |
2 | from __future__ import print_function
3 | from __future__ import division
4 |
5 | import sys
6 | import os
7 | sys.path.append(os.path.abspath(__file__ + "/../../"))
8 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
9 | from data_proc import *
10 | # FIXME: had some problem importing util.py; importing successful but
11 | # functions cannot be called (NameError: global name XXX is not defined)
12 | # fast solution: copied asMinutes and timeSince functions herefrom util import *
13 | from G_eval import *
14 |
15 | import torch
16 | import torch.nn as nn
17 | from torch import optim
18 | from torch.autograd import Variable
19 | import torch.nn.functional as F
20 | import time
21 |
22 | use_cuda = torch.cuda.is_available()
23 |
24 |
25 | import time
26 | import math
27 |
28 | # FIXME: added these two functions because import util does not seem to work (see above)
29 | def asMinutes(s):
30 | m = math.floor(s / 60)
31 | s -= m * 60
32 | return '%dm %ds' % (m, s)
33 |
34 | def timeSince(since, percent):
35 | now = time.time()
36 | s = now - since
37 | es = s / (percent)
38 | rs = es - s
39 | return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
40 |
41 |
42 |
43 | def trainIters(generator, optimizer, batch_size, embeddings_size,
44 | embeddings_index, word2index, index2word, max_length, triplets, teacher_forcing_ratio,
45 | to_file, loss_f, sample_out_f,
46 | n_iters=5, print_every=10, plot_every=100):
47 |
48 | begin_time = time.time()
49 |
50 | # plot_losses = []
51 | print_loss_total = 0 # Reset every print_every
52 | plot_loss_total = 0 # Reset every plot_every
53 |
54 | print()
55 |
56 | for iter in range(1, n_iters + 1):
57 |
58 | # prepare batch
59 | training_batch, seq_lens = get_random_batch(triplets, batch_size)
60 | training_batch, _, seq_lens = prepare_batch_var(training_batch, seq_lens, batch_size, word2index, embeddings_index, embeddings_size, use_cuda=1, mode=['word', 'index'], concat_opt='ca')
61 | inputs_ca = Variable(training_batch[0].cuda()) if use_cuda else Variable(training_batch[0]) # embeddings vectors, size = [seq len x batch size x embedding dim]
62 | inputs_q = Variable(training_batch[1].cuda()) if use_cuda else Variable(training_batch[1]) # represented as indices, size = [seq len x batch size]
63 |
64 | max_c_a_len = max(seq_lens[0]) # max seq length of context + ans combined
65 | max_q_len = max(seq_lens[1]) # max seq length of question
66 |
67 | optimizer.zero_grad()
68 | loss = 0
69 | all_decoder_outputs = generator.forward(inputs_ca, inputs_q, seq_lens[0], batch_size, max_q_len,
70 | embeddings_index, embeddings_size, word2index, index2word,
71 | teacher_forcing_ratio)
72 | loss += generator.backward(all_decoder_outputs, inputs_q, seq_lens[1], optimizer)
73 |
74 | print_loss_total += loss.data[0]
75 | plot_loss_total += loss.data[0]
76 |
77 | if iter % print_every == 0:
78 | print_loss_avg = print_loss_total / print_every
79 | print_loss_total = 0
80 | print('%s (%d %d%%) %.4f' % (timeSince(begin_time, iter / float(n_iters)),
81 | iter, iter / n_iters * 100, print_loss_avg))
82 | print('---sample generated question---')
83 | # sample a triple and print the generated question
84 | evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length)
85 | print('-------------------------------')
86 | print('-------------------------------')
87 | print()
88 |
89 | if iter % plot_every == 0:
90 | plot_loss_avg = plot_loss_total / plot_every
91 | # plot_losses.append(plot_loss_avg)
92 | plot_loss_total = 0
93 | if to_file:
94 | loss_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100)))
95 | loss_f.write(unicode(plot_loss_avg))
96 | loss_f.write(unicode('\n'))
97 | sample_out_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100)))
98 | evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length, to_file, sample_out_f)
99 | sample_out_f.write(unicode('\n'))
100 |
101 |
102 |
103 | # showPlot(plot_losses)
104 | if to_file:
105 | loss_f.close()
106 |
107 |
108 |
109 |
--------------------------------------------------------------------------------
/src/G_baseline/G_eval.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | sys.path.append(os.path.abspath(__file__ + "/../../"))
4 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
5 | from data_proc import *
6 | from util import *
7 |
8 | import torch
9 | import torch.nn as nn
10 | from torch import optim
11 | from torch.autograd import Variable
12 | import torch.nn.functional as F
13 | import time
14 |
15 | use_cuda = torch.cuda.is_available()
16 |
17 |
18 | # max_length constrains the maximum length of the generated question
19 | def evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length,
20 | to_file = False, sample_out_f = None):
21 |
22 | # prepare test input
23 | batch_size = 1
24 | training, seq_lens = get_random_batch(triplets, batch_size)
25 | context_words = training[0]
26 | answer_words = training[2]
27 | question_words = training[1]
28 | training, _, seq_lens = prepare_batch_var(training, seq_lens, batch_size, word2index, embeddings_index, embeddings_size, mode=['word', 'index'], concat_opt='ca')
29 | inputs_ca = Variable(training[0].cuda()) if use_cuda else Variable(training[0]) # embeddings vectors, size = [seq len x batch size x embedding dim]
30 | # inputs_q = Variable(training[1].cuda()) if use_cuda else Variable(training[1]) # represented as indices, size = [seq len x batch size]
31 | inputs_q = None
32 |
33 | all_decoder_outputs = generator.forward(inputs_ca, inputs_q, seq_lens[0], batch_size, max_length,
34 | embeddings_index, embeddings_size, word2index, index2word,
35 | teacher_forcing_ratio=0)
36 |
37 | decoded_sentences = []
38 | decoded_words = []
39 | for b in range(batch_size):
40 | # get the word token and add to the list of words
41 | for di in range(max_length):
42 | # top value and index of every batch
43 | topv, topi = all_decoder_outputs[di,b].data.topk(1)
44 | ni = topi[0]
45 | if (ni == word2index['EOS']) or (ni == word2index['PAD']):
46 | decoded_words.append('EOS')
47 | # decoder_attentions[di] = decoder_attention[0].data
48 | break
49 | else:
50 | decoded_words.append(index2word[ni])
51 | decoded_sentences.append(decoded_words)
52 |
53 | # print results
54 | if not to_file:
55 | print('context > ' + ' '.join(context_words[0]).encode('utf-8').strip())
56 | print('answer > ' + ' '.join(answer_words[0]).encode('utf-8').strip())
57 | print('question > ' + ' '.join(question_words[0]).encode('utf-8').strip())
58 | # true_q = []
59 | # for i in range(seq_lens[1][0]):
60 | # true_q.append(index2word[inputs_q[i][0].data[0]])
61 | # print('question with padding> ' + ' '.join(true_q))
62 | print('generated question > ' + ' '.join(decoded_words))
63 | else:
64 | sample_out_f.write(unicode('context > ' + ' '.join(context_words[0]) + '\n'))
65 | sample_out_f.write(unicode('answer > ' + ' '.join(answer_words[0]) + '\n'))
66 | sample_out_f.write(unicode('question > ' + ' '.join(question_words[0]) + '\n'))
67 | sample_out_f.write(unicode('generated question > ' + ' '.join(decoded_words) + '\n'))
68 |
69 | # TODO: uncomment the following return if you want to record the decoder outputs in file
70 | # (note: need to modify this function call in G_train.py)
71 | # return decoded_sentences
72 |
73 |
74 | def G_sampler(generator, ca, embeddings_index, embeddings_size, word2index, index2word, max_length):
75 | # NOTE currently only generate one question at a time. multiple questions not yet supported
76 |
77 | var = torch.FloatTensor(len(ca), embeddings_size)
78 | for j in range(len(ca)):
79 | var[j] = embeddings_index[ca[j]]
80 | var = var.unsqueeze(1)
81 | if use_cuda:
82 | var = Variable(var.cuda())
83 | else:
84 | var = Variable(var)
85 |
86 | decoder_output = generator.forward(var, None, [len(ca)], 1, max_length,
87 | embeddings_index, embeddings_size, word2index, index2word,
88 | teacher_forcing_ratio=0).detach()
89 | decoder_output = decoder_output.squeeze(1)
90 |
91 | decoded_words = []
92 | for di in range(max_length):
93 | # top value and index of every batch
94 | topv, topi = decoder_output[di].data.topk(1)
95 | ni = topi[0]
96 | if (ni == word2index['EOS']) or (ni == word2index['PAD']):
97 | decoded_words.append('EOS')
98 | # decoder_attentions[di] = decoder_attention[0].data
99 | break
100 | else:
101 | decoded_words.append(index2word[ni])
102 |
103 | return decoded_words
104 |
105 |
--------------------------------------------------------------------------------
/src/G_baseline/G_model.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import random
4 | sys.path.append(os.path.abspath(__file__ + "/../../"))
5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
6 |
7 | from model_zoo import *
8 | from masked_cross_entropy import *
9 | import torch
10 | import torch.nn as nn
11 | from torch.autograd import Variable
12 |
13 | use_cuda = torch.cuda.is_available()
14 |
15 |
16 | class G(nn.Module):
17 | def __init__(self, enc_input_size, enc_hidden_size, enc_n_layers, enc_num_directions,
18 | dec_input_size, dec_hidden_size, output_size, dec_n_layers, dec_num_directions,
19 | batch_size, use_attn=True):
20 | super(G, self).__init__()
21 | self.encoder = EncoderRNN(enc_input_size, enc_hidden_size, batch_size, enc_n_layers, enc_num_directions)
22 | if use_attn:
23 | self.decoder = AttnDecoderRNN(dec_input_size, dec_hidden_size, output_size, self.encoder,
24 | dec_n_layers, dec_num_directions)
25 | else:
26 | # TODO: complete case when not using attention (add decoder class in model zoo)
27 | pass
28 |
29 |
30 | def forward(self, inputs_ca, inputs_q, seq_lens, batch_size, max_q_len,
31 | embeddings_index, embeddings_size, word2index, index2word, teacher_forcing_ratio):
32 | # context encoding
33 | # output size: (seq_len, batch, hidden_size)
34 | # hidden size: (num_layers, batch, hidden_size)
35 | # the collection of all hidden states per batch is of size (seq_len, batch, hidden_size * num_directions)
36 | encoder_hiddens, encoder_hidden = self.encoder(inputs_ca, seq_lens, None)
37 |
38 | print(type(encoder_hiddens.data))
39 | print(encoder_hiddens.size())
40 |
41 | # decoder
42 | # prepare decoder inputs as word embeddings in a batch
43 | # decoder_input size: (1, batch size, embedding size); first dim is 1 because only one time step;
44 | # nee to have a 3D tensor for input to nn.GRU module
45 | decoder_input = Variable(embeddings_index['SOS'].repeat(batch_size, 1).unsqueeze(0))
46 | # init all decoder outputs
47 | all_decoder_outputs = Variable(torch.zeros(max_q_len, batch_size, self.decoder.output_size))
48 | if use_cuda:
49 | decoder_input = decoder_input.cuda()
50 | all_decoder_outputs = all_decoder_outputs.cuda()
51 |
52 | use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
53 |
54 | if use_teacher_forcing:
55 | # Teacher forcing: Feed the target as the next input
56 | for di in range(max_q_len):
57 | decoder_output, decoder_hidden, decoder_attention = self.decoder(
58 | decoder_input, encoder_hiddens, embeddings_index)
59 |
60 | all_decoder_outputs[di] = decoder_output
61 |
62 | # change next time step input to current target output, in embedding format
63 | decoder_input = Variable(torch.FloatTensor(1, batch_size, embeddings_size).cuda()) if use_cuda else \
64 | Variable(torch.FloatTensor(1, batch_size, embeddings_size))
65 | for b in range(batch_size):
66 | decoder_input[0, b] = embeddings_index[index2word[inputs_q[di, b].data[0]]].cuda() if use_cuda else \
67 | embeddings_index[index2word[inputs_q[di, b].data[0]]] # Teacher forcing
68 |
69 | else:
70 | # Without teacher forcing: use its own predictions as the next input
71 | for di in range(max_q_len):
72 | decoder_output, decoder_hidden, decoder_attention = self.decoder(
73 | decoder_input, encoder_hiddens, embeddings_index)
74 |
75 | all_decoder_outputs[di] = decoder_output
76 |
77 | # top value and index of every batch
78 | # size of both topv, topi = (batch size, 1)
79 | topv, topi = decoder_output.data.topk(1)
80 |
81 | # get the output word for every batch
82 | decoder_input = Variable(torch.FloatTensor(1, batch_size, embeddings_size).cuda()) if use_cuda else \
83 | Variable(torch.FloatTensor(1, batch_size, embeddings_size))
84 | for b in range(batch_size):
85 | decoder_input[0, b] = embeddings_index[index2word[topi[0][0]]].cuda() if use_cuda else \
86 | embeddings_index[index2word[topi[0][0]]]
87 |
88 | return all_decoder_outputs
89 |
90 |
91 | def backward(self, out, labels, true_lens, optimizer):
92 | loss = masked_cross_entropy(
93 | out.transpose(0, 1).contiguous(), # -> batch x seq
94 | labels.transpose(0, 1).contiguous(), # -> batch x seq
95 | true_lens
96 | )
97 | loss.backward()
98 | optimizer.step()
99 | return loss
100 |
--------------------------------------------------------------------------------
/src/G_c_a_sep/G_c_a_sep.py:
--------------------------------------------------------------------------------
1 | # the encoder in the generator process the context and answer separately.
2 |
3 | import sys
4 | import os
5 | import random
6 | sys.path.append(os.path.abspath(__file__ + "/../../"))
7 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
8 |
9 | from model_zoo import *
10 | from masked_cross_entropy import *
11 | import torch
12 | import torch.nn as nn
13 | from torch.autograd import Variable
14 |
15 | use_cuda = torch.cuda.is_available()
16 |
17 |
18 | class G(nn.Module):
19 | def __init__(self, enc_input_size, enc_hidden_size, enc_n_layers, enc_num_directions,
20 | dec_input_size, dec_hidden_size, output_size, dec_n_layers, dec_num_directions,
21 | batch_size, use_attn=True):
22 | super(G, self).__init__()
23 | self.c_encoder = EncoderRNN(enc_input_size, enc_hidden_size, batch_size, enc_n_layers, enc_num_directions)
24 | self.a_encoder = EncoderRNN(enc_input_size, enc_hidden_size, batch_size, enc_n_layers, enc_num_directions)
25 | if use_attn:
26 | self.decoder = AttnDecoderRNN(dec_input_size, dec_hidden_size, output_size, self.a_encoder,
27 | dec_n_layers, dec_num_directions)
28 | else:
29 | # TODO: complete case when not using attention (add decoder class in model zoo)
30 | pass
31 |
32 |
33 | def forward(self, inputs, seq_lens, batch_size, max_q_len,
34 | embeddings_index, embeddings_size, word2index, index2word, teacher_forcing_ratio):
35 | # inputs is a collection of c, a, q. index by 0,2,1
36 | # output size: (seq_len, batch, hidden_size)
37 | # hidden size: (num_layers, batch, hidden_size)
38 | # the collection of all hidden states per batch is of size (seq_len, batch, hidden_size * num_directions)
39 | c_encoder_hiddens, c_encoder_hidden = self.c_encoder(inputs[0], seq_lens[0])
40 | a_encoder_hiddens, a_encoder_hidden = self.a_encoder(inputs[2], seq_lens[2])
41 |
42 | # TODO: the below code of how to use/combine hidden states from context/answer can be changed
43 | encoder_hiddens = torch.cat((c_encoder_hiddens, a_encoder_hiddens), 0) # concat along the first dimension (seq len)
44 |
45 | # decoder
46 | # prepare decoder inputs as word embeddings in a batch
47 | # decoder_input size: (1, batch size, embedding size); first dim is 1 because only one time step;
48 | # nee to have a 3D tensor for input to nn.GRU module
49 | decoder_input = Variable(embeddings_index['SOS'].repeat(batch_size, 1).unsqueeze(0))
50 | # init all decoder outputs
51 | all_decoder_outputs = Variable(torch.zeros(max_q_len, batch_size, self.decoder.output_size))
52 | if use_cuda:
53 | decoder_input = decoder_input.cuda()
54 | all_decoder_outputs = all_decoder_outputs.cuda()
55 |
56 | use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
57 |
58 | if use_teacher_forcing:
59 | # Teacher forcing: Feed the target as the next input
60 | for di in range(max_q_len):
61 | decoder_output, decoder_hidden, decoder_attention = self.decoder(
62 | decoder_input, c_encoder_hiddens, embeddings_index)
63 |
64 | all_decoder_outputs[di] = decoder_output
65 |
66 | # change next time step input to current target output, in embedding format
67 | decoder_input = Variable(torch.FloatTensor(1, batch_size, embeddings_size).cuda()) if use_cuda else \
68 | Variable(torch.FloatTensor(1, batch_size, embeddings_size))
69 | for b in range(batch_size):
70 | decoder_input[0, b] = embeddings_index[index2word[inputs[1][di, b].data[0]]].cuda() if use_cuda else \
71 | embeddings_index[index2word[inputs[1][di, b].data[0]]] # Teacher forcing
72 |
73 | else:
74 | # Without teacher forcing: use its own predictions as the next input
75 | for di in range(max_q_len):
76 | decoder_output, decoder_hidden, decoder_attention = self.decoder(
77 | decoder_input, encoder_hiddens, embeddings_index)
78 |
79 | all_decoder_outputs[di] = decoder_output
80 |
81 | # top value and index of every batch
82 | # size of both topv, topi = (batch size, 1)
83 | topv, topi = decoder_output.data.topk(1)
84 |
85 | # get the output word for every batch
86 | decoder_input = Variable(torch.FloatTensor(1, batch_size, embeddings_size).cuda()) if use_cuda else \
87 | Variable(torch.FloatTensor(1, batch_size, embeddings_size))
88 | for b in range(batch_size):
89 | decoder_input[0, b] = embeddings_index[index2word[topi[0][0]]].cuda() if use_cuda else \
90 | embeddings_index[index2word[topi[0][0]]]
91 |
92 | return all_decoder_outputs
93 |
94 |
95 | def backward(self, out, labels, true_lens, optimizer):
96 | loss = masked_cross_entropy(
97 | out.transpose(0, 1).contiguous(), # -> batch x seq
98 | labels.transpose(0, 1).contiguous(), # -> batch x seq
99 | true_lens
100 | )
101 | loss.backward()
102 | optimizer.step()
103 | return loss
104 |
--------------------------------------------------------------------------------
/references/code/vanilla-gan.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # code from https://github.com/devnag/pytorch-generative-adversarial-networks/blob/master/gan_pytorch.py
4 |
5 | # Generative Adversarial Networks (GAN) example in PyTorch.
6 | # See related blog post at https://medium.com/@devnag/generative-adversarial-networks-gans-in-50-lines-of-code-pytorch-e81b79659e3f#.sch4xgsa9
7 | import numpy as np
8 | import torch
9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | import torch.optim as optim
12 | from torch.autograd import Variable
13 |
14 | # Data params
15 | data_mean = 4
16 | data_stddev = 1.25
17 |
18 | # Model params
19 | g_input_size = 1 # Random noise dimension coming into generator, per output vector
20 | g_hidden_size = 50 # Generator complexity
21 | g_output_size = 1 # size of generated output vector
22 | d_input_size = 100 # Minibatch size - cardinality of distributions
23 | d_hidden_size = 50 # Discriminator complexity
24 | d_output_size = 1 # Single dimension for 'real' vs. 'fake'
25 | minibatch_size = d_input_size
26 |
27 | d_learning_rate = 2e-4 # 2e-4
28 | g_learning_rate = 2e-4
29 | optim_betas = (0.9, 0.999)
30 | num_epochs = 30000
31 | print_interval = 200
32 | d_steps = 1 # 'k' steps in the original GAN paper. Can put the discriminator on higher training freq than generator
33 | g_steps = 1
34 |
35 | # ### Uncomment only one of these
36 | #(name, preprocess, d_input_func) = ("Raw data", lambda data: data, lambda x: x)
37 | (name, preprocess, d_input_func) = ("Data and variances", lambda data: decorate_with_diffs(data, 2.0), lambda x: x * 2)
38 |
39 | print("Using data [%s]" % (name))
40 |
41 | # ##### DATA: Target data and generator input data
42 |
43 | def get_distribution_sampler(mu, sigma):
44 | return lambda n: torch.Tensor(np.random.normal(mu, sigma, (1, n))) # Gaussian
45 |
46 | def get_generator_input_sampler():
47 | return lambda m, n: torch.rand(m, n) # Uniform-dist data into generator, _NOT_ Gaussian
48 |
49 | # ##### MODELS: Generator model and discriminator model
50 |
51 | class Generator(nn.Module):
52 | def __init__(self, input_size, hidden_size, output_size):
53 | super(Generator, self).__init__()
54 | self.map1 = nn.Linear(input_size, hidden_size)
55 | self.map2 = nn.Linear(hidden_size, hidden_size)
56 | self.map3 = nn.Linear(hidden_size, output_size)
57 |
58 | def forward(self, x):
59 | x = F.elu(self.map1(x))
60 | x = F.sigmoid(self.map2(x))
61 | return self.map3(x)
62 |
63 | class Discriminator(nn.Module):
64 | def __init__(self, input_size, hidden_size, output_size):
65 | super(Discriminator, self).__init__()
66 | self.map1 = nn.Linear(input_size, hidden_size)
67 | self.map2 = nn.Linear(hidden_size, hidden_size)
68 | self.map3 = nn.Linear(hidden_size, output_size)
69 |
70 | def forward(self, x):
71 | x = F.elu(self.map1(x))
72 | x = F.elu(self.map2(x))
73 | return F.sigmoid(self.map3(x))
74 |
75 | def extract(v):
76 | return v.data.storage().tolist()
77 |
78 | def stats(d):
79 | return [np.mean(d), np.std(d)]
80 |
81 | def decorate_with_diffs(data, exponent):
82 | mean = torch.mean(data.data, 1)
83 | mean_broadcast = torch.mul(torch.ones(data.size()), mean.tolist()[0][0])
84 | diffs = torch.pow(data - Variable(mean_broadcast), exponent)
85 | return torch.cat([data, diffs], 1)
86 |
87 | d_sampler = get_distribution_sampler(data_mean, data_stddev)
88 | gi_sampler = get_generator_input_sampler()
89 | G = Generator(input_size=g_input_size, hidden_size=g_hidden_size, output_size=g_output_size)
90 | D = Discriminator(input_size=d_input_func(d_input_size), hidden_size=d_hidden_size, output_size=d_output_size)
91 | criterion = nn.BCELoss() # Binary cross entropy: http://pytorch.org/docs/nn.html#bceloss
92 | d_optimizer = optim.Adam(D.parameters(), lr=d_learning_rate, betas=optim_betas)
93 | g_optimizer = optim.Adam(G.parameters(), lr=g_learning_rate, betas=optim_betas)
94 |
95 | for epoch in range(num_epochs):
96 | for d_index in range(d_steps):
97 | # 1. Train D on real+fake
98 | D.zero_grad()
99 |
100 | # 1A: Train D on real
101 | d_real_data = Variable(d_sampler(d_input_size))
102 | d_real_decision = D(preprocess(d_real_data))
103 | d_real_error = criterion(d_real_decision, Variable(torch.ones(1))) # ones = true
104 | d_real_error.backward() # compute/store gradients, but don't change params
105 |
106 | # 1B: Train D on fake
107 | d_gen_input = Variable(gi_sampler(minibatch_size, g_input_size))
108 | d_fake_data = G(d_gen_input).detach() # detach to avoid training G on these labels
109 | d_fake_decision = D(preprocess(d_fake_data.t()))
110 | d_fake_error = criterion(d_fake_decision, Variable(torch.zeros(1))) # zeros = fake
111 | d_fake_error.backward()
112 | d_optimizer.step() # Only optimizes D's parameters; changes based on stored gradients from backward()
113 |
114 | for g_index in range(g_steps):
115 | # 2. Train G on D's response (but DO NOT train D on these labels)
116 | G.zero_grad()
117 |
118 | gen_input = Variable(gi_sampler(minibatch_size, g_input_size))
119 | g_fake_data = G(gen_input)
120 | dg_fake_decision = D(preprocess(g_fake_data.t()))
121 | g_error = criterion(dg_fake_decision, Variable(torch.ones(1))) # we want to fool, so pretend it's all genuine
122 |
123 | g_error.backward()
124 | g_optimizer.step() # Only optimizes G's parameters
125 |
126 | if epoch % print_interval == 0:
127 | print("%s: D: %s/%s G: %s (Real: %s, Fake: %s) " % (epoch,
128 | extract(d_real_error)[0],
129 | extract(d_fake_error)[0],
130 | extract(g_error)[0],
131 | stats(extract(d_real_data)),
132 | stats(extract(d_fake_data))))
--------------------------------------------------------------------------------
/src/GAN_model/GAN_main.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division
3 |
4 | import sys, os
5 | # sys.path.append(os.path.abspath(__file__ + "/../../"))
6 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
7 | # sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_c_a_sep')
8 | # sys.path.append(os.path.abspath(__file__ + "/../../") + '/D_baseline')
9 |
10 | from data_proc import *
11 | # from G_train import *
12 | # from G_c_a_sep import *
13 | from GAN_model import *
14 | import numpy as np
15 |
16 | from torch import optim
17 |
18 | global use_cuda
19 | use_cuda = torch.cuda.is_available()
20 | teacher_forcing_ratio = 0.5 # default in original code is 0.5
21 |
22 |
23 | ######### set paths
24 | # TODO: to run properly, change the following paths and filenames
25 | # path variables
26 | path_to_dataset = '/home/jack/Documents/QA_QG/data/' # path to original dataset
27 | load_path = '/home/jack/Documents/QA_QG/data/processed/' # path to processed dataset
28 | G_path = '/home/jack/Documents/QA_QG/exp_results_temp/G_c_a_sep_pretrain_exp_0902(2)/generator_temp.pth' # path to saved generator model
29 | path_to_exp = '/home/jack/Documents/QA_QG/exp_results_temp/' # path to experiment folder
30 |
31 |
32 | # default values for the dataset and the path to the project/dataset
33 | dataset = 'squad'
34 | f_name = 'train-v1.1.json'
35 | path_to_data = path_to_dataset + dataset + '/' + f_name
36 | GLOVE_DIR = path_to_dataset + 'glove.6B/'
37 |
38 | ######### first load the pretrained word embeddings
39 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')
40 | embeddings_index, embeddings_size = readGlove(path_to_glove)
41 |
42 |
43 | import pickle
44 | # triplets = pickle.load(open(load_path+'triplets.txt', 'rb'))
45 | sent_c_triplets = pickle.load(open(load_path+'sent_c_triplets.txt', 'rb'))
46 | # windowed_c_triplets_10 = pickle.load(open(load_path+'windowed_c_triplets_10.txt', 'rb'))
47 | triplets = sent_c_triplets
48 | # ######### read corpus
49 | # raw_triplets = read_raw_squad(path_to_data)
50 | # triplets = tokenize_squad(raw_triplets, embeddings_index)
51 |
52 | # # find max length of context, question, answer, respectively
53 | # max_len_c, max_len_q, max_len_a = max_length(triplets)
54 |
55 | ######### corpus preprocessing
56 | # words that do not appear in embeddings, etc
57 |
58 | ## find all unique tokens in the data (should be a subset of the number of embeddings)
59 | effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index)
60 | print('effective number of tokens: ' + str(effective_num_tokens))
61 | print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n')
62 | # build word2index dictionary and index2word dictionary
63 | word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens)
64 |
65 |
66 | print('reading and preprocessing data complete.')
67 | print('found %s unique tokens in the intersection of corpus and word embeddings.' % effective_num_tokens)
68 | if use_cuda:
69 | print('GPU ready.')
70 | print('')
71 | print('start training...')
72 | print('')
73 |
74 |
75 | ######### set up model
76 | G_enc_input_size = embeddings_size
77 | G_enc_hidden_size = 256
78 | G_enc_n_layers = 1
79 | G_enc_num_directions = 1
80 | G_dec_input_size = embeddings_size
81 | G_dec_hidden_size = 256
82 | G_output_size = effective_num_tokens
83 | G_dec_n_layers = 1
84 | G_dec_num_directions = 1
85 | D_enc_input_size = embeddings_size
86 | D_enc_hidden_size = 256
87 | D_enc_n_layers = 1
88 | D_num_directions = 1
89 | D_mlp_hidden_size = 64
90 | D_num_attn_weights = 1
91 | D_mlp_output_size = 1
92 | use_attn = True
93 | batch_size = 5
94 |
95 |
96 | vanilla_gan = GAN_model(G_enc_input_size, G_enc_hidden_size, G_enc_n_layers, G_enc_num_directions,
97 | G_dec_input_size, G_dec_hidden_size, G_output_size, G_dec_n_layers, G_dec_num_directions,
98 | D_enc_input_size, D_enc_hidden_size, D_enc_n_layers, D_num_directions,
99 | D_mlp_hidden_size, D_num_attn_weights, D_mlp_output_size,
100 | use_attn, batch_size, G_path=G_path, pretrain=True)
101 | if use_cuda:
102 | vanilla_gan = vanilla_gan.cuda()
103 |
104 | learning_rate = 1e-3
105 | d_optimizer = optim.Adam(vanilla_gan.D.parameters(), lr=learning_rate)
106 | g_optimizer = optim.Adam(vanilla_gan.G.parameters(), lr=learning_rate)
107 | criterion = nn.BCELoss()
108 |
109 | # max_length of generated question
110 | max_len = 100
111 | to_file = True
112 | print_every = 500
113 | plot_every = 50
114 | checkpoint_every = 2000
115 | n_iters = 10000
116 | d_steps = 1
117 | g_steps = 5
118 |
119 | # open the files
120 | exp_name = 'GAN_0911'
121 | path_to_exp_out = path_to_exp + exp_name
122 | if to_file:
123 | if not os.path.exists(path_to_exp_out):
124 | os.mkdir(path_to_exp_out)
125 | loss_f = 'loss_temp.txt'
126 | sample_out_f = 'sample_outputs_temp.txt'
127 | path_to_loss_f = path_to_exp_out + '/' + loss_f
128 | path_to_sample_out_f = path_to_exp_out + '/' + sample_out_f
129 | loss_f = open(path_to_loss_f,'w+')
130 | sample_out_f = open(path_to_sample_out_f, 'w+')
131 | # else:
132 | # loss_f = None
133 | # sample_out_f = None
134 | # path_to_exp_out = None
135 |
136 | # # load a pre-trained model
137 | # model_fname = 'checkpoint_iter_1.pth.tar'
138 | # path_to_model = path_to_exp_out + '/' + model_fname
139 | # checkpoint = torch.load(path_to_model)
140 | # vanilla_gan.D.load_state_dict(checkpoint['d_state_dict'])
141 | # vanilla_gan.G.load_state_dict(checkpoint['g_state_dict'])
142 | # d_optimizer.load_state_dict(checkpoint['d_optimizer'])
143 | # g_optimizer.load_state_dict(checkpoint['g_optimizer'])
144 |
145 | # train
146 | vanilla_gan.train(triplets, n_iters, d_steps, d_optimizer, g_steps, g_optimizer, batch_size, max_len,
147 | criterion, word2index, index2word, embeddings_index, embeddings_size, print_every, plot_every, checkpoint_every,
148 | to_file=to_file, loss_f=loss_f, sample_out_f=sample_out_f, path_to_exp_out=path_to_exp_out)
149 |
150 | if to_file:
151 | loss_f.close()
152 | sample_out_f.close()
153 | torch.save(vanilla_gan, path_to_exp_out + exp_name + '/GAN_model.pth.tar')
--------------------------------------------------------------------------------
/src/G_c_a_sep/G_train.py:
--------------------------------------------------------------------------------
1 |
2 | import sys
3 | import os
4 | sys.path.append(os.path.abspath(__file__ + "/../../"))
5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
6 | # from util import timeSince, asMinutes, plotLoss
7 | from data_proc import *
8 | # FIXME: had some problem importing util.py; importing successful but
9 | # functions cannot be called (NameError: global name XXX is not defined)
10 | # fast solution: copied asMinutes and timeSince functions herefrom util import *
11 | from G_eval import *
12 |
13 | import torch
14 | from torch.autograd import Variable
15 |
16 | use_cuda = torch.cuda.is_available()
17 |
18 |
19 | ########################################################################################################################
20 | import matplotlib
21 | matplotlib.use('Agg')
22 | import matplotlib.pyplot as plt
23 | import time
24 | import math
25 |
26 | def asMinutes(s):
27 | m = math.floor(s / 60)
28 | s -= m * 60
29 | return '%dm %ds' % (m, s)
30 |
31 | def timeSince(since, percent):
32 | now = time.time()
33 | s = now - since
34 | es = s / (percent)
35 | rs = es - s
36 | return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
37 |
38 |
39 |
40 | ######################################################################
41 | # show loss function
42 | def plotLoss(loss_f, plot_every, save_path=None, from_file=True, f_name='loss.png', title='training loss'):
43 | if from_file:
44 | loss_vec = []
45 | with open(loss_f) as f:
46 | content = f.readlines()
47 | content = [x.strip() for x in content] # list of every line, each a string
48 | for line in content:
49 | try:
50 | loss_vec.append(float(line))
51 | except ValueError:
52 | pass
53 | else:
54 | loss_vec = loss_f
55 | # plot
56 | plt.figure()
57 | plt.title(title)
58 | plt.xlabel('training iterations')
59 | plt.ylabel('loss')
60 | plt.grid()
61 | plt.plot([x*plot_every for x in range(1, len(loss_vec)+1)], loss_vec)
62 | if save_path == None:
63 | plt.savefig(f_name)
64 | else:
65 | plt.savefig(save_path + '/' + f_name)
66 | ########################################################################################################################
67 |
68 |
69 | def trainIters(generator, optimizer, batch_size, embeddings_size,
70 | embeddings_index, word2index, index2word, max_length, triplets, teacher_forcing_ratio,
71 | to_file, loss_f, sample_out_f, path_to_exp_out,
72 | n_iters=1, print_every=1, plot_every=1, checkpoint_every=1):
73 |
74 | begin_time = time.time()
75 |
76 | # plot_losses = []
77 | print_loss_total = 0 # Reset every print_every
78 | plot_loss_total = 0 # Reset every plot_every
79 | plot_loss_avgs = []
80 |
81 | print()
82 |
83 | for iter in range(1, n_iters + 1):
84 |
85 | # prepare batch
86 | training_batch, seq_lens = get_random_batch(triplets, batch_size)
87 | training_batch, _, seq_lens = prepare_batch_var(
88 | training_batch, seq_lens, batch_size, word2index, embeddings_index, embeddings_size)
89 |
90 | # print(type(training_batch))
91 | # print(type(training_batch[0]))
92 |
93 | # prepare inputs (load to cuda)
94 | inputs = []
95 | for var in training_batch:
96 | if not isinstance(var, list):
97 | inputs.append(Variable(var.cuda())) if use_cuda else inputs.append(Variable(var))
98 | # NOTE not currently appending start and end index to inputs because model does not use them.
99 | # NOTE if want to apend, make sure these are changed from list to LongTensor
100 | # else:
101 | # inputs.append(Variable(var))
102 |
103 | max_c_a_len = max(seq_lens[0]) # max seq length of context + ans combined
104 | max_q_len = max(seq_lens[1]) # max seq length of question
105 |
106 | optimizer.zero_grad()
107 | loss = 0
108 | all_decoder_outputs = generator.forward(inputs, seq_lens, batch_size, max_q_len,
109 | embeddings_index, embeddings_size, word2index, index2word,
110 | teacher_forcing_ratio)
111 | loss += generator.backward(all_decoder_outputs, inputs[1], seq_lens[1], optimizer)
112 |
113 | print_loss_total += loss.data[0]
114 | plot_loss_total += loss.data[0]
115 |
116 | if iter % print_every == 0:
117 | print_loss_avg = print_loss_total / print_every
118 | print_loss_total = 0
119 | print('%s (%d %d%%) %.4f' % (timeSince(begin_time, iter / float(n_iters)),
120 | iter, iter / n_iters * 100, print_loss_avg))
121 | print('---sample generated question---')
122 | # sample a triple and print the generated question
123 | evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length)
124 | print('-------------------------------')
125 | print('-------------------------------')
126 | print()
127 | if to_file:
128 | sample_out_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100)))
129 | evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length, to_file, sample_out_f)
130 | sample_out_f.write(unicode('\n'))
131 | if iter % plot_every == 0:
132 | plot_loss_avg = plot_loss_total / plot_every
133 | plot_loss_avgs.append(plot_loss_avg)
134 | # plot_losses.append(plot_loss_avg)
135 | plot_loss_total = 0
136 | if to_file:
137 | loss_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100)))
138 | loss_f.write(unicode(plot_loss_avg))
139 | loss_f.write(unicode('\n'))
140 | if to_file and ((iter % checkpoint_every == 0) or (iter == n_iters)):
141 | checkpoint_fname = 'checkpoint_iter_' + str(iter) + '.pth.tar'
142 | state = {
143 | 'iteration': iter + 1,
144 | 'g_state_dict': generator.state_dict(),
145 | 'g_optimizer' : optimizer.state_dict(),
146 | }
147 | torch.save(state, path_to_exp_out+'/'+checkpoint_fname)
148 | plotLoss(plot_loss_avgs, plot_every, save_path=path_to_exp_out, f_name='d_loss_itr_'+str(iter)+'.png',
149 | title='training loss', from_file=False)
150 |
151 | # showPlot(plot_losses)
152 | if to_file:
153 | loss_f.close()
154 |
155 |
156 |
157 |
--------------------------------------------------------------------------------
/src/util/test.py:
--------------------------------------------------------------------------------
1 | # various test cases
2 |
3 | # load model
4 | import sys, os
5 | __file__ = '/home/jack/Documents/QA_QG/GAN-QA/src/util/'
6 | sys.path.append(os.path.abspath(__file__))
7 | import data_proc
8 | reload(data_proc)
9 | from data_proc import *
10 | import util
11 | reload(util)
12 | from util import *
13 | sys.path.append(os.path.abspath(__file__ + "/../../"))
14 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/D_baseline')
15 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_baseline')
16 | from G_model import *
17 | from model_zoo import *
18 | from G_eval import *
19 | import torch
20 | import numpy as np
21 |
22 | global use_cuda
23 | use_cuda = torch.cuda.is_available()
24 |
25 | ######################################################################
26 | ######################################################################
27 | # test for various util functions
28 | # uncomment this for much of the later unit tests in this file
29 | ######### set paths
30 | # TODO: to run properly, change the following paths and filenames
31 | # default values for the dataset and the path to the project/dataset
32 | dataset = 'squad'
33 | f_name = 'train-v1.1.json'
34 | path_to_dataset = '/home/jack/Documents/QA_QG/data/'
35 | path_to_data = path_to_dataset + dataset + '/' + f_name
36 | GLOVE_DIR = path_to_dataset + 'glove.6B/'
37 | # path for experiment outputs
38 | # exp_name = 'QG_seq2seq_baseline'
39 | path_to_exp_out = '/home/jack/Documents/QA_QG/exp_results_temp/'
40 | loss_f = 'loss_temp.txt'
41 | sample_out_f = 'sample_outputs_temp.txt'
42 | path_to_loss_f = path_to_exp_out + '/' + loss_f
43 | path_to_sample_out_f = path_to_exp_out + '/' + sample_out_f
44 |
45 | ######### first load the pretrained word embeddings
46 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')
47 | embeddings_index, embeddings_size = readGlove(path_to_glove)
48 |
49 | ######### read corpus
50 | raw_triplets = read_raw_squad(path_to_data)
51 |
52 | # # test of windowed triplets
53 | # window_size = 10
54 | # test_idx = 250
55 | # windowed_c_triplets_10 = get_windowed_ans(raw_triplets, window_size)
56 | # print(raw_triplets[test_idx][0])
57 | # print(raw_triplets[test_idx][2])
58 | # print(windowed_c_triplets[0][0])
59 |
60 | # test of selecting the sentence containing answer from context
61 | # test_idx = 0
62 | sent_window = 1
63 | sent_c_triplets, unmatch = get_ans_sentence(raw_triplets)
64 | # print(raw_triplets[test_idx][0])
65 | # print(raw_triplets[test_idx][2])
66 | # print('ans start idx: %d' % raw_triplets[test_idx][3])
67 | # print('ans end idx: %d' % raw_triplets[test_idx][4])
68 | # print(sent_c_triplets[0][0])
69 | # windowed_c_triplets_10_noEOS = tokenize_squad(windowed_c_triplets_10, embeddings_index, opt='window', a_EOS=False, c_EOS=False)
70 | # triplets = windowed_c_triplets_30_noEOS
71 | # windowed_c_triplets_10_noEOS = tokenize_squad(windowed_c_triplets_10_noEOS, embeddings_index, opt='window')
72 | # sent_c_triplets = tokenize_squad(sent_c_triplets, embeddings_index, opt='sent')
73 | # triplets = tokenize_squad(raw_triplets, embeddings_index)
74 |
75 | # print(raw_triplets[test_idx][0])
76 | # print(' '.join(triplets[test_idx][0]))
77 | # print(raw_triplets[test_idx][1])
78 | # print(' '.join(triplets[test_idx][1]))
79 | # print(raw_triplets[test_idx][2])
80 | # print(' '.join(triplets[test_idx][2]))
81 |
82 | # # save to files
83 | # import pickle
84 | # save_path = '/home/jack/Documents/QA_QG/data/processed/'
85 | # if not os.path.exists(save_path):
86 | # os.mkdir(save_path)
87 | # with open(save_path+'windowed_c_triplets_10_noEOS.txt', 'wb') as fp:
88 | # pickle.dump(windowed_c_triplets_10_noEOS, fp)
89 | # with open(save_path+'sent_c_triplets.txt', 'wb') as fp:
90 | # pickle.dump(sent_c_triplets, fp)
91 | # with open(save_path+'triplets.txt', 'wb') as fp:
92 | # pickle.dump(triplets, fp)
93 |
94 | # # test pickle load
95 | import pickle
96 | load_path = '/home/jack/Documents/QA_QG/data/processed/'
97 | # triplets = pickle.load(open(load_path+'triplets.txt', 'rb'))
98 | sent_c_triplets = pickle.load(open(load_path+'sent_c_triplets.txt', 'rb'))
99 | # windowed_c_triplets_10 = pickle.load(open(load_path+'windowed_c_triplets_10.txt', 'rb'))
100 |
101 | # # find max length of context, question, answer, respectively
102 | # # max_len_c, max_len_q, max_len_a = max_length(triplets)
103 | #
104 | # effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index)
105 | # # print('effective number of tokens: ' + str(effective_num_tokens))
106 | # # print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n')
107 | # # # build word2index dictionary and index2word dictionary
108 | # word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens)
109 |
110 | # test similarity test
111 | q = 'what is the language spoken in germany ? EOS'
112 | scores = generated_q_novelty(sent_c_triplets, q)
113 | idx = np.argpartition(scores, -10)[-10:]
114 | scores[idx]
115 | for i in idx:
116 | print(sent_c_triplets[i][1])
117 |
118 | ######################################################################
119 | ######################################################################
120 | # test case of get_random_batch and prepare_batch_var functions in data_proc.py
121 | # (uncomment code below to test)
122 | # test and time
123 | # to run this test, you need to have these things ready:
124 | # 1) triplet processed by tokenize_squad,
125 | # 2) embeddings_index
126 | # 3) a mini batch processed by get_random_batch
127 | # batch_size = 500
128 | # start = time.time()
129 | # batch, seq_lens, fake_batch, fake_seq_lens = get_random_batch(triplets, batch_size, with_fake=True)
130 | # batch, seq_lens = get_random_batch(triplets, batch_size)
131 | #
132 | # # temp, temp_orig, seq_lens_cqa = prepare_batch_var(batch, seq_lens, fake_batch, fake_seq_lens, batch_size, word2index, embeddings_index, embeddings_size,
133 | # # mode = ['word', 'index'], concat_opt='cqa', with_fake=True)
134 | # batch_vars, batch_paddings, seq_lens = prepare_batch_var(batch, seq_lens, batch_size, word2index, embeddings_index, embeddings_size)
135 |
136 | # end = time.time()
137 | # print('time elapsed: ' + str(end-start))
138 | # # the following check if the batched data matches with the original data
139 | # batch_idx = random.choice(range(batch_size))
140 | # print(batch_idx)
141 | #
142 | # print('context > ', ' '.join(temp_orig[0][batch_idx]))
143 | # print('question > ', ' '.join(temp_orig[1][batch_idx]))
144 | # print('answer > ', ' '.join(temp_orig[2][batch_idx]))
145 | #
146 | # idx = batch[0].index(temp_orig[0][batch_idx])
147 | # print('context > ', ' '.join(batch[0][idx]))
148 | # print('question > ', ' '.join(batch[1][idx]))
149 | # print('answer > ', ' '.join(batch[2][idx]))
150 |
151 | # seq_idx = random.choice(range(min(seq_lens[0])))
152 | # print(seq_idx)
153 | # word1 = embeddings_index[batch[0][seq_lens[0].index(heapq.nlargest(batch_idx, seq_lens[0])[-1])][seq_idx]]
154 | # word2 = temp[0][seq_idx, batch_idx,]
155 | # set(word1) == set(word2.data.cpu())
156 |
157 |
158 | ######################################################################
159 | ######################################################################
160 | # # test case to load the G model and sample from G
161 | # teacher_forcing_ratio = 0.5 # default in original code is 0.5
162 |
163 | # # param for G
164 | # enc_hidden_size = 256
165 | # enc_n_layers = 1
166 | # enc_num_directions = 1
167 | # dec_hidden_size = 256
168 | # dec_n_layers = 1
169 | # dec_num_directions = 1
170 | # batch_size = 5
171 | # learning_rate = 0.0005
172 |
173 | # generator = G(embeddings_size, enc_hidden_size, enc_n_layers, enc_num_directions,
174 | # embeddings_size, dec_hidden_size, effective_num_tokens, dec_n_layers, dec_num_directions,
175 | # batch_size)
176 | # if use_cuda:
177 | # generator = generator.cuda()
178 |
179 | # # prepare G input
180 | # training_batch, seq_lens = get_random_batch(triplets, batch_size)
181 | # ca = training_batch[0][0] + training_batch[2][0]
182 | # # sample from G
183 | # max_len = 100
184 | # sample_q = G_sampler(generator, ca, embeddings_index, embeddings_size, word2index, index2word, max_len)
185 | # print(' '.join(sample_q))
186 |
--------------------------------------------------------------------------------
/src/model_zoo.py:
--------------------------------------------------------------------------------
1 |
2 | import sys
3 | import os
4 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
5 | from data_proc import *
6 |
7 | import torch
8 | import torch.nn as nn
9 | from torch.autograd import Variable
10 | import torch.nn.functional as F
11 |
12 | use_cuda = torch.cuda.is_available()
13 |
14 | ######################################################################
15 | # The Encoder
16 | # -----------
17 | class EncoderRNN(nn.Module):
18 | # output is the same dimension as input (dimension defined by externalword embedding model)
19 | def __init__(self, input_size, hidden_size, batch_size, n_layers=1, num_directions=1):
20 | super(EncoderRNN, self).__init__()
21 | self.n_layers = n_layers
22 | self.hidden_size = hidden_size
23 | self.input_size = input_size
24 | self.num_directions = num_directions
25 | self.batch_size = batch_size
26 | # print('batch size is: %d' % batch_size)
27 |
28 | if self.num_directions == 1:
29 | self.gru = nn.GRU(input_size, hidden_size, n_layers, bidirectional=False)
30 | elif self.num_directions == 2:
31 | self.gru = nn.GRU(input_size, hidden_size, n_layers, bidirectional=True)
32 | else:
33 | raise Exception('input num_directions is wrong - need to be either 1 or 2')
34 |
35 | def forward(self, input, seq_lens, hidden=None):
36 |
37 | # # prepare encoder input
38 | # if self.batch_size > 1:
39 | # # see how pack_padded_sequence works, take a look here (this is a wrong example): https://goo.gl/oN9uc9
40 | # input = nn.utils.rnn.pack_padded_sequence(input, seq_lens)
41 | # # input = pack_sequence(input, seq_lens)
42 |
43 | # input is matrix of size [max seq len x batch size x embedding dimension]
44 | encoder_outputs, hidden = self.gru(input, hidden)
45 |
46 | # # unpack the sequence
47 | # # size of unpacked sequence: (seq_len, batch size, hidden_size*num_directions)
48 | # if self.batch_size > 1:
49 | # encoder_outputs, output_lens = torch.nn.utils.rnn.pad_packed_sequence(encoder_outputs)
50 |
51 | # FIXME: do I need to sum the eocnder_outputs when the network is bidirectional:
52 | # e.g. outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
53 |
54 | return encoder_outputs, hidden
55 |
56 |
57 | ######################################################################
58 | # Vanilla Decoder
59 | # ^^^^^^^^^^^^^^^^^
60 | # TODO: take another look at the attn implementation; there might be some errors
61 | class DecoderRNN(nn.Module):
62 | def __init__(self, input_size, hidden_size, output_size, n_layers=1, num_directions=1, dropout_p=0.1):
63 | super(AttnDecoderRNN, self).__init__()
64 | self.input_size = input_size
65 | self.hidden_size = hidden_size
66 | self.output_size = output_size
67 | self.n_layers = n_layers
68 | self.dropout_p = dropout_p
69 | self.bidi = True if num_directions==2 else False
70 |
71 | # recurrent model
72 | self.dropout = nn.Dropout(self.dropout_p)
73 | self.gru = nn.GRU(self.input_size, self.hidden_size, num_layers=self.n_layers, bidirectional=self.bidi)
74 | self.out = nn.Linear(self.hidden_size, self.output_size)
75 |
76 | # forward for each time step.
77 | # need to do this because of teacher forcing at each time step
78 | def forward(self, input, encoder_hidden, embeddings_index, hidden=None):
79 |
80 | # get the output
81 | # hidden: (num_layers * num_directions, batch, hidden_size)
82 | # note: for each time step, output and hidden are the same
83 | output, hidden = self.gru(input, hidden)
84 |
85 | # if bidirectional, sum decoder hidden states of both directions
86 | if self.bidi:
87 | hidden = hidden[2*self.n_layer - 1] + hidden[2*self.n_layer]
88 | hidden = hidden.unsqueeze(0)
89 |
90 | # output size: (batch size, vocab size)
91 | output = F.log_softmax(self.out(output))
92 |
93 | return output, hidden
94 |
95 |
96 | ######################################################################
97 | # Attention Decoder
98 | # ^^^^^^^^^^^^^^^^^
99 | # TODO: take another look at the attn implementation; there might be some errors
100 | class AttnDecoderRNN(nn.Module):
101 | def __init__(self, input_size, hidden_size, output_size, encoder, n_layers=1, num_directions=1, dropout_p=0.1):
102 | super(AttnDecoderRNN, self).__init__()
103 | self.input_size = input_size
104 | self.hidden_size = hidden_size
105 | self.output_size = output_size
106 | self.n_layers = n_layers
107 | self.dropout_p = dropout_p
108 | self.num_directions = num_directions
109 |
110 | # recurrent model
111 | self.dropout = nn.Dropout(self.dropout_p)
112 | self.gru = nn.GRU(self.input_size, self.hidden_size)
113 | self.out = nn.Linear(self.hidden_size + encoder.num_directions * encoder.hidden_size, self.output_size)
114 |
115 | # attention mechanism
116 | self.attn = nn.Linear(self.hidden_size + encoder.num_directions * encoder.hidden_size, self.hidden_size)
117 |
118 | # forward for each time step.
119 | # need to do this because of teacher forcing at each time step
120 | def forward(self, input, encoder_outputs, embeddings_index, hidden=None):
121 |
122 | # get the output
123 | # hidden: (num_layers * num_directions, batch, hidden_size)
124 | # note: for each time step, output and hidden are the same
125 | output, hidden = self.gru(input, hidden)
126 |
127 | # # unpack the sequence
128 | # # decoder_outputs size (seq len, batch, hidden_size * num_directions)
129 | # # --> collection of hidden states at every time step
130 | # TODO: should figure out how to do this in a batch
131 | # current implementation is one token at a time using a forloop
132 | # decoder_outputs, output_lens = torch.nn.utils.rnn.pad_packed_sequence(decoder_outputs)
133 |
134 | # init attention weights
135 | # length = batch_size x encoder output lens
136 | attn_weights = Variable(torch.zeros(encoder_outputs.size(1), encoder_outputs.size(0)))
137 | if use_cuda:
138 | attn_weights = attn_weights.cuda()
139 |
140 | for b in range(encoder_outputs.size(1)):
141 | # copy the decoder output at the present time step to N rows, where N = num encoder outputs
142 | # first dimension of append = first dimension of encoder_outputs[:,b] = seq_len of encoder
143 | # the scores for calculating attention weights of all encoder outputs for one time step of decoder output
144 | for i in range(encoder_outputs.size(0)):
145 | attn_weights[b,i] = hidden[:,b].squeeze(0).dot(self.attn(torch.cat((hidden[:,b], encoder_outputs[i,b].unsqueeze(0)), 1)).t())
146 | # Below is an alternative implementation using matrices instead of for loop
147 | # not sure which one is more space efficient
148 | # (the out of memory error most likely comes from here)
149 | # attn_weights[i,b] = torch.mm(hidden[:, b],
150 | # self.attn(torch.cat((append, encoder_outputs[:, b]), 1)).t())
151 |
152 | attn_weights = F.softmax(attn_weights)
153 |
154 | # input to bmm:
155 | # weights size: (batch size, 1, seq_len)
156 | # hidden states size: (seq_len, batch, hidden_size * num_directions)
157 | # transpose hidden state size: (batch, seq len, hidden_size * num_directions)
158 | # output size: (batch size, 1, hidden_size * num_directions)
159 | context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs.transpose(0, 1))
160 |
161 | # calculate
162 | decoder_output = torch.cat((hidden.squeeze(0), context.squeeze(1)), 1)
163 |
164 | # output size: (batch size, vocab size)
165 | decoder_output = F.log_softmax(self.out(decoder_output))
166 |
167 | return decoder_output, hidden, attn_weights
168 |
169 |
170 | ######################################################################
171 | # multi-layer perceptron
172 | # ^^^^^^^^^^^^^^^^^^^^^^
173 | # code adapted from pytorch tutorial
174 | class MLP(nn.Module):
175 | # FIXME: the number of attention weights here is hard coded for tensor multiplication instead of using for loops
176 | def __init__(self, hidden_size, output_size, encoder, num_attn_weights, use_attn = True):
177 | # maximum input length it can take (for attention mechanism)
178 | super(MLP, self).__init__()
179 | self.hidden_size = hidden_size
180 | self.use_attn = use_attn
181 | self.num_attn_weights = num_attn_weights
182 | self.output_size = output_size
183 |
184 | # fully connected layers (2) and non-linearity
185 | self.layer1 = nn.Linear(encoder.num_directions * encoder.hidden_size, self.hidden_size)
186 | self.relu = nn.ReLU()
187 | self.layer2 = nn.Linear(self.hidden_size, self.output_size)
188 | self.sigmoid = nn.Sigmoid()
189 |
190 | # attention
191 | if self.use_attn:
192 | self.tanh = nn.Tanh()
193 | self.attn = nn.Linear(encoder.hidden_size*encoder.num_directions, self.num_attn_weights)
194 |
195 | def forward(self, inputs):
196 | # inputs size (seq len, batch size, hidden size * num directions)
197 | # if use attention, the output vector is a weighted combination of input hidden states
198 | # if not use attention, the output vector is simply a feedforward network operated on input's last hidden state
199 | # TODO: write the attn function into another module???
200 | if self.use_attn:
201 |
202 | # reshape input to be 2D tensor instead of 3D
203 | seq_len = inputs.size(0)
204 | batch_size = inputs.size(1)
205 | inputs_for_attn_calc = inputs.view(-1, inputs.size(-1))
206 |
207 | attn_weights = Variable(torch.zeros(inputs.size(1), inputs.size(0)))
208 | if use_cuda:
209 | attn_weights = attn_weights.cuda()
210 |
211 | # calculate attention weight for each output time step
212 | # remember encoder_outputs size: (seq_len, batch, hidden_size * num_directions)
213 | # for each token in the decoder output sequences:
214 | for b in range(inputs.size(1)):
215 | # the scores for calculating attention weights of all encoder outputs for one time step of decoder output
216 | attn_weights[b] = self.attn(inputs[:, b]).t()
217 |
218 | attn_weights = F.softmax(attn_weights)
219 |
220 | # input to bmm:
221 | # weights size: (batch size, 1, seq_len)
222 | # hidden states size: (seq_len, batch, hidden_size * num_directions)
223 | # transpose hidden state size: (batch, seq len, hidden_size * num_directions)
224 | # output size: (batch size, 1, hidden_size * num_directions)
225 | context = torch.bmm(attn_weights.unsqueeze(1), inputs.transpose(0, 1)).squeeze(1)
226 | else:
227 | context = torch.sum( inputs.transpose(0,1), 1 ).squeeze(1)
228 |
229 | # feedforward
230 | out = self.layer1(context)
231 | out = self.relu(out)
232 | out = self.layer2(out)
233 | out = self.sigmoid(out)
234 |
235 | return out
236 |
--------------------------------------------------------------------------------
/src/GAN_model/GAN_model.py:
--------------------------------------------------------------------------------
1 | import sys, os
2 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
3 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_c_a_sep')
4 | # sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_baseline/')
5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/D_baseline')
6 | from util import asMinutes, timeSince
7 | from data_proc import *
8 | from G_c_a_sep import G
9 | from G_eval import *
10 | # from G_model import G
11 | from D_model import *
12 |
13 | import torch
14 | import torch.nn as nn
15 | import numpy as np
16 | import torch.autograd as autograd
17 | from torch.autograd import Variable
18 |
19 | ##################################################################
20 |
21 | use_cuda = torch.cuda.is_available()
22 | if use_cuda:
23 | gpu = 0
24 |
25 | def to_var(x):
26 | if use_cuda:
27 | x = x.cuda()
28 | return Variable(x)
29 |
30 |
31 | class GAN_model(nn.Module):
32 | def __init__(self, G_enc_input_size, G_enc_hidden_size, G_enc_n_layers, G_enc_num_directions,
33 | G_dec_input_size, G_dec_hidden_size, G_output_size, G_dec_n_layers, G_dec_num_directions,
34 | D_enc_input_size, D_enc_hidden_size, D_enc_n_layers, D_num_directions,
35 | D_mlp_hidden_size, D_num_attn_weights, D_mlp_output_size,
36 | use_attn, batch_size, pretrain=False, G_path=None
37 | ):
38 |
39 | super(GAN_model, self).__init__()
40 |
41 | self.G = G(G_enc_input_size, G_enc_hidden_size, G_enc_n_layers, G_enc_num_directions, G_dec_input_size,
42 | G_dec_hidden_size, G_output_size, G_dec_n_layers, G_dec_num_directions, batch_size)
43 | if pretrain:
44 | # load the G model from G_path
45 | self.G = torch.load(G_path)
46 |
47 | self.D = D(D_enc_input_size, D_enc_hidden_size, D_enc_n_layers, D_num_directions, D_mlp_hidden_size,
48 | D_num_attn_weights, D_mlp_output_size, use_attn, batch_size)
49 |
50 | def train(self, triplets, n_iters, d_steps, d_optimizer, g_steps, g_optimizer, batch_size, max_len,
51 | criterion, word2index, index2word, embeddings_index, embeddings_size, print_every, plot_every, checkpoint_every,
52 | to_file=False, loss_f=None, sample_out_f=None, path_to_exp_out=None):
53 | # criterion is for both G and D
54 |
55 | # record start time for logging
56 | begin_time = time.time()
57 | print_d_loss_total = 0 # Reset every print_every
58 | plot_d_loss_total = 0 # Reset every plot_every
59 | print_g_loss_total = 0 # Reset every print_every
60 | plot_g_loss_total = 0 # Reset every plot_every
61 | plot_d_loss_avgs = []
62 | plot_g_loss_avgs = []
63 |
64 | for iter in range(1, n_iters + 1):
65 |
66 | # train D
67 | for d_train_idx in range(d_steps):
68 | # 1. Train D on real+fake
69 | self.D.zero_grad()
70 |
71 | # 1A: Train D on real
72 | # get data
73 | # prepare batch
74 | training_batch, seq_lens = get_random_batch(triplets, batch_size)
75 | # concat the context_ans batch with the question batch
76 | # each element in the training batch is context + question + answer
77 | cqa_batch, _, cqa_lens = prepare_batch_var(training_batch, seq_lens,
78 | batch_size, word2index, embeddings_index,
79 | embeddings_size, mode=['word'], concat_opt='cqa')
80 |
81 | train_input = Variable(cqa_batch[0].cuda()) if use_cuda else Variable(
82 | cqa_batch[0]) # embeddings vectors, size = [seq len x batch size x embedding dim]
83 |
84 | d_real_decision = self.D.forward(train_input, cqa_lens[0])
85 | real_target = Variable(torch.FloatTensor([1]*batch_size)).cuda() if use_cuda else \
86 | Variable(torch.FloatTensor([1]*batch_size))
87 | d_real_error = criterion(d_real_decision, real_target) # ones = true
88 | d_real_error.backward() # compute/store gradients, but don't change params
89 |
90 | # 1B: Train D on fake
91 | fake_cqa_batch, fake_cqa_lens = prepare_fake_batch_var(self.G, training_batch, max_len, batch_size,
92 | word2index, index2word, embeddings_index,
93 | embeddings_size, mode = ('word'))
94 |
95 | # # sanity check: rpepare fake batch and prepare batch have the same order
96 | # print(fake_cqa_batch[0][12] == cqa_batch[0][12])
97 |
98 | d_fake_data = Variable(fake_cqa_batch[0].cuda()) if use_cuda else Variable(fake_cqa_batch[0])
99 | d_fake_decision = self.D.forward(d_fake_data, fake_cqa_lens[0])
100 | fake_target = Variable(torch.FloatTensor([0]*batch_size)).cuda() if use_cuda else \
101 | Variable(torch.FloatTensor([0]*batch_size))
102 | # d_fake_error = criterion(d_fake_decision, fake_target) # zeros = fake
103 | # d_fake_error.backward()
104 | # d_optimizer.step()
105 |
106 | # accumulate loss
107 | # FIXME I dont think below implementation works for batch version
108 | d_error = torch.mean(d_fake_decision) - torch.mean(d_real_decision) # W_GAN loss
109 | # d_error = -torch.mean(self.log(1 - d_fake_decision)) - torch.mean(self.log(d_real_decision)) # GAN loss
110 | d_error.backward()
111 | d_optimizer.step()
112 |
113 | # d_error = d_real_error + d_fake_error
114 |
115 | # train G
116 | for g_train_idx in range(g_steps):
117 | self.G.zero_grad()
118 |
119 | # conditional data for generator
120 | training_batch, seq_lens = get_random_batch(triplets, batch_size)
121 | fake_cqa_batch, fake_cqa_lens = prepare_fake_batch_var(self.G, training_batch, max_len, batch_size,
122 | word2index, index2word, embeddings_index,
123 | embeddings_size, mode=('word'), detach=False)
124 | g_fake_data = Variable(fake_cqa_batch[0].cuda()) if use_cuda else Variable(fake_cqa_batch[0])
125 | dg_fake_decision = self.D.forward(g_fake_data, fake_cqa_lens[0])
126 | target = Variable(torch.FloatTensor([1]*batch_size).cuda()) if use_cuda else \
127 | Variable(torch.FloatTensor([1]*batch_size))
128 | # g_error = criterion(dg_fake_decision, target)
129 | g_error = -torch.mean(dg_fake_decision) # wgan loss
130 | # G_error = -torch.mean(self.log(dg_fake_decision)) # gan loss
131 | g_error.backward()
132 | g_optimizer.step() # Only optimizes G's parameters
133 |
134 | # log error
135 | print_d_loss_total += d_error.data[0]
136 | print_g_loss_total += g_error.data[0]
137 | plot_d_loss_total += d_error.data[0]
138 | plot_g_loss_total += g_error.data[0]
139 | if iter % print_every == 0:
140 | print_d_loss_avg = print_d_loss_total / print_every
141 | print_g_loss_avg = print_g_loss_total / print_every
142 | print_d_loss_total = 0
143 | print_g_loss_total = 0
144 |
145 | if not to_file:
146 | print('%s (%d %d%%)' % (timeSince(begin_time, iter / float(n_iters)), iter, iter / n_iters * 100))
147 | # print("errors: D: real-%s/fake-%s G: %s " % ( d_real_error.data[0], d_fake_error.data[0], g_error.data[0]) )
148 | print("errors: D: %s G: %s " % (print_d_loss_avg, print_g_loss_avg))
149 | print('---sample generated question---')
150 | # sample a triple and print the generated question
151 | evaluate(self.G, triplets, embeddings_index, embeddings_size, word2index, index2word, max_len)
152 | else:
153 | sample_out_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100)))
154 | evaluate(self.G, triplets, embeddings_index, embeddings_size, word2index, index2word, max_len,
155 | to_file, sample_out_f)
156 | sample_out_f.write(unicode('\n'))
157 |
158 | if iter % plot_every == 0:
159 | plot_d_loss_avg = plot_d_loss_total / plot_every
160 | plot_d_loss_avgs.append(plot_d_loss_avg)
161 | plot_g_loss_avg = plot_g_loss_total / plot_every
162 | plot_g_loss_avgs.append(plot_g_loss_avg)
163 | plot_d_loss_total = 0
164 | plot_g_loss_total = 0
165 |
166 | if to_file:
167 | loss_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100)))
168 | loss_f.write(unicode("errors: D: %s G: %s " % (print_d_loss_avg, print_g_loss_avg)))
169 | loss_f.write(unicode('\n'))
170 |
171 | if (iter % checkpoint_every == 0) or (iter == n_iters):
172 | checkpoint_fname = 'checkpoint_iter_' + str(iter) + '.pth.tar'
173 | state = {
174 | 'iteration': iter + 1,
175 | 'd_state_dict': self.D.state_dict(),
176 | 'g_state_dict': self.G.state_dict(),
177 | 'd_optimizer' : d_optimizer.state_dict(),
178 | 'g_optimizer' : g_optimizer.state_dict(),
179 | }
180 | torch.save(state, path_to_exp_out+'/'+checkpoint_fname)
181 | plotLoss(plot_d_loss_avgs, plot_every, save_path=path_to_exp_out, f_name='d_loss_itr_'+str(iter)+'.png',
182 | title='training loss D (monitoring purpose)', from_file=False)
183 | plotLoss(plot_g_loss_avgs, plot_every, save_path=path_to_exp_out, f_name='g_loss_itr_'+str(iter)+'.png',
184 | title='training loss G (monitoring purpose)', from_file=False)
185 |
186 | # def train(self, **kwargs):
187 | # pass
188 | #
189 | # def test(self):
190 | # pass
191 |
192 | # L2 loss instead of Binary cross entropy loss (this is optional for stable training)
193 | # FIXME: is L2 loss the same as MSELoss in torch loss module?
194 | # FIXME: these losses don't work with minibatch yet?
195 | def loss(self, D_real, D_fake, gen_params, disc_params, cond_real_data, cond_fake_data, mode, lr=None):
196 | mode = mode.lower()
197 | if mode == 'gan':
198 | G_loss = -torch.mean(self.log(D_fake))
199 | # FIXME G_loss.backward()
200 | D_loss = -torch.mean(self.log(1 - D_fake)) - torch.mean(self.log(D_real))
201 | # FIXME D_loss.backward()
202 | metric = -D_loss / 2 + np.log(2) # JS divergence
203 |
204 | G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3)
205 | D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3)
206 |
207 | elif mode == 'lsgan-1':
208 | G_loss = torch.mean(D_fake ** 2)
209 | D_loss = torch.mean((D_real - 1) ** 2)
210 | metric = 0 # TBD
211 |
212 | G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3)
213 | D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3)
214 |
215 | elif mode == 'lsgan-2':
216 | G_loss = torch.mean((D_fake - 1) ** 2)
217 | D_loss = torch.mean((D_real - 1) ** 2) + torch.mean(D_fake ** 2)
218 | metric = D_loss / 2 # Pearson Chi-Square divergence
219 |
220 | G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3)
221 | D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3)
222 |
223 | elif mode == 'wgan':
224 | G_loss = -torch.mean(D_fake)
225 | D_loss = torch.mean(D_fake) - torch.mean(D_real)
226 | metric = -D_loss # Earth-mover distance
227 |
228 | grad_penalty = self.cal_grad_penalty(cond_real_data, cond_fake_data)
229 | D_loss += self.lmd * grad_penalty
230 |
231 | G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3)
232 | D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3)
233 |
234 | elif mode == 'bgan':
235 | G_loss = 0.5 * torch.mean((self.log(D_fake) - self.log(1 - D_fake)) ** 2)
236 | D_loss = -torch.mean(self.log(D_real) + self.log(1 - D_fake))
237 | metric = 0 # TBD
238 | G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3)
239 | D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3)
240 |
241 | else:
242 | raise ValueError('Unknown mode: {}'.format(mode))
243 |
244 | return G_loss, D_loss, metric, G_solver, D_solver
245 |
246 | def cal_grad_penalty(self, cond_real_data, cond_fake_data):
247 | epsilon = to_var(torch.rand(self.batch_size, 1))
248 | epsilon = epsilon.expand(cond_real_data.size())
249 |
250 | data_diff = cond_fake_data - cond_real_data
251 | cond_interp_data = cond_real_data + epsilon * data_diff
252 | disc_interp = self.D(self.d_net, cond_interp_data, reuse=True) # TODO: change the arguments
253 |
254 | grad_interp = autograd.grad(outputs=disc_interp, inputs=cond_interp_data,
255 | grad_outputs=torch.ones(disc_interp.size()).cuda(
256 | gpu) if use_cuda else torch.ones(
257 | disc_interp.size()),
258 | create_graph=True, retain_graph=True, only_inputs=True)[0]
259 |
260 | grad_interp_flat = grad_interp.view([self.batch_size, -1])
261 | slope = grad_interp_flat.norm(p=2, dim=1)
262 |
263 | grad_penalty = torch.mean((slope - 1.) ** 2)
264 | return grad_penalty
265 |
266 |
267 |
268 | # same context and answer as in the real batch, but generated question
269 | def prepare_fake_batch_var(generator, batch, max_len, batch_size, word2index, index2word,
270 | embeddings_index, embeddings_size, sort=False, mode = ('word'), detach=True, concat=None):
271 |
272 | batch_vars = []
273 | batch_var_orig = []
274 |
275 | cqa = []
276 | cqa_len = []
277 | labels = torch.LongTensor([0] * batch_size) # all fake labels, thus all 0's
278 | for b in range(batch_size):
279 | if concat=='ca':
280 | ca = batch[0][b] + batch[2][b]
281 | fake_q_sample = G_sampler(generator, ca, embeddings_index, embeddings_size, word2index, index2word, max_len, detach=detach, concat=concat)
282 | elif concat==None:
283 | inputs = [batch[0][b], batch[1][b], batch[2][b]]
284 | fake_q_sample = G_sampler(generator, inputs, embeddings_index, embeddings_size, word2index, index2word, max_len, detach=detach)
285 | cqa.append(batch[0][b] + fake_q_sample + batch[2][b])
286 | cqa_len.append(len(batch[0][b] + fake_q_sample + batch[2][b]))
287 |
288 | batch = [cqa, batch[3], batch[4], labels]
289 | seq_lens = [cqa_len]
290 |
291 | # sort this batch_var in descending order according to the values of the lengths of the first element in batch
292 | num_batch = len(batch)
293 |
294 | if sort:
295 | all = batch + seq_lens
296 | all = sorted(zip(*all), key=lambda p: len(p[0]), reverse=True)
297 | all = zip(*all)
298 | batch = all[0:num_batch]
299 | seq_lens = all[num_batch:]
300 | batch_orig = batch
301 |
302 | for b in range(num_batch):
303 |
304 | batch_var = batch[b]
305 |
306 | # if element in batch is float, i.e. indices, then do nothing
307 | if isinstance(batch_var[0], int):
308 | batch_var = list(batch_var)
309 | pass
310 | else:
311 | # pad each context, question, answer to their respective max length
312 | if mode[b] == 'index':
313 | batch_padded = [pad_sequence(s, max(seq_lens[b]), word2index, mode='index') for s in batch_var]
314 | else:
315 | batch_padded = [pad_sequence(s, max(seq_lens[b]), word2index) for s in batch_var]
316 |
317 | # init variable matrices
318 | if mode[b] == 'index':
319 | batch_var = torch.LongTensor(max(seq_lens[b]), batch_size) # long tensor for module loss criterion
320 | else:
321 | batch_var = torch.FloatTensor(max(seq_lens[b]), batch_size, embeddings_size)
322 |
323 | # FIXME: very stupid embedded for loop implementation
324 | for i in range(batch_size):
325 | for j in range(max(seq_lens[b])):
326 | if mode[b] == 'index':
327 | batch_var[j, i] = batch_padded[i][j]
328 | else:
329 | batch_var[j, i,] = embeddings_index[batch_padded[i][j]]
330 |
331 | batch_vars.append(batch_var)
332 |
333 | # the second output is for debugging purpose
334 | if sort:
335 | return batch_vars, batch_orig, seq_lens
336 | else:
337 | return batch_vars, seq_lens
338 |
339 |
340 | # function to sample generator output
341 | def G_sampler(generator, input, embeddings_index, embeddings_size, word2index, index2word, max_length, concat=None, detach=True):
342 | # NOTE currently only generate one question at a time. multiple questions not yet supported
343 |
344 | if concat == 'ca':
345 | var = torch.FloatTensor(len(input), embeddings_size)
346 | for j in range(len(input)):
347 | var[j] = embeddings_index[input[j]]
348 | var = inputs.unsqueeze(1)
349 | if use_cuda:
350 | var = Variable(var.cuda())
351 | else:
352 | var = Variable(var)
353 |
354 | decoder_output = generator.forward(var, None, [len(input)], 1, max_length,
355 | embeddings_index, embeddings_size, word2index, index2word,
356 | teacher_forcing_ratio=0).detach()
357 | decoder_output = decoder_output.squeeze(1)
358 | elif concat == None:
359 | # NOTE: hardcode indices of c, q, a, in the line - for i in range(0,3)
360 | inputs = []
361 | for i in range(0,3):
362 | # print(input[i])
363 | var = torch.FloatTensor(len(input[i]), embeddings_size)
364 | for j in range(len(input[i])):
365 | var[j] = embeddings_index[input[i][j]]
366 | var = var.unsqueeze(1)
367 | if use_cuda:
368 | var = Variable(var.cuda())
369 | else:
370 | var = Variable(var)
371 | inputs.append(var)
372 |
373 | decoder_output = generator.forward(inputs, [len(x) for x in input], 1, max_length,
374 | embeddings_index, embeddings_size, word2index, index2word,
375 | teacher_forcing_ratio=0)
376 | if detach:
377 | decoder_output = decoder_output.detach()
378 | decoder_output = decoder_output.squeeze(1)
379 |
380 |
381 |
382 | decoded_words = []
383 | for di in range(max_length):
384 | # top value and index of every batch
385 | topv, topi = decoder_output[di].data.topk(1)
386 | ni = topi[0]
387 | if (ni == word2index['EOS']) or (ni == word2index['PAD']):
388 | decoded_words.append('EOS')
389 | # decoder_attentions[di] = decoder_attention[0].data
390 | break
391 | else:
392 | decoded_words.append(index2word[ni])
393 |
394 | return decoded_words
395 |
--------------------------------------------------------------------------------
/src/util/data_proc.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------------------------------------#
2 | #-----------------------------------------------------------------------------------------------#
3 | # data loading helper functions
4 | #-----------------------------------------------------------------------------------------------#
5 | #-----------------------------------------------------------------------------------------------#
6 | from __future__ import unicode_literals, print_function, division
7 | from io import open
8 | import unicodedata
9 | import random
10 |
11 | # import spacy
12 | from spacy.en import English
13 | spacynlp = English()
14 |
15 | import torch
16 | from torch.autograd import Variable
17 |
18 | # FIXME: import spacy again below to avoid an error encountered when importing torch and spacy
19 | # it seems that spacy needs to be imported before torch. However, on Baylor cluster,
20 | # you need to import spacy again here for it to actually be imported without error.
21 | from spacy.en import English
22 | spacynlp = English()
23 |
24 | import json
25 | import numpy as np
26 |
27 | # import sys, os
28 | # sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_baseline')
29 | # from G_eval import *
30 |
31 |
32 | ######################################################################
33 | # The files are all in Unicode, to simplify we will turn Unicode
34 | # characters to ASCII, make everything lowercase
35 | #
36 |
37 | # Turn a Unicode string to plain ASCII, thanks to
38 | # http://stackoverflow.com/a/518232/2809427
39 | def unicodeToAscii(s):
40 | return ''.join(
41 | c for c in unicodedata.normalize('NFD', s)
42 | if unicodedata.category(c) != 'Mn'
43 | )
44 |
45 | # Lowercase, trim, and remove non-letter characters
46 | def normalizeString(s):
47 | s = unicodeToAscii(s.lower().strip())
48 | # s = re.sub(r"([.!?])", r" \1", s)
49 | # s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
50 | return s
51 |
52 |
53 |
54 | ######################################################################
55 | # read GLOVE word embeddings
56 | def readGlove(path_to_data):
57 | embeddings_index = {}
58 | f = open(path_to_data)
59 | for line in f:
60 | values = line.split()
61 | word = values[0]
62 | coefs = np.asarray(values[1:], dtype='float32')
63 | coefs = torch.from_numpy(coefs)
64 | embeddings_index[word] = coefs
65 | f.close()
66 |
67 | print('Found %s word vectors.' % len(embeddings_index))
68 |
69 | # get dimension from a random sample in the dict
70 | embeddings_size = random.sample( embeddings_index.items(), 1 )[0][1].size(-1)
71 | print('dimension of word embeddings: ' + str(embeddings_size))
72 |
73 | SOS_token = -torch.ones(embeddings_size) # start of sentence token, all zerons
74 | EOS_token = torch.ones(embeddings_size) # end of sentence token, all ones
75 | UNK_token = torch.ones(embeddings_size) + torch.ones(embeddings_size) # these choices are pretty random
76 | PAD_token = torch.zeros(embeddings_size)
77 |
78 | # add special tokens to the embeddings
79 | embeddings_index['SOS'] = SOS_token
80 | embeddings_index['EOS'] = EOS_token
81 | embeddings_index['UNK'] = UNK_token
82 | embeddings_index['PAD'] = PAD_token
83 |
84 | return embeddings_index, embeddings_size
85 |
86 |
87 | ######################################################################
88 | # read data specific for SQUAD dataset
89 |
90 | def read_raw_squad(path_to_data, normalize=True):
91 | # output (context, question, answer, ans_start_idx, ans_end_idx) triplets
92 | print("Reading dataset...")
93 | triplets = []
94 | with open(path_to_data) as f:
95 | train = json.load(f)
96 | train = train['data']
97 | for s in range(0, len(train)):
98 | samples = train[s]['paragraphs']
99 | for p in range(0, len(samples)):
100 | context = samples[p]['context']
101 | qas = samples[p]['qas']
102 | for i in range(0, len(qas)):
103 | # print('current s,p,i are: ' + str(s)+str(p)+str(i))
104 | answers = qas[i]['answers']
105 | question = qas[i]['question']
106 | for a in range(0, len(answers)):
107 | ans_text = answers[a]['text']
108 | ans_start_idx = answers[a]['answer_start']
109 | ans_end_idx = ans_start_idx + len(ans_text)
110 |
111 | if normalize:
112 | # turn from unicode to ascii and lower case everything
113 | context = unicodeToAscii(context)
114 | question = unicodeToAscii(question)
115 | ans_text = unicodeToAscii(ans_text)
116 |
117 | triplets.append((context, question, ans_text, ans_start_idx, ans_end_idx))
118 | return triplets
119 |
120 |
121 | # helper function to tokenize the raw squad data
122 | # e.g. the context is read as a string; this function produces a list of word tokens from context string
123 | # and return as the processed tuple (context, question, ans_text, ans_start_idx, ans_end_idx)
124 | # the first three are lists, the last two are LongTensor
125 | def tokenize_squad(squad, embeddings_index, opt='raw', c_EOS=True, a_EOS=True):
126 | tokenized_triplets = []
127 | if opt == 'raw':
128 | for triple in squad:
129 | tokenized_triplets.append( ( tokenize_sentence(triple[0], embeddings_index, EOS=c_EOS),
130 | tokenize_sentence(triple[1], embeddings_index),
131 | tokenize_sentence(triple[2], embeddings_index, EOS=a_EOS),
132 | triple[3],
133 | triple[4] ) )
134 | elif opt == 'window':
135 | for triple in squad:
136 | tokenized_triplets.append( ( tokenize_sentence(triple[0], embeddings_index, spacy=False, EOS=c_EOS),
137 | tokenize_sentence(triple[1], embeddings_index),
138 | tokenize_sentence(triple[2], embeddings_index, spacy=False, EOS=a_EOS),
139 | triple[3],
140 | triple[4] ) )
141 | elif opt == 'sent':
142 | for triple in squad:
143 | tokenized_triplets.append( ( tokenize_sentence(triple[0], embeddings_index, spacy=False, EOS=c_EOS),
144 | tokenize_sentence(triple[1], embeddings_index),
145 | tokenize_sentence(triple[2], embeddings_index, EOS=a_EOS),
146 | triple[3],
147 | triple[4] ) )
148 | else:
149 | raise Exception('unknown option. should be one of "raw", "window", or "sent".')
150 | return tokenized_triplets
151 |
152 |
153 | # helper function to get the sentence of where the answer appear in the context
154 | # based on tokenized_squad, first element in output
155 | # output seq of tokens only from the answer sentence (same format as element in tokenize_squad output)
156 | def get_ans_sentence(raw_squad, sent_window=0):
157 |
158 | sent_c_triplets = [] # now each context in
159 | unmatch = [] # for debug
160 | for t in range(len(raw_squad)):
161 | sent = None
162 | c = raw_squad[t][0]
163 | a = raw_squad[t][2]
164 | sent_c = list(spacynlp(c).sents)
165 | tokenized_a = spacynlp.tokenizer(a)
166 | # sanity check
167 | # if len(sent_c) == 1:
168 | # print('WARNING: sentence segmentation may not work in this triple')
169 | # print(sent_c)
170 | # print(tokenized_c)
171 | ans_start_idx = raw_squad[t][3]
172 | ans_end_idx = raw_squad[t][4]
173 |
174 | # print(ans_start_idx)
175 | # print(ans_end_idx)
176 |
177 | idx = 0
178 | for s in sent_c:
179 | print(idx)
180 | # print('currenet index: %d' % idx)
181 | if idx <= ans_start_idx and idx+len(s.string)>=ans_end_idx:
182 | # print('enter if statement')
183 | # print(s)
184 | sent = s
185 | # print(sent_c.index(sent))
186 | # if isinstance(sent, unicode):
187 | # raise Exception('unicode detected, where expecting spacy span object.')
188 | if tokenized_a[0].string not in sent.string:
189 | # print('c')
190 | # print(idx)
191 | # print(idx+len(s.string))
192 | # print(ans_start_idx)
193 | # print(ans_end_idx)
194 | print(type(tokenized_a[0]))
195 | print(type(sent))
196 | unmatch.append(t)
197 | # raise Exception('answer token not in current sentence')
198 | break
199 | else:
200 | idx += len(s.string)
201 |
202 | try:
203 | idx_temp = sent_c.index(sent)
204 | except:
205 |
206 | print(sent_c)
207 | print(sent)
208 | print(tokenized_a)
209 | print('\n')
210 | unmatch.append(t)
211 |
212 | #TODO: multiple sentences as context
213 | if sent_window > 0:
214 | ans_sent_idx = sent_c.index(sent)
215 | # print(ans_sent_idx)
216 | for i in range(1,sent_window):
217 | if ans_sent_idx-i > 0 and ans_sent_idx+i < len(sent_c):
218 | sent = [sent_c[ans_sent_idx-i], sent, sent_c[ans_sent_idx+i]]
219 | elif ans_sent_idx-1 <= 0 and ans_sent_idx+1 < len(sent_c):
220 | sent = [sent, sent_c[ans_sent_idx+i]]
221 | elif ans_sent_idx-1 > 0 and ans_sent_idx+1 >= len(sent_c):
222 | sent = [sent_c[ans_sent_idx-i], sent]
223 | sent_c_triplets.append( ( sent, raw_squad[t][1], raw_squad[t][2], raw_squad[t][3], raw_squad[t][4] ) )
224 |
225 | return sent_c_triplets, set(unmatch)
226 |
227 |
228 | # helper function to get a window of tokens around the answer
229 | # similar to get_ans_sentence; only difference is the span of tokens
230 | # NOTE: here the number of window operates on crude tokens: there's = one token.
231 | # in proc_tokenized_sent, there's = 3 tokens. therefore, the actual
232 | # number of tokens before and after the answer may exceed the set window size
233 | def get_windowed_ans(raw_squad, window_size):
234 |
235 | windowed_c_triplets = []
236 |
237 | for triple in raw_squad:
238 | c = triple[0]
239 | a = triple[2]
240 | tokenized_c = spacynlp.tokenizer(c)
241 | # sanity check
242 | # print(tokenized_c)
243 | tokenized_a = spacynlp.tokenizer(a)
244 | ans_start_idx = triple[3]
245 | ans_end_idx = triple[4]
246 | c_sub = c[:ans_start_idx]
247 | # print('first token in answer = %s' % tokenized_a[0])
248 |
249 | # find the start token of the answer in context
250 | idx = 0
251 | t = 0
252 | for token in tokenized_c:
253 | if idx+c_sub.count(' ') == ans_start_idx and unicode(token) == unicode(tokenized_a[0]):
254 | break
255 | else:
256 | idx += len(token)
257 | t += 1
258 | if t < window_size:
259 | left_window = 0
260 | else:
261 | left_window = t - window_size
262 | if t + window_size + len(tokenized_a) > len(tokenized_c):
263 | right_window = len(tokenized_c)
264 | else:
265 | right_window = t + window_size + len(tokenized_a)
266 |
267 | windowed_c = tokenized_c[left_window:right_window]
268 | # # sanity check
269 | # if tokenized_a[0] not in windowed_c:
270 | # print('ERROR: windowed context does not contain answer token')
271 |
272 | windowed_c_triplets.append( ( windowed_c , triple[1], tokenized_a, triple[3], triple[4] ) )
273 |
274 | return windowed_c_triplets
275 |
276 |
277 | def annotate_context_w_ans(raw_squad):
278 | pass
279 |
280 |
281 |
282 |
283 | # turns a sentence into individual tokens
284 | # this function takes care of word tokens that does not appear in pre trained embeddings
285 | # solution is to turn those word tokens into 'UNK'
286 | def tokenize_sentence(sentence, data_tokens, spacy=True, EOS=True):
287 | if spacy:
288 | tokenized_sentence = spacynlp.tokenizer(sentence)
289 | else:
290 | tokenized_sentence = sentence
291 | # # an additional preprocessing step to separate words and non-words when they appear together
292 | proc_tokenized_sentence = post_proc_tokenize_sentence(tokenized_sentence)
293 |
294 | token_num = len(proc_tokenized_sentence)
295 |
296 | var = []
297 |
298 | for t in range(0, token_num):
299 | # the first if loop only for experimental use to aviod large vocab size
300 | if proc_tokenized_sentence[t] not in data_tokens:
301 | var.append('UNK')
302 | else:
303 | var.append(proc_tokenized_sentence[t])
304 |
305 | if EOS:
306 | var.append('EOS')
307 | return var
308 |
309 |
310 | # helper function for post processing tokenizer
311 | # separate all punctuations into single tokens
312 | # e.g. "(they're)" --> "they", "'", "re"
313 | # outputs a list of strings
314 | def post_proc_tokenize_sentence(tokenized_sentence):
315 | proc_tokenized_sentence = []
316 | for t in range(0, len(tokenized_sentence)):
317 | # try:
318 | # token = tokenized_sentence[t].string.lower().strip()
319 | # except:
320 | # print(tokenized_sentence)
321 | token = tokenized_sentence[t].string.lower().strip()
322 | # first check if the string is number or alphabet only
323 | if token.isdigit() or token.isalpha():
324 | proc_tokenized_sentence.append(token)
325 | # sepatate this token into substrings of only words, numbers, or individual symbols
326 | else:
327 | index = -1
328 | for s in range(0, len(token)):
329 | if s > index:
330 | if token[s].isdigit():
331 | # print('find digit')
332 | for i in range(s,len(token)):
333 | if (not token[i].isdigit()):
334 | proc_tokenized_sentence.append(token[s:i])
335 | index = i-1
336 | break
337 | elif (token[i].isdigit()) and (i == len(token)-1):
338 | proc_tokenized_sentence.append(token[s:i+1])
339 | index = i
340 | break
341 | elif token[s].isalpha():
342 | # print('find alphabet')
343 | for i in range(s,len(token)):
344 | if (not token[i].isalpha()):
345 | proc_tokenized_sentence.append(token[s:i])
346 | index = i-1
347 | break
348 | elif (token[i].isalpha()) and (i == len(token)-1):
349 | proc_tokenized_sentence.append(token[s:i+1])
350 | index = i
351 | break
352 | else:
353 | # print('find symbol')
354 | proc_tokenized_sentence.append(token[s])
355 | index += 1
356 | # print(index)
357 | return proc_tokenized_sentence
358 | # test
359 | # x = post_proc_tokenizer(spacynlp.tokenizer(u'mid-1960s'))
360 |
361 |
362 | # # find the max length of context, answer, and question
363 | # def max_length(triplets):
364 |
365 | # max_len_c = 0
366 | # max_len_q = 0
367 | # max_len_a = 0
368 |
369 | # for triple in triplets:
370 | # len_c = len(triple[0])
371 | # len_q = len(triple[1])
372 | # len_a = len(triple[2])
373 | # if len_c > max_len_c:
374 | # max_len_c = len_c
375 | # if len_q > max_len_q:
376 | # max_len_q = len_q
377 | # if len_a > max_len_a:
378 | # max_len_a = len_a
379 |
380 | # return max_len_c, max_len_q, max_len_a
381 |
382 |
383 | ######################################################################
384 | # count the number of tokens in both the word embeddings and the corpus
385 | def count_effective_num_tokens(triplets, embeddings_index, sos_eos = True):
386 | ## find all unique tokens in the data (should be a subset of the number of embeddings)
387 | data_tokens = []
388 | for triple in triplets:
389 | data_tokens += triple[0] + triple[1] + triple[2]
390 | data_tokens = list(set(data_tokens)) # find unique
391 | if sos_eos:
392 | data_tokens = ['SOS', 'EOS', 'UNK', 'PAD'] + data_tokens
393 | else:
394 | data_tokens = ['UNK', 'PAD']
395 |
396 | effective_tokens = list(set(data_tokens).intersection(embeddings_index.keys()))
397 | effective_num_tokens = len(effective_tokens)
398 |
399 | return effective_tokens, effective_num_tokens
400 |
401 |
402 | ######################################################################
403 | # generate word index and index word look up tables
404 | def generate_look_up_table(effective_tokens, effective_num_tokens, use_cuda = True):
405 | word2index = {}
406 | index2word = {}
407 | for i in range(effective_num_tokens):
408 | index2word[i] = effective_tokens[i]
409 | word2index[effective_tokens[i]] = i
410 | return word2index, index2word
411 |
412 |
413 | ######################################################################
414 | # prepare minibatch of data
415 | # output is (contexts, questions, answers, answer_start_idxs, answer_end_idxs)
416 | # each is of dimension [batch_size x their respective max length]
417 | def get_random_batch(triplets, batch_size, with_fake = False):
418 |
419 | # init values
420 | contexts = []
421 | questions = []
422 | answers = []
423 | ans_start_idxs = []
424 | ans_end_idxs = []
425 |
426 | # inside this forloop, all word tokens are turned into their respective index according to word2index lookup table
427 | for i in range(batch_size):
428 | triple = random.choice(triplets)
429 | contexts.append(triple[0])
430 | questions.append( triple[1] )
431 | answers.append(triple[2])
432 | ans_start_idxs.append( triple[3] )
433 | ans_end_idxs.append( triple[4] )
434 |
435 | # get lengths of each context, question, answer in their respective arrays
436 | context_lens = [len(s) for s in contexts]
437 | question_lens = [len(s) for s in questions]
438 | answer_lens = [len(s) for s in answers]
439 |
440 | if with_fake:
441 | idx = int(batch_size/2)
442 | return [contexts[:idx], questions[:idx], answers[:idx], ans_start_idxs[:idx], ans_end_idxs[:idx]], \
443 | [context_lens[:idx], question_lens[:idx], answer_lens[:idx]],\
444 | [contexts[idx:], questions, answers[idx:], ans_start_idxs[idx:], ans_end_idxs[idx:]], \
445 | [context_lens[idx:], question_lens[idx:], answer_lens[idx:]]
446 | else:
447 | return [contexts, questions, answers, ans_start_idxs, ans_end_idxs], \
448 | [context_lens, question_lens, answer_lens]
449 |
450 |
451 | # - prepare batch training data
452 | # - training_batch contains five pieces of data. The first three with size [batch size x max seq len],
453 | # - the last two with size [batch size].
454 | # - seq_lens contains lengths of the first three sequences, each of size [batch size]
455 | # - the output would be matrices of size [max seq len x batch size x embedding size]
456 | # - if question is represented as index, then its size is [max seq len x batch size] --> this is transpose of the input
457 | # from get_random_batch in order to fit NLLLoss function (indexing and selecting the whole batch of a single token) is
458 | # easier. e.g. you can do question[i] which selects the whole sequence of the first dimension
459 | def prepare_batch_var(batch, seq_lens, batch_size, word2index, embeddings_index, embeddings_size,
460 | use_cuda=1, sort=False, mode=('word', 'index', 'word'), concat_opt=None,
461 | with_fake=False, fake_batch=None, fake_seq_lens=None):
462 |
463 | batch_vars = []
464 | batch_var_orig = []
465 | batch_paddings = []
466 |
467 | if with_fake:
468 | batch_size = int(batch_size/2)
469 | fake_q = fake_batch[1]
470 | fake_q_lens = fake_seq_lens[1]
471 |
472 | #TODO (for different applications): change the below code (before for loop) to concat different portions of the batch_triplets
473 | if concat_opt == None:
474 | pass
475 |
476 | elif concat_opt == 'ca':
477 | ca = []
478 | ca_len = []
479 | for b in range(batch_size):
480 | ca.append(batch[0][b] + batch[2][b])
481 | ca_len.append(len(batch[0][b] + batch[2][b]))
482 | batch = [ca, batch[1], batch[3], batch[4]]
483 | seq_lens = [ca_len] + seq_lens
484 |
485 | elif concat_opt == 'qa':
486 | pass
487 |
488 | # FIXME: only this following elif implemented fake question
489 | elif concat_opt == 'cqa':
490 | cqa = []
491 | cqa_len = []
492 | labels = []
493 | for b in range(batch_size):
494 | cqa.append(batch[0][b] + batch[1][b] + batch[2][b]) # append real
495 | cqa_len.append(len(batch[0][b] + batch[1][b] + batch[2][b])) # append real
496 | labels.append(1)
497 | if with_fake: # append fake
498 | fake_q_sample = random.sample(fake_q,1)[0]
499 | cqa.append(batch[0][b] + fake_q_sample + batch[2][b])
500 | cqa_len.append(len(batch[0][b] + fake_q_sample + batch[2][b]))
501 | labels.append(0)
502 | if with_fake:
503 | batch = [cqa, batch[3]+fake_batch[3], batch[4]+fake_batch[4], labels]
504 | else:
505 | batch = [cqa, batch[3], batch[4]]
506 | seq_lens = [cqa_len]
507 | elif concat_opt == 'qca':
508 | pass
509 |
510 | else:
511 | raise ValueError('not a valid concat option.')
512 |
513 | num_batch = len(batch)
514 | # sort this batch_var in descending order according to the values of the lengths of the first element in batch
515 | if sort:
516 | all = batch + seq_lens
517 | all = sorted(zip(*all), key=lambda p: len(p[0]), reverse=True)
518 | all = zip(*all)
519 | batch = all[0:num_batch]
520 | seq_lens = all[num_batch:]
521 | batch_orig = batch
522 |
523 | # get bacth size back to 2x if with fake
524 | if with_fake:
525 | batch_size = batch_size * 2
526 |
527 | for b in range(num_batch):
528 |
529 | batch_var = batch[b]
530 |
531 | # if element in batch is float, i.e. indices, then do nothing
532 | if isinstance(batch_var[0], int):
533 | batch_var = list(batch_var)
534 | pass
535 | else:
536 | # pad each context, question, answer to their respective max length
537 | if mode[b] == 'index':
538 | batch_padded = [pad_sequence(s, max(seq_lens[b]), word2index, mode='index') for s in batch_var]
539 | else:
540 | batch_padded = [pad_sequence(s, max(seq_lens[b]), word2index) for s in batch_var]
541 |
542 | # init variable matrices
543 | if mode[b] == 'index':
544 | batch_var = torch.LongTensor(max(seq_lens[b]), batch_size) # long tensor for module loss criterion
545 | else:
546 | batch_var = torch.FloatTensor(max(seq_lens[b]), batch_size, embeddings_size)
547 |
548 | # FIXME: very stupid embedded for loop implementation
549 | for i in range(batch_size):
550 | for j in range(max(seq_lens[b])):
551 | if mode[b] == 'index':
552 | batch_var[j, i] = batch_padded[i][j]
553 | else:
554 | batch_var[j, i,] = embeddings_index[batch_padded[i][j]]
555 |
556 | batch_vars.append(batch_var)
557 | batch_paddings.append(batch_padded)
558 |
559 | # the second output is for debugging purpose
560 | return batch_vars, batch_paddings, seq_lens
561 |
562 | # helper function to zero pad context, question, answer to their respective maximum length
563 | def pad_sequence(s, max_len, word2index, mode = 'word'):
564 | if mode == 'word':
565 | return s + ['PAD' for i in range(max_len - len(s))]
566 | elif mode == 'index':
567 | return [word2index[i] for i in s] + [word2index['PAD'] for i in range(max_len - len(s))]
568 |
569 |
570 | ######################################################################
571 | # TODO: need a function to sample some (c, q, a) triplets from the generator
572 | def sample_generated_triples(triplets, G, batch_size):
573 |
574 | # should return the same thing as get_random_batch with with_fake = False
575 | return None
576 |
577 |
578 | ######################################################################
579 | # test function for examining the output of the batch
580 | # primarily see whether the context, question, answer triplets make sense
581 | def print_batch(batch, batch_size, index2word):
582 | idx = random.choice(range(batch_size))
583 | context = [ index2word[i] for i in batch[0][idx,] ]
584 | question = [ index2word[i] for i in batch[1][idx,] ]
585 | answer = [ index2word[i] for i in batch[2][idx,] ]
586 | return (' '.join(context), ' '.join(question), ' '.join(answer))
587 |
588 |
589 |
590 |
--------------------------------------------------------------------------------
/references/code/seq2seq_translation_tutorial.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Translation with a Sequence to Sequence Network and Attention
4 | *************************************************************
5 | **Author**: `Sean Robertson `_
6 |
7 | In this project we will be teaching a neural network to translate from
8 | French to English.
9 |
10 | ::
11 |
12 | [KEY: > input, = target, < output]
13 |
14 | > il est en train de peindre un tableau .
15 | = he is painting a picture .
16 | < he is painting a picture .
17 |
18 | > pourquoi ne pas essayer ce vin delicieux ?
19 | = why not try that delicious wine ?
20 | < why not try that delicious wine ?
21 |
22 | > elle n est pas poete mais romanciere .
23 | = she is not a poet but a novelist .
24 | < she not not a poet but a novelist .
25 |
26 | > vous etes trop maigre .
27 | = you re too skinny .
28 | < you re all alone .
29 |
30 | ... to varying degrees of success.
31 |
32 | This is made possible by the simple but powerful idea of the `sequence
33 | to sequence network `__, in which two
34 | recurrent neural networks work together to transform one sequence to
35 | another. An encoder network condenses an input sequence into a vector,
36 | and a decoder network unfolds that vector into a new sequence.
37 |
38 | .. figure:: /_static/img/seq-seq-images/seq2seq.png
39 | :alt:
40 |
41 | To improve upon this model we'll use an `attention
42 | mechanism `__, which lets the decoder
43 | learn to focus over a specific range of the input sequence.
44 |
45 | **Recommended Reading:**
46 |
47 | I assume you have at least installed PyTorch, know Python, and
48 | understand Tensors:
49 |
50 | - http://pytorch.org/ For installation instructions
51 | - :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general
52 | - :doc:`/beginner/pytorch_with_examples` for a wide and deep overview
53 | - :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user
54 |
55 |
56 | It would also be useful to know about Sequence to Sequence networks and
57 | how they work:
58 |
59 | - `Learning Phrase Representations using RNN Encoder-Decoder for
60 | Statistical Machine Translation `__
61 | - `Sequence to Sequence Learning with Neural
62 | Networks `__
63 | - `Neural Machine Translation by Jointly Learning to Align and
64 | Translate `__
65 | - `A Neural Conversational Model `__
66 |
67 | You will also find the previous tutorials on
68 | :doc:`/intermediate/char_rnn_classification_tutorial`
69 | and :doc:`/intermediate/char_rnn_generation_tutorial`
70 | helpful as those concepts are very similar to the Encoder and Decoder
71 | models, respectively.
72 |
73 | And for more, read the papers that introduced these topics:
74 |
75 | - `Learning Phrase Representations using RNN Encoder-Decoder for
76 | Statistical Machine Translation `__
77 | - `Sequence to Sequence Learning with Neural
78 | Networks `__
79 | - `Neural Machine Translation by Jointly Learning to Align and
80 | Translate `__
81 | - `A Neural Conversational Model `__
82 |
83 |
84 | **Requirements**
85 | """
86 | from __future__ import unicode_literals, print_function, division
87 | from io import open
88 | import unicodedata
89 | import string
90 | import re
91 | import random
92 |
93 | import torch
94 | import torch.nn as nn
95 | from torch.autograd import Variable
96 | from torch import optim
97 | import torch.nn.functional as F
98 |
99 | use_cuda = torch.cuda.is_available()
100 |
101 | ######################################################################
102 | # Loading data files
103 | # ==================
104 | #
105 | # The data for this project is a set of many thousands of English to
106 | # French translation pairs.
107 | #
108 | # `This question on Open Data Stack
109 | # Exchange `__
110 | # pointed me to the open translation site http://tatoeba.org/ which has
111 | # downloads available at http://tatoeba.org/eng/downloads - and better
112 | # yet, someone did the extra work of splitting language pairs into
113 | # individual text files here: http://www.manythings.org/anki/
114 | #
115 | # The English to French pairs are too big to include in the repo, so
116 | # download to ``data/eng-fra.txt`` before continuing. The file is a tab
117 | # separated list of translation pairs:
118 | #
119 | # ::
120 | #
121 | # I am cold. Je suis froid.
122 | #
123 | # .. Note::
124 | # Download the data from
125 | # `here `_
126 | # and extract it to the current directory.
127 |
128 | ######################################################################
129 | # Similar to the character encoding used in the character-level RNN
130 | # tutorials, we will be representing each word in a language as a one-hot
131 | # vector, or giant vector of zeros except for a single one (at the index
132 | # of the word). Compared to the dozens of characters that might exist in a
133 | # language, there are many many more words, so the encoding vector is much
134 | # larger. We will however cheat a bit and trim the data to only use a few
135 | # thousand words per language.
136 | #
137 | # .. figure:: /_static/img/seq-seq-images/word-encoding.png
138 | # :alt:
139 | #
140 | #
141 |
142 |
143 | ######################################################################
144 | # We'll need a unique index per word to use as the inputs and targets of
145 | # the networks later. To keep track of all this we will use a helper class
146 | # called ``Lang`` which has word → index (``word2index``) and index → word
147 | # (``index2word``) dictionaries, as well as a count of each word
148 | # ``word2count`` to use to later replace rare words.
149 | #
150 |
151 | SOS_token = 0
152 | EOS_token = 1
153 |
154 |
155 | class Lang:
156 | def __init__(self, name):
157 | self.name = name
158 | self.word2index = {}
159 | self.word2count = {}
160 | self.index2word = {0: "SOS", 1: "EOS"}
161 | self.n_words = 2 # Count SOS and EOS
162 |
163 | def addSentence(self, sentence):
164 | for word in sentence.split(' '):
165 | self.addWord(word)
166 |
167 | def addWord(self, word):
168 | if word not in self.word2index:
169 | self.word2index[word] = self.n_words
170 | self.word2count[word] = 1
171 | self.index2word[self.n_words] = word
172 | self.n_words += 1
173 | else:
174 | self.word2count[word] += 1
175 |
176 |
177 | ######################################################################
178 | # The files are all in Unicode, to simplify we will turn Unicode
179 | # characters to ASCII, make everything lowercase, and trim most
180 | # punctuation.
181 | #
182 |
183 | # Turn a Unicode string to plain ASCII, thanks to
184 | # http://stackoverflow.com/a/518232/2809427
185 | def unicodeToAscii(s):
186 | return ''.join(
187 | c for c in unicodedata.normalize('NFD', s)
188 | if unicodedata.category(c) != 'Mn'
189 | )
190 |
191 |
192 | # Lowercase, trim, and remove non-letter characters
193 |
194 |
195 | def normalizeString(s):
196 | s = unicodeToAscii(s.lower().strip())
197 | s = re.sub(r"([.!?])", r" \1", s)
198 | s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
199 | return s
200 |
201 |
202 | ######################################################################
203 | # To read the data file we will split the file into lines, and then split
204 | # lines into pairs. The files are all English → Other Language, so if we
205 | # want to translate from Other Language → English I added the ``reverse``
206 | # flag to reverse the pairs.
207 | #
208 |
209 | def readLangs(lang1, lang2, reverse=False):
210 | print("Reading lines...")
211 |
212 | # Read the file and split into lines
213 | lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8'). \
214 | read().strip().split('\n')
215 |
216 | # Split every line into pairs and normalize
217 | pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
218 |
219 | # Reverse pairs, make Lang instances
220 | if reverse:
221 | pairs = [list(reversed(p)) for p in pairs]
222 | input_lang = Lang(lang2)
223 | output_lang = Lang(lang1)
224 | else:
225 | input_lang = Lang(lang1)
226 | output_lang = Lang(lang2)
227 |
228 | return input_lang, output_lang, pairs
229 |
230 |
231 | ######################################################################
232 | # Since there are a *lot* of example sentences and we want to train
233 | # something quickly, we'll trim the data set to only relatively short and
234 | # simple sentences. Here the maximum length is 10 words (that includes
235 | # ending punctuation) and we're filtering to sentences that translate to
236 | # the form "I am" or "He is" etc. (accounting for apostrophes replaced
237 | # earlier).
238 | #
239 |
240 | MAX_LENGTH = 10
241 |
242 | eng_prefixes = (
243 | "i am ", "i m ",
244 | "he is", "he s ",
245 | "she is", "she s",
246 | "you are", "you re ",
247 | "we are", "we re ",
248 | "they are", "they re "
249 | )
250 |
251 |
252 | def filterPair(p):
253 | return len(p[0].split(' ')) < MAX_LENGTH and \
254 | len(p[1].split(' ')) < MAX_LENGTH and \
255 | p[1].startswith(eng_prefixes)
256 |
257 |
258 | def filterPairs(pairs):
259 | return [pair for pair in pairs if filterPair(pair)]
260 |
261 |
262 | ######################################################################
263 | # The full process for preparing the data is:
264 | #
265 | # - Read text file and split into lines, split lines into pairs
266 | # - Normalize text, filter by length and content
267 | # - Make word lists from sentences in pairs
268 | #
269 |
270 | def prepareData(lang1, lang2, reverse=False):
271 | input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
272 | print("Read %s sentence pairs" % len(pairs))
273 | pairs = filterPairs(pairs)
274 | print("Trimmed to %s sentence pairs" % len(pairs))
275 | print("Counting words...")
276 | for pair in pairs:
277 | input_lang.addSentence(pair[0])
278 | output_lang.addSentence(pair[1])
279 | print("Counted words:")
280 | print(input_lang.name, input_lang.n_words)
281 | print(output_lang.name, output_lang.n_words)
282 | return input_lang, output_lang, pairs
283 |
284 |
285 | input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
286 | print(random.choice(pairs))
287 |
288 |
289 | ######################################################################
290 | # The Seq2Seq Model
291 | # =================
292 | #
293 | # A Recurrent Neural Network, or RNN, is a network that operates on a
294 | # sequence and uses its own output as input for subsequent steps.
295 | #
296 | # A `Sequence to Sequence network `__, or
297 | # seq2seq network, or `Encoder Decoder
298 | # network `__, is a model
299 | # consisting of two RNNs called the encoder and decoder. The encoder reads
300 | # an input sequence and outputs a single vector, and the decoder reads
301 | # that vector to produce an output sequence.
302 | #
303 | # .. figure:: /_static/img/seq-seq-images/seq2seq.png
304 | # :alt:
305 | #
306 | # Unlike sequence prediction with a single RNN, where every input
307 | # corresponds to an output, the seq2seq model frees us from sequence
308 | # length and order, which makes it ideal for translation between two
309 | # languages.
310 | #
311 | # Consider the sentence "Je ne suis pas le chat noir" → "I am not the
312 | # black cat". Most of the words in the input sentence have a direct
313 | # translation in the output sentence, but are in slightly different
314 | # orders, e.g. "chat noir" and "black cat". Because of the "ne/pas"
315 | # construction there is also one more word in the input sentence. It would
316 | # be difficult to produce a correct translation directly from the sequence
317 | # of input words.
318 | #
319 | # With a seq2seq model the encoder creates a single vector which, in the
320 | # ideal case, encodes the "meaning" of the input sequence into a single
321 | # vector — a single point in some N dimensional space of sentences.
322 | #
323 |
324 |
325 | ######################################################################
326 | # The Encoder
327 | # -----------
328 | #
329 | # The encoder of a seq2seq network is a RNN that outputs some value for
330 | # every word from the input sentence. For every input word the encoder
331 | # outputs a vector and a hidden state, and uses the hidden state for the
332 | # next input word.
333 | #
334 | # .. figure:: /_static/img/seq-seq-images/encoder-network.png
335 | # :alt:
336 | #
337 | #
338 |
339 | class EncoderRNN(nn.Module):
340 | def __init__(self, input_size, hidden_size, n_layers=1):
341 | super(EncoderRNN, self).__init__()
342 | self.n_layers = n_layers
343 | self.hidden_size = hidden_size
344 |
345 | self.embedding = nn.Embedding(input_size, hidden_size)
346 | self.gru = nn.GRU(hidden_size, hidden_size)
347 |
348 | def forward(self, input, hidden):
349 | embedded = self.embedding(input).view(1, 1, -1)
350 | output = embedded
351 | for i in range(self.n_layers):
352 | output, hidden = self.gru(output, hidden)
353 | return output, hidden
354 |
355 | def initHidden(self):
356 | result = Variable(torch.zeros(1, 1, self.hidden_size))
357 | if use_cuda:
358 | return result.cuda()
359 | else:
360 | return result
361 |
362 |
363 | ######################################################################
364 | # The Decoder
365 | # -----------
366 | #
367 | # The decoder is another RNN that takes the encoder output vector(s) and
368 | # outputs a sequence of words to create the translation.
369 | #
370 |
371 |
372 | ######################################################################
373 | # Simple Decoder
374 | # ^^^^^^^^^^^^^^
375 | #
376 | # In the simplest seq2seq decoder we use only last output of the encoder.
377 | # This last output is sometimes called the *context vector* as it encodes
378 | # context from the entire sequence. This context vector is used as the
379 | # initial hidden state of the decoder.
380 | #
381 | # At every step of decoding, the decoder is given an input token and
382 | # hidden state. The initial input token is the start-of-string ````
383 | # token, and the first hidden state is the context vector (the encoder's
384 | # last hidden state).
385 | #
386 | # .. figure:: /_static/img/seq-seq-images/decoder-network.png
387 | # :alt:
388 | #
389 | #
390 |
391 | class DecoderRNN(nn.Module):
392 | def __init__(self, hidden_size, output_size, n_layers=1):
393 | super(DecoderRNN, self).__init__()
394 | self.n_layers = n_layers
395 | self.hidden_size = hidden_size
396 |
397 | self.embedding = nn.Embedding(output_size, hidden_size)
398 | self.gru = nn.GRU(hidden_size, hidden_size)
399 | self.out = nn.Linear(hidden_size, output_size)
400 | self.softmax = nn.LogSoftmax()
401 |
402 | def forward(self, input, hidden):
403 | output = self.embedding(input).view(1, 1, -1)
404 | for i in range(self.n_layers):
405 | output = F.relu(output)
406 | output, hidden = self.gru(output, hidden)
407 | output = self.softmax(self.out(output[0]))
408 | return output, hidden
409 |
410 | def initHidden(self):
411 | result = Variable(torch.zeros(1, 1, self.hidden_size))
412 | if use_cuda:
413 | return result.cuda()
414 | else:
415 | return result
416 |
417 |
418 | ######################################################################
419 | # I encourage you to train and observe the results of this model, but to
420 | # save space we'll be going straight for the gold and introducing the
421 | # Attention Mechanism.
422 | #
423 |
424 |
425 | ######################################################################
426 | # Attention Decoder
427 | # ^^^^^^^^^^^^^^^^^
428 | #
429 | # If only the context vector is passed betweeen the encoder and decoder,
430 | # that single vector carries the burden of encoding the entire sentence.
431 | #
432 | # Attention allows the decoder network to "focus" on a different part of
433 | # the encoder's outputs for every step of the decoder's own outputs. First
434 | # we calculate a set of *attention weights*. These will be multiplied by
435 | # the encoder output vectors to create a weighted combination. The result
436 | # (called ``attn_applied`` in the code) should contain information about
437 | # that specific part of the input sequence, and thus help the decoder
438 | # choose the right output words.
439 | #
440 | # .. figure:: https://i.imgur.com/1152PYf.png
441 | # :alt:
442 | #
443 | # Calculating the attention weights is done with another feed-forward
444 | # layer ``attn``, using the decoder's input and hidden state as inputs.
445 | # Because there are sentences of all sizes in the training data, to
446 | # actually create and train this layer we have to choose a maximum
447 | # sentence length (input length, for encoder outputs) that it can apply
448 | # to. Sentences of the maximum length will use all the attention weights,
449 | # while shorter sentences will only use the first few.
450 | #
451 | # .. figure:: /_static/img/seq-seq-images/attention-decoder-network.png
452 | # :alt:
453 | #
454 | #
455 |
456 | class AttnDecoderRNN(nn.Module):
457 | def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1, max_length=MAX_LENGTH):
458 | super(AttnDecoderRNN, self).__init__()
459 | self.hidden_size = hidden_size
460 | self.output_size = output_size
461 | self.n_layers = n_layers
462 | self.dropout_p = dropout_p
463 | self.max_length = max_length
464 |
465 | self.embedding = nn.Embedding(self.output_size, self.hidden_size)
466 | self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
467 | self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
468 | self.dropout = nn.Dropout(self.dropout_p)
469 | self.gru = nn.GRU(self.hidden_size, self.hidden_size)
470 | self.out = nn.Linear(self.hidden_size, self.output_size)
471 |
472 | def forward(self, input, hidden, encoder_output, encoder_outputs):
473 | embedded = self.embedding(input).view(1, 1, -1)
474 | embedded = self.dropout(embedded)
475 |
476 | attn_weights = F.softmax(
477 | self.attn(torch.cat((embedded[0], hidden[0]), 1)))
478 | attn_applied = torch.bmm(attn_weights.unsqueeze(0),
479 | encoder_outputs.unsqueeze(0))
480 |
481 | output = torch.cat((embedded[0], attn_applied[0]), 1)
482 | output = self.attn_combine(output).unsqueeze(0)
483 |
484 | for i in range(self.n_layers):
485 | output = F.relu(output)
486 | output, hidden = self.gru(output, hidden)
487 |
488 | output = F.log_softmax(self.out(output[0]))
489 | return output, hidden, attn_weights
490 |
491 | def initHidden(self):
492 | result = Variable(torch.zeros(1, 1, self.hidden_size))
493 | if use_cuda:
494 | return result.cuda()
495 | else:
496 | return result
497 |
498 |
499 | ######################################################################
500 | # .. note:: There are other forms of attention that work around the length
501 | # limitation by using a relative position approach. Read about "local
502 | # attention" in `Effective Approaches to Attention-based Neural Machine
503 | # Translation `__.
504 | #
505 | # Training
506 | # ========
507 | #
508 | # Preparing Training Data
509 | # -----------------------
510 | #
511 | # To train, for each pair we will need an input tensor (indexes of the
512 | # words in the input sentence) and target tensor (indexes of the words in
513 | # the target sentence). While creating these vectors we will append the
514 | # EOS token to both sequences.
515 | #
516 |
517 | def indexesFromSentence(lang, sentence):
518 | return [lang.word2index[word] for word in sentence.split(' ')]
519 |
520 |
521 | def variableFromSentence(lang, sentence):
522 | indexes = indexesFromSentence(lang, sentence)
523 | indexes.append(EOS_token)
524 | result = Variable(torch.LongTensor(indexes).view(-1, 1))
525 | if use_cuda:
526 | return result.cuda()
527 | else:
528 | return result
529 |
530 |
531 | def variablesFromPair(pair):
532 | input_variable = variableFromSentence(input_lang, pair[0])
533 | target_variable = variableFromSentence(output_lang, pair[1])
534 | return (input_variable, target_variable)
535 |
536 |
537 | ######################################################################
538 | # Training the Model
539 | # ------------------
540 | #
541 | # To train we run the input sentence through the encoder, and keep track
542 | # of every output and the latest hidden state. Then the decoder is given
543 | # the ```` token as its first input, and the last hidden state of the
544 | # encoder as its first hidden state.
545 | #
546 | # "Teacher forcing" is the concept of using the real target outputs as
547 | # each next input, instead of using the decoder's guess as the next input.
548 | # Using teacher forcing causes it to converge faster but `when the trained
549 | # network is exploited, it may exhibit
550 | # instability `__.
551 | #
552 | # You can observe outputs of teacher-forced networks that read with
553 | # coherent grammar but wander far from the correct translation -
554 | # intuitively it has learned to represent the output grammar and can "pick
555 | # up" the meaning once the teacher tells it the first few words, but it
556 | # has not properly learned how to create the sentence from the translation
557 | # in the first place.
558 | #
559 | # Because of the freedom PyTorch's autograd gives us, we can randomly
560 | # choose to use teacher forcing or not with a simple if statement. Turn
561 | # ``teacher_forcing_ratio`` up to use more of it.
562 | #
563 |
564 | teacher_forcing_ratio = 0.5
565 |
566 |
567 | def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,
568 | max_length=MAX_LENGTH):
569 | encoder_hidden = encoder.initHidden()
570 |
571 | encoder_optimizer.zero_grad()
572 | decoder_optimizer.zero_grad()
573 |
574 | input_length = input_variable.size()[0]
575 | target_length = target_variable.size()[0]
576 |
577 | encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
578 | encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
579 |
580 | loss = 0
581 |
582 | for ei in range(input_length):
583 | encoder_output, encoder_hidden = encoder(
584 | input_variable[ei], encoder_hidden)
585 | encoder_outputs[ei] = encoder_output[0][0]
586 |
587 | decoder_input = Variable(torch.LongTensor([[SOS_token]]))
588 | decoder_input = decoder_input.cuda() if use_cuda else decoder_input
589 |
590 | decoder_hidden = encoder_hidden
591 |
592 | use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
593 |
594 | if use_teacher_forcing:
595 | # Teacher forcing: Feed the target as the next input
596 | for di in range(target_length):
597 | decoder_output, decoder_hidden, decoder_attention = decoder(
598 | decoder_input, decoder_hidden, encoder_output, encoder_outputs)
599 | loss += criterion(decoder_output[0], target_variable[di])
600 | decoder_input = target_variable[di] # Teacher forcing
601 |
602 | else:
603 | # Without teacher forcing: use its own predictions as the next input
604 | for di in range(target_length):
605 | decoder_output, decoder_hidden, decoder_attention = decoder(
606 | decoder_input, decoder_hidden, encoder_output, encoder_outputs)
607 | topv, topi = decoder_output.data.topk(1)
608 | ni = topi[0][0]
609 |
610 | decoder_input = Variable(torch.LongTensor([[ni]]))
611 | decoder_input = decoder_input.cuda() if use_cuda else decoder_input
612 |
613 | loss += criterion(decoder_output[0], target_variable[di])
614 | if ni == EOS_token:
615 | break
616 |
617 | loss.backward()
618 |
619 | encoder_optimizer.step()
620 | decoder_optimizer.step()
621 |
622 | return loss.data[0] / target_length
623 |
624 |
625 | ######################################################################
626 | # This is a helper function to print time elapsed and estimated time
627 | # remaining given the current time and progress %.
628 | #
629 |
630 | import time
631 | import math
632 |
633 |
634 | def asMinutes(s):
635 | m = math.floor(s / 60)
636 | s -= m * 60
637 | return '%dm %ds' % (m, s)
638 |
639 |
640 | def timeSince(since, percent):
641 | now = time.time()
642 | s = now - since
643 | es = s / (percent)
644 | rs = es - s
645 | return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
646 |
647 |
648 | ######################################################################
649 | # The whole training process looks like this:
650 | #
651 | # - Start a timer
652 | # - Initialize optimizers and criterion
653 | # - Create set of training pairs
654 | # - Start empty losses array for plotting
655 | #
656 | # Then we call ``train`` many times and occasionally print the progress (%
657 | # of examples, time so far, estimated time) and average loss.
658 | #
659 |
660 | def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
661 | start = time.time()
662 | plot_losses = []
663 | print_loss_total = 0 # Reset every print_every
664 | plot_loss_total = 0 # Reset every plot_every
665 |
666 | encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
667 | decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
668 | training_pairs = [variablesFromPair(random.choice(pairs))
669 | for i in range(n_iters)]
670 | criterion = nn.NLLLoss()
671 |
672 | for iter in range(1, n_iters + 1):
673 | training_pair = training_pairs[iter - 1]
674 | input_variable = training_pair[0]
675 | target_variable = training_pair[1]
676 |
677 | loss = train(input_variable, target_variable, encoder,
678 | decoder, encoder_optimizer, decoder_optimizer, criterion)
679 | print_loss_total += loss
680 | plot_loss_total += loss
681 |
682 | if iter % print_every == 0:
683 | print_loss_avg = print_loss_total / print_every
684 | print_loss_total = 0
685 | print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
686 | iter, iter / n_iters * 100, print_loss_avg))
687 |
688 | if iter % plot_every == 0:
689 | plot_loss_avg = plot_loss_total / plot_every
690 | plot_losses.append(plot_loss_avg)
691 | plot_loss_total = 0
692 |
693 | showPlot(plot_losses)
694 |
695 |
696 | ######################################################################
697 | # Plotting results
698 | # ----------------
699 | #
700 | # Plotting is done with matplotlib, using the array of loss values
701 | # ``plot_losses`` saved while training.
702 | #
703 |
704 | import matplotlib.pyplot as plt
705 | import matplotlib.ticker as ticker
706 | import numpy as np
707 |
708 |
709 | def showPlot(points):
710 | plt.figure()
711 | fig, ax = plt.subplots()
712 | # this locator puts ticks at regular intervals
713 | loc = ticker.MultipleLocator(base=0.2)
714 | ax.yaxis.set_major_locator(loc)
715 | plt.plot(points)
716 |
717 |
718 | ######################################################################
719 | # Evaluation
720 | # ==========
721 | #
722 | # Evaluation is mostly the same as training, but there are no targets so
723 | # we simply feed the decoder's predictions back to itself for each step.
724 | # Every time it predicts a word we add it to the output string, and if it
725 | # predicts the EOS token we stop there. We also store the decoder's
726 | # attention outputs for display later.
727 | #
728 |
729 | def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
730 | input_variable = variableFromSentence(input_lang, sentence)
731 | input_length = input_variable.size()[0]
732 | encoder_hidden = encoder.initHidden()
733 |
734 | encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
735 | encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
736 |
737 | for ei in range(input_length):
738 | encoder_output, encoder_hidden = encoder(input_variable[ei],
739 | encoder_hidden)
740 | encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0]
741 |
742 | decoder_input = Variable(torch.LongTensor([[SOS_token]])) # SOS
743 | decoder_input = decoder_input.cuda() if use_cuda else decoder_input
744 |
745 | decoder_hidden = encoder_hidden
746 |
747 | decoded_words = []
748 | decoder_attentions = torch.zeros(max_length, max_length)
749 |
750 | for di in range(max_length):
751 | decoder_output, decoder_hidden, decoder_attention = decoder(
752 | decoder_input, decoder_hidden, encoder_output, encoder_outputs)
753 | decoder_attentions[di] = decoder_attention.data
754 | topv, topi = decoder_output.data.topk(1)
755 | ni = topi[0][0]
756 | if ni == EOS_token:
757 | decoded_words.append('')
758 | break
759 | else:
760 | decoded_words.append(output_lang.index2word[ni])
761 |
762 | decoder_input = Variable(torch.LongTensor([[ni]]))
763 | decoder_input = decoder_input.cuda() if use_cuda else decoder_input
764 |
765 | return decoded_words, decoder_attentions[:di + 1]
766 |
767 |
768 | ######################################################################
769 | # We can evaluate random sentences from the training set and print out the
770 | # input, target, and output to make some subjective quality judgements:
771 | #
772 |
773 | def evaluateRandomly(encoder, decoder, n=10):
774 | for i in range(n):
775 | pair = random.choice(pairs)
776 | print('>', pair[0])
777 | print('=', pair[1])
778 | output_words, attentions = evaluate(encoder, decoder, pair[0])
779 | output_sentence = ' '.join(output_words)
780 | print('<', output_sentence)
781 | print('')
782 |
783 |
784 | ######################################################################
785 | # Training and Evaluating
786 | # =======================
787 | #
788 | # With all these helper functions in place (it looks like extra work, but
789 | # it's easier to run multiple experiments easier) we can actually
790 | # initialize a network and start training.
791 | #
792 | # Remember that the input sentences were heavily filtered. For this small
793 | # dataset we can use relatively small networks of 256 hidden nodes and a
794 | # single GRU layer. After about 40 minutes on a MacBook CPU we'll get some
795 | # reasonable results.
796 | #
797 | # .. Note::
798 | # If you run this notebook you can train, interrupt the kernel,
799 | # evaluate, and continue training later. Comment out the lines where the
800 | # encoder and decoder are initialized and run ``trainIters`` again.
801 | #
802 |
803 | hidden_size = 256
804 | encoder1 = EncoderRNN(input_lang.n_words, hidden_size)
805 | attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words,
806 | 1, dropout_p=0.1)
807 |
808 | if use_cuda:
809 | encoder1 = encoder1.cuda()
810 | attn_decoder1 = attn_decoder1.cuda()
811 |
812 | trainIters(encoder1, attn_decoder1, 75000, print_every=5000)
813 |
814 | ######################################################################
815 | #
816 |
817 | evaluateRandomly(encoder1, attn_decoder1)
818 |
819 | ######################################################################
820 | # Visualizing Attention
821 | # ---------------------
822 | #
823 | # A useful property of the attention mechanism is its highly interpretable
824 | # outputs. Because it is used to weight specific encoder outputs of the
825 | # input sequence, we can imagine looking where the network is focused most
826 | # at each time step.
827 | #
828 | # You could simply run ``plt.matshow(attentions)`` to see attention output
829 | # displayed as a matrix, with the columns being input steps and rows being
830 | # output steps:
831 | #
832 |
833 | output_words, attentions = evaluate(
834 | encoder1, attn_decoder1, "je suis trop froid .")
835 | plt.matshow(attentions.numpy())
836 |
837 |
838 | ######################################################################
839 | # For a better viewing experience we will do the extra work of adding axes
840 | # and labels:
841 | #
842 |
843 | def showAttention(input_sentence, output_words, attentions):
844 | # Set up figure with colorbar
845 | fig = plt.figure()
846 | ax = fig.add_subplot(111)
847 | cax = ax.matshow(attentions.numpy(), cmap='bone')
848 | fig.colorbar(cax)
849 |
850 | # Set up axes
851 | ax.set_xticklabels([''] + input_sentence.split(' ') +
852 | [''], rotation=90)
853 | ax.set_yticklabels([''] + output_words)
854 |
855 | # Show label at every tick
856 | ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
857 | ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
858 |
859 | plt.show()
860 |
861 |
862 | def evaluateAndShowAttention(input_sentence):
863 | output_words, attentions = evaluate(
864 | encoder1, attn_decoder1, input_sentence)
865 | print('input =', input_sentence)
866 | print('output =', ' '.join(output_words))
867 | showAttention(input_sentence, output_words, attentions)
868 |
869 |
870 | evaluateAndShowAttention("elle a cinq ans de moins que moi .")
871 |
872 | evaluateAndShowAttention("elle est trop petit .")
873 |
874 | evaluateAndShowAttention("je ne crains pas de mourir .")
875 |
876 | evaluateAndShowAttention("c est un jeune directeur plein de talent .")
877 |
878 |
879 | ######################################################################
880 | # Exercises
881 | # =========
882 | #
883 | # - Try with a different dataset
884 | #
885 | # - Another language pair
886 | # - Human → Machine (e.g. IOT commands)
887 | # - Chat → Response
888 | # - Question → Answer
889 | #
890 | # - Replace the embeddings with pre-trained word embeddings such as word2vec or
891 | # GloVe
892 | # - Try with more layers, more hidden units, and more sentences. Compare
893 | # the training time and results.
894 | # - If you use a translation file where pairs have two of the same phrase
895 | # (``I am test \t I am test``), you can use this as an autoencoder. Try
896 | # this:
897 | #
898 | # - Train as an autoencoder
899 | # - Save only the Encoder network
900 | # - Train a new Decoder for translation from there
901 | #
--------------------------------------------------------------------------------