├── references
    ├── code
    │   ├── __init__.py
    │   ├── vanilla-gan.py
    │   └── seq2seq_translation_tutorial.py
    ├── __init__.py
    └── papers
    │   ├── 1707.07328.pdf
    │   ├── WORDS OR CHARACTERS _ FINE-GRAINED GATING.pdf
    │   ├── deep reinforcement learning for dialogue generation.pdf
    │   ├── Semi-Supervised QA with Generative Domain-Adaptive Nets.pdf
    │   └── learning cooperative visual dialog agents with deep reinforcement learning.pdf
├── .gitignore
├── src
    ├── __init__.py
    ├── util
    │   ├── util.pyc
    │   ├── __init__.py
    │   ├── data_proc.pyc
    │   ├── masked_cross_entropy.py
    │   ├── util.py
    │   ├── test.py
    │   └── data_proc.py
    ├── GAN_model
    │   ├── __init__.py
    │   ├── GAN_main.py
    │   └── GAN_model.py
    ├── G_c_a_sep
    │   ├── __init__.py
    │   ├── G_eval.py
    │   ├── G_main.py
    │   ├── G_c_a_sep.py
    │   └── G_train.py
    ├── D_baseline
    │   ├── D_eval.pyc
    │   ├── D_train.pyc
    │   ├── __init__.py
    │   ├── D_baseline_model.pyc
    │   ├── D_model.py
    │   ├── D_eval.py
    │   ├── D_main.py
    │   └── D_train.py
    ├── G_baseline
    │   ├── __init__.py
    │   ├── G_main.py
    │   ├── G_train.py
    │   ├── G_eval.py
    │   └── G_model.py
    └── model_zoo.py
└── .idea
    ├── GAN-QA.iml
    └── misc.xml


/references/code/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | 
3 | .gitignore
4 | .idea


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division


--------------------------------------------------------------------------------
/src/util/util.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/util/util.pyc


--------------------------------------------------------------------------------
/src/util/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division


--------------------------------------------------------------------------------
/references/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division


--------------------------------------------------------------------------------
/src/GAN_model/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division


--------------------------------------------------------------------------------
/src/G_c_a_sep/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division


--------------------------------------------------------------------------------
/src/util/data_proc.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/util/data_proc.pyc


--------------------------------------------------------------------------------
/src/D_baseline/D_eval.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/D_baseline/D_eval.pyc


--------------------------------------------------------------------------------
/src/D_baseline/D_train.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/D_baseline/D_train.pyc


--------------------------------------------------------------------------------
/src/D_baseline/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division


--------------------------------------------------------------------------------
/src/G_baseline/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from __future__ import division


--------------------------------------------------------------------------------
/references/papers/1707.07328.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/1707.07328.pdf


--------------------------------------------------------------------------------
/src/D_baseline/D_baseline_model.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/src/D_baseline/D_baseline_model.pyc


--------------------------------------------------------------------------------
/references/papers/WORDS OR CHARACTERS _ FINE-GRAINED GATING.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/WORDS OR CHARACTERS _ FINE-GRAINED GATING.pdf


--------------------------------------------------------------------------------
/references/papers/deep reinforcement learning for dialogue generation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/deep reinforcement learning for dialogue generation.pdf


--------------------------------------------------------------------------------
/references/papers/Semi-Supervised QA with Generative Domain-Adaptive Nets.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/Semi-Supervised QA with Generative Domain-Adaptive Nets.pdf


--------------------------------------------------------------------------------
/references/papers/learning cooperative visual dialog agents with deep reinforcement learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/weilinie/GAN-QA/HEAD/references/papers/learning cooperative visual dialog agents with deep reinforcement learning.pdf


--------------------------------------------------------------------------------
/.idea/GAN-QA.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 2.7.12 (/usr/bin/python2.7)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.12 (/usr/bin/python2.7)" project-jdk-type="Python SDK" />
 4 |   <component name="PythonCompatibilityInspectionAdvertiser">
 5 |     <option name="version" value="1" />
 6 |   </component>
 7 |   <component name="masterDetails">
 8 |     <states>
 9 |       <state key="ScopeChooserConfigurable.UI">
10 |         <settings>
11 |           <splitter-proportions>
12 |             <option name="proportions">
13 |               <list>
14 |                 <option value="0.2" />
15 |               </list>
16 |             </option>
17 |           </splitter-proportions>
18 |         </settings>
19 |       </state>
20 |     </states>
21 |   </component>
22 | </project>


--------------------------------------------------------------------------------
/src/D_baseline/D_model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import sys
 3 | import os
 4 | sys.path.append(os.path.abspath(__file__ + "/../../"))
 5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
 6 | 
 7 | # FIXME: spacy has some problem with torch. need to import spacy first. therefore import data_proc first.
 8 | from data_proc import *
 9 | from model_zoo import *
10 | 
11 | import torch
12 | import torch.nn as nn
13 | 
14 | use_cuda = torch.cuda.is_available()
15 | 
16 | ######################################################################
17 | # The Encoder
18 | # -----------
19 | # FIXME: not sure if __name__ is to be used. 
20 | # if __name__ == '__main__':
21 | 
22 | class D(nn.Module):
23 | 
24 |     def __init__(self, enc_input_size, enc_hidden_size, enc_n_layers, num_directions,
25 |                  mlp_hidden_size, num_attn_weights, mlp_output_size, use_attn,
26 |                  batch_size):
27 |         # super constructor
28 |         super(D, self).__init__()
29 | 
30 |         self.encoder = EncoderRNN(enc_input_size, enc_hidden_size, batch_size, enc_n_layers, num_directions)
31 |         self.mlp = MLP(mlp_hidden_size, mlp_output_size, self.encoder, num_attn_weights, use_attn = True)
32 | 
33 | 
34 |     def forward(self, inputs, seq_lens, hidden=None):
35 |         # input size = (seq len, batch size, word embedding dimension)
36 |         
37 |         # encoding
38 |         # outputs dim (seq_len, batch size, hidden_size*num_directions)
39 |         encoder_outputs, encoder_hidden = self.encoder(inputs, seq_lens)
40 | 
41 |         # MLP
42 |         out = self.mlp(encoder_outputs)
43 | 
44 |         return out
45 | 
46 | 
47 |     def backward(self, out, labels, criterion, optimizer):
48 |         loss = criterion(out, labels)
49 |         loss.backward()
50 |         optimizer.step()
51 |         return loss
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/src/D_baseline/D_eval.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import sys
 3 | import os
 4 | sys.path.append(os.path.abspath(__file__ + "/../../"))
 5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
 6 | 
 7 | from data_proc import *
 8 | from D_model import *
 9 | 
10 | import torch
11 | from torch.autograd import Variable
12 | 
13 | use_cuda = torch.cuda.is_available()
14 | 
15 | 
16 | def evaluate(discriminator, triplets,
17 |              word2index, embeddings_index, embeddings_size,
18 |              eval_batch_size=10):
19 |     
20 |     # prepare batch
21 |     training_batch, seq_lens, fake_training_batch, fake_seq_lens = get_random_batch(triplets, eval_batch_size, with_fake=True)
22 |     # concat the context_ans batch with the question batch
23 |     # each element in the training batch is context + question + answer
24 |     training_batch, _, seq_lens = prepare_batch_var(training_batch, seq_lens, fake_training_batch, fake_seq_lens,
25 |                                                     eval_batch_size, word2index, embeddings_index, embeddings_size,
26 |                                                     mode = ['word'], concat_opt='cqa', with_fake=True)
27 | 
28 |     train_input = Variable(training_batch[0].cuda()) if use_cuda else Variable(
29 |         training_batch[0])  # embeddings vectors, size = [seq len x batch size x embedding dim]
30 |     true_labels = Variable(torch.FloatTensor(training_batch[-1]).cuda()) if use_cuda else Variable(
31 |         torch.FloatTensor(training_batch[-1]))
32 | 
33 |     # pass through discriminator model
34 |     outputs = discriminator.forward(train_input, true_labels, seq_lens[0])
35 | 
36 |     # get label predictions from model & compare the number of correct predictions
37 |     pred_labels = torch.zeros(outputs.size())
38 |     num_correct_pred = 0
39 |     for i in range(outputs.size(0)):
40 |         pred_labels[i] = 0 if outputs.data[i][0] <= 0.5 else 1
41 |         if pred_labels[i][0] == true_labels[i].data[0]:
42 |             num_correct_pred += 1
43 | 
44 |     print('percentage of correct predictions (True/False): ' + 
45 |             str(float(num_correct_pred)/float(outputs.size(0))*100) + '%.\n')
46 | 
47 | 
48 | 
49 |     
50 | 


--------------------------------------------------------------------------------
/src/util/masked_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import functional
 3 | from torch.autograd import Variable
 4 | 
 5 | def sequence_mask(sequence_length, max_len=None):
 6 |     if max_len is None:
 7 |         max_len = sequence_length.data.max()
 8 |     batch_size = sequence_length.size(0)
 9 |     seq_range = torch.range(0, max_len - 1).long()
10 |     seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
11 |     seq_range_expand = Variable(seq_range_expand)
12 |     if sequence_length.is_cuda:
13 |         seq_range_expand = seq_range_expand.cuda()
14 |     seq_length_expand = (sequence_length.unsqueeze(1)
15 |                          .expand_as(seq_range_expand))
16 |     return seq_range_expand < seq_length_expand
17 | 
18 | 
19 | def masked_cross_entropy(logits, target, length):
20 |     length = Variable(torch.LongTensor(length)).cuda()
21 | 
22 |     """
23 |     Args:
24 |         logits: A Variable containing a FloatTensor of size
25 |             (batch, max_len, num_classes) which contains the
26 |             unnormalized probability for each class.
27 |         target: A Variable containing a LongTensor of size
28 |             (batch, max_len) which contains the index of the true
29 |             class for each corresponding step.
30 |         length: A Variable containing a LongTensor of size (batch,)
31 |             which contains the length of each data in a batch.
32 |     Returns:
33 |         loss: An average loss value masked by the length.
34 |     """
35 | 
36 |     # logits_flat: (batch * max_len, num_classes)
37 |     logits_flat = logits.view(-1, logits.size(-1))
38 |     # log_probs_flat: (batch * max_len, num_classes)
39 |     log_probs_flat = functional.log_softmax(logits_flat)
40 |     # target_flat: (batch * max_len, 1)
41 |     target_flat = target.view(-1, 1)
42 |     # losses_flat: (batch * max_len, 1)
43 |     losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
44 |     # losses: (batch, max_len)
45 |     losses = losses_flat.view(*target.size())
46 |     # mask: (batch, max_len)
47 |     mask = sequence_mask(sequence_length=length, max_len=target.size(1))
48 |     losses = losses * mask.float()
49 |     loss = losses.sum() / length.float().sum()
50 |     return loss


--------------------------------------------------------------------------------
/src/D_baseline/D_main.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from __future__ import print_function
  3 | from __future__ import division
  4 | 
  5 | import sys
  6 | import os
  7 | sys.path.append(os.path.abspath(__file__ + "/../../"))
  8 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
  9 | from data_proc import *
 10 | 
 11 | from D_model import *
 12 | from D_train import *
 13 | from D_eval import *
 14 | import numpy as np
 15 | 
 16 | from torch import optim
 17 | 
 18 | use_cuda = torch.cuda.is_available()
 19 | 
 20 | 
 21 | ######### set paths
 22 | # TODO: to run properly, change the following paths and filenames
 23 | # default values for the dataset and the path to the project/dataset
 24 | dataset = 'squad'
 25 | f_name = 'dev-v1.1.json'
 26 | path_to_dataset = os.path.abspath(__file__ + '/../../../../') + '/data/'
 27 | path_to_data = path_to_dataset + dataset + '/' + f_name
 28 | GLOVE_DIR = path_to_dataset + 'glove.6B/'
 29 | # path for experiment outputs
 30 | # exp_name = 'QG_seq2seq_baseline'
 31 | path_to_exp_out = os.path.abspath(__file__ + '/../../../../') + '/exp_results_D_temp/'
 32 | loss_f = 'loss_temp.txt'
 33 | sample_out_f = 'sample_outputs_temp.txt'
 34 | path_to_loss_f = path_to_exp_out + '/' + loss_f
 35 | path_to_sample_out_f = path_to_exp_out + '/' + sample_out_f
 36 | 
 37 | 
 38 | ######### first load the pretrained word embeddings
 39 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.50d.txt')
 40 | embeddings_index, embeddings_size = readGlove(path_to_glove)
 41 | 
 42 | 
 43 | ######### read corpus
 44 | raw_triplets = read_raw_squad(path_to_data)
 45 | triplets = tokenize_squad(raw_triplets, embeddings_index)
 46 | 
 47 | # find max length of context, question, answer, respectively
 48 | max_len_c, max_len_q, max_len_a = max_length(triplets)
 49 | 
 50 | ######### corpus preprocessing
 51 | # words that do not appear in embeddings, etc
 52 | 
 53 | ## find all unique tokens in the data (should be a subset of the number of embeddings)
 54 | effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index)
 55 | print('effective number of tokens: ' + str(effective_num_tokens))
 56 | print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n')
 57 | # build word2index dictionary and index2word dictionary
 58 | word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens)
 59 | 
 60 | 
 61 | ######### set up model
 62 | enc_hidden_size = 256
 63 | enc_n_layers = 1
 64 | num_directions = 1
 65 | mlp_hidden_size = 64
 66 | mlp_output_size = 1
 67 | num_attn_weights = 1 # 1000
 68 | use_attn = True
 69 | batch_size = 100
 70 | enc_lr = 0.01
 71 | mlp_lr = 0.01
 72 | learning_rate = 0.001
 73 | discriminator = D(embeddings_size, enc_hidden_size, enc_n_layers, num_directions,
 74 |                   mlp_hidden_size, num_attn_weights, mlp_output_size, use_attn,
 75 |                   batch_size)
 76 | if use_cuda:
 77 |     discriminator = discriminator.cuda()
 78 | 
 79 | criterion = nn.BCELoss()
 80 | optimizer = optim.Adam(discriminator.parameters(), lr=learning_rate)
 81 | 
 82 | 
 83 | ######### start training
 84 | to_file = False
 85 | train(discriminator, criterion, optimizer, batch_size, embeddings_size,
 86 |            embeddings_index, word2index, index2word, triplets,
 87 |            to_file, path_to_loss_f, path_to_sample_out_f, path_to_exp_out,
 88 |            n_iters=3000, print_every=100, plot_every=1)
 89 | 
 90 | 
 91 | # save the final model
 92 | # if to_file:
 93 | #     torch.save(encoder, path_to_exp_out+'/encoder.pth')
 94 | #     torch.save(mlp, path_to_exp_out+'/mlp.pth')
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/src/G_c_a_sep/G_eval.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | sys.path.append(os.path.abspath(__file__ + "/../../"))
 4 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
 5 | from data_proc import *
 6 | 
 7 | import torch
 8 | from torch.autograd import Variable
 9 | 
10 | use_cuda = torch.cuda.is_available()
11 | 
12 | 
13 | # max_length constrains the maximum length of the generated question
14 | def evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length,
15 |              to_file = False, sample_out_f = None):
16 | 
17 |     # prepare test input
18 |     batch_size = 1
19 |     training, seq_lens = get_random_batch(triplets, batch_size)
20 |     context_words = training[0]
21 |     answer_words = training[2]
22 |     question_words = training[1]
23 |     training, _, seq_lens = prepare_batch_var(training, seq_lens, batch_size,
24 |                                                               word2index, embeddings_index, embeddings_size)
25 |     inputs = []
26 |     for var in training:
27 |         if not isinstance(var, list):
28 |             inputs.append(Variable(var.cuda())) if use_cuda else inputs.append(Variable(var))
29 |             # NOTE not currently appending start and end index to inputs because model does not use them
30 |             # else:
31 |             #     inputs.append(Variable(inputs))
32 | 
33 |     inputs_q = None
34 | 
35 |     all_decoder_outputs = generator.forward(inputs, seq_lens, batch_size, max_length,
36 |                                             embeddings_index, embeddings_size, word2index, index2word,
37 |                                             teacher_forcing_ratio=0)
38 | 
39 |     decoded_sentences = []
40 |     decoded_words = []
41 |     for b in range(batch_size):
42 |         # get the word token and add to the list of words
43 |         for di in range(max_length):
44 |             # top value and index of every batch
45 |             topv, topi = all_decoder_outputs[di,b].data.topk(1)
46 |             ni = topi[0]
47 |             if (ni == word2index['EOS']) or (ni == word2index['PAD']):
48 |                 decoded_words.append('EOS')
49 |                 # decoder_attentions[di] = decoder_attention[0].data
50 |                 break
51 |             else:
52 |                 decoded_words.append(index2word[ni])
53 |         decoded_sentences.append(decoded_words)
54 | 
55 |     # print results
56 |     if not to_file:
57 |         print('context              > ' + ' '.join(context_words[0]).encode('utf-8').strip())
58 |         print('answer               > ' + ' '.join(answer_words[0]).encode('utf-8').strip())
59 |         print('question             > ' + ' '.join(question_words[0]).encode('utf-8').strip())
60 |         # true_q = []
61 |         # for i in range(seq_lens[1][0]):
62 |         #     true_q.append(index2word[inputs_q[i][0].data[0]])
63 |         # print('question with padding> ' + ' '.join(true_q))
64 |         print('generated question   > ' + ' '.join(decoded_words))
65 |         return decoded_words
66 |     else:
67 |         sample_out_f.write(unicode('context              > ' + ' '.join(context_words[0]) + '\n'))
68 |         sample_out_f.write(unicode('answer               > ' + ' '.join(answer_words[0]) + '\n'))
69 |         sample_out_f.write(unicode('question             > ' + ' '.join(question_words[0]) + '\n'))
70 |         sample_out_f.write(unicode('generated question   > ' + ' '.join(decoded_words) + '\n'))
71 | 
72 |     # TODO: uncomment the following return if you want to record the decoder outputs in file
73 |     #       (note: need to modify this function call in G_train.py)
74 |     # return decoded_sentences
75 | 
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/src/G_baseline/G_main.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import division
  3 | 
  4 | import sys
  5 | import os
  6 | sys.path.append(os.path.abspath(__file__ + "/../../"))
  7 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
  8 | 
  9 | from G_train import *
 10 | from G_model import *
 11 | import numpy as np
 12 | 
 13 | global use_cuda
 14 | use_cuda = torch.cuda.is_available()
 15 | teacher_forcing_ratio = 0.75 # default in original code is 0.5
 16 | 
 17 | 
 18 | ######### set paths
 19 | # TODO: to run properly, change the following paths and filenames
 20 | # default values for the dataset and the path to the project/dataset
 21 | dataset = 'squad'
 22 | f_name = 'train-v1.1.json'
 23 | path_to_dataset = '/home/jack/Documents/QA_QG/data/'
 24 | path_to_data = path_to_dataset + dataset + '/' + f_name
 25 | GLOVE_DIR = path_to_dataset + 'glove.6B/'
 26 | 
 27 | 
 28 | ######### first load the pretrained word embeddings
 29 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')
 30 | embeddings_index, embeddings_size = readGlove(path_to_glove)
 31 | 
 32 | 
 33 | ######### read corpus
 34 | raw_triplets = read_raw_squad(path_to_data)
 35 | triplets = tokenize_squad(raw_triplets, embeddings_index)
 36 | 
 37 | # find max length of context, question, answer, respectively
 38 | # max_len_c, max_len_q, max_len_a = max_length(triplets)
 39 | 
 40 | ######### corpus preprocessing
 41 | # words that do not appear in embeddings, etc
 42 | 
 43 | ## find all unique tokens in the data (should be a subset of the number of embeddings)
 44 | effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index)
 45 | print('effective number of tokens: ' + str(effective_num_tokens))
 46 | print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n')
 47 | # build word2index dictionary and index2word dictionary
 48 | word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens)
 49 | 
 50 | 
 51 | print('reading and preprocessing data complete.')
 52 | print('found %s unique tokens in the intersection of corpus and word embeddings.' % effective_num_tokens)
 53 | if use_cuda:
 54 |     print('GPU ready.')
 55 | print('')
 56 | print('start training...')
 57 | print('')
 58 | 
 59 | 
 60 | ######### set up model
 61 | enc_hidden_size = 256
 62 | enc_n_layers = 1
 63 | enc_num_directions = 2
 64 | dec_hidden_size = 256
 65 | dec_n_layers = 1
 66 | dec_num_directions = 2
 67 | batch_size = 5
 68 | learning_rate = 0.0005
 69 | 
 70 | generator = G(embeddings_size, enc_hidden_size, enc_n_layers, enc_num_directions,
 71 |                  embeddings_size, dec_hidden_size, effective_num_tokens, dec_n_layers, dec_num_directions,
 72 |                  batch_size)
 73 | 
 74 | if use_cuda:
 75 |     generator = generator.cuda()
 76 | 
 77 | optimizer = optim.Adam(generator.parameters(), lr=learning_rate)
 78 | criterion = nn.NLLLoss()
 79 | 
 80 | # max_length of generated question
 81 | max_length = 100
 82 | to_file = False
 83 | 
 84 | # open the files
 85 | if to_file:
 86 |     exp_name = 'G_pretrain_exp_0827'
 87 |     path_to_exp_out = '/home/jack/Documents/QA_QG/exp_results_temp/'
 88 |     if not os.path.exists(path_to_exp_out+exp_name):
 89 |         os.mkdir(path_to_exp_out+exp_name)
 90 |     loss_f = 'loss_temp.txt'
 91 |     sample_out_f = 'sample_outputs_temp.txt'
 92 |     path_to_loss_f = path_to_exp_out + exp_name + '/' + loss_f
 93 |     path_to_sample_out_f = path_to_exp_out + exp_name + '/' + sample_out_f
 94 |     loss_f = open(path_to_loss_f,'w+')
 95 |     sample_out_f = open(path_to_sample_out_f, 'w+')
 96 | else:
 97 |     loss_f = None
 98 |     sample_out_f = None
 99 | 
100 | trainIters(generator, optimizer, batch_size, embeddings_size,
101 |            embeddings_index, word2index, index2word, max_length, triplets, teacher_forcing_ratio,
102 |            to_file, loss_f, sample_out_f,
103 |            n_iters = 1, print_every=1, plot_every=1)
104 | 
105 | # save the final model
106 | if to_file:
107 |     torch.save(generator, path_to_exp_out + exp_name +'/generator_temp.pth')
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/src/D_baseline/D_train.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from __future__ import print_function
  3 | from __future__ import division
  4 | 
  5 | import sys
  6 | import os
  7 | import time
  8 | sys.path.append(os.path.abspath(__file__ + "/../../"))
  9 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
 10 | from data_proc import *
 11 | # FIXME: had some problem importing util.py; importing successful but 
 12 | #        functions cannot be called (NameError: global name XXX is not defined)
 13 | #        fast solution: copied asMinutes and timeSince functions here
 14 | from util import *
 15 | 
 16 | import torch
 17 | from torch.autograd import Variable
 18 | from D_eval import *
 19 | 
 20 | use_cuda = torch.cuda.is_available()
 21 | 
 22 | import time
 23 | import math
 24 | 
 25 | # FIXME: added these two functions because import util does not seem to work (see above)
 26 | def asMinutes(s):
 27 |     m = math.floor(s / 60)
 28 |     s -= m * 60
 29 |     return '%dm %ds' % (m, s)
 30 | 
 31 | def timeSince(since, percent):
 32 |     now = time.time()
 33 |     s = now - since
 34 |     es = s / (percent)
 35 |     rs = es - s
 36 |     return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
 37 | 
 38 | 
 39 | ######################################################################
 40 | # Training the Model
 41 | # context = input_variable
 42 | def train(discriminator, criterion, optimizer, batch_size, embeddings_size,
 43 |     embeddings_index, word2index, index2word, triplets,
 44 |     to_file, path_to_loss_f, path_to_sample_out_f, path_to_exp_out,
 45 |     n_iters=10, print_every=10, plot_every=100):
 46 | 
 47 |     begin_time = time.time()
 48 | 
 49 |     # open the files
 50 |     if to_file:
 51 |         loss_f = open(path_to_loss_f,'w+')
 52 |         sample_out_f = open(path_to_sample_out_f, 'w+')
 53 | 
 54 |     # plot_losses = []
 55 |     print_loss_total = 0  # Reset every print_every
 56 |     plot_loss_total = 0  # Reset every plot_every
 57 | 
 58 |     print()
 59 | 
 60 |     for iter in range(1, n_iters + 1):
 61 | 
 62 |         # prepare batch
 63 |         training_batch, seq_lens, fake_training_batch, fake_seq_lens = get_random_batch(triplets, batch_size, with_fake=True)
 64 |         # concat the context_ans batch with the question batch
 65 |         # each element in the training batch is context + question + answer
 66 |         training_batch, _, seq_lens = prepare_batch_var(training_batch, seq_lens, fake_training_batch, fake_seq_lens,
 67 |                                                         batch_size, word2index, embeddings_index, embeddings_size,
 68 |                                                         mode = ['word'], concat_opt='cqa', with_fake=True)
 69 | 
 70 |         train_input = Variable(training_batch[0].cuda()) if use_cuda else Variable(training_batch[0]) # embeddings vectors, size = [seq len x batch size x embedding dim]
 71 |         # the labels are the last element of training_batch; see prepare_batch_var in data_proc.py for detail
 72 |         train_label = Variable(torch.FloatTensor(training_batch[-1]).cuda()) if use_cuda else Variable(torch.FloatTensor(training_batch[-1]))
 73 | 
 74 |         optimizer.zero_grad()
 75 |         loss = 0
 76 |         outputs = discriminator.forward(train_input, seq_lens[0])
 77 |         loss += discriminator.backward(outputs, train_label, criterion, optimizer)
 78 | 
 79 |         print_loss_total += loss.data[0]
 80 |         plot_loss_total += loss.data[0]
 81 | 
 82 |         # log on console
 83 |         if iter % print_every == 0:
 84 |             print_loss_avg = print_loss_total / print_every
 85 |             print_loss_total = 0
 86 |             print('%s (%d %d%%) %.4f' % (timeSince(begin_time, iter / float(n_iters)),
 87 |                                          iter, iter / n_iters * 100, print_loss_avg))
 88 |             evaluate(discriminator, triplets, word2index, embeddings_index, embeddings_size, eval_batch_size=100)
 89 |             print('-------------------------------')
 90 |             print('-------------------------------')
 91 |             print()
 92 | 
 93 |         # save error to file for plotting later
 94 |         if iter % plot_every == 0:
 95 |             plot_loss_avg = plot_loss_total / plot_every
 96 |             # plot_losses.append(plot_loss_avg)
 97 |             plot_loss_total = 0
 98 |             if to_file:
 99 |                 loss_f.write(unicode(plot_loss_avg))
100 |                 loss_f.write(unicode('\n'))
101 | 
102 |     # showPlot(plot_losses)
103 |     if to_file:
104 |         loss_f.close()
105 | 
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/src/util/util.py:
--------------------------------------------------------------------------------
  1 | ######################################################################
  2 | # Plotting results
  3 | # ----------------
  4 | #
  5 | # Plotting is done with matplotlib, using the array of loss values
  6 | # ``plot_losses`` saved while training.
  7 | #
  8 | import matplotlib
  9 | matplotlib.use('Agg')
 10 | import matplotlib.pyplot as plt
 11 | import matplotlib.ticker as ticker
 12 | import numpy as np
 13 | import difflib
 14 | 
 15 | 
 16 | def showPlot(points):
 17 |     plt.figure()
 18 |     fig, ax = plt.subplots()
 19 |     # this locator puts ticks at regular intervals
 20 |     loc = ticker.MultipleLocator(base=0.2)
 21 |     ax.yaxis.set_major_locator(loc)
 22 |     plt.plot(points)
 23 | 
 24 | 
 25 | 
 26 | def extract(v):
 27 |     return v.data.storage().tolist()
 28 | 
 29 | 
 30 | 
 31 | ######################################################################
 32 | # This is a helper function to print time elapsed and estimated time
 33 | # remaining given the current time and progress %.
 34 | #
 35 | 
 36 | import time
 37 | import math
 38 | 
 39 | def asMinutes(s):
 40 |     m = math.floor(s / 60)
 41 |     s -= m * 60
 42 |     return '%dm %ds' % (m, s)
 43 | 
 44 | def timeSince(since, percent):
 45 |     now = time.time()
 46 |     s = now - since
 47 |     es = s / (percent)
 48 |     rs = es - s
 49 |     return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
 50 | 
 51 | 
 52 | 
 53 | ######################################################################
 54 | # show loss function
 55 | def plotLoss(loss_f, plot_every, save_path=None, from_file=True, f_name='loss.png', title='training loss'):
 56 |     if from_file:
 57 |         loss_vec = []
 58 |         with open(loss_f) as f:
 59 |             content = f.readlines()
 60 |             content = [x.strip() for x in content] # list of every line, each a string
 61 |             for line in content:
 62 |                 try:
 63 |                     loss_vec.append(float(line))
 64 |                 except ValueError:
 65 |                     pass
 66 |     else:
 67 |         loss_vec = loss_f
 68 |     # plot
 69 |     plt.figure()
 70 |     plt.title(title)
 71 |     plt.xlabel('training iterations')
 72 |     plt.ylabel('loss')
 73 |     plt.grid()
 74 |     plt.plot([x*plot_every for x in range(1, len(loss_vec)+1)], loss_vec)
 75 |     if save_path == None:
 76 |         plt.savefig(f_name)
 77 |     else:
 78 |         plt.savefig(save_path + '/' + f_name)
 79 | 
 80 | # test
 81 | # from util import *
 82 | # plotLoss('../../../exp_results_temp/G_c_a_sep_pretrain_exp_0902/loss_temp.txt', 30)
 83 | 
 84 | 
 85 | ######################################################################
 86 | # check if the generated question already exist in the corpus
 87 | def generated_q_novelty(triplets, generated_q):
 88 |     # input - tokenized triplets, each one a list of strings
 89 |     # input - generated question
 90 |     # output - a similarity score vector for each of the questions in the triplets
 91 |     scores = []
 92 |     if not (isinstance(generated_q, str) or isinstance(generated_q, unicode)):
 93 |         generated_q = ' '.join(generated_q)
 94 |     for idx in range(len(triplets)):
 95 |         q = ' '.join(triplets[idx][1])
 96 |         scores.append(difflib.SequenceMatcher(None, generated_q, q).ratio)
 97 |     return np.array(scores)
 98 | # test
 99 | 
100 | 
101 | # ######################################################################
102 | # # For a better viewing experience we will do the extra work of adding axes
103 | # # and labels:
104 | # #
105 | # def showAttention(input_sentence, output_words, attentions):
106 | #     # Set up figure with colorbar
107 | #     fig = plt.figure()
108 | #     ax = fig.add_subplot(111)
109 | #     cax = ax.matshow(attentions.numpy(), cmap='bone')
110 | #     fig.colorbar(cax)
111 | 
112 | #     # Set up axes
113 | #     ax.set_xticklabels([''] + input_sentence.split(' ') +
114 | #                        ['<EOS>'], rotation=90)
115 | #     ax.set_yticklabels([''] + output_words)
116 | 
117 | #     # Show label at every tick
118 | #     ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
119 | #     ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
120 | 
121 | #     plt.show()
122 | 
123 | 
124 | # def evaluateAndShowAttention(input_sentence):
125 | #     output_words, attentions = evaluate(
126 | #         encoder1, attn_decoder1, input_sentence)
127 | #     print('input =', input_sentence)
128 | #     print('output =', ' '.join(output_words))
129 | #     showAttention(input_sentence, output_words, attentions)
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/src/G_c_a_sep/G_main.py:
--------------------------------------------------------------------------------
  1 | # from __future__ import print_function
  2 | # from __future__ import division
  3 | 
  4 | import sys
  5 | import os
  6 | sys.path.append(os.path.abspath(__file__ + "/../../"))
  7 | 
  8 | from G_train import *
  9 | from G_c_a_sep import *
 10 | # import numpy as np
 11 | from torch import optim
 12 | 
 13 | global use_cuda
 14 | use_cuda = torch.cuda.is_available()
 15 | teacher_forcing_ratio = 0.75 # default in original code is 0.5
 16 | 
 17 | 
 18 | ######### set paths
 19 | # TODO: to run properly, change the following paths and filenames
 20 | # default values for the dataset and the path to the project/dataset
 21 | dataset = 'squad'
 22 | f_name = 'train-v1.1.json'
 23 | path_to_dataset = '/home/jack/Documents/QA_QG/data/'
 24 | path_to_data = path_to_dataset + dataset + '/' + f_name
 25 | GLOVE_DIR = path_to_dataset + 'glove.6B/'
 26 | 
 27 | 
 28 | ######### first load the pretrained word embeddings
 29 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')
 30 | embeddings_index, embeddings_size = readGlove(path_to_glove)
 31 | 
 32 | 
 33 | ######### read corpus - only the sentence containing the answer as context
 34 | # raw_triplets = read_raw_squad(path_to_data)
 35 | # sent_c_triplets = get_ans_sentence(raw_triplets)
 36 | # sent_c_triplets = tokenize_squad(sent_c_triplets, embeddings_index, opt='sent')
 37 | import pickle
 38 | load_path = '/home/jack/Documents/QA_QG/data/processed/'
 39 | # triplets = pickle.load(open(load_path+'triplets.txt', 'rb'))
 40 | # sent_c_triplets = pickle.load(open(load_path+'sent_c_triplets.txt', 'rb'))
 41 | windowed_c_triplets_30_noEOS = pickle.load(open(load_path+'windowed_c_triplets_30_noEOS.txt', 'rb'))
 42 | # triplets = sent_c_triplets
 43 | triplets = windowed_c_triplets_30_noEOS
 44 | 
 45 | # find max length of context, question, answer, respectively
 46 | # max_len_c, max_len_q, max_len_a = max_length(triplets)
 47 | 
 48 | ######### corpus preprocessing
 49 | # words that do not appear in embeddings, etc
 50 | 
 51 | ## find all unique tokens in the data (should be a subset of the number of embeddings)
 52 | effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index)
 53 | print('effective number of tokens: ' + str(effective_num_tokens))
 54 | print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n')
 55 | # build word2index dictionary and index2word dictionary
 56 | word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens)
 57 | 
 58 | 
 59 | print('reading and preprocessing data complete.')
 60 | print('found %s unique tokens in the intersection of corpus and word embeddings.' % effective_num_tokens)
 61 | if use_cuda:
 62 |     print('GPU ready.')
 63 | print('')
 64 | print('start training...')
 65 | print('')
 66 | 
 67 | 
 68 | ######### set up model
 69 | enc_hidden_size = 256
 70 | enc_n_layers = 1
 71 | enc_num_directions = 2
 72 | dec_hidden_size = 256
 73 | dec_n_layers = 1
 74 | dec_num_directions = 2
 75 | batch_size = 5
 76 | learning_rate = 0.001
 77 | 
 78 | generator = G(embeddings_size, enc_hidden_size, enc_n_layers, enc_num_directions,
 79 |                  embeddings_size, dec_hidden_size, effective_num_tokens, dec_n_layers, dec_num_directions,
 80 |                  batch_size)
 81 | 
 82 | if use_cuda:
 83 |     generator = generator.cuda()
 84 | 
 85 | optimizer = optim.Adam(generator.parameters(), lr=learning_rate)
 86 | criterion = nn.NLLLoss()
 87 | 
 88 | # max_length of generated question
 89 | max_length = 100
 90 | to_file = True
 91 | 
 92 | # open the files
 93 | if to_file:
 94 |     exp_name = 'G_c_a_sep_pretrain_exp_windowed_c_noEOS_0911'
 95 |     path_to_exp_out = '/home/jack/Documents/QA_QG/exp_results_temp/'
 96 |     if not os.path.exists(path_to_exp_out+exp_name):
 97 |         os.mkdir(path_to_exp_out+exp_name)
 98 |     loss_f = 'loss_temp.txt'
 99 |     sample_out_f = 'sample_outputs_temp.txt'
100 |     path_to_loss_f = path_to_exp_out + exp_name + '/' + loss_f
101 |     path_to_sample_out_f = path_to_exp_out + exp_name + '/' + sample_out_f
102 |     loss_f = open(path_to_loss_f,'w+')
103 |     sample_out_f = open(path_to_sample_out_f, 'w+')
104 | else:
105 |     loss_f = None
106 |     sample_out_f = None
107 |     path_to_exp_out = None
108 | 
109 | trainIters(generator, optimizer, batch_size, embeddings_size,
110 |            embeddings_index, word2index, index2word, max_length, triplets, teacher_forcing_ratio,
111 |            to_file, loss_f, sample_out_f, path_to_exp_out,
112 |            n_iters=30000, print_every=300, plot_every=30, checkpoint_every=6000)
113 | 
114 | # save the final model
115 | if to_file:
116 |     torch.save(generator, path_to_exp_out + exp_name +'/generator_temp.pth.tar')
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/src/G_baseline/G_train.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from __future__ import print_function
  3 | from __future__ import division
  4 | 
  5 | import sys
  6 | import os
  7 | sys.path.append(os.path.abspath(__file__ + "/../../"))
  8 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
  9 | from data_proc import *
 10 | # FIXME: had some problem importing util.py; importing successful but 
 11 | #        functions cannot be called (NameError: global name XXX is not defined)
 12 | #        fast solution: copied asMinutes and timeSince functions herefrom util import *
 13 | from G_eval import *
 14 | 
 15 | import torch
 16 | import torch.nn as nn
 17 | from torch import optim
 18 | from torch.autograd import Variable
 19 | import torch.nn.functional as F
 20 | import time
 21 | 
 22 | use_cuda = torch.cuda.is_available()
 23 | 
 24 | 
 25 | import time
 26 | import math
 27 | 
 28 | # FIXME: added these two functions because import util does not seem to work (see above)
 29 | def asMinutes(s):
 30 |     m = math.floor(s / 60)
 31 |     s -= m * 60
 32 |     return '%dm %ds' % (m, s)
 33 | 
 34 | def timeSince(since, percent):
 35 |     now = time.time()
 36 |     s = now - since
 37 |     es = s / (percent)
 38 |     rs = es - s
 39 |     return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
 40 | 
 41 | 
 42 | 
 43 | def trainIters(generator, optimizer, batch_size, embeddings_size,
 44 |     embeddings_index, word2index, index2word, max_length, triplets, teacher_forcing_ratio,
 45 |     to_file, loss_f, sample_out_f,
 46 |     n_iters=5, print_every=10, plot_every=100):
 47 | 
 48 |     begin_time = time.time()
 49 | 
 50 |     # plot_losses = []
 51 |     print_loss_total = 0  # Reset every print_every
 52 |     plot_loss_total = 0  # Reset every plot_every
 53 | 
 54 |     print()
 55 | 
 56 |     for iter in range(1, n_iters + 1):
 57 | 
 58 |         # prepare batch
 59 |         training_batch, seq_lens = get_random_batch(triplets, batch_size)
 60 |         training_batch, _, seq_lens = prepare_batch_var(training_batch, seq_lens, batch_size, word2index, embeddings_index, embeddings_size, use_cuda=1, mode=['word', 'index'], concat_opt='ca')
 61 |         inputs_ca = Variable(training_batch[0].cuda()) if use_cuda else Variable(training_batch[0]) # embeddings vectors, size = [seq len x batch size x embedding dim]
 62 |         inputs_q = Variable(training_batch[1].cuda()) if use_cuda else Variable(training_batch[1]) # represented as indices, size = [seq len x batch size]
 63 | 
 64 |         max_c_a_len = max(seq_lens[0])  # max seq length of context + ans combined
 65 |         max_q_len = max(seq_lens[1])  # max seq length of question
 66 | 
 67 |         optimizer.zero_grad()
 68 |         loss = 0
 69 |         all_decoder_outputs = generator.forward(inputs_ca, inputs_q, seq_lens[0], batch_size, max_q_len,
 70 |                                                 embeddings_index, embeddings_size, word2index, index2word,
 71 |                                                 teacher_forcing_ratio)
 72 |         loss += generator.backward(all_decoder_outputs, inputs_q, seq_lens[1], optimizer)
 73 | 
 74 |         print_loss_total += loss.data[0]
 75 |         plot_loss_total += loss.data[0]
 76 | 
 77 |         if iter % print_every == 0:
 78 |             print_loss_avg = print_loss_total / print_every
 79 |             print_loss_total = 0
 80 |             print('%s (%d %d%%) %.4f' % (timeSince(begin_time, iter / float(n_iters)),
 81 |                                          iter, iter / n_iters * 100, print_loss_avg))
 82 |             print('---sample generated question---')
 83 |             # sample a triple and print the generated question
 84 |             evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length)
 85 |             print('-------------------------------')
 86 |             print('-------------------------------')
 87 |             print()
 88 | 
 89 |         if iter % plot_every == 0:
 90 |             plot_loss_avg = plot_loss_total / plot_every
 91 |             # plot_losses.append(plot_loss_avg)
 92 |             plot_loss_total = 0
 93 |             if to_file:
 94 |                 loss_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100)))
 95 |                 loss_f.write(unicode(plot_loss_avg))
 96 |                 loss_f.write(unicode('\n'))
 97 |                 sample_out_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100)))
 98 |                 evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length, to_file, sample_out_f)
 99 |                 sample_out_f.write(unicode('\n'))
100 | 
101 |                     
102 | 
103 |     # showPlot(plot_losses)
104 |     if to_file:
105 |         loss_f.close()
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/src/G_baseline/G_eval.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.abspath(__file__ + "/../../"))
  4 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
  5 | from data_proc import *
  6 | from util import *
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch import optim
 11 | from torch.autograd import Variable
 12 | import torch.nn.functional as F
 13 | import time
 14 | 
 15 | use_cuda = torch.cuda.is_available()
 16 | 
 17 | 
 18 | # max_length constrains the maximum length of the generated question
 19 | def evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length,
 20 |              to_file = False, sample_out_f = None):
 21 | 
 22 |     # prepare test input
 23 |     batch_size = 1
 24 |     training, seq_lens = get_random_batch(triplets, batch_size)
 25 |     context_words = training[0]
 26 |     answer_words = training[2]
 27 |     question_words = training[1]
 28 |     training, _, seq_lens = prepare_batch_var(training, seq_lens, batch_size, word2index, embeddings_index, embeddings_size, mode=['word', 'index'], concat_opt='ca')
 29 |     inputs_ca = Variable(training[0].cuda()) if use_cuda else Variable(training[0]) # embeddings vectors, size = [seq len x batch size x embedding dim]
 30 |     # inputs_q = Variable(training[1].cuda()) if use_cuda else Variable(training[1]) # represented as indices, size = [seq len x batch size]
 31 |     inputs_q = None
 32 | 
 33 |     all_decoder_outputs = generator.forward(inputs_ca, inputs_q, seq_lens[0], batch_size, max_length,
 34 |                                             embeddings_index, embeddings_size, word2index, index2word,
 35 |                                             teacher_forcing_ratio=0)
 36 | 
 37 |     decoded_sentences = []
 38 |     decoded_words = []
 39 |     for b in range(batch_size):
 40 |         # get the word token and add to the list of words
 41 |         for di in range(max_length):
 42 |             # top value and index of every batch
 43 |             topv, topi = all_decoder_outputs[di,b].data.topk(1)
 44 |             ni = topi[0]
 45 |             if (ni == word2index['EOS']) or (ni == word2index['PAD']):
 46 |                 decoded_words.append('EOS')
 47 |                 # decoder_attentions[di] = decoder_attention[0].data
 48 |                 break
 49 |             else:
 50 |                 decoded_words.append(index2word[ni])
 51 |         decoded_sentences.append(decoded_words)
 52 | 
 53 |     # print results
 54 |     if not to_file:
 55 |         print('context              > ' + ' '.join(context_words[0]).encode('utf-8').strip())
 56 |         print('answer               > ' + ' '.join(answer_words[0]).encode('utf-8').strip())
 57 |         print('question             > ' + ' '.join(question_words[0]).encode('utf-8').strip())
 58 |         # true_q = []
 59 |         # for i in range(seq_lens[1][0]):
 60 |         #     true_q.append(index2word[inputs_q[i][0].data[0]])
 61 |         # print('question with padding> ' + ' '.join(true_q))
 62 |         print('generated question   > ' + ' '.join(decoded_words))
 63 |     else:
 64 |         sample_out_f.write(unicode('context              > ' + ' '.join(context_words[0]) + '\n'))
 65 |         sample_out_f.write(unicode('answer               > ' + ' '.join(answer_words[0]) + '\n'))
 66 |         sample_out_f.write(unicode('question             > ' + ' '.join(question_words[0]) + '\n'))
 67 |         sample_out_f.write(unicode('generated question   > ' + ' '.join(decoded_words) + '\n'))
 68 | 
 69 |     # TODO: uncomment the following return if you want to record the decoder outputs in file
 70 |     #       (note: need to modify this function call in G_train.py)
 71 |     # return decoded_sentences
 72 | 
 73 | 
 74 | def G_sampler(generator, ca, embeddings_index, embeddings_size, word2index, index2word, max_length):
 75 | # NOTE currently only generate one question at a time. multiple questions not yet supported
 76 | 
 77 |     var = torch.FloatTensor(len(ca), embeddings_size)
 78 |     for j in range(len(ca)):
 79 |         var[j] = embeddings_index[ca[j]]
 80 |     var = var.unsqueeze(1)
 81 |     if use_cuda:
 82 |         var = Variable(var.cuda())
 83 |     else:
 84 |         var = Variable(var)
 85 | 
 86 |     decoder_output = generator.forward(var, None, [len(ca)], 1, max_length,
 87 |                                        embeddings_index, embeddings_size, word2index, index2word,
 88 |                                        teacher_forcing_ratio=0).detach()
 89 |     decoder_output = decoder_output.squeeze(1)
 90 | 
 91 |     decoded_words = []
 92 |     for di in range(max_length):
 93 |         # top value and index of every batch
 94 |         topv, topi = decoder_output[di].data.topk(1)
 95 |         ni = topi[0]
 96 |         if (ni == word2index['EOS']) or (ni == word2index['PAD']):
 97 |             decoded_words.append('EOS')
 98 |             # decoder_attentions[di] = decoder_attention[0].data
 99 |             break
100 |         else:
101 |             decoded_words.append(index2word[ni])
102 | 
103 |     return decoded_words
104 | 
105 | 


--------------------------------------------------------------------------------
/src/G_baseline/G_model.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import random
  4 | sys.path.append(os.path.abspath(__file__ + "/../../"))
  5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
  6 | 
  7 | from model_zoo import *
  8 | from masked_cross_entropy import *
  9 | import torch
 10 | import torch.nn as nn
 11 | from torch.autograd import Variable
 12 | 
 13 | use_cuda = torch.cuda.is_available()
 14 | 
 15 | 
 16 | class G(nn.Module):
 17 |     def __init__(self, enc_input_size, enc_hidden_size, enc_n_layers, enc_num_directions,
 18 |                  dec_input_size, dec_hidden_size, output_size, dec_n_layers, dec_num_directions,
 19 |                  batch_size, use_attn=True):
 20 |         super(G, self).__init__()
 21 |         self.encoder = EncoderRNN(enc_input_size, enc_hidden_size, batch_size, enc_n_layers, enc_num_directions)
 22 |         if use_attn:
 23 |             self.decoder = AttnDecoderRNN(dec_input_size, dec_hidden_size, output_size, self.encoder,
 24 |                                           dec_n_layers, dec_num_directions)
 25 |         else:
 26 |             # TODO: complete case when not using attention (add decoder class in model zoo)
 27 |             pass
 28 | 
 29 | 
 30 |     def forward(self, inputs_ca, inputs_q, seq_lens, batch_size, max_q_len,
 31 |                 embeddings_index, embeddings_size, word2index, index2word, teacher_forcing_ratio):
 32 |         # context encoding
 33 |         # output size: (seq_len, batch, hidden_size)
 34 |         # hidden size: (num_layers, batch, hidden_size)
 35 |         # the collection of all hidden states per batch is of size (seq_len, batch, hidden_size * num_directions)
 36 |         encoder_hiddens, encoder_hidden = self.encoder(inputs_ca, seq_lens, None)
 37 | 
 38 |         print(type(encoder_hiddens.data))
 39 |         print(encoder_hiddens.size())
 40 | 
 41 |         # decoder
 42 |         # prepare decoder inputs as word embeddings in a batch
 43 |         # decoder_input size: (1, batch size, embedding size); first dim is 1 because only one time step;
 44 |         # nee to have a 3D tensor for input to nn.GRU module
 45 |         decoder_input = Variable(embeddings_index['SOS'].repeat(batch_size, 1).unsqueeze(0))
 46 |         # init all decoder outputs
 47 |         all_decoder_outputs = Variable(torch.zeros(max_q_len, batch_size, self.decoder.output_size))
 48 |         if use_cuda:
 49 |             decoder_input = decoder_input.cuda()
 50 |             all_decoder_outputs = all_decoder_outputs.cuda()
 51 | 
 52 |         use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
 53 |         
 54 |         if use_teacher_forcing:
 55 |             # Teacher forcing: Feed the target as the next input
 56 |             for di in range(max_q_len):
 57 |                 decoder_output, decoder_hidden, decoder_attention = self.decoder(
 58 |                     decoder_input, encoder_hiddens, embeddings_index)
 59 | 
 60 |                 all_decoder_outputs[di] = decoder_output
 61 | 
 62 |                 # change next time step input to current target output, in embedding format
 63 |                 decoder_input = Variable(torch.FloatTensor(1, batch_size, embeddings_size).cuda()) if use_cuda else \
 64 |                     Variable(torch.FloatTensor(1, batch_size, embeddings_size))
 65 |                 for b in range(batch_size):
 66 |                     decoder_input[0, b] = embeddings_index[index2word[inputs_q[di, b].data[0]]].cuda() if use_cuda else \
 67 |                                           embeddings_index[index2word[inputs_q[di, b].data[0]]]  # Teacher forcing
 68 | 
 69 |         else:
 70 |             # Without teacher forcing: use its own predictions as the next input
 71 |             for di in range(max_q_len):
 72 |                 decoder_output, decoder_hidden, decoder_attention = self.decoder(
 73 |                     decoder_input, encoder_hiddens, embeddings_index)
 74 | 
 75 |                 all_decoder_outputs[di] = decoder_output
 76 | 
 77 |                 # top value and index of every batch
 78 |                 # size of both topv, topi = (batch size, 1)
 79 |                 topv, topi = decoder_output.data.topk(1)
 80 | 
 81 |                 # get the output word for every batch
 82 |                 decoder_input = Variable(torch.FloatTensor(1, batch_size, embeddings_size).cuda()) if use_cuda else \
 83 |                     Variable(torch.FloatTensor(1, batch_size, embeddings_size))
 84 |                 for b in range(batch_size):
 85 |                     decoder_input[0, b] = embeddings_index[index2word[topi[0][0]]].cuda() if use_cuda else \
 86 |                         embeddings_index[index2word[topi[0][0]]]
 87 | 
 88 |         return all_decoder_outputs
 89 | 
 90 | 
 91 |     def backward(self, out, labels, true_lens, optimizer):
 92 |         loss = masked_cross_entropy(
 93 |             out.transpose(0, 1).contiguous(), # -> batch x seq
 94 |             labels.transpose(0, 1).contiguous(), # -> batch x seq
 95 |             true_lens
 96 |         )
 97 |         loss.backward()
 98 |         optimizer.step()
 99 |         return loss
100 | 


--------------------------------------------------------------------------------
/src/G_c_a_sep/G_c_a_sep.py:
--------------------------------------------------------------------------------
  1 | # the encoder in the generator process the context and answer separately.
  2 | 
  3 | import sys
  4 | import os
  5 | import random
  6 | sys.path.append(os.path.abspath(__file__ + "/../../"))
  7 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
  8 | 
  9 | from model_zoo import *
 10 | from masked_cross_entropy import *
 11 | import torch
 12 | import torch.nn as nn
 13 | from torch.autograd import Variable
 14 | 
 15 | use_cuda = torch.cuda.is_available()
 16 | 
 17 | 
 18 | class G(nn.Module):
 19 |     def __init__(self, enc_input_size, enc_hidden_size, enc_n_layers, enc_num_directions,
 20 |                  dec_input_size, dec_hidden_size, output_size, dec_n_layers, dec_num_directions,
 21 |                  batch_size, use_attn=True):
 22 |         super(G, self).__init__()
 23 |         self.c_encoder = EncoderRNN(enc_input_size, enc_hidden_size, batch_size, enc_n_layers, enc_num_directions)
 24 |         self.a_encoder = EncoderRNN(enc_input_size, enc_hidden_size, batch_size, enc_n_layers, enc_num_directions)
 25 |         if use_attn:
 26 |             self.decoder = AttnDecoderRNN(dec_input_size, dec_hidden_size, output_size, self.a_encoder,
 27 |                                           dec_n_layers, dec_num_directions)
 28 |         else:
 29 |             # TODO: complete case when not using attention (add decoder class in model zoo)
 30 |             pass
 31 | 
 32 | 
 33 |     def forward(self, inputs, seq_lens, batch_size, max_q_len,
 34 |                 embeddings_index, embeddings_size, word2index, index2word, teacher_forcing_ratio):
 35 |         # inputs is a collection of c, a, q. index by 0,2,1
 36 |         # output size: (seq_len, batch, hidden_size)
 37 |         # hidden size: (num_layers, batch, hidden_size)
 38 |         # the collection of all hidden states per batch is of size (seq_len, batch, hidden_size * num_directions)
 39 |         c_encoder_hiddens, c_encoder_hidden = self.c_encoder(inputs[0], seq_lens[0])
 40 |         a_encoder_hiddens, a_encoder_hidden = self.a_encoder(inputs[2], seq_lens[2])
 41 | 
 42 |         # TODO: the below code of how to use/combine hidden states from context/answer can be changed
 43 |         encoder_hiddens = torch.cat((c_encoder_hiddens, a_encoder_hiddens), 0) # concat along the first dimension (seq len)
 44 | 
 45 |         # decoder
 46 |         # prepare decoder inputs as word embeddings in a batch
 47 |         # decoder_input size: (1, batch size, embedding size); first dim is 1 because only one time step;
 48 |         # nee to have a 3D tensor for input to nn.GRU module
 49 |         decoder_input = Variable(embeddings_index['SOS'].repeat(batch_size, 1).unsqueeze(0))
 50 |         # init all decoder outputs
 51 |         all_decoder_outputs = Variable(torch.zeros(max_q_len, batch_size, self.decoder.output_size))
 52 |         if use_cuda:
 53 |             decoder_input = decoder_input.cuda()
 54 |             all_decoder_outputs = all_decoder_outputs.cuda()
 55 | 
 56 |         use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
 57 |         
 58 |         if use_teacher_forcing:
 59 |             # Teacher forcing: Feed the target as the next input
 60 |             for di in range(max_q_len):
 61 |                 decoder_output, decoder_hidden, decoder_attention = self.decoder(
 62 |                     decoder_input, c_encoder_hiddens, embeddings_index)
 63 | 
 64 |                 all_decoder_outputs[di] = decoder_output
 65 | 
 66 |                 # change next time step input to current target output, in embedding format
 67 |                 decoder_input = Variable(torch.FloatTensor(1, batch_size, embeddings_size).cuda()) if use_cuda else \
 68 |                     Variable(torch.FloatTensor(1, batch_size, embeddings_size))
 69 |                 for b in range(batch_size):
 70 |                     decoder_input[0, b] = embeddings_index[index2word[inputs[1][di, b].data[0]]].cuda() if use_cuda else \
 71 |                                           embeddings_index[index2word[inputs[1][di, b].data[0]]]  # Teacher forcing
 72 | 
 73 |         else:
 74 |             # Without teacher forcing: use its own predictions as the next input
 75 |             for di in range(max_q_len):
 76 |                 decoder_output, decoder_hidden, decoder_attention = self.decoder(
 77 |                     decoder_input, encoder_hiddens, embeddings_index)
 78 | 
 79 |                 all_decoder_outputs[di] = decoder_output
 80 | 
 81 |                 # top value and index of every batch
 82 |                 # size of both topv, topi = (batch size, 1)
 83 |                 topv, topi = decoder_output.data.topk(1)
 84 | 
 85 |                 # get the output word for every batch
 86 |                 decoder_input = Variable(torch.FloatTensor(1, batch_size, embeddings_size).cuda()) if use_cuda else \
 87 |                     Variable(torch.FloatTensor(1, batch_size, embeddings_size))
 88 |                 for b in range(batch_size):
 89 |                     decoder_input[0, b] = embeddings_index[index2word[topi[0][0]]].cuda() if use_cuda else \
 90 |                         embeddings_index[index2word[topi[0][0]]]
 91 | 
 92 |         return all_decoder_outputs
 93 | 
 94 | 
 95 |     def backward(self, out, labels, true_lens, optimizer):
 96 |         loss = masked_cross_entropy(
 97 |             out.transpose(0, 1).contiguous(), # -> batch x seq
 98 |             labels.transpose(0, 1).contiguous(), # -> batch x seq
 99 |             true_lens
100 |         )
101 |         loss.backward()
102 |         optimizer.step()
103 |         return loss
104 | 


--------------------------------------------------------------------------------
/references/code/vanilla-gan.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # code from https://github.com/devnag/pytorch-generative-adversarial-networks/blob/master/gan_pytorch.py
  4 | 
  5 | # Generative Adversarial Networks (GAN) example in PyTorch.
  6 | # See related blog post at https://medium.com/@devnag/generative-adversarial-networks-gans-in-50-lines-of-code-pytorch-e81b79659e3f#.sch4xgsa9
  7 | import numpy as np
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | import torch.optim as optim
 12 | from torch.autograd import Variable
 13 | 
 14 | # Data params
 15 | data_mean = 4
 16 | data_stddev = 1.25
 17 | 
 18 | # Model params
 19 | g_input_size = 1     # Random noise dimension coming into generator, per output vector
 20 | g_hidden_size = 50   # Generator complexity
 21 | g_output_size = 1    # size of generated output vector
 22 | d_input_size = 100   # Minibatch size - cardinality of distributions
 23 | d_hidden_size = 50   # Discriminator complexity
 24 | d_output_size = 1    # Single dimension for 'real' vs. 'fake'
 25 | minibatch_size = d_input_size
 26 | 
 27 | d_learning_rate = 2e-4  # 2e-4
 28 | g_learning_rate = 2e-4
 29 | optim_betas = (0.9, 0.999)
 30 | num_epochs = 30000
 31 | print_interval = 200
 32 | d_steps = 1  # 'k' steps in the original GAN paper. Can put the discriminator on higher training freq than generator
 33 | g_steps = 1
 34 | 
 35 | # ### Uncomment only one of these
 36 | #(name, preprocess, d_input_func) = ("Raw data", lambda data: data, lambda x: x)
 37 | (name, preprocess, d_input_func) = ("Data and variances", lambda data: decorate_with_diffs(data, 2.0), lambda x: x * 2)
 38 | 
 39 | print("Using data [%s]" % (name))
 40 | 
 41 | # ##### DATA: Target data and generator input data
 42 | 
 43 | def get_distribution_sampler(mu, sigma):
 44 |     return lambda n: torch.Tensor(np.random.normal(mu, sigma, (1, n)))  # Gaussian
 45 | 
 46 | def get_generator_input_sampler():
 47 |     return lambda m, n: torch.rand(m, n)  # Uniform-dist data into generator, _NOT_ Gaussian
 48 | 
 49 | # ##### MODELS: Generator model and discriminator model
 50 | 
 51 | class Generator(nn.Module):
 52 |     def __init__(self, input_size, hidden_size, output_size):
 53 |         super(Generator, self).__init__()
 54 |         self.map1 = nn.Linear(input_size, hidden_size)
 55 |         self.map2 = nn.Linear(hidden_size, hidden_size)
 56 |         self.map3 = nn.Linear(hidden_size, output_size)
 57 | 
 58 |     def forward(self, x):
 59 |         x = F.elu(self.map1(x))
 60 |         x = F.sigmoid(self.map2(x))
 61 |         return self.map3(x)
 62 | 
 63 | class Discriminator(nn.Module):
 64 |     def __init__(self, input_size, hidden_size, output_size):
 65 |         super(Discriminator, self).__init__()
 66 |         self.map1 = nn.Linear(input_size, hidden_size)
 67 |         self.map2 = nn.Linear(hidden_size, hidden_size)
 68 |         self.map3 = nn.Linear(hidden_size, output_size)
 69 | 
 70 |     def forward(self, x):
 71 |         x = F.elu(self.map1(x))
 72 |         x = F.elu(self.map2(x))
 73 |         return F.sigmoid(self.map3(x))
 74 | 
 75 | def extract(v):
 76 |     return v.data.storage().tolist()
 77 | 
 78 | def stats(d):
 79 |     return [np.mean(d), np.std(d)]
 80 | 
 81 | def decorate_with_diffs(data, exponent):
 82 |     mean = torch.mean(data.data, 1)
 83 |     mean_broadcast = torch.mul(torch.ones(data.size()), mean.tolist()[0][0])
 84 |     diffs = torch.pow(data - Variable(mean_broadcast), exponent)
 85 |     return torch.cat([data, diffs], 1)
 86 | 
 87 | d_sampler = get_distribution_sampler(data_mean, data_stddev)
 88 | gi_sampler = get_generator_input_sampler()
 89 | G = Generator(input_size=g_input_size, hidden_size=g_hidden_size, output_size=g_output_size)
 90 | D = Discriminator(input_size=d_input_func(d_input_size), hidden_size=d_hidden_size, output_size=d_output_size)
 91 | criterion = nn.BCELoss()  # Binary cross entropy: http://pytorch.org/docs/nn.html#bceloss
 92 | d_optimizer = optim.Adam(D.parameters(), lr=d_learning_rate, betas=optim_betas)
 93 | g_optimizer = optim.Adam(G.parameters(), lr=g_learning_rate, betas=optim_betas)
 94 | 
 95 | for epoch in range(num_epochs):
 96 |     for d_index in range(d_steps):
 97 |         # 1. Train D on real+fake
 98 |         D.zero_grad()
 99 | 
100 |         #  1A: Train D on real
101 |         d_real_data = Variable(d_sampler(d_input_size))
102 |         d_real_decision = D(preprocess(d_real_data))
103 |         d_real_error = criterion(d_real_decision, Variable(torch.ones(1)))  # ones = true
104 |         d_real_error.backward() # compute/store gradients, but don't change params
105 | 
106 |         #  1B: Train D on fake
107 |         d_gen_input = Variable(gi_sampler(minibatch_size, g_input_size))
108 |         d_fake_data = G(d_gen_input).detach()  # detach to avoid training G on these labels
109 |         d_fake_decision = D(preprocess(d_fake_data.t()))
110 |         d_fake_error = criterion(d_fake_decision, Variable(torch.zeros(1)))  # zeros = fake
111 |         d_fake_error.backward()
112 |         d_optimizer.step()     # Only optimizes D's parameters; changes based on stored gradients from backward()
113 | 
114 |     for g_index in range(g_steps):
115 |         # 2. Train G on D's response (but DO NOT train D on these labels)
116 |         G.zero_grad()
117 | 
118 |         gen_input = Variable(gi_sampler(minibatch_size, g_input_size))
119 |         g_fake_data = G(gen_input)
120 |         dg_fake_decision = D(preprocess(g_fake_data.t()))
121 |         g_error = criterion(dg_fake_decision, Variable(torch.ones(1)))  # we want to fool, so pretend it's all genuine
122 | 
123 |         g_error.backward()
124 |         g_optimizer.step()  # Only optimizes G's parameters
125 | 
126 |     if epoch % print_interval == 0:
127 |         print("%s: D: %s/%s G: %s (Real: %s, Fake: %s) " % (epoch,
128 |                                                             extract(d_real_error)[0],
129 |                                                             extract(d_fake_error)[0],
130 |                                                             extract(g_error)[0],
131 |                                                             stats(extract(d_real_data)),
132 |                                                             stats(extract(d_fake_data))))


--------------------------------------------------------------------------------
/src/GAN_model/GAN_main.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from __future__ import division
  3 | 
  4 | import sys, os
  5 | # sys.path.append(os.path.abspath(__file__ + "/../../"))
  6 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
  7 | # sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_c_a_sep')
  8 | # sys.path.append(os.path.abspath(__file__ + "/../../") + '/D_baseline')
  9 | 
 10 | from data_proc import *
 11 | # from G_train import *
 12 | # from G_c_a_sep import *
 13 | from GAN_model import *
 14 | import numpy as np
 15 | 
 16 | from torch import optim
 17 | 
 18 | global use_cuda
 19 | use_cuda = torch.cuda.is_available()
 20 | teacher_forcing_ratio = 0.5 # default in original code is 0.5
 21 | 
 22 | 
 23 | ######### set paths
 24 | # TODO: to run properly, change the following paths and filenames
 25 | # path variables
 26 | path_to_dataset = '/home/jack/Documents/QA_QG/data/' # path to original dataset
 27 | load_path = '/home/jack/Documents/QA_QG/data/processed/' # path to processed dataset
 28 | G_path = '/home/jack/Documents/QA_QG/exp_results_temp/G_c_a_sep_pretrain_exp_0902(2)/generator_temp.pth' # path to saved generator model
 29 | path_to_exp = '/home/jack/Documents/QA_QG/exp_results_temp/' # path to experiment folder
 30 | 
 31 | 
 32 | # default values for the dataset and the path to the project/dataset
 33 | dataset = 'squad'
 34 | f_name = 'train-v1.1.json'
 35 | path_to_data = path_to_dataset + dataset + '/' + f_name
 36 | GLOVE_DIR = path_to_dataset + 'glove.6B/'
 37 | 
 38 | ######### first load the pretrained word embeddings
 39 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')
 40 | embeddings_index, embeddings_size = readGlove(path_to_glove)
 41 | 
 42 | 
 43 | import pickle
 44 | # triplets = pickle.load(open(load_path+'triplets.txt', 'rb'))
 45 | sent_c_triplets = pickle.load(open(load_path+'sent_c_triplets.txt', 'rb'))
 46 | # windowed_c_triplets_10 = pickle.load(open(load_path+'windowed_c_triplets_10.txt', 'rb'))
 47 | triplets = sent_c_triplets
 48 | # ######### read corpus
 49 | # raw_triplets = read_raw_squad(path_to_data)
 50 | # triplets = tokenize_squad(raw_triplets, embeddings_index)
 51 | 
 52 | # # find max length of context, question, answer, respectively
 53 | # max_len_c, max_len_q, max_len_a = max_length(triplets)
 54 | 
 55 | ######### corpus preprocessing
 56 | # words that do not appear in embeddings, etc
 57 | 
 58 | ## find all unique tokens in the data (should be a subset of the number of embeddings)
 59 | effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index)
 60 | print('effective number of tokens: ' + str(effective_num_tokens))
 61 | print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n')
 62 | # build word2index dictionary and index2word dictionary
 63 | word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens)
 64 | 
 65 | 
 66 | print('reading and preprocessing data complete.')
 67 | print('found %s unique tokens in the intersection of corpus and word embeddings.' % effective_num_tokens)
 68 | if use_cuda:
 69 |     print('GPU ready.')
 70 | print('')
 71 | print('start training...')
 72 | print('')
 73 | 
 74 | 
 75 | ######### set up model
 76 | G_enc_input_size = embeddings_size
 77 | G_enc_hidden_size = 256
 78 | G_enc_n_layers = 1
 79 | G_enc_num_directions = 1
 80 | G_dec_input_size = embeddings_size
 81 | G_dec_hidden_size = 256
 82 | G_output_size = effective_num_tokens
 83 | G_dec_n_layers = 1
 84 | G_dec_num_directions = 1
 85 | D_enc_input_size = embeddings_size
 86 | D_enc_hidden_size = 256
 87 | D_enc_n_layers = 1
 88 | D_num_directions = 1
 89 | D_mlp_hidden_size = 64
 90 | D_num_attn_weights = 1
 91 | D_mlp_output_size = 1
 92 | use_attn = True
 93 | batch_size = 5
 94 | 
 95 | 
 96 | vanilla_gan = GAN_model(G_enc_input_size, G_enc_hidden_size, G_enc_n_layers, G_enc_num_directions,
 97 |                         G_dec_input_size, G_dec_hidden_size, G_output_size, G_dec_n_layers, G_dec_num_directions,
 98 |                         D_enc_input_size, D_enc_hidden_size, D_enc_n_layers, D_num_directions,
 99 |                         D_mlp_hidden_size, D_num_attn_weights, D_mlp_output_size,
100 |                         use_attn, batch_size, G_path=G_path, pretrain=True)
101 | if use_cuda:
102 |     vanilla_gan = vanilla_gan.cuda()
103 | 
104 | learning_rate = 1e-3
105 | d_optimizer = optim.Adam(vanilla_gan.D.parameters(), lr=learning_rate)
106 | g_optimizer = optim.Adam(vanilla_gan.G.parameters(), lr=learning_rate)
107 | criterion = nn.BCELoss()
108 | 
109 | # max_length of generated question
110 | max_len = 100
111 | to_file = True
112 | print_every = 500
113 | plot_every = 50
114 | checkpoint_every = 2000
115 | n_iters = 10000
116 | d_steps = 1
117 | g_steps = 5
118 | 
119 | # open the files
120 | exp_name = 'GAN_0911'
121 | path_to_exp_out = path_to_exp + exp_name
122 | if to_file:
123 |     if not os.path.exists(path_to_exp_out):
124 |         os.mkdir(path_to_exp_out)
125 |     loss_f = 'loss_temp.txt'
126 |     sample_out_f = 'sample_outputs_temp.txt'
127 |     path_to_loss_f = path_to_exp_out + '/' + loss_f
128 |     path_to_sample_out_f = path_to_exp_out + '/' + sample_out_f
129 |     loss_f = open(path_to_loss_f,'w+')
130 |     sample_out_f = open(path_to_sample_out_f, 'w+')
131 | # else:
132 | #     loss_f = None
133 | #     sample_out_f = None
134 | #     path_to_exp_out = None
135 | 
136 | # # load a pre-trained model
137 | # model_fname = 'checkpoint_iter_1.pth.tar'
138 | # path_to_model = path_to_exp_out + '/' + model_fname
139 | # checkpoint = torch.load(path_to_model)
140 | # vanilla_gan.D.load_state_dict(checkpoint['d_state_dict'])
141 | # vanilla_gan.G.load_state_dict(checkpoint['g_state_dict'])
142 | # d_optimizer.load_state_dict(checkpoint['d_optimizer'])
143 | # g_optimizer.load_state_dict(checkpoint['g_optimizer'])
144 | 
145 | # train
146 | vanilla_gan.train(triplets, n_iters, d_steps, d_optimizer, g_steps, g_optimizer, batch_size, max_len,
147 |                   criterion, word2index, index2word, embeddings_index, embeddings_size, print_every, plot_every, checkpoint_every,
148 |                   to_file=to_file, loss_f=loss_f, sample_out_f=sample_out_f, path_to_exp_out=path_to_exp_out)
149 | 
150 | if to_file:
151 |     loss_f.close()
152 |     sample_out_f.close()
153 |     torch.save(vanilla_gan, path_to_exp_out + exp_name + '/GAN_model.pth.tar')


--------------------------------------------------------------------------------
/src/G_c_a_sep/G_train.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath(__file__ + "/../../"))
  5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
  6 | # from util import timeSince, asMinutes, plotLoss
  7 | from data_proc import *
  8 | # FIXME: had some problem importing util.py; importing successful but 
  9 | #        functions cannot be called (NameError: global name XXX is not defined)
 10 | #        fast solution: copied asMinutes and timeSince functions herefrom util import *
 11 | from G_eval import *
 12 | 
 13 | import torch
 14 | from torch.autograd import Variable
 15 | 
 16 | use_cuda = torch.cuda.is_available()
 17 | 
 18 | 
 19 | ########################################################################################################################
 20 | import matplotlib
 21 | matplotlib.use('Agg')
 22 | import matplotlib.pyplot as plt
 23 | import time
 24 | import math
 25 | 
 26 | def asMinutes(s):
 27 |     m = math.floor(s / 60)
 28 |     s -= m * 60
 29 |     return '%dm %ds' % (m, s)
 30 | 
 31 | def timeSince(since, percent):
 32 |     now = time.time()
 33 |     s = now - since
 34 |     es = s / (percent)
 35 |     rs = es - s
 36 |     return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
 37 | 
 38 | 
 39 | 
 40 | ######################################################################
 41 | # show loss function
 42 | def plotLoss(loss_f, plot_every, save_path=None, from_file=True, f_name='loss.png', title='training loss'):
 43 |     if from_file:
 44 |         loss_vec = []
 45 |         with open(loss_f) as f:
 46 |             content = f.readlines()
 47 |             content = [x.strip() for x in content] # list of every line, each a string
 48 |             for line in content:
 49 |                 try:
 50 |                     loss_vec.append(float(line))
 51 |                 except ValueError:
 52 |                     pass
 53 |     else:
 54 |         loss_vec = loss_f
 55 |     # plot
 56 |     plt.figure()
 57 |     plt.title(title)
 58 |     plt.xlabel('training iterations')
 59 |     plt.ylabel('loss')
 60 |     plt.grid()
 61 |     plt.plot([x*plot_every for x in range(1, len(loss_vec)+1)], loss_vec)
 62 |     if save_path == None:
 63 |         plt.savefig(f_name)
 64 |     else:
 65 |         plt.savefig(save_path + '/' + f_name)
 66 | ########################################################################################################################
 67 | 
 68 | 
 69 | def trainIters(generator, optimizer, batch_size, embeddings_size,
 70 |     embeddings_index, word2index, index2word, max_length, triplets, teacher_forcing_ratio,
 71 |     to_file, loss_f, sample_out_f, path_to_exp_out,
 72 |     n_iters=1, print_every=1, plot_every=1, checkpoint_every=1):
 73 | 
 74 |     begin_time = time.time()
 75 | 
 76 |     # plot_losses = []
 77 |     print_loss_total = 0  # Reset every print_every
 78 |     plot_loss_total = 0  # Reset every plot_every
 79 |     plot_loss_avgs = []
 80 | 
 81 |     print()
 82 | 
 83 |     for iter in range(1, n_iters + 1):
 84 | 
 85 |         # prepare batch
 86 |         training_batch, seq_lens = get_random_batch(triplets, batch_size)
 87 |         training_batch, _, seq_lens = prepare_batch_var(
 88 |             training_batch, seq_lens, batch_size, word2index, embeddings_index, embeddings_size)
 89 | 
 90 |         # print(type(training_batch))
 91 |         # print(type(training_batch[0]))
 92 | 
 93 |         # prepare inputs (load to cuda)
 94 |         inputs = []
 95 |         for var in training_batch:
 96 |             if not isinstance(var, list):
 97 |                 inputs.append(Variable(var.cuda())) if use_cuda else inputs.append(Variable(var))
 98 |             # NOTE not currently appending start and end index to inputs because model does not use them.
 99 |             # NOTE if want to apend, make sure these are changed from list to LongTensor
100 |             # else:
101 |             #     inputs.append(Variable(var))
102 | 
103 |         max_c_a_len = max(seq_lens[0])  # max seq length of context + ans combined
104 |         max_q_len = max(seq_lens[1])  # max seq length of question
105 | 
106 |         optimizer.zero_grad()
107 |         loss = 0
108 |         all_decoder_outputs = generator.forward(inputs, seq_lens, batch_size, max_q_len,
109 |                                                 embeddings_index, embeddings_size, word2index, index2word,
110 |                                                 teacher_forcing_ratio)
111 |         loss += generator.backward(all_decoder_outputs, inputs[1], seq_lens[1], optimizer)
112 | 
113 |         print_loss_total += loss.data[0]
114 |         plot_loss_total += loss.data[0]
115 | 
116 |         if iter % print_every == 0:
117 |             print_loss_avg = print_loss_total / print_every
118 |             print_loss_total = 0
119 |             print('%s (%d %d%%) %.4f' % (timeSince(begin_time, iter / float(n_iters)),
120 |                                          iter, iter / n_iters * 100, print_loss_avg))
121 |             print('---sample generated question---')
122 |             # sample a triple and print the generated question
123 |             evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length)
124 |             print('-------------------------------')
125 |             print('-------------------------------')
126 |             print()
127 |             if to_file:
128 |                 sample_out_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100)))
129 |                 evaluate(generator, triplets, embeddings_index, embeddings_size, word2index, index2word, max_length, to_file, sample_out_f)
130 |                 sample_out_f.write(unicode('\n'))
131 |         if iter % plot_every == 0:
132 |             plot_loss_avg = plot_loss_total / plot_every
133 |             plot_loss_avgs.append(plot_loss_avg)
134 |             # plot_losses.append(plot_loss_avg)
135 |             plot_loss_total = 0
136 |             if to_file:
137 |                 loss_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100)))
138 |                 loss_f.write(unicode(plot_loss_avg))
139 |                 loss_f.write(unicode('\n'))
140 |         if to_file and ((iter % checkpoint_every == 0) or (iter == n_iters)):
141 |             checkpoint_fname = 'checkpoint_iter_' + str(iter) + '.pth.tar'
142 |             state = {
143 |                         'iteration': iter + 1,
144 |                         'g_state_dict': generator.state_dict(),
145 |                         'g_optimizer' : optimizer.state_dict(),
146 |                     }
147 |             torch.save(state, path_to_exp_out+'/'+checkpoint_fname)
148 |             plotLoss(plot_loss_avgs, plot_every, save_path=path_to_exp_out, f_name='d_loss_itr_'+str(iter)+'.png',
149 |                 title='training loss', from_file=False)
150 | 
151 |     # showPlot(plot_losses)
152 |     if to_file:
153 |         loss_f.close()
154 | 
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/src/util/test.py:
--------------------------------------------------------------------------------
  1 | # various test cases
  2 | 
  3 | # load model
  4 | import sys, os
  5 | __file__ = '/home/jack/Documents/QA_QG/GAN-QA/src/util/'
  6 | sys.path.append(os.path.abspath(__file__))
  7 | import data_proc
  8 | reload(data_proc)
  9 | from data_proc import *
 10 | import util
 11 | reload(util)
 12 | from util import *
 13 | sys.path.append(os.path.abspath(__file__ + "/../../"))
 14 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/D_baseline')
 15 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_baseline')
 16 | from G_model import *
 17 | from model_zoo import *
 18 | from G_eval import *
 19 | import torch
 20 | import numpy as np
 21 | 
 22 | global use_cuda
 23 | use_cuda = torch.cuda.is_available()
 24 | 
 25 | ######################################################################
 26 | ######################################################################
 27 | # test for various util functions
 28 | # uncomment this for much of the later unit tests in this file
 29 | ######### set paths
 30 | # TODO: to run properly, change the following paths and filenames
 31 | # default values for the dataset and the path to the project/dataset
 32 | dataset = 'squad'
 33 | f_name = 'train-v1.1.json'
 34 | path_to_dataset = '/home/jack/Documents/QA_QG/data/'
 35 | path_to_data = path_to_dataset + dataset + '/' + f_name
 36 | GLOVE_DIR = path_to_dataset + 'glove.6B/'
 37 | # path for experiment outputs
 38 | # exp_name = 'QG_seq2seq_baseline'
 39 | path_to_exp_out = '/home/jack/Documents/QA_QG/exp_results_temp/'
 40 | loss_f = 'loss_temp.txt'
 41 | sample_out_f = 'sample_outputs_temp.txt'
 42 | path_to_loss_f = path_to_exp_out + '/' + loss_f
 43 | path_to_sample_out_f = path_to_exp_out + '/' + sample_out_f
 44 | 
 45 | ######### first load the pretrained word embeddings
 46 | path_to_glove = os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')
 47 | embeddings_index, embeddings_size = readGlove(path_to_glove)
 48 | 
 49 | ######### read corpus
 50 | raw_triplets = read_raw_squad(path_to_data)
 51 | 
 52 | # # test of windowed triplets
 53 | # window_size = 10
 54 | # test_idx = 250
 55 | # windowed_c_triplets_10 = get_windowed_ans(raw_triplets, window_size)
 56 | # print(raw_triplets[test_idx][0])
 57 | # print(raw_triplets[test_idx][2])
 58 | # print(windowed_c_triplets[0][0])
 59 | 
 60 | # test of selecting the sentence containing answer from context
 61 | # test_idx = 0
 62 | sent_window = 1
 63 | sent_c_triplets, unmatch = get_ans_sentence(raw_triplets)
 64 | # print(raw_triplets[test_idx][0])
 65 | # print(raw_triplets[test_idx][2])
 66 | # print('ans start idx: %d' % raw_triplets[test_idx][3])
 67 | # print('ans end idx:   %d' % raw_triplets[test_idx][4])
 68 | # print(sent_c_triplets[0][0])
 69 | # windowed_c_triplets_10_noEOS = tokenize_squad(windowed_c_triplets_10, embeddings_index, opt='window', a_EOS=False, c_EOS=False)
 70 | # triplets = windowed_c_triplets_30_noEOS
 71 | # windowed_c_triplets_10_noEOS = tokenize_squad(windowed_c_triplets_10_noEOS, embeddings_index, opt='window')
 72 | # sent_c_triplets = tokenize_squad(sent_c_triplets, embeddings_index, opt='sent')
 73 | # triplets = tokenize_squad(raw_triplets, embeddings_index)
 74 | 
 75 | # print(raw_triplets[test_idx][0])
 76 | # print(' '.join(triplets[test_idx][0]))
 77 | # print(raw_triplets[test_idx][1])
 78 | # print(' '.join(triplets[test_idx][1]))
 79 | # print(raw_triplets[test_idx][2])
 80 | # print(' '.join(triplets[test_idx][2]))
 81 | 
 82 | # # save to files
 83 | # import pickle
 84 | # save_path = '/home/jack/Documents/QA_QG/data/processed/'
 85 | # if not os.path.exists(save_path):
 86 | # 	os.mkdir(save_path)
 87 | # with open(save_path+'windowed_c_triplets_10_noEOS.txt', 'wb') as fp:
 88 | # 	pickle.dump(windowed_c_triplets_10_noEOS, fp)
 89 | # with open(save_path+'sent_c_triplets.txt', 'wb') as fp:
 90 | # 	pickle.dump(sent_c_triplets, fp)
 91 | # with open(save_path+'triplets.txt', 'wb') as fp:
 92 | # 	pickle.dump(triplets, fp)
 93 | 
 94 | # # test pickle load
 95 | import pickle
 96 | load_path = '/home/jack/Documents/QA_QG/data/processed/'
 97 | # triplets = pickle.load(open(load_path+'triplets.txt', 'rb'))
 98 | sent_c_triplets = pickle.load(open(load_path+'sent_c_triplets.txt', 'rb'))
 99 | # windowed_c_triplets_10 = pickle.load(open(load_path+'windowed_c_triplets_10.txt', 'rb'))
100 | 
101 | # # find max length of context, question, answer, respectively
102 | # # max_len_c, max_len_q, max_len_a = max_length(triplets)
103 | #
104 | # effective_tokens, effective_num_tokens = count_effective_num_tokens(triplets, embeddings_index)
105 | # # print('effective number of tokens: ' + str(effective_num_tokens))
106 | # # print('expected initial loss: ' + str(-np.log(1/float(effective_num_tokens))) + '\n')
107 | # # # build word2index dictionary and index2word dictionary
108 | # word2index, index2word = generate_look_up_table(effective_tokens, effective_num_tokens)
109 | 
110 | # test similarity test
111 | q = 'what is the language spoken in germany ? EOS'
112 | scores = generated_q_novelty(sent_c_triplets, q)
113 | idx = np.argpartition(scores, -10)[-10:]
114 | scores[idx]
115 | for i in idx:
116 | 	print(sent_c_triplets[i][1])
117 | 
118 | ######################################################################
119 | ######################################################################
120 | # test case of get_random_batch and prepare_batch_var functions in data_proc.py
121 | # (uncomment code below to test)
122 | # test and time
123 | # to run this test, you need to have these things ready:
124 | # 1) triplet processed by tokenize_squad,
125 | # 2) embeddings_index
126 | # 3) a mini batch processed by get_random_batch
127 | # batch_size = 500
128 | # start = time.time()
129 | # batch, seq_lens, fake_batch, fake_seq_lens = get_random_batch(triplets, batch_size, with_fake=True)
130 | # batch, seq_lens = get_random_batch(triplets, batch_size)
131 | #
132 | # # temp, temp_orig, seq_lens_cqa = prepare_batch_var(batch, seq_lens, fake_batch, fake_seq_lens, batch_size, word2index, embeddings_index, embeddings_size,
133 | # #                                                   mode = ['word', 'index'], concat_opt='cqa', with_fake=True)
134 | # batch_vars, batch_paddings, seq_lens = prepare_batch_var(batch, seq_lens, batch_size, word2index, embeddings_index, embeddings_size)
135 | 
136 | # end = time.time()
137 | # print('time elapsed: ' + str(end-start))
138 | # # the following check if the batched data matches with the original data
139 | # batch_idx = random.choice(range(batch_size))
140 | # print(batch_idx)
141 | #
142 | # print('context  > ', ' '.join(temp_orig[0][batch_idx]))
143 | # print('question > ', ' '.join(temp_orig[1][batch_idx]))
144 | # print('answer   > ', ' '.join(temp_orig[2][batch_idx]))
145 | #
146 | # idx = batch[0].index(temp_orig[0][batch_idx])
147 | # print('context  > ', ' '.join(batch[0][idx]))
148 | # print('question > ', ' '.join(batch[1][idx]))
149 | # print('answer   > ', ' '.join(batch[2][idx]))
150 | 
151 | # seq_idx = random.choice(range(min(seq_lens[0])))
152 | # print(seq_idx)
153 | # word1 = embeddings_index[batch[0][seq_lens[0].index(heapq.nlargest(batch_idx, seq_lens[0])[-1])][seq_idx]]
154 | # word2 = temp[0][seq_idx, batch_idx,]
155 | # set(word1) == set(word2.data.cpu())
156 | 
157 | 
158 | ######################################################################
159 | ######################################################################
160 | # # test case to load the G model and sample from G
161 | # teacher_forcing_ratio = 0.5 # default in original code is 0.5
162 | 
163 | # # param for G
164 | # enc_hidden_size = 256
165 | # enc_n_layers = 1
166 | # enc_num_directions = 1
167 | # dec_hidden_size = 256
168 | # dec_n_layers = 1
169 | # dec_num_directions = 1
170 | # batch_size = 5
171 | # learning_rate = 0.0005
172 | 
173 | # generator = G(embeddings_size, enc_hidden_size, enc_n_layers, enc_num_directions,
174 | #                  embeddings_size, dec_hidden_size, effective_num_tokens, dec_n_layers, dec_num_directions,
175 | #                  batch_size)
176 | # if use_cuda:
177 | #     generator = generator.cuda()
178 | 
179 | # # prepare G input
180 | # training_batch, seq_lens = get_random_batch(triplets, batch_size)
181 | # ca = training_batch[0][0] + training_batch[2][0]
182 | # # sample from G
183 | # max_len = 100
184 | # sample_q = G_sampler(generator, ca, embeddings_index, embeddings_size, word2index, index2word, max_len)
185 | # print(' '.join(sample_q))
186 | 


--------------------------------------------------------------------------------
/src/model_zoo.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import sys
  3 | import os
  4 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
  5 | from data_proc import *
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.autograd import Variable
 10 | import torch.nn.functional as F
 11 | 
 12 | use_cuda = torch.cuda.is_available()
 13 | 
 14 | ######################################################################
 15 | # The Encoder
 16 | # -----------
 17 | class EncoderRNN(nn.Module):
 18 |     # output is the same dimension as input (dimension defined by externalword embedding model)
 19 |     def __init__(self, input_size, hidden_size, batch_size, n_layers=1, num_directions=1):
 20 |         super(EncoderRNN, self).__init__()
 21 |         self.n_layers = n_layers
 22 |         self.hidden_size = hidden_size
 23 |         self.input_size = input_size
 24 |         self.num_directions = num_directions
 25 |         self.batch_size = batch_size
 26 |         # print('batch size is: %d' % batch_size)
 27 | 
 28 |         if self.num_directions == 1:
 29 |             self.gru = nn.GRU(input_size, hidden_size, n_layers, bidirectional=False)
 30 |         elif self.num_directions == 2:
 31 |             self.gru = nn.GRU(input_size, hidden_size, n_layers, bidirectional=True)
 32 |         else:
 33 |             raise Exception('input num_directions is wrong - need to be either 1 or 2')
 34 | 
 35 |     def forward(self, input, seq_lens, hidden=None):
 36 | 
 37 |         # # prepare encoder input
 38 |         # if self.batch_size > 1:
 39 |         #     # see how pack_padded_sequence works, take a look here (this is a wrong example): https://goo.gl/oN9uc9
 40 |         #     input = nn.utils.rnn.pack_padded_sequence(input, seq_lens)
 41 |         #     # input = pack_sequence(input, seq_lens)
 42 | 
 43 |         # input is matrix of size [max seq len x batch size x embedding dimension]
 44 |         encoder_outputs, hidden = self.gru(input, hidden)
 45 | 
 46 |         # # unpack the sequence
 47 |         # # size of unpacked sequence: (seq_len, batch size, hidden_size*num_directions)
 48 |         # if self.batch_size > 1:
 49 |         #     encoder_outputs, output_lens = torch.nn.utils.rnn.pad_packed_sequence(encoder_outputs)
 50 | 
 51 |         # FIXME: do I need to sum the eocnder_outputs when the network is bidirectional:
 52 |         # e.g. outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
 53 | 
 54 |         return encoder_outputs, hidden
 55 | 
 56 | 
 57 | ######################################################################
 58 | # Vanilla Decoder
 59 | # ^^^^^^^^^^^^^^^^^
 60 | # TODO: take another look at the attn implementation; there might be some errors
 61 | class DecoderRNN(nn.Module):
 62 |     def __init__(self, input_size, hidden_size, output_size, n_layers=1, num_directions=1, dropout_p=0.1):
 63 |         super(AttnDecoderRNN, self).__init__()
 64 |         self.input_size = input_size
 65 |         self.hidden_size = hidden_size
 66 |         self.output_size = output_size
 67 |         self.n_layers = n_layers
 68 |         self.dropout_p = dropout_p
 69 |         self.bidi = True if num_directions==2 else False
 70 | 
 71 |         # recurrent model
 72 |         self.dropout = nn.Dropout(self.dropout_p)
 73 |         self.gru = nn.GRU(self.input_size, self.hidden_size, num_layers=self.n_layers, bidirectional=self.bidi)
 74 |         self.out = nn.Linear(self.hidden_size, self.output_size)
 75 | 
 76 |     # forward for each time step.
 77 |     # need to do this because of teacher forcing at each time step
 78 |     def forward(self, input, encoder_hidden, embeddings_index, hidden=None):
 79 | 
 80 |         # get the output
 81 |         # hidden: (num_layers * num_directions, batch, hidden_size)
 82 |         # note: for each time step, output and hidden are the same
 83 |         output, hidden = self.gru(input, hidden)
 84 |         
 85 |         # if bidirectional, sum decoder hidden states of both directions
 86 |         if self.bidi:
 87 |             hidden = hidden[2*self.n_layer - 1] + hidden[2*self.n_layer]
 88 |             hidden = hidden.unsqueeze(0)
 89 | 
 90 |         # output size: (batch size, vocab size)
 91 |         output = F.log_softmax(self.out(output))
 92 | 
 93 |         return output, hidden
 94 | 
 95 | 
 96 | ######################################################################
 97 | # Attention Decoder
 98 | # ^^^^^^^^^^^^^^^^^
 99 | # TODO: take another look at the attn implementation; there might be some errors
100 | class AttnDecoderRNN(nn.Module):
101 |     def __init__(self, input_size, hidden_size, output_size, encoder, n_layers=1, num_directions=1, dropout_p=0.1):
102 |         super(AttnDecoderRNN, self).__init__()
103 |         self.input_size = input_size
104 |         self.hidden_size = hidden_size
105 |         self.output_size = output_size
106 |         self.n_layers = n_layers
107 |         self.dropout_p = dropout_p
108 |         self.num_directions = num_directions
109 | 
110 |         # recurrent model
111 |         self.dropout = nn.Dropout(self.dropout_p)
112 |         self.gru = nn.GRU(self.input_size, self.hidden_size)
113 |         self.out = nn.Linear(self.hidden_size + encoder.num_directions * encoder.hidden_size, self.output_size)
114 | 
115 |         # attention mechanism
116 |         self.attn = nn.Linear(self.hidden_size + encoder.num_directions * encoder.hidden_size, self.hidden_size)
117 | 
118 |     # forward for each time step.
119 |     # need to do this because of teacher forcing at each time step
120 |     def forward(self, input, encoder_outputs, embeddings_index, hidden=None):
121 | 
122 |         # get the output
123 |         # hidden: (num_layers * num_directions, batch, hidden_size)
124 |         # note: for each time step, output and hidden are the same
125 |         output, hidden = self.gru(input, hidden)
126 | 
127 |         # # unpack the sequence
128 |         # # decoder_outputs size (seq len, batch, hidden_size * num_directions)
129 |         # # --> collection of hidden states at every time step
130 |         # TODO: should figure out how to do this in a batch
131 |         #       current implementation is one token at a time using a forloop 
132 |         # decoder_outputs, output_lens = torch.nn.utils.rnn.pad_packed_sequence(decoder_outputs)
133 | 
134 |         # init attention weights
135 |         # length = batch_size x encoder output lens
136 |         attn_weights = Variable(torch.zeros(encoder_outputs.size(1), encoder_outputs.size(0)))
137 |         if use_cuda:
138 |             attn_weights = attn_weights.cuda()
139 | 
140 |         for b in range(encoder_outputs.size(1)):
141 |             # copy the decoder output at the present time step to N rows, where N = num encoder outputs
142 |             # first dimension of append = first dimension of encoder_outputs[:,b] = seq_len of encoder
143 |             # the scores for calculating attention weights of all encoder outputs for one time step of decoder output
144 |             for i in range(encoder_outputs.size(0)):
145 |                 attn_weights[b,i] = hidden[:,b].squeeze(0).dot(self.attn(torch.cat((hidden[:,b], encoder_outputs[i,b].unsqueeze(0)), 1)).t())
146 |             # Below is an alternative implementation using matrices instead of for loop
147 |             # not sure which one is more space efficient
148 |             # (the out of memory error most likely comes from here)
149 |             # attn_weights[i,b] = torch.mm(hidden[:, b],
150 |             #                     self.attn(torch.cat((append, encoder_outputs[:, b]), 1)).t())
151 | 
152 |         attn_weights = F.softmax(attn_weights)
153 | 
154 |         # input to bmm:
155 |         # weights size: (batch size, 1, seq_len)
156 |         # hidden states size: (seq_len, batch, hidden_size * num_directions)
157 |         # transpose hidden state size: (batch, seq len, hidden_size * num_directions)
158 |         # output size: (batch size, 1, hidden_size * num_directions)
159 |         context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs.transpose(0, 1))
160 | 
161 |         # calculate 
162 |         decoder_output = torch.cat((hidden.squeeze(0), context.squeeze(1)), 1)
163 | 
164 |         # output size: (batch size, vocab size)
165 |         decoder_output = F.log_softmax(self.out(decoder_output))
166 | 
167 |         return decoder_output, hidden, attn_weights
168 | 
169 | 
170 | ######################################################################
171 | # multi-layer perceptron
172 | # ^^^^^^^^^^^^^^^^^^^^^^
173 | # code adapted from pytorch tutorial
174 | class MLP(nn.Module):
175 |     # FIXME: the number of attention weights here is hard coded for tensor multiplication instead of using for loops
176 |     def __init__(self, hidden_size, output_size, encoder, num_attn_weights, use_attn = True):
177 |         # maximum input length it can take (for attention mechanism)
178 |         super(MLP, self).__init__()
179 |         self.hidden_size = hidden_size
180 |         self.use_attn = use_attn
181 |         self.num_attn_weights = num_attn_weights
182 |         self.output_size = output_size
183 | 
184 |         # fully connected layers (2) and non-linearity
185 |         self.layer1 = nn.Linear(encoder.num_directions * encoder.hidden_size, self.hidden_size)
186 |         self.relu = nn.ReLU()
187 |         self.layer2 = nn.Linear(self.hidden_size, self.output_size)
188 |         self.sigmoid = nn.Sigmoid()
189 | 
190 |         # attention
191 |         if self.use_attn:
192 |             self.tanh = nn.Tanh()
193 |             self.attn = nn.Linear(encoder.hidden_size*encoder.num_directions, self.num_attn_weights)
194 | 
195 |     def forward(self, inputs):
196 |         # inputs size (seq len, batch size, hidden size * num directions)
197 |         # if use attention, the output vector is a weighted combination of input hidden states
198 |         # if not use attention, the output vector is simply a feedforward network operated on input's last hidden state
199 |         # TODO: write the attn function into another module???
200 |         if self.use_attn:
201 | 
202 |             # reshape input to be 2D tensor instead of 3D
203 |             seq_len = inputs.size(0)
204 |             batch_size = inputs.size(1)
205 |             inputs_for_attn_calc = inputs.view(-1, inputs.size(-1))
206 | 
207 |             attn_weights = Variable(torch.zeros(inputs.size(1), inputs.size(0)))
208 |             if use_cuda:
209 |                 attn_weights = attn_weights.cuda()
210 | 
211 |             # calculate attention weight for each output time step
212 |             # remember encoder_outputs size: (seq_len, batch, hidden_size * num_directions)
213 |             # for each token in the decoder output sequences:
214 |             for b in range(inputs.size(1)):
215 |                 # the scores for calculating attention weights of all encoder outputs for one time step of decoder output
216 |                 attn_weights[b] = self.attn(inputs[:, b]).t()
217 | 
218 |             attn_weights = F.softmax(attn_weights)
219 | 
220 |             # input to bmm:
221 |             # weights size: (batch size, 1, seq_len)
222 |             # hidden states size: (seq_len, batch, hidden_size * num_directions)
223 |             # transpose hidden state size: (batch, seq len, hidden_size * num_directions)
224 |             # output size: (batch size, 1, hidden_size * num_directions)
225 |             context = torch.bmm(attn_weights.unsqueeze(1), inputs.transpose(0, 1)).squeeze(1)
226 |         else:
227 |             context = torch.sum( inputs.transpose(0,1), 1 ).squeeze(1)
228 | 
229 |         # feedforward
230 |         out = self.layer1(context)
231 |         out = self.relu(out)
232 |         out = self.layer2(out)
233 |         out = self.sigmoid(out)
234 | 
235 |         return out
236 | 


--------------------------------------------------------------------------------
/src/GAN_model/GAN_model.py:
--------------------------------------------------------------------------------
  1 | import sys, os
  2 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/util')
  3 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_c_a_sep')
  4 | # sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_baseline/')
  5 | sys.path.append(os.path.abspath(__file__ + "/../../") + '/D_baseline')
  6 | from util import asMinutes, timeSince
  7 | from data_proc import *
  8 | from G_c_a_sep import G
  9 | from G_eval import *
 10 | # from G_model import G
 11 | from D_model import *
 12 | 
 13 | import torch
 14 | import torch.nn as nn
 15 | import numpy as np
 16 | import torch.autograd as autograd
 17 | from torch.autograd import Variable
 18 | 
 19 | ##################################################################
 20 | 
 21 | use_cuda = torch.cuda.is_available()
 22 | if use_cuda:
 23 |     gpu = 0
 24 | 
 25 | def to_var(x):
 26 |     if use_cuda:
 27 |         x = x.cuda()
 28 |     return Variable(x)
 29 | 
 30 | 
 31 | class GAN_model(nn.Module):
 32 |     def __init__(self, G_enc_input_size, G_enc_hidden_size, G_enc_n_layers, G_enc_num_directions,
 33 |                  G_dec_input_size, G_dec_hidden_size, G_output_size, G_dec_n_layers, G_dec_num_directions,
 34 |                  D_enc_input_size, D_enc_hidden_size, D_enc_n_layers, D_num_directions,
 35 |                  D_mlp_hidden_size, D_num_attn_weights, D_mlp_output_size,
 36 |                  use_attn, batch_size, pretrain=False, G_path=None
 37 |                  ):
 38 | 
 39 |         super(GAN_model, self).__init__()
 40 | 
 41 |         self.G = G(G_enc_input_size, G_enc_hidden_size, G_enc_n_layers, G_enc_num_directions, G_dec_input_size,
 42 |                    G_dec_hidden_size, G_output_size, G_dec_n_layers, G_dec_num_directions, batch_size)
 43 |         if pretrain:
 44 |             # load the G model from G_path
 45 |             self.G = torch.load(G_path)
 46 | 
 47 |         self.D = D(D_enc_input_size, D_enc_hidden_size, D_enc_n_layers, D_num_directions, D_mlp_hidden_size,
 48 |                    D_num_attn_weights, D_mlp_output_size, use_attn, batch_size)
 49 | 
 50 |     def train(self, triplets, n_iters, d_steps, d_optimizer, g_steps, g_optimizer, batch_size, max_len,
 51 |               criterion, word2index, index2word, embeddings_index, embeddings_size, print_every, plot_every, checkpoint_every,
 52 |               to_file=False, loss_f=None, sample_out_f=None, path_to_exp_out=None):
 53 |         # criterion is for both G and D
 54 | 
 55 |         # record start time for logging
 56 |         begin_time = time.time()
 57 |         print_d_loss_total = 0  # Reset every print_every
 58 |         plot_d_loss_total = 0  # Reset every plot_every
 59 |         print_g_loss_total = 0  # Reset every print_every
 60 |         plot_g_loss_total = 0  # Reset every plot_every
 61 |         plot_d_loss_avgs = []
 62 |         plot_g_loss_avgs = []
 63 | 
 64 |         for iter in range(1, n_iters + 1):
 65 | 
 66 |             # train D
 67 |             for d_train_idx in range(d_steps):
 68 |                 # 1. Train D on real+fake
 69 |                 self.D.zero_grad()
 70 | 
 71 |                 #  1A: Train D on real
 72 |                 #       get data
 73 |                 #       prepare batch
 74 |                 training_batch, seq_lens = get_random_batch(triplets, batch_size)
 75 |                 #       concat the context_ans batch with the question batch
 76 |                 #       each element in the training batch is context + question + answer
 77 |                 cqa_batch, _, cqa_lens = prepare_batch_var(training_batch, seq_lens,
 78 |                                                                 batch_size, word2index, embeddings_index,
 79 |                                                                 embeddings_size, mode=['word'], concat_opt='cqa')
 80 | 
 81 |                 train_input = Variable(cqa_batch[0].cuda()) if use_cuda else Variable(
 82 |                     cqa_batch[0])  # embeddings vectors, size = [seq len x batch size x embedding dim]
 83 | 
 84 |                 d_real_decision = self.D.forward(train_input, cqa_lens[0])
 85 |                 real_target = Variable(torch.FloatTensor([1]*batch_size)).cuda() if use_cuda else \
 86 |                     Variable(torch.FloatTensor([1]*batch_size))
 87 |                 d_real_error = criterion(d_real_decision, real_target)  # ones = true
 88 |                 d_real_error.backward()  # compute/store gradients, but don't change params
 89 | 
 90 |                 #  1B: Train D on fake
 91 |                 fake_cqa_batch, fake_cqa_lens = prepare_fake_batch_var(self.G, training_batch, max_len, batch_size,
 92 |                                                                         word2index, index2word, embeddings_index,
 93 |                                                                         embeddings_size, mode = ('word'))
 94 | 
 95 |                 # # sanity check: rpepare fake batch and prepare batch have the same order
 96 |                 # print(fake_cqa_batch[0][12] == cqa_batch[0][12])
 97 | 
 98 |                 d_fake_data = Variable(fake_cqa_batch[0].cuda()) if use_cuda else Variable(fake_cqa_batch[0])
 99 |                 d_fake_decision = self.D.forward(d_fake_data, fake_cqa_lens[0])
100 |                 fake_target = Variable(torch.FloatTensor([0]*batch_size)).cuda() if use_cuda else \
101 |                     Variable(torch.FloatTensor([0]*batch_size))
102 |                 # d_fake_error = criterion(d_fake_decision, fake_target)  # zeros = fake
103 |                 # d_fake_error.backward()
104 |                 # d_optimizer.step()
105 | 
106 |                 # accumulate loss
107 |                 # FIXME I dont think below implementation works for batch version
108 |                 d_error = torch.mean(d_fake_decision) - torch.mean(d_real_decision) # W_GAN loss
109 |                 # d_error = -torch.mean(self.log(1 - d_fake_decision)) - torch.mean(self.log(d_real_decision)) # GAN loss
110 |                 d_error.backward()
111 |                 d_optimizer.step()
112 | 
113 |                 # d_error = d_real_error + d_fake_error
114 | 
115 |             # train G
116 |             for g_train_idx in range(g_steps):
117 |                 self.G.zero_grad()
118 | 
119 |                 # conditional data for generator
120 |                 training_batch, seq_lens = get_random_batch(triplets, batch_size)
121 |                 fake_cqa_batch, fake_cqa_lens = prepare_fake_batch_var(self.G, training_batch, max_len, batch_size,
122 |                                                                         word2index, index2word, embeddings_index,
123 |                                                                         embeddings_size, mode=('word'), detach=False)
124 |                 g_fake_data = Variable(fake_cqa_batch[0].cuda()) if use_cuda else Variable(fake_cqa_batch[0])
125 |                 dg_fake_decision = self.D.forward(g_fake_data, fake_cqa_lens[0])
126 |                 target = Variable(torch.FloatTensor([1]*batch_size).cuda()) if use_cuda else \
127 |                     Variable(torch.FloatTensor([1]*batch_size))
128 |                 # g_error = criterion(dg_fake_decision, target)
129 |                 g_error = -torch.mean(dg_fake_decision) # wgan loss
130 |                 # G_error = -torch.mean(self.log(dg_fake_decision)) # gan loss
131 |                 g_error.backward()
132 |                 g_optimizer.step()  # Only optimizes G's parameters
133 | 
134 |             # log error
135 |             print_d_loss_total += d_error.data[0]
136 |             print_g_loss_total += g_error.data[0]
137 |             plot_d_loss_total += d_error.data[0]
138 |             plot_g_loss_total += g_error.data[0]
139 |             if iter % print_every == 0:
140 |                 print_d_loss_avg = print_d_loss_total / print_every
141 |                 print_g_loss_avg = print_g_loss_total / print_every
142 |                 print_d_loss_total = 0
143 |                 print_g_loss_total = 0
144 |                 
145 |                 if not to_file:
146 |                     print('%s (%d %d%%)' % (timeSince(begin_time, iter / float(n_iters)), iter, iter / n_iters * 100))
147 |                     # print("errors: D: real-%s/fake-%s G: %s " % ( d_real_error.data[0], d_fake_error.data[0], g_error.data[0]) )
148 |                     print("errors: D: %s G: %s " % (print_d_loss_avg, print_g_loss_avg))
149 |                     print('---sample generated question---')
150 |                     # sample a triple and print the generated question
151 |                     evaluate(self.G, triplets, embeddings_index, embeddings_size, word2index, index2word, max_len)
152 |                 else:
153 |                     sample_out_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100)))
154 |                     evaluate(self.G, triplets, embeddings_index, embeddings_size, word2index, index2word, max_len,
155 |                              to_file, sample_out_f)
156 |                     sample_out_f.write(unicode('\n'))
157 | 
158 |             if iter % plot_every == 0:
159 |                 plot_d_loss_avg = plot_d_loss_total / plot_every
160 |                 plot_d_loss_avgs.append(plot_d_loss_avg)
161 |                 plot_g_loss_avg = plot_g_loss_total / plot_every
162 |                 plot_g_loss_avgs.append(plot_g_loss_avg)
163 |                 plot_d_loss_total = 0
164 |                 plot_g_loss_total = 0
165 | 
166 |                 if to_file:
167 |                     loss_f.write(unicode('%s (%d %d%%)\n' % (timeSince(begin_time, iter / float(n_iters)), iter, float(iter) / float(n_iters) * 100)))
168 |                     loss_f.write(unicode("errors: D: %s G: %s " % (print_d_loss_avg, print_g_loss_avg)))
169 |                     loss_f.write(unicode('\n'))
170 | 
171 |             if (iter % checkpoint_every == 0) or (iter == n_iters):
172 |                 checkpoint_fname = 'checkpoint_iter_' + str(iter) + '.pth.tar'
173 |                 state = {
174 |                             'iteration': iter + 1,
175 |                             'd_state_dict': self.D.state_dict(),
176 |                             'g_state_dict': self.G.state_dict(),
177 |                             'd_optimizer' : d_optimizer.state_dict(),
178 |                             'g_optimizer' : g_optimizer.state_dict(),
179 |                         }
180 |                 torch.save(state, path_to_exp_out+'/'+checkpoint_fname)
181 |                 plotLoss(plot_d_loss_avgs, plot_every, save_path=path_to_exp_out, f_name='d_loss_itr_'+str(iter)+'.png', 
182 |                     title='training loss D (monitoring purpose)', from_file=False)
183 |                 plotLoss(plot_g_loss_avgs, plot_every, save_path=path_to_exp_out, f_name='g_loss_itr_'+str(iter)+'.png',
184 |                     title='training loss G (monitoring purpose)', from_file=False)
185 | 
186 |     # def train(self, **kwargs):
187 |     #     pass
188 |     #
189 |     # def test(self):
190 |     #     pass
191 | 
192 |     # L2 loss instead of Binary cross entropy loss (this is optional for stable training)
193 |     # FIXME: is L2 loss the same as MSELoss in torch loss module?
194 |     # FIXME: these losses don't work with minibatch yet?
195 |     def loss(self, D_real, D_fake, gen_params, disc_params, cond_real_data, cond_fake_data, mode, lr=None):
196 |         mode = mode.lower()
197 |         if mode == 'gan':
198 |             G_loss = -torch.mean(self.log(D_fake))
199 |             # FIXME G_loss.backward()
200 |             D_loss = -torch.mean(self.log(1 - D_fake)) - torch.mean(self.log(D_real))
201 |             # FIXME D_loss.backward()
202 |             metric = -D_loss / 2 + np.log(2)  # JS divergence
203 | 
204 |             G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3)
205 |             D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3)
206 | 
207 |         elif mode == 'lsgan-1':
208 |             G_loss = torch.mean(D_fake ** 2)
209 |             D_loss = torch.mean((D_real - 1) ** 2)
210 |             metric = 0  # TBD
211 | 
212 |             G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3)
213 |             D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3)
214 | 
215 |         elif mode == 'lsgan-2':
216 |             G_loss = torch.mean((D_fake - 1) ** 2)
217 |             D_loss = torch.mean((D_real - 1) ** 2) + torch.mean(D_fake ** 2)
218 |             metric = D_loss / 2  # Pearson Chi-Square divergence
219 | 
220 |             G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3)
221 |             D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3)
222 | 
223 |         elif mode == 'wgan':
224 |             G_loss = -torch.mean(D_fake)
225 |             D_loss = torch.mean(D_fake) - torch.mean(D_real)
226 |             metric = -D_loss  # Earth-mover distance
227 | 
228 |             grad_penalty = self.cal_grad_penalty(cond_real_data, cond_fake_data)
229 |             D_loss += self.lmd * grad_penalty
230 | 
231 |             G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3)
232 |             D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3)
233 | 
234 |         elif mode == 'bgan':
235 |             G_loss = 0.5 * torch.mean((self.log(D_fake) - self.log(1 - D_fake)) ** 2)
236 |             D_loss = -torch.mean(self.log(D_real) + self.log(1 - D_fake))
237 |             metric = 0  # TBD
238 |             G_solver = torch.optim.Adam(gen_params, lr=lr if lr else 1e-3)
239 |             D_solver = torch.optim.Adam(disc_params, lr=lr if lr else 1e-3)
240 | 
241 |         else:
242 |             raise ValueError('Unknown mode: {}'.format(mode))
243 | 
244 |         return G_loss, D_loss, metric, G_solver, D_solver
245 | 
246 |     def cal_grad_penalty(self, cond_real_data, cond_fake_data):
247 |         epsilon = to_var(torch.rand(self.batch_size, 1))
248 |         epsilon = epsilon.expand(cond_real_data.size())
249 | 
250 |         data_diff = cond_fake_data - cond_real_data
251 |         cond_interp_data = cond_real_data + epsilon * data_diff
252 |         disc_interp = self.D(self.d_net, cond_interp_data, reuse=True)  # TODO: change the arguments
253 | 
254 |         grad_interp = autograd.grad(outputs=disc_interp, inputs=cond_interp_data,
255 |                                   grad_outputs=torch.ones(disc_interp.size()).cuda(
256 |                                       gpu) if use_cuda else torch.ones(
257 |                                       disc_interp.size()),
258 |                                   create_graph=True, retain_graph=True, only_inputs=True)[0]
259 | 
260 |         grad_interp_flat = grad_interp.view([self.batch_size, -1])
261 |         slope = grad_interp_flat.norm(p=2, dim=1)
262 | 
263 |         grad_penalty = torch.mean((slope - 1.) ** 2)
264 |         return grad_penalty
265 | 
266 | 
267 | 
268 | # same context and answer as in the real batch, but generated question
269 | def prepare_fake_batch_var(generator, batch, max_len, batch_size, word2index, index2word,
270 |                            embeddings_index, embeddings_size, sort=False, mode = ('word'), detach=True, concat=None):
271 | 
272 |     batch_vars = []
273 |     batch_var_orig = []
274 | 
275 |     cqa = []
276 |     cqa_len = []
277 |     labels = torch.LongTensor([0] * batch_size) # all fake labels, thus all 0's
278 |     for b in range(batch_size):
279 |         if concat=='ca':
280 |             ca = batch[0][b] + batch[2][b]
281 |             fake_q_sample = G_sampler(generator, ca, embeddings_index, embeddings_size, word2index, index2word, max_len, detach=detach, concat=concat)
282 |         elif concat==None:
283 |             inputs = [batch[0][b], batch[1][b], batch[2][b]]
284 |             fake_q_sample = G_sampler(generator, inputs, embeddings_index, embeddings_size, word2index, index2word, max_len, detach=detach)
285 |         cqa.append(batch[0][b] + fake_q_sample + batch[2][b])
286 |         cqa_len.append(len(batch[0][b] + fake_q_sample + batch[2][b]))
287 | 
288 |     batch = [cqa, batch[3], batch[4], labels]
289 |     seq_lens = [cqa_len]
290 | 
291 |     # sort this batch_var in descending order according to the values of the lengths of the first element in batch
292 |     num_batch = len(batch)
293 |     
294 |     if sort:
295 |         all = batch + seq_lens
296 |         all = sorted(zip(*all), key=lambda p: len(p[0]), reverse=True)
297 |         all = zip(*all)
298 |         batch = all[0:num_batch]
299 |         seq_lens = all[num_batch:]
300 |         batch_orig = batch
301 | 
302 |     for b in range(num_batch):
303 | 
304 |         batch_var = batch[b]
305 | 
306 |         # if element in batch is float, i.e. indices, then do nothing
307 |         if isinstance(batch_var[0], int):
308 |             batch_var = list(batch_var)
309 |             pass
310 |         else:
311 |             # pad each context, question, answer to their respective max length
312 |             if mode[b]  == 'index':
313 |                 batch_padded = [pad_sequence(s, max(seq_lens[b]), word2index, mode='index') for s in batch_var]
314 |             else:
315 |                 batch_padded = [pad_sequence(s, max(seq_lens[b]), word2index) for s in batch_var]
316 | 
317 |             # init variable matrices
318 |             if mode[b] == 'index':
319 |                 batch_var = torch.LongTensor(max(seq_lens[b]), batch_size) # long tensor for module loss criterion
320 |             else:
321 |                 batch_var = torch.FloatTensor(max(seq_lens[b]), batch_size, embeddings_size)
322 | 
323 |             # FIXME: very stupid embedded for loop implementation
324 |             for i in range(batch_size):
325 |                 for j in range(max(seq_lens[b])):
326 |                     if mode[b] == 'index':
327 |                         batch_var[j, i] = batch_padded[i][j]
328 |                     else:
329 |                         batch_var[j, i,] = embeddings_index[batch_padded[i][j]]
330 | 
331 |         batch_vars.append(batch_var)
332 | 
333 |     # the second output is for debugging purpose
334 |     if sort:
335 |         return batch_vars, batch_orig, seq_lens
336 |     else:
337 |         return batch_vars, seq_lens
338 | 
339 | 
340 | # function to sample generator output
341 | def G_sampler(generator, input, embeddings_index, embeddings_size, word2index, index2word, max_length, concat=None, detach=True):
342 | # NOTE currently only generate one question at a time. multiple questions not yet supported
343 | 
344 |     if concat == 'ca':
345 |         var = torch.FloatTensor(len(input), embeddings_size)
346 |         for j in range(len(input)):
347 |             var[j] = embeddings_index[input[j]]
348 |         var = inputs.unsqueeze(1)
349 |         if use_cuda:
350 |             var = Variable(var.cuda())
351 |         else:
352 |             var = Variable(var)
353 | 
354 |         decoder_output = generator.forward(var, None, [len(input)], 1, max_length,
355 |                                            embeddings_index, embeddings_size, word2index, index2word,
356 |                                            teacher_forcing_ratio=0).detach()
357 |         decoder_output = decoder_output.squeeze(1)
358 |     elif concat == None:
359 |         # NOTE: hardcode indices of c, q, a, in the line - for i in range(0,3)
360 |         inputs = []
361 |         for i in range(0,3):
362 |             # print(input[i])
363 |             var = torch.FloatTensor(len(input[i]), embeddings_size)
364 |             for j in range(len(input[i])):
365 |                 var[j] = embeddings_index[input[i][j]]
366 |             var = var.unsqueeze(1)
367 |             if use_cuda:
368 |                 var = Variable(var.cuda())
369 |             else:
370 |                 var = Variable(var)
371 |             inputs.append(var)
372 | 
373 |         decoder_output = generator.forward(inputs, [len(x) for x in input], 1, max_length,
374 |                                            embeddings_index, embeddings_size, word2index, index2word,
375 |                                            teacher_forcing_ratio=0)
376 |         if detach:
377 |             decoder_output = decoder_output.detach()
378 |         decoder_output = decoder_output.squeeze(1)
379 | 
380 | 
381 | 
382 |     decoded_words = []
383 |     for di in range(max_length):
384 |         # top value and index of every batch
385 |         topv, topi = decoder_output[di].data.topk(1)
386 |         ni = topi[0]
387 |         if (ni == word2index['EOS']) or (ni == word2index['PAD']):
388 |             decoded_words.append('EOS')
389 |             # decoder_attentions[di] = decoder_attention[0].data
390 |             break
391 |         else:
392 |             decoded_words.append(index2word[ni])
393 | 
394 |     return decoded_words
395 | 


--------------------------------------------------------------------------------
/src/util/data_proc.py:
--------------------------------------------------------------------------------
  1 | #-----------------------------------------------------------------------------------------------#
  2 | #-----------------------------------------------------------------------------------------------#
  3 | # data loading helper functions
  4 | #-----------------------------------------------------------------------------------------------#
  5 | #-----------------------------------------------------------------------------------------------#
  6 | from __future__ import unicode_literals, print_function, division
  7 | from io import open
  8 | import unicodedata
  9 | import random
 10 | 
 11 | # import spacy
 12 | from spacy.en import English
 13 | spacynlp = English()
 14 | 
 15 | import torch
 16 | from torch.autograd import Variable
 17 | 
 18 | # FIXME: import spacy again below to avoid an error encountered when importing torch and spacy
 19 | #        it seems that spacy needs to be imported before torch. However, on Baylor cluster,
 20 | #        you need to import spacy again here for it to actually be imported without error.
 21 | from spacy.en import English
 22 | spacynlp = English()
 23 | 
 24 | import json
 25 | import numpy as np
 26 | 
 27 | # import sys, os
 28 | # sys.path.append(os.path.abspath(__file__ + "/../../") + '/G_baseline')
 29 | # from G_eval import *
 30 | 
 31 | 
 32 | ######################################################################
 33 | # The files are all in Unicode, to simplify we will turn Unicode
 34 | # characters to ASCII, make everything lowercase
 35 | #
 36 | 
 37 | # Turn a Unicode string to plain ASCII, thanks to
 38 | # http://stackoverflow.com/a/518232/2809427
 39 | def unicodeToAscii(s):
 40 |     return ''.join(
 41 |         c for c in unicodedata.normalize('NFD', s)
 42 |         if unicodedata.category(c) != 'Mn'
 43 |     )
 44 | 
 45 | # Lowercase, trim, and remove non-letter characters
 46 | def normalizeString(s):
 47 |     s = unicodeToAscii(s.lower().strip())
 48 |     # s = re.sub(r"([.!?])", r" \1", s)
 49 |     # s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
 50 |     return s
 51 | 
 52 | 
 53 | 
 54 | ######################################################################
 55 | # read GLOVE word embeddings
 56 | def readGlove(path_to_data):
 57 |     embeddings_index = {}
 58 |     f = open(path_to_data)
 59 |     for line in f:
 60 |         values = line.split()
 61 |         word = values[0]
 62 |         coefs = np.asarray(values[1:], dtype='float32')
 63 |         coefs = torch.from_numpy(coefs)
 64 |         embeddings_index[word] = coefs
 65 |     f.close()
 66 | 
 67 |     print('Found %s word vectors.' % len(embeddings_index))
 68 | 
 69 |     # get dimension from a random sample in the dict
 70 |     embeddings_size = random.sample( embeddings_index.items(), 1 )[0][1].size(-1)
 71 |     print('dimension of word embeddings: ' + str(embeddings_size))
 72 | 
 73 |     SOS_token = -torch.ones(embeddings_size) # start of sentence token, all zerons
 74 |     EOS_token = torch.ones(embeddings_size) # end of sentence token, all ones
 75 |     UNK_token = torch.ones(embeddings_size) + torch.ones(embeddings_size) # these choices are pretty random
 76 |     PAD_token = torch.zeros(embeddings_size)
 77 | 
 78 |     # add special tokens to the embeddings
 79 |     embeddings_index['SOS'] = SOS_token
 80 |     embeddings_index['EOS'] = EOS_token
 81 |     embeddings_index['UNK'] = UNK_token
 82 |     embeddings_index['PAD'] = PAD_token
 83 | 
 84 |     return embeddings_index, embeddings_size
 85 | 
 86 | 
 87 | ######################################################################
 88 | # read data specific for SQUAD dataset
 89 | 
 90 | def read_raw_squad(path_to_data, normalize=True):
 91 |     # output (context, question, answer, ans_start_idx, ans_end_idx) triplets
 92 |     print("Reading dataset...")
 93 |     triplets = []
 94 |     with open(path_to_data) as f:
 95 |         train = json.load(f)
 96 |         train = train['data']
 97 |         for s in range(0, len(train)):
 98 |             samples = train[s]['paragraphs']
 99 |             for p in range(0, len(samples)):
100 |                 context = samples[p]['context']
101 |                 qas = samples[p]['qas']
102 |                 for i in range(0, len(qas)):
103 |                 # print('current s,p,i are: ' + str(s)+str(p)+str(i))
104 |                     answers = qas[i]['answers']
105 |                     question = qas[i]['question']
106 |                     for a in range(0, len(answers)):
107 |                         ans_text = answers[a]['text']
108 |                         ans_start_idx = answers[a]['answer_start']
109 |                         ans_end_idx = ans_start_idx + len(ans_text)
110 | 
111 |                         if normalize:
112 |                             # turn from unicode to ascii and lower case everything
113 |                             context = unicodeToAscii(context)
114 |                             question = unicodeToAscii(question)
115 |                             ans_text = unicodeToAscii(ans_text)
116 | 
117 |                         triplets.append((context, question, ans_text, ans_start_idx, ans_end_idx))
118 |     return triplets
119 | 
120 | 
121 | # helper function to tokenize the raw squad data
122 | # e.g. the context is read as a string; this function produces a list of word tokens from context string
123 | # and return as the processed tuple (context, question, ans_text, ans_start_idx, ans_end_idx)
124 | # the first three are lists, the last two are LongTensor
125 | def tokenize_squad(squad, embeddings_index, opt='raw', c_EOS=True, a_EOS=True):
126 |     tokenized_triplets = []
127 |     if opt == 'raw':
128 |         for triple in squad:
129 |             tokenized_triplets.append( ( tokenize_sentence(triple[0], embeddings_index, EOS=c_EOS),
130 |                                          tokenize_sentence(triple[1], embeddings_index),
131 |                                          tokenize_sentence(triple[2], embeddings_index, EOS=a_EOS),
132 |                                          triple[3],
133 |                                          triple[4] ) )
134 |     elif opt == 'window':
135 |         for triple in squad:
136 |             tokenized_triplets.append( ( tokenize_sentence(triple[0], embeddings_index, spacy=False, EOS=c_EOS),
137 |                                          tokenize_sentence(triple[1], embeddings_index),
138 |                                          tokenize_sentence(triple[2], embeddings_index, spacy=False, EOS=a_EOS),
139 |                                          triple[3],
140 |                                          triple[4] ) )
141 |     elif opt == 'sent':
142 |         for triple in squad:
143 |             tokenized_triplets.append( ( tokenize_sentence(triple[0], embeddings_index, spacy=False, EOS=c_EOS),
144 |                                          tokenize_sentence(triple[1], embeddings_index),
145 |                                          tokenize_sentence(triple[2], embeddings_index, EOS=a_EOS),
146 |                                          triple[3],
147 |                                          triple[4] ) )
148 |     else:
149 |         raise Exception('unknown option. should be one of "raw", "window", or "sent".')
150 |     return tokenized_triplets
151 | 
152 | 
153 | # helper function to get the sentence of where the answer appear in the context
154 | # based on tokenized_squad, first element in output
155 | # output seq of tokens only from the answer sentence (same format as element in tokenize_squad output)
156 | def get_ans_sentence(raw_squad, sent_window=0):
157 | 
158 |     sent_c_triplets = [] # now each context in 
159 |     unmatch = [] # for debug
160 |     for t in range(len(raw_squad)):
161 |         sent = None
162 |         c = raw_squad[t][0]
163 |         a = raw_squad[t][2]
164 |         sent_c = list(spacynlp(c).sents)
165 |         tokenized_a = spacynlp.tokenizer(a)
166 |         # sanity check
167 |         # if len(sent_c) == 1:
168 |         #     print('WARNING: sentence segmentation may not work in this triple')
169 |         #     print(sent_c)
170 |         # print(tokenized_c)
171 |         ans_start_idx = raw_squad[t][3]
172 |         ans_end_idx = raw_squad[t][4]
173 |        
174 |         # print(ans_start_idx)
175 |         # print(ans_end_idx)
176 | 
177 |         idx = 0
178 |         for s in sent_c:
179 |             print(idx)
180 |             # print('currenet index: %d' % idx)
181 |             if idx <= ans_start_idx and idx+len(s.string)>=ans_end_idx:
182 |                 # print('enter if statement')
183 |                 # print(s)
184 |                 sent = s
185 |                 # print(sent_c.index(sent))
186 |                 # if isinstance(sent, unicode):
187 |                 #     raise Exception('unicode detected, where expecting spacy span object.')
188 |                 if tokenized_a[0].string not in sent.string:
189 |                     # print('c')
190 |                     # print(idx)
191 |                     # print(idx+len(s.string))
192 |                     # print(ans_start_idx)
193 |                     # print(ans_end_idx)
194 |                     print(type(tokenized_a[0]))
195 |                     print(type(sent))
196 |                     unmatch.append(t)
197 |                     # raise Exception('answer token not in current sentence')
198 |                 break
199 |             else:
200 |                 idx += len(s.string)
201 | 
202 |         try:
203 |             idx_temp = sent_c.index(sent)
204 |         except:
205 | 
206 |             print(sent_c)
207 |             print(sent)
208 |             print(tokenized_a)
209 |             print('\n')
210 |             unmatch.append(t)
211 | 
212 |         #TODO: multiple sentences as context
213 |         if sent_window > 0:
214 |             ans_sent_idx = sent_c.index(sent)
215 |             # print(ans_sent_idx)
216 |             for i in range(1,sent_window):
217 |                 if ans_sent_idx-i > 0 and ans_sent_idx+i < len(sent_c):
218 |                     sent = [sent_c[ans_sent_idx-i], sent, sent_c[ans_sent_idx+i]]
219 |                 elif ans_sent_idx-1 <= 0 and ans_sent_idx+1 < len(sent_c):
220 |                     sent = [sent, sent_c[ans_sent_idx+i]]
221 |                 elif ans_sent_idx-1 > 0 and ans_sent_idx+1 >= len(sent_c):
222 |                     sent = [sent_c[ans_sent_idx-i], sent]
223 |         sent_c_triplets.append( ( sent, raw_squad[t][1], raw_squad[t][2], raw_squad[t][3], raw_squad[t][4] ) )
224 | 
225 |     return sent_c_triplets, set(unmatch)
226 | 
227 | 
228 | # helper function to get a window of tokens around the answer
229 | # similar to get_ans_sentence; only difference is the span of tokens
230 | # NOTE: here the number of window operates on crude tokens: there's = one token.
231 | #       in proc_tokenized_sent, there's = 3 tokens. therefore, the actual
232 | #       number of tokens before and after the answer may exceed the set window size
233 | def get_windowed_ans(raw_squad, window_size):
234 | 
235 |     windowed_c_triplets = []
236 | 
237 |     for triple in raw_squad:
238 |         c = triple[0]
239 |         a = triple[2]
240 |         tokenized_c = spacynlp.tokenizer(c)
241 |         # sanity check
242 |         # print(tokenized_c)
243 |         tokenized_a = spacynlp.tokenizer(a)
244 |         ans_start_idx = triple[3]
245 |         ans_end_idx = triple[4]
246 |         c_sub = c[:ans_start_idx]
247 |         # print('first token in answer = %s' % tokenized_a[0])
248 | 
249 |         # find the start token of the answer in context
250 |         idx = 0
251 |         t = 0
252 |         for token in tokenized_c:
253 |             if idx+c_sub.count(' ') == ans_start_idx and unicode(token) == unicode(tokenized_a[0]):
254 |                 break
255 |             else:
256 |                 idx += len(token)
257 |                 t += 1
258 |         if t < window_size:
259 |             left_window = 0
260 |         else:
261 |             left_window = t - window_size
262 |         if t + window_size + len(tokenized_a) > len(tokenized_c):
263 |             right_window = len(tokenized_c)
264 |         else:
265 |             right_window = t + window_size + len(tokenized_a)
266 | 
267 |         windowed_c = tokenized_c[left_window:right_window]
268 |         # # sanity check
269 |         # if tokenized_a[0] not in windowed_c:
270 |         #     print('ERROR: windowed context does not contain answer token')
271 | 
272 |         windowed_c_triplets.append( ( windowed_c , triple[1], tokenized_a, triple[3], triple[4] ) )
273 | 
274 |     return windowed_c_triplets
275 | 
276 | 
277 | def annotate_context_w_ans(raw_squad):
278 |     pass
279 | 
280 | 
281 | 
282 | 
283 | # turns a sentence into individual tokens
284 | # this function takes care of word tokens that does not appear in pre trained embeddings
285 | # solution is to turn those word tokens into 'UNK'
286 | def tokenize_sentence(sentence, data_tokens, spacy=True, EOS=True):
287 |     if spacy:
288 |         tokenized_sentence = spacynlp.tokenizer(sentence)
289 |     else:
290 |         tokenized_sentence = sentence
291 |     # # an additional preprocessing step to separate words and non-words when they appear together
292 |     proc_tokenized_sentence = post_proc_tokenize_sentence(tokenized_sentence)
293 | 
294 |     token_num = len(proc_tokenized_sentence)
295 | 
296 |     var = []
297 | 
298 |     for t in range(0, token_num):
299 |         # the first if loop only for experimental use to aviod large vocab size
300 |         if proc_tokenized_sentence[t] not in data_tokens:
301 |             var.append('UNK')
302 |         else:
303 |             var.append(proc_tokenized_sentence[t])
304 | 
305 |     if EOS:
306 |         var.append('EOS')
307 |     return var
308 | 
309 | 
310 | # helper function for post processing tokenizer
311 | # separate all punctuations into single tokens
312 | # e.g. "(they're)" --> "they", "'", "re"
313 | # outputs a list of strings
314 | def post_proc_tokenize_sentence(tokenized_sentence):
315 |     proc_tokenized_sentence = []
316 |     for t in range(0, len(tokenized_sentence)):
317 |         # try:
318 |         #     token = tokenized_sentence[t].string.lower().strip()
319 |         # except:
320 |         #     print(tokenized_sentence)
321 |         token = tokenized_sentence[t].string.lower().strip()
322 |         # first check if the string is number or alphabet only
323 |         if token.isdigit() or token.isalpha():
324 |             proc_tokenized_sentence.append(token)
325 |         # sepatate this token into substrings of only words, numbers, or individual symbols
326 |         else:
327 |             index = -1
328 |             for s in range(0, len(token)):
329 |                 if s > index:
330 |                     if token[s].isdigit():
331 |                         # print('find digit')
332 |                         for i in range(s,len(token)):
333 |                             if (not token[i].isdigit()):
334 |                                 proc_tokenized_sentence.append(token[s:i])
335 |                                 index = i-1
336 |                                 break
337 |                             elif (token[i].isdigit()) and (i == len(token)-1):
338 |                                 proc_tokenized_sentence.append(token[s:i+1])
339 |                                 index = i
340 |                                 break
341 |                     elif token[s].isalpha():
342 |                         # print('find alphabet')
343 |                         for i in range(s,len(token)):
344 |                             if (not token[i].isalpha()):
345 |                                 proc_tokenized_sentence.append(token[s:i])
346 |                                 index = i-1
347 |                                 break
348 |                             elif (token[i].isalpha()) and (i == len(token)-1):
349 |                                 proc_tokenized_sentence.append(token[s:i+1])
350 |                                 index = i
351 |                                 break
352 |                     else:
353 |                         # print('find symbol')
354 |                         proc_tokenized_sentence.append(token[s])
355 |                         index += 1
356 |                     # print(index)
357 |     return proc_tokenized_sentence
358 | # test
359 | # x = post_proc_tokenizer(spacynlp.tokenizer(u'mid-1960s'))
360 | 
361 | 
362 | # # find the max length of context, answer, and question
363 | # def max_length(triplets):
364 | 
365 | #     max_len_c = 0
366 | #     max_len_q = 0
367 | #     max_len_a = 0
368 | 
369 | #     for triple in triplets:
370 | #         len_c = len(triple[0])
371 | #         len_q = len(triple[1])
372 | #         len_a = len(triple[2])
373 | #         if len_c > max_len_c:
374 | #             max_len_c = len_c
375 | #         if len_q > max_len_q:
376 | #             max_len_q = len_q
377 | #         if len_a > max_len_a:
378 | #             max_len_a = len_a
379 | 
380 | #     return max_len_c, max_len_q, max_len_a
381 | 
382 | 
383 | ######################################################################
384 | # count the number of tokens in both the word embeddings and the corpus
385 | def count_effective_num_tokens(triplets, embeddings_index, sos_eos = True):
386 |     ## find all unique tokens in the data (should be a subset of the number of embeddings)
387 |     data_tokens = []
388 |     for triple in triplets:
389 |         data_tokens += triple[0] + triple[1] + triple[2]
390 |     data_tokens = list(set(data_tokens)) # find unique
391 |     if sos_eos:
392 |         data_tokens = ['SOS', 'EOS', 'UNK', 'PAD'] + data_tokens
393 |     else:
394 |         data_tokens = ['UNK', 'PAD']
395 | 
396 |     effective_tokens = list(set(data_tokens).intersection(embeddings_index.keys()))
397 |     effective_num_tokens = len(effective_tokens)
398 | 
399 |     return effective_tokens, effective_num_tokens
400 | 
401 | 
402 | ######################################################################
403 | # generate word index and index word look up tables
404 | def generate_look_up_table(effective_tokens, effective_num_tokens, use_cuda = True):
405 |     word2index = {}
406 |     index2word = {}
407 |     for i in range(effective_num_tokens):
408 |         index2word[i] = effective_tokens[i]
409 |         word2index[effective_tokens[i]] = i
410 |     return word2index, index2word
411 | 
412 | 
413 | ######################################################################
414 | # prepare minibatch of data
415 | # output is (contexts, questions, answers, answer_start_idxs, answer_end_idxs)
416 | # each is of dimension [batch_size x their respective max length]
417 | def get_random_batch(triplets, batch_size, with_fake = False):
418 | 
419 |     # init values
420 |     contexts = []
421 |     questions = []
422 |     answers = []
423 |     ans_start_idxs = []
424 |     ans_end_idxs = []
425 | 
426 |     # inside this forloop, all word tokens are turned into their respective index according to word2index lookup table
427 |     for i in range(batch_size):
428 |         triple = random.choice(triplets)
429 |         contexts.append(triple[0])
430 |         questions.append( triple[1] )
431 |         answers.append(triple[2])
432 |         ans_start_idxs.append( triple[3] )
433 |         ans_end_idxs.append( triple[4] )
434 | 
435 |     # get lengths of each context, question, answer in their respective arrays
436 |     context_lens = [len(s) for s in contexts]
437 |     question_lens = [len(s) for s in questions]
438 |     answer_lens = [len(s) for s in answers]
439 | 
440 |     if with_fake:
441 |         idx = int(batch_size/2)
442 |         return [contexts[:idx], questions[:idx], answers[:idx], ans_start_idxs[:idx], ans_end_idxs[:idx]], \
443 |                [context_lens[:idx], question_lens[:idx], answer_lens[:idx]],\
444 |                [contexts[idx:], questions, answers[idx:], ans_start_idxs[idx:], ans_end_idxs[idx:]], \
445 |                [context_lens[idx:], question_lens[idx:], answer_lens[idx:]]
446 |     else:
447 |         return [contexts, questions, answers, ans_start_idxs, ans_end_idxs], \
448 |                [context_lens, question_lens, answer_lens]
449 | 
450 | 
451 | # - prepare batch training data
452 | # - training_batch contains five pieces of data. The first three with size [batch size x max seq len],
453 | # - the last two with size [batch size].
454 | # - seq_lens contains lengths of the first three sequences, each of size [batch size]
455 | # - the output would be matrices of size [max seq len x batch size x embedding size]
456 | # - if question is represented as index, then its size is [max seq len x batch size] --> this is transpose of the input
457 | #   from get_random_batch in order to fit NLLLoss function (indexing and selecting the whole batch of a single token) is
458 | #   easier. e.g. you can do question[i] which selects the whole sequence of the first dimension
459 | def prepare_batch_var(batch, seq_lens, batch_size, word2index, embeddings_index, embeddings_size,
460 |                       use_cuda=1, sort=False, mode=('word', 'index', 'word'), concat_opt=None,
461 |                       with_fake=False, fake_batch=None, fake_seq_lens=None):
462 | 
463 |     batch_vars = []
464 |     batch_var_orig = []
465 |     batch_paddings = []
466 | 
467 |     if with_fake:
468 |         batch_size = int(batch_size/2)
469 |         fake_q = fake_batch[1]
470 |         fake_q_lens = fake_seq_lens[1]
471 | 
472 |     #TODO (for different applications): change the below code (before for loop) to concat different portions of the batch_triplets
473 |     if concat_opt == None:
474 |         pass
475 | 
476 |     elif concat_opt == 'ca':
477 |         ca = []
478 |         ca_len = []
479 |         for b in range(batch_size):
480 |             ca.append(batch[0][b] + batch[2][b])
481 |             ca_len.append(len(batch[0][b] + batch[2][b]))
482 |         batch = [ca, batch[1], batch[3], batch[4]]
483 |         seq_lens =  [ca_len] + seq_lens
484 | 
485 |     elif concat_opt == 'qa':
486 |         pass
487 | 
488 |     # FIXME: only this following elif implemented fake question
489 |     elif concat_opt == 'cqa':
490 |         cqa = []
491 |         cqa_len = []
492 |         labels = []
493 |         for b in range(batch_size):
494 |             cqa.append(batch[0][b] + batch[1][b] + batch[2][b]) # append real
495 |             cqa_len.append(len(batch[0][b] + batch[1][b] + batch[2][b])) # append real
496 |             labels.append(1)
497 |             if with_fake: # append fake
498 |                 fake_q_sample = random.sample(fake_q,1)[0]
499 |                 cqa.append(batch[0][b] + fake_q_sample + batch[2][b])
500 |                 cqa_len.append(len(batch[0][b] + fake_q_sample + batch[2][b]))
501 |                 labels.append(0)
502 |         if with_fake:
503 |             batch = [cqa, batch[3]+fake_batch[3], batch[4]+fake_batch[4], labels]
504 |         else:
505 |             batch = [cqa, batch[3], batch[4]]
506 |         seq_lens = [cqa_len]
507 |     elif concat_opt == 'qca':
508 |         pass
509 | 
510 |     else:
511 |         raise ValueError('not a valid concat option.')
512 | 
513 |     num_batch = len(batch)
514 |     # sort this batch_var in descending order according to the values of the lengths of the first element in batch
515 |     if sort:
516 |         all = batch + seq_lens
517 |         all = sorted(zip(*all), key=lambda p: len(p[0]), reverse=True)
518 |         all = zip(*all)
519 |         batch = all[0:num_batch]
520 |         seq_lens = all[num_batch:]
521 |         batch_orig = batch
522 | 
523 |     # get bacth size back to 2x if with fake
524 |     if with_fake:
525 |         batch_size = batch_size * 2
526 | 
527 |     for b in range(num_batch):
528 | 
529 |         batch_var = batch[b]
530 | 
531 |         # if element in batch is float, i.e. indices, then do nothing
532 |         if isinstance(batch_var[0], int):
533 |             batch_var = list(batch_var)
534 |             pass
535 |         else:
536 |             # pad each context, question, answer to their respective max length
537 |             if mode[b]  == 'index':
538 |                 batch_padded = [pad_sequence(s, max(seq_lens[b]), word2index, mode='index') for s in batch_var]
539 |             else:
540 |                 batch_padded = [pad_sequence(s, max(seq_lens[b]), word2index) for s in batch_var]
541 | 
542 |             # init variable matrices
543 |             if mode[b] == 'index':
544 |                 batch_var = torch.LongTensor(max(seq_lens[b]), batch_size) # long tensor for module loss criterion
545 |             else:
546 |                 batch_var = torch.FloatTensor(max(seq_lens[b]), batch_size, embeddings_size)
547 | 
548 |             # FIXME: very stupid embedded for loop implementation
549 |             for i in range(batch_size):
550 |                 for j in range(max(seq_lens[b])):
551 |                     if mode[b] == 'index':
552 |                         batch_var[j, i] = batch_padded[i][j]
553 |                     else:
554 |                         batch_var[j, i,] = embeddings_index[batch_padded[i][j]]
555 | 
556 |         batch_vars.append(batch_var)
557 |         batch_paddings.append(batch_padded)
558 | 
559 |     # the second output is for debugging purpose
560 |     return batch_vars, batch_paddings, seq_lens
561 | 
562 | # helper function to zero pad context, question, answer to their respective maximum length
563 | def pad_sequence(s, max_len, word2index, mode = 'word'):
564 |     if mode == 'word':
565 |         return s + ['PAD' for i in range(max_len - len(s))]
566 |     elif mode == 'index':
567 |         return [word2index[i] for i in s] + [word2index['PAD'] for i in range(max_len - len(s))]
568 | 
569 | 
570 | ######################################################################
571 | # TODO: need a function to sample some (c, q, a) triplets from the generator
572 | def sample_generated_triples(triplets, G, batch_size):
573 | 
574 |     # should return the same thing as get_random_batch with with_fake = False
575 |     return None
576 | 
577 | 
578 | ######################################################################
579 | # test function for examining the output of the batch
580 | # primarily see whether the context, question, answer triplets make sense
581 | def print_batch(batch, batch_size, index2word):
582 |     idx = random.choice(range(batch_size))
583 |     context = [ index2word[i] for i in batch[0][idx,] ]
584 |     question = [ index2word[i] for i in batch[1][idx,] ]
585 |     answer = [ index2word[i] for i in batch[2][idx,] ]
586 |     return (' '.join(context), ' '.join(question), ' '.join(answer))
587 | 
588 | 
589 | 
590 | 


--------------------------------------------------------------------------------
/references/code/seq2seq_translation_tutorial.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Translation with a Sequence to Sequence Network and Attention
  4 | *************************************************************
  5 | **Author**: `Sean Robertson <https://github.com/spro/practical-pytorch>`_
  6 | 
  7 | In this project we will be teaching a neural network to translate from
  8 | French to English.
  9 | 
 10 | ::
 11 | 
 12 |     [KEY: > input, = target, < output]
 13 | 
 14 |     > il est en train de peindre un tableau .
 15 |     = he is painting a picture .
 16 |     < he is painting a picture .
 17 | 
 18 |     > pourquoi ne pas essayer ce vin delicieux ?
 19 |     = why not try that delicious wine ?
 20 |     < why not try that delicious wine ?
 21 | 
 22 |     > elle n est pas poete mais romanciere .
 23 |     = she is not a poet but a novelist .
 24 |     < she not not a poet but a novelist .
 25 | 
 26 |     > vous etes trop maigre .
 27 |     = you re too skinny .
 28 |     < you re all alone .
 29 | 
 30 | ... to varying degrees of success.
 31 | 
 32 | This is made possible by the simple but powerful idea of the `sequence
 33 | to sequence network <http://arxiv.org/abs/1409.3215>`__, in which two
 34 | recurrent neural networks work together to transform one sequence to
 35 | another. An encoder network condenses an input sequence into a vector,
 36 | and a decoder network unfolds that vector into a new sequence.
 37 | 
 38 | .. figure:: /_static/img/seq-seq-images/seq2seq.png
 39 |    :alt:
 40 | 
 41 | To improve upon this model we'll use an `attention
 42 | mechanism <https://arxiv.org/abs/1409.0473>`__, which lets the decoder
 43 | learn to focus over a specific range of the input sequence.
 44 | 
 45 | **Recommended Reading:**
 46 | 
 47 | I assume you have at least installed PyTorch, know Python, and
 48 | understand Tensors:
 49 | 
 50 | -  http://pytorch.org/ For installation instructions
 51 | -  :doc:`/beginner/deep_learning_60min_blitz` to get started with PyTorch in general
 52 | -  :doc:`/beginner/pytorch_with_examples` for a wide and deep overview
 53 | -  :doc:`/beginner/former_torchies_tutorial` if you are former Lua Torch user
 54 | 
 55 | 
 56 | It would also be useful to know about Sequence to Sequence networks and
 57 | how they work:
 58 | 
 59 | -  `Learning Phrase Representations using RNN Encoder-Decoder for
 60 |    Statistical Machine Translation <http://arxiv.org/abs/1406.1078>`__
 61 | -  `Sequence to Sequence Learning with Neural
 62 |    Networks <http://arxiv.org/abs/1409.3215>`__
 63 | -  `Neural Machine Translation by Jointly Learning to Align and
 64 |    Translate <https://arxiv.org/abs/1409.0473>`__
 65 | -  `A Neural Conversational Model <http://arxiv.org/abs/1506.05869>`__
 66 | 
 67 | You will also find the previous tutorials on
 68 | :doc:`/intermediate/char_rnn_classification_tutorial`
 69 | and :doc:`/intermediate/char_rnn_generation_tutorial`
 70 | helpful as those concepts are very similar to the Encoder and Decoder
 71 | models, respectively.
 72 | 
 73 | And for more, read the papers that introduced these topics:
 74 | 
 75 | -  `Learning Phrase Representations using RNN Encoder-Decoder for
 76 |    Statistical Machine Translation <http://arxiv.org/abs/1406.1078>`__
 77 | -  `Sequence to Sequence Learning with Neural
 78 |    Networks <http://arxiv.org/abs/1409.3215>`__
 79 | -  `Neural Machine Translation by Jointly Learning to Align and
 80 |    Translate <https://arxiv.org/abs/1409.0473>`__
 81 | -  `A Neural Conversational Model <http://arxiv.org/abs/1506.05869>`__
 82 | 
 83 | 
 84 | **Requirements**
 85 | """
 86 | from __future__ import unicode_literals, print_function, division
 87 | from io import open
 88 | import unicodedata
 89 | import string
 90 | import re
 91 | import random
 92 | 
 93 | import torch
 94 | import torch.nn as nn
 95 | from torch.autograd import Variable
 96 | from torch import optim
 97 | import torch.nn.functional as F
 98 | 
 99 | use_cuda = torch.cuda.is_available()
100 | 
101 | ######################################################################
102 | # Loading data files
103 | # ==================
104 | #
105 | # The data for this project is a set of many thousands of English to
106 | # French translation pairs.
107 | #
108 | # `This question on Open Data Stack
109 | # Exchange <http://opendata.stackexchange.com/questions/3888/dataset-of-sentences-translated-into-many-languages>`__
110 | # pointed me to the open translation site http://tatoeba.org/ which has
111 | # downloads available at http://tatoeba.org/eng/downloads - and better
112 | # yet, someone did the extra work of splitting language pairs into
113 | # individual text files here: http://www.manythings.org/anki/
114 | #
115 | # The English to French pairs are too big to include in the repo, so
116 | # download to ``data/eng-fra.txt`` before continuing. The file is a tab
117 | # separated list of translation pairs:
118 | #
119 | # ::
120 | #
121 | #     I am cold.    Je suis froid.
122 | #
123 | # .. Note::
124 | #    Download the data from
125 | #    `here <https://download.pytorch.org/tutorial/data.zip>`_
126 | #    and extract it to the current directory.
127 | 
128 | ######################################################################
129 | # Similar to the character encoding used in the character-level RNN
130 | # tutorials, we will be representing each word in a language as a one-hot
131 | # vector, or giant vector of zeros except for a single one (at the index
132 | # of the word). Compared to the dozens of characters that might exist in a
133 | # language, there are many many more words, so the encoding vector is much
134 | # larger. We will however cheat a bit and trim the data to only use a few
135 | # thousand words per language.
136 | #
137 | # .. figure:: /_static/img/seq-seq-images/word-encoding.png
138 | #    :alt:
139 | #
140 | #
141 | 
142 | 
143 | ######################################################################
144 | # We'll need a unique index per word to use as the inputs and targets of
145 | # the networks later. To keep track of all this we will use a helper class
146 | # called ``Lang`` which has word → index (``word2index``) and index → word
147 | # (``index2word``) dictionaries, as well as a count of each word
148 | # ``word2count`` to use to later replace rare words.
149 | #
150 | 
151 | SOS_token = 0
152 | EOS_token = 1
153 | 
154 | 
155 | class Lang:
156 |     def __init__(self, name):
157 |         self.name = name
158 |         self.word2index = {}
159 |         self.word2count = {}
160 |         self.index2word = {0: "SOS", 1: "EOS"}
161 |         self.n_words = 2  # Count SOS and EOS
162 | 
163 |     def addSentence(self, sentence):
164 |         for word in sentence.split(' '):
165 |             self.addWord(word)
166 | 
167 |     def addWord(self, word):
168 |         if word not in self.word2index:
169 |             self.word2index[word] = self.n_words
170 |             self.word2count[word] = 1
171 |             self.index2word[self.n_words] = word
172 |             self.n_words += 1
173 |         else:
174 |             self.word2count[word] += 1
175 | 
176 | 
177 | ######################################################################
178 | # The files are all in Unicode, to simplify we will turn Unicode
179 | # characters to ASCII, make everything lowercase, and trim most
180 | # punctuation.
181 | #
182 | 
183 | # Turn a Unicode string to plain ASCII, thanks to
184 | # http://stackoverflow.com/a/518232/2809427
185 | def unicodeToAscii(s):
186 |     return ''.join(
187 |         c for c in unicodedata.normalize('NFD', s)
188 |         if unicodedata.category(c) != 'Mn'
189 |     )
190 | 
191 | 
192 | # Lowercase, trim, and remove non-letter characters
193 | 
194 | 
195 | def normalizeString(s):
196 |     s = unicodeToAscii(s.lower().strip())
197 |     s = re.sub(r"([.!?])", r" \1", s)
198 |     s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
199 |     return s
200 | 
201 | 
202 | ######################################################################
203 | # To read the data file we will split the file into lines, and then split
204 | # lines into pairs. The files are all English → Other Language, so if we
205 | # want to translate from Other Language → English I added the ``reverse``
206 | # flag to reverse the pairs.
207 | #
208 | 
209 | def readLangs(lang1, lang2, reverse=False):
210 |     print("Reading lines...")
211 | 
212 |     # Read the file and split into lines
213 |     lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8'). \
214 |         read().strip().split('\n')
215 | 
216 |     # Split every line into pairs and normalize
217 |     pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
218 | 
219 |     # Reverse pairs, make Lang instances
220 |     if reverse:
221 |         pairs = [list(reversed(p)) for p in pairs]
222 |         input_lang = Lang(lang2)
223 |         output_lang = Lang(lang1)
224 |     else:
225 |         input_lang = Lang(lang1)
226 |         output_lang = Lang(lang2)
227 | 
228 |     return input_lang, output_lang, pairs
229 | 
230 | 
231 | ######################################################################
232 | # Since there are a *lot* of example sentences and we want to train
233 | # something quickly, we'll trim the data set to only relatively short and
234 | # simple sentences. Here the maximum length is 10 words (that includes
235 | # ending punctuation) and we're filtering to sentences that translate to
236 | # the form "I am" or "He is" etc. (accounting for apostrophes replaced
237 | # earlier).
238 | #
239 | 
240 | MAX_LENGTH = 10
241 | 
242 | eng_prefixes = (
243 |     "i am ", "i m ",
244 |     "he is", "he s ",
245 |     "she is", "she s",
246 |     "you are", "you re ",
247 |     "we are", "we re ",
248 |     "they are", "they re "
249 | )
250 | 
251 | 
252 | def filterPair(p):
253 |     return len(p[0].split(' ')) < MAX_LENGTH and \
254 |            len(p[1].split(' ')) < MAX_LENGTH and \
255 |            p[1].startswith(eng_prefixes)
256 | 
257 | 
258 | def filterPairs(pairs):
259 |     return [pair for pair in pairs if filterPair(pair)]
260 | 
261 | 
262 | ######################################################################
263 | # The full process for preparing the data is:
264 | #
265 | # -  Read text file and split into lines, split lines into pairs
266 | # -  Normalize text, filter by length and content
267 | # -  Make word lists from sentences in pairs
268 | #
269 | 
270 | def prepareData(lang1, lang2, reverse=False):
271 |     input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
272 |     print("Read %s sentence pairs" % len(pairs))
273 |     pairs = filterPairs(pairs)
274 |     print("Trimmed to %s sentence pairs" % len(pairs))
275 |     print("Counting words...")
276 |     for pair in pairs:
277 |         input_lang.addSentence(pair[0])
278 |         output_lang.addSentence(pair[1])
279 |     print("Counted words:")
280 |     print(input_lang.name, input_lang.n_words)
281 |     print(output_lang.name, output_lang.n_words)
282 |     return input_lang, output_lang, pairs
283 | 
284 | 
285 | input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
286 | print(random.choice(pairs))
287 | 
288 | 
289 | ######################################################################
290 | # The Seq2Seq Model
291 | # =================
292 | #
293 | # A Recurrent Neural Network, or RNN, is a network that operates on a
294 | # sequence and uses its own output as input for subsequent steps.
295 | #
296 | # A `Sequence to Sequence network <http://arxiv.org/abs/1409.3215>`__, or
297 | # seq2seq network, or `Encoder Decoder
298 | # network <https://arxiv.org/pdf/1406.1078v3.pdf>`__, is a model
299 | # consisting of two RNNs called the encoder and decoder. The encoder reads
300 | # an input sequence and outputs a single vector, and the decoder reads
301 | # that vector to produce an output sequence.
302 | #
303 | # .. figure:: /_static/img/seq-seq-images/seq2seq.png
304 | #    :alt:
305 | #
306 | # Unlike sequence prediction with a single RNN, where every input
307 | # corresponds to an output, the seq2seq model frees us from sequence
308 | # length and order, which makes it ideal for translation between two
309 | # languages.
310 | #
311 | # Consider the sentence "Je ne suis pas le chat noir" → "I am not the
312 | # black cat". Most of the words in the input sentence have a direct
313 | # translation in the output sentence, but are in slightly different
314 | # orders, e.g. "chat noir" and "black cat". Because of the "ne/pas"
315 | # construction there is also one more word in the input sentence. It would
316 | # be difficult to produce a correct translation directly from the sequence
317 | # of input words.
318 | #
319 | # With a seq2seq model the encoder creates a single vector which, in the
320 | # ideal case, encodes the "meaning" of the input sequence into a single
321 | # vector — a single point in some N dimensional space of sentences.
322 | #
323 | 
324 | 
325 | ######################################################################
326 | # The Encoder
327 | # -----------
328 | #
329 | # The encoder of a seq2seq network is a RNN that outputs some value for
330 | # every word from the input sentence. For every input word the encoder
331 | # outputs a vector and a hidden state, and uses the hidden state for the
332 | # next input word.
333 | #
334 | # .. figure:: /_static/img/seq-seq-images/encoder-network.png
335 | #    :alt:
336 | #
337 | #
338 | 
339 | class EncoderRNN(nn.Module):
340 |     def __init__(self, input_size, hidden_size, n_layers=1):
341 |         super(EncoderRNN, self).__init__()
342 |         self.n_layers = n_layers
343 |         self.hidden_size = hidden_size
344 | 
345 |         self.embedding = nn.Embedding(input_size, hidden_size)
346 |         self.gru = nn.GRU(hidden_size, hidden_size)
347 | 
348 |     def forward(self, input, hidden):
349 |         embedded = self.embedding(input).view(1, 1, -1)
350 |         output = embedded
351 |         for i in range(self.n_layers):
352 |             output, hidden = self.gru(output, hidden)
353 |         return output, hidden
354 | 
355 |     def initHidden(self):
356 |         result = Variable(torch.zeros(1, 1, self.hidden_size))
357 |         if use_cuda:
358 |             return result.cuda()
359 |         else:
360 |             return result
361 | 
362 | 
363 | ######################################################################
364 | # The Decoder
365 | # -----------
366 | #
367 | # The decoder is another RNN that takes the encoder output vector(s) and
368 | # outputs a sequence of words to create the translation.
369 | #
370 | 
371 | 
372 | ######################################################################
373 | # Simple Decoder
374 | # ^^^^^^^^^^^^^^
375 | #
376 | # In the simplest seq2seq decoder we use only last output of the encoder.
377 | # This last output is sometimes called the *context vector* as it encodes
378 | # context from the entire sequence. This context vector is used as the
379 | # initial hidden state of the decoder.
380 | #
381 | # At every step of decoding, the decoder is given an input token and
382 | # hidden state. The initial input token is the start-of-string ``<SOS>``
383 | # token, and the first hidden state is the context vector (the encoder's
384 | # last hidden state).
385 | #
386 | # .. figure:: /_static/img/seq-seq-images/decoder-network.png
387 | #    :alt:
388 | #
389 | #
390 | 
391 | class DecoderRNN(nn.Module):
392 |     def __init__(self, hidden_size, output_size, n_layers=1):
393 |         super(DecoderRNN, self).__init__()
394 |         self.n_layers = n_layers
395 |         self.hidden_size = hidden_size
396 | 
397 |         self.embedding = nn.Embedding(output_size, hidden_size)
398 |         self.gru = nn.GRU(hidden_size, hidden_size)
399 |         self.out = nn.Linear(hidden_size, output_size)
400 |         self.softmax = nn.LogSoftmax()
401 | 
402 |     def forward(self, input, hidden):
403 |         output = self.embedding(input).view(1, 1, -1)
404 |         for i in range(self.n_layers):
405 |             output = F.relu(output)
406 |             output, hidden = self.gru(output, hidden)
407 |         output = self.softmax(self.out(output[0]))
408 |         return output, hidden
409 | 
410 |     def initHidden(self):
411 |         result = Variable(torch.zeros(1, 1, self.hidden_size))
412 |         if use_cuda:
413 |             return result.cuda()
414 |         else:
415 |             return result
416 | 
417 | 
418 | ######################################################################
419 | # I encourage you to train and observe the results of this model, but to
420 | # save space we'll be going straight for the gold and introducing the
421 | # Attention Mechanism.
422 | #
423 | 
424 | 
425 | ######################################################################
426 | # Attention Decoder
427 | # ^^^^^^^^^^^^^^^^^
428 | #
429 | # If only the context vector is passed betweeen the encoder and decoder,
430 | # that single vector carries the burden of encoding the entire sentence.
431 | #
432 | # Attention allows the decoder network to "focus" on a different part of
433 | # the encoder's outputs for every step of the decoder's own outputs. First
434 | # we calculate a set of *attention weights*. These will be multiplied by
435 | # the encoder output vectors to create a weighted combination. The result
436 | # (called ``attn_applied`` in the code) should contain information about
437 | # that specific part of the input sequence, and thus help the decoder
438 | # choose the right output words.
439 | #
440 | # .. figure:: https://i.imgur.com/1152PYf.png
441 | #    :alt:
442 | #
443 | # Calculating the attention weights is done with another feed-forward
444 | # layer ``attn``, using the decoder's input and hidden state as inputs.
445 | # Because there are sentences of all sizes in the training data, to
446 | # actually create and train this layer we have to choose a maximum
447 | # sentence length (input length, for encoder outputs) that it can apply
448 | # to. Sentences of the maximum length will use all the attention weights,
449 | # while shorter sentences will only use the first few.
450 | #
451 | # .. figure:: /_static/img/seq-seq-images/attention-decoder-network.png
452 | #    :alt:
453 | #
454 | #
455 | 
456 | class AttnDecoderRNN(nn.Module):
457 |     def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1, max_length=MAX_LENGTH):
458 |         super(AttnDecoderRNN, self).__init__()
459 |         self.hidden_size = hidden_size
460 |         self.output_size = output_size
461 |         self.n_layers = n_layers
462 |         self.dropout_p = dropout_p
463 |         self.max_length = max_length
464 | 
465 |         self.embedding = nn.Embedding(self.output_size, self.hidden_size)
466 |         self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
467 |         self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
468 |         self.dropout = nn.Dropout(self.dropout_p)
469 |         self.gru = nn.GRU(self.hidden_size, self.hidden_size)
470 |         self.out = nn.Linear(self.hidden_size, self.output_size)
471 | 
472 |     def forward(self, input, hidden, encoder_output, encoder_outputs):
473 |         embedded = self.embedding(input).view(1, 1, -1)
474 |         embedded = self.dropout(embedded)
475 | 
476 |         attn_weights = F.softmax(
477 |             self.attn(torch.cat((embedded[0], hidden[0]), 1)))
478 |         attn_applied = torch.bmm(attn_weights.unsqueeze(0),
479 |                                  encoder_outputs.unsqueeze(0))
480 | 
481 |         output = torch.cat((embedded[0], attn_applied[0]), 1)
482 |         output = self.attn_combine(output).unsqueeze(0)
483 | 
484 |         for i in range(self.n_layers):
485 |             output = F.relu(output)
486 |             output, hidden = self.gru(output, hidden)
487 | 
488 |         output = F.log_softmax(self.out(output[0]))
489 |         return output, hidden, attn_weights
490 | 
491 |     def initHidden(self):
492 |         result = Variable(torch.zeros(1, 1, self.hidden_size))
493 |         if use_cuda:
494 |             return result.cuda()
495 |         else:
496 |             return result
497 | 
498 | 
499 | ######################################################################
500 | # .. note:: There are other forms of attention that work around the length
501 | #   limitation by using a relative position approach. Read about "local
502 | #   attention" in `Effective Approaches to Attention-based Neural Machine
503 | #   Translation <https://arxiv.org/abs/1508.04025>`__.
504 | #
505 | # Training
506 | # ========
507 | #
508 | # Preparing Training Data
509 | # -----------------------
510 | #
511 | # To train, for each pair we will need an input tensor (indexes of the
512 | # words in the input sentence) and target tensor (indexes of the words in
513 | # the target sentence). While creating these vectors we will append the
514 | # EOS token to both sequences.
515 | #
516 | 
517 | def indexesFromSentence(lang, sentence):
518 |     return [lang.word2index[word] for word in sentence.split(' ')]
519 | 
520 | 
521 | def variableFromSentence(lang, sentence):
522 |     indexes = indexesFromSentence(lang, sentence)
523 |     indexes.append(EOS_token)
524 |     result = Variable(torch.LongTensor(indexes).view(-1, 1))
525 |     if use_cuda:
526 |         return result.cuda()
527 |     else:
528 |         return result
529 | 
530 | 
531 | def variablesFromPair(pair):
532 |     input_variable = variableFromSentence(input_lang, pair[0])
533 |     target_variable = variableFromSentence(output_lang, pair[1])
534 |     return (input_variable, target_variable)
535 | 
536 | 
537 | ######################################################################
538 | # Training the Model
539 | # ------------------
540 | #
541 | # To train we run the input sentence through the encoder, and keep track
542 | # of every output and the latest hidden state. Then the decoder is given
543 | # the ``<SOS>`` token as its first input, and the last hidden state of the
544 | # encoder as its first hidden state.
545 | #
546 | # "Teacher forcing" is the concept of using the real target outputs as
547 | # each next input, instead of using the decoder's guess as the next input.
548 | # Using teacher forcing causes it to converge faster but `when the trained
549 | # network is exploited, it may exhibit
550 | # instability <http://minds.jacobs-university.de/sites/default/files/uploads/papers/ESNTutorialRev.pdf>`__.
551 | #
552 | # You can observe outputs of teacher-forced networks that read with
553 | # coherent grammar but wander far from the correct translation -
554 | # intuitively it has learned to represent the output grammar and can "pick
555 | # up" the meaning once the teacher tells it the first few words, but it
556 | # has not properly learned how to create the sentence from the translation
557 | # in the first place.
558 | #
559 | # Because of the freedom PyTorch's autograd gives us, we can randomly
560 | # choose to use teacher forcing or not with a simple if statement. Turn
561 | # ``teacher_forcing_ratio`` up to use more of it.
562 | #
563 | 
564 | teacher_forcing_ratio = 0.5
565 | 
566 | 
567 | def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion,
568 |           max_length=MAX_LENGTH):
569 |     encoder_hidden = encoder.initHidden()
570 | 
571 |     encoder_optimizer.zero_grad()
572 |     decoder_optimizer.zero_grad()
573 | 
574 |     input_length = input_variable.size()[0]
575 |     target_length = target_variable.size()[0]
576 | 
577 |     encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
578 |     encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
579 | 
580 |     loss = 0
581 | 
582 |     for ei in range(input_length):
583 |         encoder_output, encoder_hidden = encoder(
584 |             input_variable[ei], encoder_hidden)
585 |         encoder_outputs[ei] = encoder_output[0][0]
586 | 
587 |     decoder_input = Variable(torch.LongTensor([[SOS_token]]))
588 |     decoder_input = decoder_input.cuda() if use_cuda else decoder_input
589 | 
590 |     decoder_hidden = encoder_hidden
591 | 
592 |     use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
593 | 
594 |     if use_teacher_forcing:
595 |         # Teacher forcing: Feed the target as the next input
596 |         for di in range(target_length):
597 |             decoder_output, decoder_hidden, decoder_attention = decoder(
598 |                 decoder_input, decoder_hidden, encoder_output, encoder_outputs)
599 |             loss += criterion(decoder_output[0], target_variable[di])
600 |             decoder_input = target_variable[di]  # Teacher forcing
601 | 
602 |     else:
603 |         # Without teacher forcing: use its own predictions as the next input
604 |         for di in range(target_length):
605 |             decoder_output, decoder_hidden, decoder_attention = decoder(
606 |                 decoder_input, decoder_hidden, encoder_output, encoder_outputs)
607 |             topv, topi = decoder_output.data.topk(1)
608 |             ni = topi[0][0]
609 | 
610 |             decoder_input = Variable(torch.LongTensor([[ni]]))
611 |             decoder_input = decoder_input.cuda() if use_cuda else decoder_input
612 | 
613 |             loss += criterion(decoder_output[0], target_variable[di])
614 |             if ni == EOS_token:
615 |                 break
616 | 
617 |     loss.backward()
618 | 
619 |     encoder_optimizer.step()
620 |     decoder_optimizer.step()
621 | 
622 |     return loss.data[0] / target_length
623 | 
624 | 
625 | ######################################################################
626 | # This is a helper function to print time elapsed and estimated time
627 | # remaining given the current time and progress %.
628 | #
629 | 
630 | import time
631 | import math
632 | 
633 | 
634 | def asMinutes(s):
635 |     m = math.floor(s / 60)
636 |     s -= m * 60
637 |     return '%dm %ds' % (m, s)
638 | 
639 | 
640 | def timeSince(since, percent):
641 |     now = time.time()
642 |     s = now - since
643 |     es = s / (percent)
644 |     rs = es - s
645 |     return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
646 | 
647 | 
648 | ######################################################################
649 | # The whole training process looks like this:
650 | #
651 | # -  Start a timer
652 | # -  Initialize optimizers and criterion
653 | # -  Create set of training pairs
654 | # -  Start empty losses array for plotting
655 | #
656 | # Then we call ``train`` many times and occasionally print the progress (%
657 | # of examples, time so far, estimated time) and average loss.
658 | #
659 | 
660 | def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
661 |     start = time.time()
662 |     plot_losses = []
663 |     print_loss_total = 0  # Reset every print_every
664 |     plot_loss_total = 0  # Reset every plot_every
665 | 
666 |     encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
667 |     decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
668 |     training_pairs = [variablesFromPair(random.choice(pairs))
669 |                       for i in range(n_iters)]
670 |     criterion = nn.NLLLoss()
671 | 
672 |     for iter in range(1, n_iters + 1):
673 |         training_pair = training_pairs[iter - 1]
674 |         input_variable = training_pair[0]
675 |         target_variable = training_pair[1]
676 | 
677 |         loss = train(input_variable, target_variable, encoder,
678 |                      decoder, encoder_optimizer, decoder_optimizer, criterion)
679 |         print_loss_total += loss
680 |         plot_loss_total += loss
681 | 
682 |         if iter % print_every == 0:
683 |             print_loss_avg = print_loss_total / print_every
684 |             print_loss_total = 0
685 |             print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
686 |                                          iter, iter / n_iters * 100, print_loss_avg))
687 | 
688 |         if iter % plot_every == 0:
689 |             plot_loss_avg = plot_loss_total / plot_every
690 |             plot_losses.append(plot_loss_avg)
691 |             plot_loss_total = 0
692 | 
693 |     showPlot(plot_losses)
694 | 
695 | 
696 | ######################################################################
697 | # Plotting results
698 | # ----------------
699 | #
700 | # Plotting is done with matplotlib, using the array of loss values
701 | # ``plot_losses`` saved while training.
702 | #
703 | 
704 | import matplotlib.pyplot as plt
705 | import matplotlib.ticker as ticker
706 | import numpy as np
707 | 
708 | 
709 | def showPlot(points):
710 |     plt.figure()
711 |     fig, ax = plt.subplots()
712 |     # this locator puts ticks at regular intervals
713 |     loc = ticker.MultipleLocator(base=0.2)
714 |     ax.yaxis.set_major_locator(loc)
715 |     plt.plot(points)
716 | 
717 | 
718 | ######################################################################
719 | # Evaluation
720 | # ==========
721 | #
722 | # Evaluation is mostly the same as training, but there are no targets so
723 | # we simply feed the decoder's predictions back to itself for each step.
724 | # Every time it predicts a word we add it to the output string, and if it
725 | # predicts the EOS token we stop there. We also store the decoder's
726 | # attention outputs for display later.
727 | #
728 | 
729 | def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
730 |     input_variable = variableFromSentence(input_lang, sentence)
731 |     input_length = input_variable.size()[0]
732 |     encoder_hidden = encoder.initHidden()
733 | 
734 |     encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
735 |     encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs
736 | 
737 |     for ei in range(input_length):
738 |         encoder_output, encoder_hidden = encoder(input_variable[ei],
739 |                                                  encoder_hidden)
740 |         encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0]
741 | 
742 |     decoder_input = Variable(torch.LongTensor([[SOS_token]]))  # SOS
743 |     decoder_input = decoder_input.cuda() if use_cuda else decoder_input
744 | 
745 |     decoder_hidden = encoder_hidden
746 | 
747 |     decoded_words = []
748 |     decoder_attentions = torch.zeros(max_length, max_length)
749 | 
750 |     for di in range(max_length):
751 |         decoder_output, decoder_hidden, decoder_attention = decoder(
752 |             decoder_input, decoder_hidden, encoder_output, encoder_outputs)
753 |         decoder_attentions[di] = decoder_attention.data
754 |         topv, topi = decoder_output.data.topk(1)
755 |         ni = topi[0][0]
756 |         if ni == EOS_token:
757 |             decoded_words.append('<EOS>')
758 |             break
759 |         else:
760 |             decoded_words.append(output_lang.index2word[ni])
761 | 
762 |         decoder_input = Variable(torch.LongTensor([[ni]]))
763 |         decoder_input = decoder_input.cuda() if use_cuda else decoder_input
764 | 
765 |     return decoded_words, decoder_attentions[:di + 1]
766 | 
767 | 
768 | ######################################################################
769 | # We can evaluate random sentences from the training set and print out the
770 | # input, target, and output to make some subjective quality judgements:
771 | #
772 | 
773 | def evaluateRandomly(encoder, decoder, n=10):
774 |     for i in range(n):
775 |         pair = random.choice(pairs)
776 |         print('>', pair[0])
777 |         print('=', pair[1])
778 |         output_words, attentions = evaluate(encoder, decoder, pair[0])
779 |         output_sentence = ' '.join(output_words)
780 |         print('<', output_sentence)
781 |         print('')
782 | 
783 | 
784 | ######################################################################
785 | # Training and Evaluating
786 | # =======================
787 | #
788 | # With all these helper functions in place (it looks like extra work, but
789 | # it's easier to run multiple experiments easier) we can actually
790 | # initialize a network and start training.
791 | #
792 | # Remember that the input sentences were heavily filtered. For this small
793 | # dataset we can use relatively small networks of 256 hidden nodes and a
794 | # single GRU layer. After about 40 minutes on a MacBook CPU we'll get some
795 | # reasonable results.
796 | #
797 | # .. Note::
798 | #    If you run this notebook you can train, interrupt the kernel,
799 | #    evaluate, and continue training later. Comment out the lines where the
800 | #    encoder and decoder are initialized and run ``trainIters`` again.
801 | #
802 | 
803 | hidden_size = 256
804 | encoder1 = EncoderRNN(input_lang.n_words, hidden_size)
805 | attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words,
806 |                                1, dropout_p=0.1)
807 | 
808 | if use_cuda:
809 |     encoder1 = encoder1.cuda()
810 |     attn_decoder1 = attn_decoder1.cuda()
811 | 
812 | trainIters(encoder1, attn_decoder1, 75000, print_every=5000)
813 | 
814 | ######################################################################
815 | #
816 | 
817 | evaluateRandomly(encoder1, attn_decoder1)
818 | 
819 | ######################################################################
820 | # Visualizing Attention
821 | # ---------------------
822 | #
823 | # A useful property of the attention mechanism is its highly interpretable
824 | # outputs. Because it is used to weight specific encoder outputs of the
825 | # input sequence, we can imagine looking where the network is focused most
826 | # at each time step.
827 | #
828 | # You could simply run ``plt.matshow(attentions)`` to see attention output
829 | # displayed as a matrix, with the columns being input steps and rows being
830 | # output steps:
831 | #
832 | 
833 | output_words, attentions = evaluate(
834 |     encoder1, attn_decoder1, "je suis trop froid .")
835 | plt.matshow(attentions.numpy())
836 | 
837 | 
838 | ######################################################################
839 | # For a better viewing experience we will do the extra work of adding axes
840 | # and labels:
841 | #
842 | 
843 | def showAttention(input_sentence, output_words, attentions):
844 |     # Set up figure with colorbar
845 |     fig = plt.figure()
846 |     ax = fig.add_subplot(111)
847 |     cax = ax.matshow(attentions.numpy(), cmap='bone')
848 |     fig.colorbar(cax)
849 | 
850 |     # Set up axes
851 |     ax.set_xticklabels([''] + input_sentence.split(' ') +
852 |                        ['<EOS>'], rotation=90)
853 |     ax.set_yticklabels([''] + output_words)
854 | 
855 |     # Show label at every tick
856 |     ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
857 |     ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
858 | 
859 |     plt.show()
860 | 
861 | 
862 | def evaluateAndShowAttention(input_sentence):
863 |     output_words, attentions = evaluate(
864 |         encoder1, attn_decoder1, input_sentence)
865 |     print('input =', input_sentence)
866 |     print('output =', ' '.join(output_words))
867 |     showAttention(input_sentence, output_words, attentions)
868 | 
869 | 
870 | evaluateAndShowAttention("elle a cinq ans de moins que moi .")
871 | 
872 | evaluateAndShowAttention("elle est trop petit .")
873 | 
874 | evaluateAndShowAttention("je ne crains pas de mourir .")
875 | 
876 | evaluateAndShowAttention("c est un jeune directeur plein de talent .")
877 | 
878 | 
879 | ######################################################################
880 | # Exercises
881 | # =========
882 | #
883 | # -  Try with a different dataset
884 | #
885 | #    -  Another language pair
886 | #    -  Human → Machine (e.g. IOT commands)
887 | #    -  Chat → Response
888 | #    -  Question → Answer
889 | #
890 | # -  Replace the embeddings with pre-trained word embeddings such as word2vec or
891 | #    GloVe
892 | # -  Try with more layers, more hidden units, and more sentences. Compare
893 | #    the training time and results.
894 | # -  If you use a translation file where pairs have two of the same phrase
895 | #    (``I am test \t I am test``), you can use this as an autoencoder. Try
896 | #    this:
897 | #
898 | #    -  Train as an autoencoder
899 | #    -  Save only the Encoder network
900 | #    -  Train a new Decoder for translation from there
901 | #


--------------------------------------------------------------------------------