├── model ├── __init__.py ├── rvae.pyc ├── __init__.pyc ├── decoder.pyc ├── encoder.pyc ├── encoder.py ├── decoder.py ├── rvae_previous.py └── rvae.py ├── utils ├── __init__.py ├── __init__.pyc ├── tensor.pyc ├── functional.pyc ├── parameters.pyc ├── batch_loader.pyc ├── parameters.py ├── functional.py ├── visualize_word_embeddings.py ├── tensor.py └── batch_loader.py ├── selfModules ├── __init__.py ├── neg.pyc ├── tdnn.pyc ├── highway.pyc ├── __init__.pyc ├── embedding.pyc ├── highway.py ├── tdnn.py ├── embedding.py └── neg.py ├── __init__.py ├── beam_search.pyc ├── ce_result_.npy ├── kld_result_npy_.npy ├── README.md ├── train_word_embeddings.py ├── train_word_embeddings_2.py ├── beam_search.py ├── test.py ├── train.py └── sample_3.py /model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /selfModules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from . import nn_layers 2 | from . import utility 3 | -------------------------------------------------------------------------------- /beam_search.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/beam_search.pyc -------------------------------------------------------------------------------- /ce_result_.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/ce_result_.npy -------------------------------------------------------------------------------- /model/rvae.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/model/rvae.pyc -------------------------------------------------------------------------------- /model/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/model/__init__.pyc -------------------------------------------------------------------------------- /model/decoder.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/model/decoder.pyc -------------------------------------------------------------------------------- /model/encoder.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/model/encoder.pyc -------------------------------------------------------------------------------- /utils/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/utils/__init__.pyc -------------------------------------------------------------------------------- /utils/tensor.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/utils/tensor.pyc -------------------------------------------------------------------------------- /kld_result_npy_.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/kld_result_npy_.npy -------------------------------------------------------------------------------- /selfModules/neg.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/selfModules/neg.pyc -------------------------------------------------------------------------------- /selfModules/tdnn.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/selfModules/tdnn.pyc -------------------------------------------------------------------------------- /utils/functional.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/utils/functional.pyc -------------------------------------------------------------------------------- /utils/parameters.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/utils/parameters.pyc -------------------------------------------------------------------------------- /selfModules/highway.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/selfModules/highway.pyc -------------------------------------------------------------------------------- /utils/batch_loader.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/utils/batch_loader.pyc -------------------------------------------------------------------------------- /selfModules/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/selfModules/__init__.pyc -------------------------------------------------------------------------------- /selfModules/embedding.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/selfModules/embedding.pyc -------------------------------------------------------------------------------- /utils/parameters.py: -------------------------------------------------------------------------------- 1 | from .functional import * 2 | 3 | 4 | class Parameters: 5 | def __init__(self, max_word_len, max_seq_len, word_vocab_size, char_vocab_size): 6 | self.max_word_len = int(max_word_len) 7 | self.max_seq_len = int(max_seq_len) + 1 # go or eos token 8 | 9 | self.word_vocab_size = int(word_vocab_size) 10 | self.char_vocab_size = int(char_vocab_size) 11 | 12 | self.word_embed_size = 300 13 | self.char_embed_size = 15 14 | 15 | self.kernels = [(1, 25), (2, 50), (3, 75), (4, 100), (5, 125), (6, 150)] 16 | self.sum_depth = fold(lambda x, y: x + y, [depth for _, depth in self.kernels], 0) 17 | 18 | self.encoder_rnn_size = 600 19 | self.encoder_num_layers = 1 20 | 21 | self.latent_variable_size = 1100 22 | 23 | self.decoder_rnn_size = 600 24 | self.decoder_num_layers = 2 25 | -------------------------------------------------------------------------------- /utils/functional.py: -------------------------------------------------------------------------------- 1 | def fold(f, l, a): 2 | return a if (len(l) == 0) else fold(f, l[1:], f(a, l[0])) 3 | 4 | def f_and(x, y): 5 | return convert(x) and convert(y) 6 | 7 | def f_or(x, y): 8 | return convert(x) or convert(y) 9 | 10 | def convert(x): 11 | if type(x) <> bool: 12 | return len(x)>0 13 | else: 14 | return x 15 | 16 | def parameters_allocation_check(module): 17 | parameters = list(module.parameters()) 18 | return fold(f_and, parameters, True) or not fold(f_or, parameters, False) 19 | 20 | def handle_inputs(inputs, use_cuda): 21 | import torch as t 22 | from torch.autograd import Variable 23 | 24 | result = [Variable(t.from_numpy(var)) for var in inputs] 25 | result = [var.cuda() if use_cuda else var for var in result] 26 | 27 | return result 28 | 29 | def kld_coef(i): 30 | import math 31 | return (math.tanh((i - 3500)/1000) + 1)/2 32 | -------------------------------------------------------------------------------- /utils/visualize_word_embeddings.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | from sklearn.decomposition import PCA 6 | 7 | from utils.batch_loader import BatchLoader 8 | 9 | if __name__ == "__main__": 10 | if not os.path.exists('../../data/word_embeddings.npy'): 11 | raise FileNotFoundError("word embeddings file was't found") 12 | 13 | pca = PCA(n_components=2) 14 | word_embeddings = np.load('../../data/word_embeddings.npy') 15 | word_embeddings_pca = pca.fit_transform(word_embeddings) 16 | 17 | batch_loader = BatchLoader() 18 | words = batch_loader.idx_to_word 19 | 20 | fig, ax = plt.subplots() 21 | fig.set_size_inches(150, 150) 22 | x = word_embeddings_pca[:, 0] 23 | y = word_embeddings_pca[:, 1] 24 | ax.scatter(x, y) 25 | 26 | for i, word in enumerate(words): 27 | ax.annotate(word, (x[i], y[i])) 28 | 29 | fig.savefig('word_embedding.png', dpi=100) 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Deep Generative Framework for Paraphrase Generation 2 | 3 | ## Model: 4 | This is the implementation of [A Deep Generative Framework for Paraphrase Generation](https://arxiv.org/pdf/1709.05074) by Ankush et al. (AAA2018) with Kim's [Character-Aware Neural Language Models](https://arxiv.org/abs/1508.06615) embedding for tokens. The code used the Samuel Bowman's [Generating Sentences from a Continuous Space](https://arxiv.org/abs/1511.06349#) implementation as a base code available [here](https://github.com/kefirski/pytorch_RVAE). 5 | 6 | 7 | 8 | ## Usage 9 | ### Before model training it is necessary to train word embeddings for both questions and its paraphrases: 10 | ``` 11 | $ python train_word_embeddings.py --num-iterations 1200000 12 | $ python train_word_embeddings_2.py --num-iterations 1200000 13 | ``` 14 | 15 | This script train word embeddings defined in [Mikolov et al. Distributed Representations of Words and Phrases](https://arxiv.org/abs/1310.4546) 16 | 17 | #### Parameters: 18 | `--use-cuda` 19 | 20 | `--num-iterations` 21 | 22 | `--batch-size` 23 | 24 | `--num-sample` –– number of sampled from noise tokens 25 | 26 | 27 | ### To train model use: 28 | ``` 29 | $ python train.py --num-iterations 140000 30 | ``` 31 | 32 | #### Parameters: 33 | `--use-cuda` 34 | 35 | `--num-iterations` 36 | 37 | `--batch-size` 38 | 39 | `--learning-rate` 40 | 41 | `--dropout` –– probability of units to be zeroed in decoder input 42 | 43 | `--use-trained` –– use trained before model 44 | 45 | ### To sample data after training use: 46 | ``` 47 | $ python test.py 48 | ``` 49 | #### Parameters: 50 | `--use-cuda` 51 | 52 | `--num-sample` 53 | 54 | -------------------------------------------------------------------------------- /utils/tensor.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import os 3 | import re 4 | 5 | import numpy as np 6 | from six.moves import cPickle 7 | 8 | from .functional import * 9 | 10 | idx_files = ['data/words_vocab.pkl', 11 | 'data/characters_vocab.pkl'] 12 | 13 | [idx_to_word, idx_to_char] = [cPickle.load(open(file, "rb")) for file in idx_files] 14 | [word_to_idx, char_to_idx] = [dict(zip(idx, range(len(idx)))) for idx in 15 | [idx_to_word, idx_to_char]] 16 | 17 | max_word_len = np.amax([len(word) for word in idx_to_word]) 18 | 19 | def encode_characters(characters): 20 | word_len = len(characters) 21 | to_add = max_word_len - word_len 22 | characters_idx = [char_to_idx[i] for i in characters] + to_add * [char_to_idx['']] 23 | return characters_idx 24 | 25 | def preprocess_data(data_files, idx_files, tensor_files, file, str=''): 26 | 27 | # print 'Preprocessing the test file\n' 28 | if file: 29 | data = [open(file, "r").read() for file in data_files] 30 | else: 31 | data=[str+'\n'] 32 | 33 | data_words = [[line.split() for line in target.split('\n')] for target in data] 34 | data_words = [[[word for word in target if word in idx_to_word] for target in yo] for yo in data_words] 35 | 36 | word_tensor = np.array( 37 | [[list(map(word_to_idx.get, line)) for line in target] for target in data_words]) 38 | np.save(tensor_files[0][0], word_tensor[0]) 39 | # print(word_tensor.shape) 40 | character_tensor = np.array( 41 | [[list(map(encode_characters, line)) for line in target] for target in data_words]) 42 | np.save(tensor_files[1][0], character_tensor[0]) 43 | -------------------------------------------------------------------------------- /selfModules/highway.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Highway(nn.Module): 7 | def __init__(self, size, num_layers, f): 8 | 9 | super(Highway, self).__init__() 10 | 11 | self.num_layers = num_layers 12 | 13 | self.nonlinear = [nn.Linear(size, size) for _ in range(num_layers)] 14 | for i, module in enumerate(self.nonlinear): 15 | self._add_to_parameters(module.parameters(), 'nonlinear_module_{}'.format(i)) 16 | 17 | self.linear = [nn.Linear(size, size) for _ in range(num_layers)] 18 | for i, module in enumerate(self.linear): 19 | self._add_to_parameters(module.parameters(), 'linear_module_{}'.format(i)) 20 | 21 | self.gate = [nn.Linear(size, size) for _ in range(num_layers)] 22 | for i, module in enumerate(self.gate): 23 | self._add_to_parameters(module.parameters(), 'gate_module_{}'.format(i)) 24 | 25 | self.f = f 26 | 27 | def forward(self, x): 28 | """ 29 | :param x: tensor with shape of [batch_size, size] 30 | 31 | :return: tensor with shape of [batch_size, size] 32 | 33 | applies σ(x) ⨀ (f(G(x))) + (1 - σ(x)) ⨀ (Q(x)) transformation | G and Q is affine transformation, 34 | f is non-linear transformation, σ(x) is affine transformation with sigmoid non-linearition 35 | and ⨀ is element-wise multiplication 36 | """ 37 | 38 | for layer in range(self.num_layers): 39 | gate = F.sigmoid(self.gate[layer](x)) 40 | 41 | nonlinear = self.f(self.nonlinear[layer](x)) 42 | linear = self.linear[layer](x) 43 | 44 | x = gate * nonlinear + (1 - gate) * linear 45 | 46 | return x 47 | 48 | def _add_to_parameters(self, parameters, name): 49 | for i, parameter in enumerate(parameters): 50 | self.register_parameter(name='{}-{}'.format(name, i), param=parameter) 51 | -------------------------------------------------------------------------------- /model/encoder.py: -------------------------------------------------------------------------------- 1 | import torch as t 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from selfModules.highway import Highway 6 | from utils.functional import parameters_allocation_check 7 | 8 | 9 | class Encoder(nn.Module): 10 | def __init__(self, params): 11 | super(Encoder, self).__init__() 12 | 13 | self.params = params 14 | 15 | self.hw1 = Highway(self.params.sum_depth + self.params.word_embed_size, 2, F.relu) 16 | 17 | self.rnn = nn.LSTM(input_size=self.params.word_embed_size + self.params.sum_depth, 18 | hidden_size=self.params.encoder_rnn_size, 19 | num_layers=self.params.encoder_num_layers, 20 | batch_first=True, 21 | bidirectional=True) 22 | 23 | def forward(self, input, State): 24 | """ 25 | :param input: [batch_size, seq_len, embed_size] tensor 26 | :return: context of input sentenses with shape of [batch_size, latent_variable_size] 27 | """ 28 | #print "Three" 29 | [batch_size, seq_len, embed_size] = input.size() 30 | 31 | input = input.view(-1, embed_size) 32 | input = self.hw1(input) 33 | input = input.view(batch_size, seq_len, embed_size) 34 | 35 | assert parameters_allocation_check(self), \ 36 | 'Invalid CUDA options. Parameters should be allocated in the same memory' 37 | 38 | ''' Unfold rnn with zero initial state and get its final state from the last layer 39 | ''' 40 | _, (transfer_state_1, final_state) = self.rnn(input, State) 41 | transfer_state_2 = final_state 42 | 43 | final_state = final_state.view(self.params.encoder_num_layers, 2, batch_size, self.params.encoder_rnn_size) 44 | final_state = final_state[-1] 45 | h_1, h_2 = final_state[0], final_state[1] 46 | final_state = t.cat([h_1, h_2], 1) 47 | 48 | return final_state, transfer_state_1, transfer_state_2 49 | -------------------------------------------------------------------------------- /selfModules/tdnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch as t 3 | from torch.nn import Parameter 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | class TDNN(nn.Module): 9 | def __init__(self, params): 10 | super(TDNN, self).__init__() 11 | 12 | self.params = params 13 | 14 | self.kernels = [Parameter(t.Tensor(out_dim, self.params.char_embed_size, kW).uniform_(-1, 1)) 15 | for kW, out_dim in params.kernels] 16 | self._add_to_parameters(self.kernels, 'TDNN_kernel') 17 | 18 | def forward(self, x): 19 | """ 20 | :param x: tensor with shape [batch_size, max_seq_len, max_word_len, char_embed_size] 21 | 22 | :return: tensor with shape [batch_size, max_seq_len, depth_sum] 23 | 24 | applies multikenrel 1d-conv layer along every word in input with max-over-time pooling 25 | to emit fixed-size output 26 | """ 27 | 28 | input_size = x.size() 29 | input_size_len = len(input_size) 30 | 31 | assert input_size_len == 4, \ 32 | 'Wrong input rang, must be equal to 4, but {} found'.format(input_size_len) 33 | 34 | [batch_size, seq_len, _, embed_size] = input_size 35 | 36 | assert embed_size == self.params.char_embed_size, \ 37 | 'Wrong embedding size, must be equal to {}, but {} found'.format(self.params.char_embed_size, embed_size) 38 | 39 | # leaps with shape 40 | x = x.view(-1, self.params.max_word_len, self.params.char_embed_size).transpose(1, 2).contiguous() 41 | 42 | xs = [F.tanh(F.conv1d(x, kernel)) for kernel in self.kernels] 43 | xs = [x.max(2,keepdim=True)[0].squeeze(2) for x in xs] 44 | 45 | x = t.cat(xs, 1) 46 | x = x.view(batch_size, seq_len, -1) 47 | 48 | return x 49 | 50 | def _add_to_parameters(self, parameters, name): 51 | for i, parameter in enumerate(parameters): 52 | self.register_parameter(name='{}-{}'.format(name, i), param=parameter) 53 | -------------------------------------------------------------------------------- /selfModules/embedding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import numpy as np 3 | import torch as t 4 | import torch.nn as nn 5 | from torch.nn import Parameter 6 | 7 | from .tdnn import TDNN 8 | 9 | 10 | class Embedding(nn.Module): 11 | def __init__(self, params, path='../../../', flag=False): 12 | super(Embedding, self).__init__() 13 | 14 | self.params = params 15 | 16 | if flag == True: 17 | word_embed = np.load(path + 'data/super/word_embeddings.npy') 18 | else : 19 | word_embed = np.load(path + 'data/word_embeddings.npy') 20 | 21 | self.word_embed = nn.Embedding(self.params.word_vocab_size, self.params.word_embed_size) 22 | self.char_embed = nn.Embedding(self.params.char_vocab_size, self.params.char_embed_size) 23 | self.word_embed.weight = Parameter(t.from_numpy(word_embed).float(), requires_grad=False) 24 | self.char_embed.weight = Parameter( 25 | t.Tensor(self.params.char_vocab_size, self.params.char_embed_size).uniform_(-1, 1)) 26 | 27 | self.TDNN = TDNN(self.params) 28 | 29 | def forward(self, word_input, character_input): 30 | """ 31 | :param word_input: [batch_size, seq_len] tensor of Long type 32 | :param character_input: [batch_size, seq_len, max_word_len] tensor of Long type 33 | :return: input embedding with shape of [batch_size, seq_len, word_embed_size + sum_depth] 34 | """ 35 | 36 | assert word_input.size()[:2] == character_input.size()[:2], \ 37 | 'Word input and character input must have the same sizes, but {} and {} found'.format( 38 | word_input.size(), character_input.size()) 39 | 40 | [batch_size, seq_len] = word_input.size() 41 | 42 | word_input = self.word_embed(word_input) 43 | 44 | character_input = character_input.view(-1, self.params.max_word_len) 45 | character_input = self.char_embed(character_input) 46 | character_input = character_input.view(batch_size, 47 | seq_len, 48 | self.params.max_word_len, 49 | self.params.char_embed_size) 50 | 51 | character_input = self.TDNN(character_input) 52 | 53 | result = t.cat([word_input, character_input], 2) 54 | 55 | return result 56 | -------------------------------------------------------------------------------- /selfModules/neg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch as t 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | from torch.nn import Parameter 6 | 7 | from utils.functional import * 8 | 9 | 10 | class NEG_loss(nn.Module): 11 | def __init__(self, num_classes, embed_size): 12 | """ 13 | :param num_classes: An int. The number of possible classes. 14 | :param embed_size: An int. Embedding size 15 | """ 16 | 17 | super(NEG_loss, self).__init__() 18 | 19 | self.num_classes = num_classes 20 | self.embed_size = embed_size 21 | 22 | self.out_embed = nn.Embedding(self.num_classes, self.embed_size) 23 | self.out_embed.weight = Parameter(t.FloatTensor(self.num_classes, self.embed_size).uniform_(-1, 1)) 24 | 25 | self.in_embed = nn.Embedding(self.num_classes, self.embed_size) 26 | self.in_embed.weight = Parameter(t.FloatTensor(self.num_classes, self.embed_size).uniform_(-1, 1)) 27 | 28 | def forward(self, input_labes, out_labels, num_sampled): 29 | """ 30 | :param input_labes: Tensor with shape of [batch_size] of Long type 31 | :param out_labels: Tensor with shape of [batch_size] of Long type 32 | :param num_sampled: An int. The number of sampled from noise examples 33 | 34 | :return: Loss estimation with shape of [batch_size] 35 | loss defined in Mikolov et al. Distributed Representations of Words and Phrases and their Compositionality 36 | papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf 37 | """ 38 | 39 | assert parameters_allocation_check(self), \ 40 | """ 41 | Invalid CUDA options. out_embed and in_embed parameters both should be stored in the same memory 42 | got out_embed.is_cuda = {}, in_embed.is_cuda = {} 43 | """.format(self.out_embed.weight.is_cuda, self.in_embed.weight.is_cuda) 44 | 45 | use_cuda = self.out_embed.weight.is_cuda 46 | 47 | [batch_size] = input_labes.size() 48 | 49 | input = self.in_embed(input_labes) 50 | output = self.out_embed(out_labels) 51 | 52 | noise = Variable(t.Tensor(batch_size, num_sampled).uniform_(0, self.num_classes - 1).long()) 53 | if use_cuda: 54 | noise = noise.cuda() 55 | noise = self.out_embed(noise).neg() 56 | 57 | log_target = (input * output).sum(1).squeeze().sigmoid().log() 58 | 59 | ''' ∑[batch_size, num_sampled, embed_size] * [batch_size, embed_size, 1] -> 60 | ∑[batch_size, num_sampled] -> [batch_size] ''' 61 | sum_log_sampled = t.bmm(noise, input.unsqueeze(2)).sigmoid().log().sum(1).squeeze() 62 | 63 | loss = log_target + sum_log_sampled 64 | 65 | return -loss 66 | 67 | def input_embeddings(self): 68 | return self.in_embed.weight.data.cpu().numpy() 69 | -------------------------------------------------------------------------------- /train_word_embeddings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import argparse 3 | 4 | import numpy as np 5 | import torch as t 6 | from torch.autograd import Variable 7 | from torch.optim import SGD 8 | 9 | from utils.batch_loader import BatchLoader 10 | from utils.parameters import Parameters 11 | from selfModules.neg import NEG_loss 12 | 13 | if __name__ == '__main__': 14 | 15 | parser = argparse.ArgumentParser(description='word2vec') 16 | parser.add_argument('--num-iterations', type=int, default=1000000, metavar='NI', 17 | help='num iterations (default: 1000000)') 18 | parser.add_argument('--batch-size', type=int, default=10, metavar='BS', 19 | help='batch size (default: 10)') 20 | parser.add_argument('--num-sample', type=int, default=5, metavar='NS', 21 | help='num sample (default: 5)') 22 | parser.add_argument('--use-cuda', type=bool, default=True, metavar='CUDA', 23 | help='use cuda (default: True)') 24 | args = parser.parse_args() 25 | 26 | 27 | path='' 28 | 29 | data_files = [path + 'data/train.txt', 30 | path + 'data/test.txt'] 31 | 32 | idx_files = [path + 'data/words_vocab.pkl', 33 | path + 'data/characters_vocab.pkl'] 34 | 35 | tensor_files = [[path + 'data/train_word_tensor.npy', 36 | path + 'data/valid_word_tensor.npy'], 37 | [path + 'data/train_character_tensor.npy', 38 | path + 'data/valid_character_tensor.npy']] 39 | 40 | batch_loader = BatchLoader(data_files, idx_files, tensor_files, path) 41 | 42 | # batch_loader = BatchLoader('') 43 | params = Parameters(batch_loader.max_word_len, 44 | batch_loader.max_seq_len, 45 | batch_loader.words_vocab_size, 46 | batch_loader.chars_vocab_size) 47 | 48 | neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size) 49 | if args.use_cuda: 50 | neg_loss = neg_loss.cuda() 51 | 52 | # NEG_loss is defined over two embedding matrixes with shape of [params.word_vocab_size, params.word_embed_size] 53 | optimizer = SGD(neg_loss.parameters(), 0.1) 54 | 55 | for iteration in range(args.num_iterations): 56 | 57 | input_idx, target_idx = batch_loader.next_embedding_seq(args.batch_size) 58 | 59 | input = Variable(t.from_numpy(input_idx).long()) 60 | target = Variable(t.from_numpy(target_idx).long()) 61 | if args.use_cuda: 62 | input, target = input.cuda(), target.cuda() 63 | 64 | out = neg_loss(input, target, args.num_sample).mean() 65 | 66 | optimizer.zero_grad() 67 | out.backward() 68 | optimizer.step() 69 | 70 | if iteration % 500 == 0: 71 | out = out.cpu().data.numpy()[0] 72 | print('iteration = {}, loss = {}'.format(iteration, out)) 73 | 74 | word_embeddings = neg_loss.input_embeddings() 75 | #Saves the word embeddings at the end of this programs 76 | np.save('data/word_embeddings.npy', word_embeddings) 77 | -------------------------------------------------------------------------------- /train_word_embeddings_2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import argparse 3 | 4 | import numpy as np 5 | import torch as t 6 | from torch.autograd import Variable 7 | from torch.optim import SGD 8 | 9 | from utils.batch_loader import BatchLoader 10 | from utils.parameters import Parameters 11 | from selfModules.neg import NEG_loss 12 | 13 | if __name__ == '__main__': 14 | 15 | parser = argparse.ArgumentParser(description='word2vec') 16 | parser.add_argument('--num-iterations', type=int, default=1000000, metavar='NI', 17 | help='num iterations (default: 1000000)') 18 | parser.add_argument('--batch-size', type=int, default=10, metavar='BS', 19 | help='batch size (default: 10)') 20 | parser.add_argument('--num-sample', type=int, default=5, metavar='NS', 21 | help='num sample (default: 5)') 22 | parser.add_argument('--use-cuda', type=bool, default=True, metavar='CUDA', 23 | help='use cuda (default: True)') 24 | args = parser.parse_args() 25 | 26 | 27 | 28 | 29 | path='' 30 | 31 | data_files = [path + 'data/super/train_2.txt', 32 | path + 'data/super/test_2.txt'] 33 | 34 | idx_files = [path + 'data/super/words_vocab_2.pkl', 35 | path + 'data/super/characters_vocab_2.pkl'] 36 | 37 | tensor_files = [[path + 'data/super/train_word_tensor_2.npy', 38 | path + 'data/super/valid_word_tensor_2.npy'], 39 | [path + 'data/super/train_character_tensor_2.npy', 40 | path + 'data/super/valid_character_tensor_2.npy']] 41 | batch_loader_2 = BatchLoader(data_files, idx_files, tensor_files, path) 42 | 43 | 44 | 45 | 46 | # batch_loader_2 = BatchLoader('') 47 | params = Parameters(batch_loader_2.max_word_len, 48 | batch_loader_2.max_seq_len, 49 | batch_loader_2.words_vocab_size, 50 | batch_loader_2.chars_vocab_size) 51 | 52 | neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size) 53 | if args.use_cuda: 54 | neg_loss = neg_loss.cuda() 55 | 56 | # NEG_loss is defined over two embedding matrixes with shape of [params.word_vocab_size, params.word_embed_size] 57 | optimizer = SGD(neg_loss.parameters(), 0.1) 58 | 59 | for iteration in range(args.num_iterations): 60 | 61 | input_idx, target_idx = batch_loader_2.next_embedding_seq(args.batch_size) 62 | 63 | input = Variable(t.from_numpy(input_idx).long()) 64 | target = Variable(t.from_numpy(target_idx).long()) 65 | if args.use_cuda: 66 | input, target = input.cuda(), target.cuda() 67 | 68 | out = neg_loss(input, target, args.num_sample).mean() 69 | 70 | optimizer.zero_grad() 71 | out.backward() 72 | optimizer.step() 73 | 74 | if iteration % 500 == 0: 75 | out = out.cpu().data.numpy()[0] 76 | print('iteration = {}, loss = {}'.format(iteration, out)) 77 | 78 | word_embeddings = neg_loss.input_embeddings() 79 | #Saves the word embeddings at the end of this programs 80 | np.save('data/super/word_embeddings.npy', word_embeddings) 81 | -------------------------------------------------------------------------------- /model/decoder.py: -------------------------------------------------------------------------------- 1 | import torch as t 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from utils.functional import parameters_allocation_check 6 | 7 | 8 | class Decoder(nn.Module): 9 | def __init__(self, params): 10 | super(Decoder, self).__init__() 11 | 12 | self.params = params 13 | 14 | self.rnn = nn.LSTM(input_size=self.params.latent_variable_size + self.params.word_embed_size, 15 | hidden_size=self.params.decoder_rnn_size, 16 | num_layers=self.params.decoder_num_layers, 17 | batch_first=True) 18 | 19 | self.fc = nn.Linear(self.params.decoder_rnn_size, self.params.word_vocab_size) 20 | 21 | 22 | def only_decoder_beam(self, decoder_input, z, drop_prob, initial_state=None): 23 | 24 | assert parameters_allocation_check(self), \ 25 | 'Invalid CUDA options. Parameters should be allocated in the same memory' 26 | 27 | # print decoder_input.size() 28 | 29 | [beam_batch_size, _, _] = decoder_input.size() 30 | 31 | ''' 32 | decoder rnn is conditioned on context via additional bias = W_cond * z to every input token 33 | ''' 34 | decoder_input = F.dropout(decoder_input, drop_prob) 35 | 36 | z = z.unsqueeze(0) 37 | 38 | # print z.size() 39 | 40 | z = t.cat([z] * beam_batch_size, 0) 41 | 42 | # print z.size() 43 | # z = z.contiguous().view(1, -1) 44 | 45 | # z = z.view(beam_batch_size, self.params.latent_variable_size) 46 | 47 | # print z.size() 48 | 49 | decoder_input = t.cat([decoder_input, z], 2) 50 | 51 | # print "decoder_input:",decoder_input.size() 52 | 53 | rnn_out, final_state = self.rnn(decoder_input, initial_state) 54 | 55 | # print "rnn_out:",rnn_out.size() 56 | # print "final_state_1:",final_state[0].size() 57 | # print "final_state_1:",final_state[1].size() 58 | 59 | return rnn_out, final_state 60 | 61 | 62 | def forward(self, decoder_input, z, drop_prob, initial_state=None): 63 | """ 64 | :param decoder_input: tensor with shape of [batch_size, seq_len, embed_size] 65 | :param z: sequence context with shape of [batch_size, latent_variable_size] 66 | :param drop_prob: probability of an element of decoder input to be zeroed in sense of dropout 67 | :param initial_state: initial state of decoder rnn 68 | 69 | :return: unnormalized logits of sentense words distribution probabilities 70 | with shape of [batch_size, seq_len, word_vocab_size] 71 | final rnn state with shape of [num_layers, batch_size, decoder_rnn_size] 72 | """ 73 | 74 | assert parameters_allocation_check(self), \ 75 | 'Invalid CUDA options. Parameters should be allocated in the same memory' 76 | 77 | [batch_size, seq_len, _] = decoder_input.size() 78 | 79 | ''' 80 | decoder rnn is conditioned on context via additional bias = W_cond * z to every input token 81 | ''' 82 | decoder_input = F.dropout(decoder_input, drop_prob) 83 | 84 | z = t.cat([z] * seq_len, 1).view(batch_size, seq_len, self.params.latent_variable_size) 85 | decoder_input = t.cat([decoder_input, z], 2) 86 | 87 | rnn_out, final_state = self.rnn(decoder_input, initial_state) 88 | rnn_out = rnn_out.contiguous().view(-1, self.params.decoder_rnn_size) 89 | 90 | 91 | result = self.fc(rnn_out) 92 | result = result.view(batch_size, seq_len, self.params.word_vocab_size) 93 | 94 | return result, final_state 95 | -------------------------------------------------------------------------------- /beam_search.py: -------------------------------------------------------------------------------- 1 | """Beam search implementation in PyTorch.""" 2 | # 3 | # 4 | # hyp1#-hyp1---hyp1 -hyp1 5 | # \ / 6 | # hyp2 \-hyp2 /-hyp2#hyp2 7 | # / \ 8 | # hyp3#-hyp3---hyp3 -hyp3 9 | # ======================== 10 | # 11 | # Takes care of beams, back pointers, and scores. 12 | 13 | # Code borrowed from PyTorch OpenNMT example 14 | # https://github.com/pytorch/examples/blob/master/OpenNMT/onmt/Beam.py 15 | 16 | import torch 17 | 18 | 19 | class Beam(object): 20 | """Ordered beam of candidate outputs.""" 21 | 22 | def __init__(self, size, batch_loader, cuda=False): 23 | """Initialize params.""" 24 | self.size = size 25 | self.done = False 26 | self.pad = batch_loader.word_to_idx[batch_loader.pad_token] 27 | self.bos = batch_loader.word_to_idx[batch_loader.go_token] 28 | self.eos = batch_loader.word_to_idx[batch_loader.end_token] 29 | 30 | 31 | self.tt = torch.cuda if cuda else torch 32 | 33 | # The score for each translation on the beam. 34 | self.scores = self.tt.FloatTensor(size).zero_() 35 | 36 | # The backpointers at each time-step. 37 | self.prevKs = [] 38 | 39 | # The outputs at each time-step. 40 | self.nextYs = [self.tt.LongTensor(size).fill_(self.pad)] 41 | self.nextYs[0][0] = self.bos 42 | 43 | # The attentions (matrix) for each time. 44 | self.attn = [] 45 | 46 | # Get the outputs for the current timestep. 47 | def get_current_state(self): 48 | """Get state of beam.""" 49 | return self.nextYs[-1] 50 | 51 | # Get the backpointers for the current timestep. 52 | def get_current_origin(self): 53 | """Get the backpointer to the beam at this step.""" 54 | return self.prevKs[-1] 55 | 56 | # Given prob over words for every last beam `wordLk` and attention 57 | # `attnOut`: Compute and update the beam search. 58 | # 59 | # Parameters: 60 | # 61 | # * `wordLk`- probs of advancing from the last step (K x words) 62 | # * `attnOut`- attention at the last step 63 | # 64 | # Returns: True if beam search is complete. 65 | 66 | def advance(self, workd_lk): 67 | """Advance the beam.""" 68 | num_words = workd_lk.size(1) 69 | 70 | # Sum the previous scores. 71 | if len(self.prevKs) > 0: 72 | beam_lk = workd_lk + self.scores.unsqueeze(1).expand_as(workd_lk) 73 | else: 74 | beam_lk = workd_lk[0] 75 | 76 | flat_beam_lk = beam_lk.view(-1) 77 | 78 | bestScores, bestScoresId = flat_beam_lk.topk(self.size, 0, True, True) 79 | self.scores = bestScores 80 | 81 | # bestScoresId is flattened beam x word array, so calculate which 82 | # word and beam each score came from 83 | prev_k = bestScoresId / num_words 84 | self.prevKs.append(prev_k) 85 | self.nextYs.append(bestScoresId - prev_k * num_words) 86 | 87 | # End condition is when top-of-beam is EOS. 88 | if self.nextYs[-1][0] == self.eos: 89 | self.done = True 90 | 91 | return self.done 92 | 93 | def sort_best(self): 94 | """Sort the beam.""" 95 | return torch.sort(self.scores, 0, True) 96 | 97 | # Get the score of the best in the beam. 98 | def get_best(self): 99 | """Get the most likely candidate.""" 100 | scores, ids = self.sort_best() 101 | return scores[1], ids[1] 102 | 103 | # Walk back to construct the full hypothesis. 104 | # 105 | # Parameters. 106 | # 107 | # * `k` - the position in the beam to construct. 108 | # 109 | # Returns. 110 | # 111 | # 1. The hypothesis 112 | # 2. The attention at each time step. 113 | def get_hyp(self, k): 114 | """Get hypotheses.""" 115 | hyp = [] 116 | # print(len(self.prevKs), len(self.nextYs), len(self.attn)) 117 | for j in range(len(self.prevKs) - 1, -1, -1): 118 | hyp.append(self.nextYs[j + 1][k]) 119 | k = self.prevKs[j][k] 120 | # print "inside:", hyp 121 | 122 | return hyp[::-1] -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | 5 | import numpy as np 6 | import torch as t 7 | 8 | from utils.batch_loader import BatchLoader 9 | from utils.tensor import preprocess_data 10 | from utils.parameters import Parameters 11 | from model.rvae import RVAE 12 | from torch.autograd import Variable 13 | from six.moves import cPickle 14 | 15 | if __name__ == '__main__': 16 | 17 | assert os.path.exists('./trained_RVAE'), \ 18 | 'trained model not found' 19 | 20 | parser = argparse.ArgumentParser(description='Sampler') 21 | parser.add_argument('--use-cuda', type=bool, default=True, metavar='CUDA', 22 | help='use cuda (default: True)') 23 | parser.add_argument('--num-sample', type=int, default=5, metavar='NS', 24 | help='num samplings (default: 5)') 25 | parser.add_argument('--num-sentence', type=int, default=10, metavar='NS', 26 | help='num samplings (default: 10)') 27 | parser.add_argument('--beam-top', type=int, default=3, metavar='NS', 28 | help='beam top (default: 1)') 29 | parser.add_argument('--beam-size', type=int, default=10, metavar='NS', 30 | help='beam size (default: 10)') 31 | parser.add_argument('--use-file', type=bool, default=True, metavar='NS', 32 | help='use file (default: False)') 33 | #Path to test file --- 34 | parser.add_argument('--test-file', type=str, default='data/test.txt', metavar='NS', 35 | help='test file path (default: data/test.txt)') 36 | parser.add_argument('--save-model', type=str, default='./trained_RVAE', metavar='NS', 37 | help='trained model save path (default: ./trained_models/trained_RVAE_quora)') 38 | args = parser.parse_args() 39 | 40 | #Removing, is already some previous files exist from last execution of program 41 | if os.path.exists('data/test_word_tensor.npy'): 42 | os.remove('data/test_word_tensor.npy') 43 | if os.path.exists('data/test_character_tensor.npy'): 44 | os.remove('data/test_character_tensor.npy') 45 | 46 | str ='' 47 | if not args.use_file: 48 | str = raw_input("Input Question : ") 49 | else: 50 | file_1 = open(args.test_file, 'r') 51 | data = file_1.readlines() 52 | 53 | ''' ================================= BatchLoader loading =============================================== 54 | ''' 55 | data_files = [args.test_file] 56 | 57 | idx_files = ['data/words_vocab.pkl', 58 | 'data/characters_vocab.pkl'] 59 | 60 | tensor_files = [['data/test_word_tensor.npy'], 61 | ['data/test_character_tensor.npy']] 62 | 63 | preprocess_data(data_files, idx_files, tensor_files, args.use_file, str) 64 | 65 | batch_loader = BatchLoader(data_files, idx_files, tensor_files) 66 | parameters = Parameters(batch_loader.max_word_len, 67 | batch_loader.max_seq_len, 68 | batch_loader.words_vocab_size, 69 | batch_loader.chars_vocab_size) 70 | 71 | 72 | ''' ============================ BatchLoader for Question-2 =============================================== 73 | ''' 74 | data_files = ['data/super/train_2.txt'] 75 | 76 | idx_files = ['data/super/words_vocab_2.pkl', 77 | 'data/super/characters_vocab_2.pkl'] 78 | 79 | tensor_files = [['data/super/train_word_tensor_2.npy'], 80 | ['data/super/train_character_tensor_2.npy']] 81 | batch_loader_2 = BatchLoader(data_files, idx_files, tensor_files) 82 | parameters_2 = Parameters(batch_loader_2.max_word_len, 83 | batch_loader_2.max_seq_len, 84 | batch_loader_2.words_vocab_size, 85 | batch_loader_2.chars_vocab_size) 86 | 87 | 88 | '''======================================== RVAE loading ================================================== 89 | ''' 90 | print 'Started loading' 91 | start_time = time.time() 92 | rvae = RVAE(parameters,parameters_2) 93 | rvae.load_state_dict(t.load(args.save_model)) 94 | if args.use_cuda: 95 | rvae = rvae.cuda() 96 | loading_time=time.time() - start_time 97 | print 'Time elapsed in loading model =' , loading_time 98 | print 'Finished loading' 99 | 100 | ''' ==================================== Parameters Initialising =========================================== 101 | ''' 102 | n_best = args.beam_top 103 | beam_size =args.beam_size 104 | 105 | assert n_best <= beam_size 106 | use_cuda = args.use_cuda 107 | 108 | if args.use_file: 109 | num_sentence = args.num_sentence 110 | else: 111 | num_sentence = 1 112 | 113 | ''' ======================================================================================================= 114 | ''' 115 | 116 | for i in range(len(data)): 117 | if args.use_file: 118 | print (data[i]) 119 | else: 120 | print str + '\n' 121 | for iteration in range(args.num_sample): 122 | 123 | seed = Variable(t.randn([1, parameters.latent_variable_size])) 124 | seed = seed.cuda() 125 | 126 | results, scores = rvae.sampler(batch_loader,batch_loader_2, 50, seed, args.use_cuda,i,beam_size,n_best) 127 | 128 | for tt in results: 129 | for k in xrange(n_best): 130 | sen = " ". join([batch_loader_2.decode_word(x[k]) for x in tt]) 131 | if batch_loader.end_token in sen: 132 | print sen[:sen.index(batch_loader.end_token)] 133 | else : 134 | print sen 135 | print '\n' -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import numpy as np 5 | import torch as t 6 | from torch.optim import Adam 7 | 8 | from utils.batch_loader import BatchLoader 9 | from utils.parameters import Parameters 10 | from model.rvae import RVAE 11 | 12 | if __name__ == "__main__": 13 | 14 | if not os.path.exists('data/word_embeddings.npy'): 15 | raise FileNotFoundError("word embeddings file was't found") 16 | 17 | parser = argparse.ArgumentParser(description='RVAE') 18 | parser.add_argument('--num-iterations', type=int, default=120000, metavar='NI', 19 | help='num iterations (default: 120000)') 20 | parser.add_argument('--batch-size', type=int, default=32, metavar='BS', 21 | help='batch size (default: 32)') 22 | parser.add_argument('--use-cuda', type=bool, default=True, metavar='CUDA', 23 | help='use cuda (default: True)') 24 | parser.add_argument('--learning-rate', type=float, default=0.00005, metavar='LR', 25 | help='learning rate (default: 0.00005)') 26 | parser.add_argument('--dropout', type=float, default=0.3, metavar='DR', 27 | help='dropout (default: 0.3)') 28 | parser.add_argument('--use-trained', type=bool, default=False, metavar='UT', 29 | help='load pretrained model (default: False)') 30 | parser.add_argument('--ce-result', default='', metavar='CE', 31 | help='ce result path (default: '')') 32 | parser.add_argument('--kld-result', default='', metavar='KLD', 33 | help='ce result path (default: '')') 34 | 35 | args = parser.parse_args() 36 | 37 | 38 | path='' 39 | 40 | ''' =================== Creating batch_loader for encoder-1 ========================================= 41 | ''' 42 | data_files = [path + 'data/train.txt', 43 | path + 'data/test.txt'] 44 | 45 | idx_files = [path + 'data/words_vocab.pkl', 46 | path + 'data/characters_vocab.pkl'] 47 | 48 | tensor_files = [[path + 'data/train_word_tensor.npy', 49 | path + 'data/valid_word_tensor.npy'], 50 | [path + 'data/train_character_tensor.npy', 51 | path + 'data/valid_character_tensor.npy']] 52 | 53 | batch_loader = BatchLoader(data_files, idx_files, tensor_files, path) 54 | parameters = Parameters(batch_loader.max_word_len, 55 | batch_loader.max_seq_len, 56 | batch_loader.words_vocab_size, 57 | batch_loader.chars_vocab_size) 58 | 59 | 60 | ''' =================== Doing the same for encoder-2 =============================================== 61 | ''' 62 | data_files = [path + 'data/super/train_2.txt', 63 | path + 'data/super/test_2.txt'] 64 | 65 | idx_files = [path + 'data/super/words_vocab_2.pkl', 66 | path + 'data/super/characters_vocab_2.pkl'] 67 | 68 | tensor_files = [[path + 'data/super/train_word_tensor_2.npy', 69 | path + 'data/super/valid_word_tensor_2.npy'], 70 | [path + 'data/super/train_character_tensor_2.npy', 71 | path + 'data/super/valid_character_tensor_2.npy']] 72 | batch_loader_2 = BatchLoader(data_files, idx_files, tensor_files, path) 73 | parameters_2 = Parameters(batch_loader_2.max_word_len, 74 | batch_loader_2.max_seq_len, 75 | batch_loader_2.words_vocab_size, 76 | batch_loader_2.chars_vocab_size) 77 | '''================================================================================================= 78 | ''' 79 | 80 | 81 | rvae = RVAE(parameters,parameters_2) 82 | if args.use_trained: 83 | rvae.load_state_dict(t.load('trained_RVAE')) 84 | if args.use_cuda: 85 | rvae = rvae.cuda() 86 | 87 | optimizer = Adam(rvae.learnable_parameters(), args.learning_rate) 88 | 89 | train_step = rvae.trainer(optimizer,batch_loader, batch_loader_2) 90 | validate = rvae.validater(batch_loader,batch_loader_2) 91 | 92 | ce_result = [] 93 | kld_result = [] 94 | 95 | start_index = 0 96 | # start_index_2 = 0 97 | 98 | for iteration in range(args.num_iterations): 99 | #This needs to be changed 100 | #start_index = (start_index+1)%50000 101 | start_index = (start_index+args.batch_size)%149163 102 | cross_entropy, kld, coef = train_step(iteration, args.batch_size, args.use_cuda, args.dropout, start_index) 103 | 104 | # exit() 105 | 106 | if iteration % 5 == 0: 107 | print('\n') 108 | print('------------TRAIN-------------') 109 | print('----------ITERATION-----------') 110 | print(iteration) 111 | print('--------CROSS-ENTROPY---------') 112 | print(cross_entropy.data.cpu().numpy()[0]) 113 | print('-------------KLD--------------') 114 | print(kld.data.cpu().numpy()[0]) 115 | print('-----------KLD-coef-----------') 116 | print(coef) 117 | print('------------------------------') 118 | 119 | # if iteration % 10 == 0: 120 | # start_index_2 = (start_index_2+args.batch_size)%3900 121 | # cross_entropy, kld = validate(args.batch_size, args.use_cuda, start_index_2) 122 | 123 | # cross_entropy = cross_entropy.data.cpu().numpy()[0] 124 | # kld = kld.data.cpu().numpy()[0] 125 | 126 | # print('\n') 127 | # print('------------VALID-------------') 128 | # print('--------CROSS-ENTROPY---------') 129 | # print(cross_entropy) 130 | # print('-------------KLD--------------') 131 | # print(kld) 132 | # print('------------------------------') 133 | 134 | # ce_result += [cross_entropy] 135 | # kld_result += [kld] 136 | ''' 137 | if iteration % 20 == 0: 138 | seed = np.random.normal(size=[1, parameters.latent_variable_size]) 139 | 140 | sample = rvae.sample(batch_loader_2, 50, seed, args.use_cuda) 141 | 142 | print('\n') 143 | print('------------SAMPLE------------') 144 | print('------------------------------') 145 | print(sample) 146 | print('------------------------------') 147 | ''' 148 | t.save(rvae.state_dict(), 'trained_RVAE') 149 | 150 | np.save('ce_result_{}.npy'.format(args.ce_result), np.array(ce_result)) 151 | np.save('kld_result_npy_{}'.format(args.kld_result), np.array(kld_result)) 152 | -------------------------------------------------------------------------------- /sample_3.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import numpy as np 5 | import torch as t 6 | 7 | from utils.batch_loader import BatchLoader 8 | from utils.parameters import Parameters 9 | from model.rvae import RVAE 10 | from torch.autograd import Variable 11 | 12 | if __name__ == '__main__': 13 | 14 | assert os.path.exists('trained_RVAE'), \ 15 | 'trained model not found' 16 | 17 | parser = argparse.ArgumentParser(description='Sampler') 18 | parser.add_argument('--use-cuda', type=bool, default=True, metavar='CUDA', 19 | help='use cuda (default: True)') 20 | parser.add_argument('--num-sample', type=int, default=5, metavar='NS', 21 | help='num samplings (default: 5)') 22 | parser.add_argument('--num-sentence', type=int, default=10, metavar='NS', 23 | help='num samplings (default: 10)') 24 | args = parser.parse_args() 25 | 26 | file_1 = open('test.txt', 'r') 27 | data = file_1.readlines() 28 | 29 | file_2 = open('test_2.txt', 'r') 30 | data_2 = file_2.readlines() 31 | 32 | path='' 33 | 34 | ''' ============================= BatchLoader loading =============================================== 35 | ''' 36 | data_files = [path + 'data/train.txt', 37 | path + 'data/test.txt'] 38 | 39 | idx_files = [path + 'data/words_vocab.pkl', 40 | path + 'data/characters_vocab.pkl'] 41 | 42 | tensor_files = [[path + 'data/train_word_tensor.npy', 43 | path + 'data/valid_word_tensor.npy'], 44 | [path + 'data/train_character_tensor.npy', 45 | path + 'data/valid_character_tensor.npy']] 46 | 47 | batch_loader = BatchLoader(data_files, idx_files, tensor_files, path) 48 | parameters = Parameters(batch_loader.max_word_len, 49 | batch_loader.max_seq_len, 50 | batch_loader.words_vocab_size, 51 | batch_loader.chars_vocab_size) 52 | 53 | ''' ============================= BatchLoader loading =============================================== 54 | ''' 55 | 56 | data_files = [path + 'data/super/train_2.txt', 57 | path + 'data/super/test_2.txt'] 58 | 59 | idx_files = [path + 'data/super/words_vocab_2.pkl', 60 | path + 'data/super/characters_vocab_2.pkl'] 61 | 62 | tensor_files = [[path + 'data/super/train_word_tensor_2.npy', 63 | path + 'data/super/valid_word_tensor_2.npy'], 64 | [path + 'data/super/train_character_tensor_2.npy', 65 | path + 'data/super/valid_character_tensor_2.npy']] 66 | 67 | batch_loader_2 = BatchLoader(data_files, idx_files, tensor_files, path) 68 | parameters_2 = Parameters(batch_loader_2.max_word_len, 69 | batch_loader_2.max_seq_len, 70 | batch_loader_2.words_vocab_size, 71 | batch_loader_2.chars_vocab_size) 72 | 73 | '''======================================== RVAE creation ================================================== 74 | ''' 75 | 76 | rvae = RVAE(parameters,parameters_2) 77 | rvae.load_state_dict(t.load('trained_RVAE')) 78 | if args.use_cuda: 79 | rvae = rvae.cuda() 80 | 81 | n_best = 3 82 | beam_size=10 83 | 84 | assert n_best <= beam_size 85 | 86 | for i in range(args.num_sentence): 87 | 88 | '''================================================== Input Encoder-1 ======================================================== 89 | ''' 90 | use_cuda = 1 91 | input = batch_loader.next_batch(1, 'valid', i) 92 | input = [Variable(t.from_numpy(var)) for var in input] 93 | input = [var.long() for var in input] 94 | input = [var.cuda() if use_cuda else var for var in input] 95 | 96 | [encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target] = input 97 | 98 | 99 | ''' =================================================== Input for Encoder-2 ======================================================== 100 | ''' 101 | 102 | input_2 = batch_loader_2.next_batch(1, 'valid', i) 103 | input_2 = [Variable(t.from_numpy(var)) for var in input_2] 104 | input_2 = [var.long() for var in input_2] 105 | input_2 = [var.cuda() if use_cuda else var for var in input_2] 106 | 107 | [encoder_word_input_2, encoder_character_input_2, decoder_word_input_2, decoder_character_input_2, target] = input_2 108 | 109 | ''' ================================================== Forward pass =========================================================== 110 | ''' 111 | # exit() 112 | 113 | logits,_,kld,mu,std = rvae.forward(0., 114 | encoder_word_input, encoder_character_input, 115 | encoder_word_input_2,encoder_character_input_2, 116 | decoder_word_input_2, decoder_character_input_2, 117 | z=None) 118 | 119 | ''' ================================================================================================================================ 120 | ''' 121 | 122 | # print '============' 123 | print (data[i]) 124 | print (data_2[i]) 125 | # print '------------------------------------' 126 | 127 | 128 | 129 | 130 | for iteration in range(args.num_sample): 131 | # seed = np.random.normal(size=[1, parameters.latent_variable_size]) 132 | seed = Variable(t.randn([1, parameters.latent_variable_size])) 133 | # seed = Variable(t.from_numpy(seed).float()) 134 | # exit() 135 | # seed = mu 136 | # if use_cuda: 137 | seed = seed.cuda() 138 | 139 | seed = seed * std + mu 140 | # seed = seed*std + mu 141 | # print 'Multiplication done' 142 | # seed = seed.cuda() 143 | # print seed.size 144 | # print type(seed) 145 | # print seed 146 | # exit() 147 | results, scores = rvae.sampler(batch_loader,batch_loader_2, 50, seed, args.use_cuda,i,beam_size,n_best) 148 | # exit() 149 | # print(results) 150 | for tt in results: 151 | for k in xrange(n_best): 152 | sen = " ". join([batch_loader_2.decode_word(x[k]) for x in tt]) 153 | # print sen 154 | if batch_loader.end_token in sen: 155 | print sen[:sen.index(batch_loader.end_token)] 156 | else : 157 | print sen 158 | # exit() 159 | print '\n' 160 | 161 | 162 | # print 'words_vocab_size BatchLoader ----------->' 163 | # print batch_loader.words_vocab_size 164 | # print '-----------------------------------------' 165 | 166 | # print 'words_vocab_size BatchLoader_2 ----------->' 167 | # print batch_loader_2.words_vocab_size 168 | # print '-----------------------------------------' 169 | 170 | -------------------------------------------------------------------------------- /model/rvae_previous.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as t 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | 7 | from .decoder import Decoder 8 | from .encoder import Encoder 9 | 10 | from selfModules.embedding import Embedding 11 | 12 | from utils.functional import kld_coef, parameters_allocation_check, fold 13 | 14 | 15 | class RVAE(nn.Module): 16 | def __init__(self, params,params_2): 17 | super(RVAE, self).__init__() 18 | 19 | self.params = params 20 | self.params_2 = params_2 #Encoder-2 parameters 21 | 22 | self.embedding = Embedding(self.params, '') 23 | self.embedding_2 = Embedding(self.params_2, '') 24 | 25 | self.encoder = Encoder(self.params) 26 | self.encoder_2 = Encoder(self.params_2) 27 | 28 | 29 | self.context_to_mu = nn.Linear(self.params.encoder_rnn_size * 2, self.params.latent_variable_size) 30 | self.context_to_logvar = nn.Linear(self.params.encoder_rnn_size * 2, self.params.latent_variable_size) 31 | 32 | self.encoder_3 = Encoder(self.params) 33 | self.decoder = Decoder(self.params_2) #change this to params_2 34 | 35 | def forward(self, drop_prob, 36 | encoder_word_input=None, encoder_character_input=None, 37 | encoder_word_input_2=None, encoder_character_input_2=None, 38 | decoder_word_input_2=None, decoder_character_input_2=None, 39 | z=None, initial_state=None): 40 | 41 | #Modified the parameters of forward function according to Encoder-2 42 | """ 43 | :param encoder_word_input: An tensor with shape of [batch_size, seq_len] of Long type 44 | :param encoder_character_input: An tensor with shape of [batch_size, seq_len, max_word_len] of Long type 45 | :param decoder_word_input: An tensor with shape of [batch_size, max_seq_len + 1] of Long type 46 | :param initial_state: initial state of decoder rnn in order to perform sampling 47 | 48 | :param drop_prob: probability of an element of decoder input to be zeroed in sense of dropout 49 | 50 | :param z: context if sampling is performing 51 | 52 | :return: unnormalized logits of sentence words distribution probabilities 53 | with shape of [batch_size, seq_len, word_vocab_size] 54 | final rnn state with shape of [num_layers, batch_size, decoder_rnn_size] 55 | """ 56 | 57 | assert parameters_allocation_check(self), \ 58 | 'Invalid CUDA options. Parameters should be allocated in the same memory' 59 | use_cuda = self.embedding.word_embed.weight.is_cuda 60 | 61 | assert z is None and fold(lambda acc, parameter: acc and parameter is not None, 62 | [encoder_word_input, encoder_character_input, decoder_word_input_2], 63 | True) \ 64 | or (z is not None and decoder_word_input_2 is not None), \ 65 | "Invalid input. If z is None then encoder and decoder inputs should be passed as arguments" 66 | 67 | if z is None: 68 | ''' Get context from encoder and sample z ~ N(mu, std) 69 | ''' 70 | [batch_size, _] = encoder_word_input.size() 71 | 72 | encoder_input = self.embedding(encoder_word_input, encoder_character_input) 73 | 74 | ''' ===================================================Doing the same for encoder-2=================================================== 75 | ''' 76 | [batch_size_2, _] = encoder_word_input_2.size() 77 | 78 | encoder_input_2 = self.embedding_2(encoder_word_input_2, encoder_character_input_2) 79 | 80 | ''' ================================================================================================================================== 81 | ''' 82 | 83 | context , h_0 , c_0 = self.encoder(encoder_input, None) 84 | 85 | State = (h_0,c_0) #Final state of Encoder-1 86 | context_2 , _ , _ = self.encoder_2( encoder_input_2, State ) #Encoder_2 for Ques_2 87 | 88 | mu = self.context_to_mu(context_2) 89 | logvar = self.context_to_logvar(context_2) 90 | std = t.exp(0.5 * logvar) 91 | 92 | z = Variable(t.randn([batch_size, self.params.latent_variable_size])) 93 | if use_cuda: 94 | z = z.cuda() 95 | 96 | z = z * std + mu 97 | 98 | kld = (-0.5 * t.sum(logvar - t.pow(mu, 2) - t.exp(logvar) + 1, 1)).mean().squeeze() 99 | 100 | encoder_input = self.embedding(encoder_word_input, encoder_character_input) 101 | _ , h_0 , c_0 = self.encoder_3(encoder_input, None) 102 | initial_state = (h_0,c_0) #Final state of Encoder-1 103 | 104 | else: 105 | kld = None 106 | 107 | 108 | 109 | 110 | decoder_input_2 = self.embedding.word_embed(decoder_word_input_2) # What to do with this decoder input ? --> Slightly resolved 111 | out, final_state = self.decoder(decoder_input_2, z, drop_prob, initial_state) # Take a look at the decoder 112 | 113 | return out, final_state, kld 114 | 115 | def learnable_parameters(self): 116 | 117 | # word_embedding is constant parameter thus it must be dropped from list of parameters for optimizer 118 | return [p for p in self.parameters() if p.requires_grad] 119 | 120 | def trainer(self, optimizer, batch_loader, batch_loader_2): 121 | def train(i, batch_size, use_cuda, dropout, start_index): 122 | input = batch_loader.next_batch(batch_size, 'train', start_index) 123 | input = [Variable(t.from_numpy(var)) for var in input] 124 | input = [var.long() for var in input] 125 | input = [var.cuda() if use_cuda else var for var in input] 126 | 127 | [encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target] = input 128 | 129 | 130 | ''' =================================================== Input for Encoder-2 ======================================================== 131 | ''' 132 | 133 | input_2 = batch_loader_2.next_batch(batch_size, 'train', start_index) 134 | input_2 = [Variable(t.from_numpy(var)) for var in input_2] 135 | input_2 = [var.long() for var in input_2] 136 | input_2 = [var.cuda() if use_cuda else var for var in input_2] 137 | 138 | [encoder_word_input_2, encoder_character_input_2, decoder_word_input_2, decoder_character_input_2, target] = input_2 139 | 140 | ''' ================================================================================================================================ 141 | ''' 142 | # exit() 143 | 144 | logits, _, kld = self(dropout, 145 | encoder_word_input, encoder_character_input, 146 | encoder_word_input_2,encoder_character_input_2, 147 | decoder_word_input_2, decoder_character_input_2, 148 | z=None) 149 | 150 | # logits = logits.view(-1, self.params.word_vocab_size) 151 | logits = logits.view(-1, self.params_2.word_vocab_size) 152 | target = target.view(-1) 153 | cross_entropy = F.cross_entropy(logits, target) 154 | 155 | loss = 79 * cross_entropy + kld_coef(i) * kld 156 | 157 | optimizer.zero_grad() 158 | loss.backward() 159 | optimizer.step() 160 | 161 | return cross_entropy, kld, kld_coef(i) 162 | 163 | return train 164 | 165 | def validater(self, batch_loader,batch_loader_2): 166 | def validate(batch_size, use_cuda, start_index): 167 | input = batch_loader.next_batch(batch_size, 'valid', start_index) 168 | input = [Variable(t.from_numpy(var)) for var in input] 169 | input = [var.long() for var in input] 170 | input = [var.cuda() if use_cuda else var for var in input] 171 | 172 | [encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target] = input 173 | 174 | ''' ==================================================== Input for Encoder-2 ======================================================== 175 | ''' 176 | 177 | input_2 = batch_loader_2.next_batch(batch_size, 'valid', start_index) 178 | input_2 = [Variable(t.from_numpy(var)) for var in input_2] 179 | input_2 = [var.long() for var in input_2] 180 | input_2 = [var.cuda() if use_cuda else var for var in input_2] 181 | [encoder_word_input_2, encoder_character_input_2, decoder_word_input_2, decoder_character_input_2, target] = input_2 182 | 183 | ''' ================================================================================================================================== 184 | ''' 185 | 186 | logits, _, kld = self(0., 187 | encoder_word_input, encoder_character_input, 188 | encoder_word_input_2,encoder_character_input_2, 189 | decoder_word_input_2, decoder_character_input_2, 190 | z=None) 191 | 192 | # logits = logits.view(-1, self.params.word_vocab_size) 193 | logits = logits.view(-1, self.params_2.word_vocab_size) 194 | target = target.view(-1) 195 | cross_entropy = F.cross_entropy(logits, target) 196 | 197 | return cross_entropy, kld 198 | 199 | return validate 200 | 201 | def sample(self, batch_loader, seq_len, seed, use_cuda, State): 202 | seed = Variable(t.from_numpy(seed).float()) 203 | if use_cuda: 204 | seed = seed.cuda() 205 | 206 | decoder_word_input_np, decoder_character_input_np = batch_loader.go_input(1) 207 | 208 | decoder_word_input = Variable(t.from_numpy(decoder_word_input_np).long()) 209 | decoder_character_input = Variable(t.from_numpy(decoder_character_input_np).long()) 210 | 211 | if use_cuda: 212 | decoder_word_input, decoder_character_input = decoder_word_input.cuda(), decoder_character_input.cuda() 213 | 214 | result = '' 215 | 216 | initial_state = State 217 | 218 | for i in range(seq_len): 219 | logits, initial_state, _ = self(0., None, None, 220 | None, None, 221 | decoder_word_input, decoder_character_input, 222 | seed, initial_state) 223 | 224 | 225 | # forward(self, drop_prob, 226 | # encoder_word_input=None, encoder_character_input=None, 227 | # encoder_word_input_2=None, encoder_character_input_2=None, 228 | # decoder_word_input_2=None, decoder_character_input_2=None, 229 | # z=None, initial_state=None): 230 | 231 | # logits = logits.view(-1, self.params.word_vocab_size) 232 | # logits = logits.view(-1, self.params.word_vocab_size) 233 | logits = logits.view(-1, self.params_2.word_vocab_size) 234 | # print '---------------------------------------' 235 | # print 'Printing logits' 236 | # print logits 237 | # print '------------------------------------------' 238 | 239 | prediction = F.softmax(logits) 240 | 241 | word = batch_loader.sample_word_from_distribution(prediction.data.cpu().numpy()[-1]) 242 | 243 | if word == batch_loader.end_token: 244 | break 245 | 246 | result += ' ' + word 247 | 248 | decoder_word_input_np = np.array([[batch_loader.word_to_idx[word]]]) 249 | decoder_character_input_np = np.array([[batch_loader.encode_characters(word)]]) 250 | 251 | decoder_word_input = Variable(t.from_numpy(decoder_word_input_np).long()) 252 | decoder_character_input = Variable(t.from_numpy(decoder_character_input_np).long()) 253 | 254 | if use_cuda: 255 | decoder_word_input, decoder_character_input = decoder_word_input.cuda(), decoder_character_input.cuda() 256 | 257 | return result 258 | 259 | def sampler(self, batch_loader, seq_len, seed, use_cuda): 260 | input = batch_loader.next_batch(1, 'valid', 1) 261 | input = [Variable(t.from_numpy(var)) for var in input] 262 | input = [var.long() for var in input] 263 | input = [var.cuda() if use_cuda else var for var in input] 264 | [encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target] = input 265 | 266 | encoder_input = self.embedding(encoder_word_input, encoder_character_input) 267 | 268 | _ , h0 , c0 = self.encoder_3(encoder_input, None) 269 | State = (h0,c0) 270 | 271 | # print '----------------------' 272 | # print 'Printing h0 ---------->' 273 | # print h0 274 | # print '----------------------' 275 | 276 | # State = None 277 | result = self.sample(batch_loader, seq_len, seed, use_cuda, State) 278 | 279 | return result 280 | -------------------------------------------------------------------------------- /utils/batch_loader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import collections 4 | import os 5 | import re 6 | 7 | import numpy as np 8 | from six.moves import cPickle 9 | 10 | from .functional import * 11 | 12 | 13 | class BatchLoader: 14 | def __init__(self, data_files, idx_files, tensor_files, path='../../'): 15 | 16 | ''' 17 | :properties 18 | 19 | data_files - array containing paths to data sources 20 | 21 | idx_files - array of paths to vocabulury files 22 | 23 | tensor_files - matrix with shape of [2, target_num] containing paths to files 24 | with data represented as tensors 25 | where first index in shape corresponds to types of representation of data, 26 | i.e. word representation and character-aware representation 27 | 28 | blind_symbol - special symbol to fill spaces in every word in character-aware representation 29 | to make all words be the same lenght 30 | pad_token - the same special symbol as blind_symbol, but in case of lines of words 31 | go_token - start of sequence symbol 32 | end_token - end of sequence symbol 33 | 34 | chars_vocab_size - number of unique characters 35 | idx_to_char - array of shape [chars_vocab_size] containing ordered list of inique characters 36 | char_to_idx - dictionary of shape [chars_vocab_size] 37 | such that idx_to_char[char_to_idx[some_char]] = some_char 38 | where some_char is such that idx_to_char contains it 39 | 40 | words_vocab_size, idx_to_word, word_to_idx - same as for characters 41 | 42 | max_word_len - maximum word length 43 | max_seq_len - maximum sequence length 44 | num_lines - num of lines in data with shape [target_num] 45 | 46 | word_tensor - tensor of shape [target_num, num_lines, line_lenght] c 47 | ontains word's indexes instead of words itself 48 | 49 | character_tensor - tensor of shape [target_num, num_lines, line_lenght, max_word_len]. 50 | Rows contain character indexes for every word in data 51 | 52 | :methods 53 | 54 | build_character_vocab(self, data) -> chars_vocab_size, idx_to_char, char_to_idx 55 | chars_vocab_size - size of unique characters in corpus 56 | idx_to_char - array of shape [chars_vocab_size] containing ordered list of inique characters 57 | char_to_idx - dictionary of shape [chars_vocab_size] 58 | such that idx_to_char[char_to_idx[some_char]] = some_char 59 | where some_char is such that idx_to_char contains it 60 | 61 | build_word_vocab(self, sentences) -> words_vocab_size, idx_to_word, word_to_idx 62 | same as for characters 63 | 64 | preprocess(self, data_files, idx_files, tensor_files) -> Void 65 | preprocessed and initialized properties and then save them 66 | 67 | load_preprocessed(self, data_files, idx_files, tensor_files) -> Void 68 | load and and initialized properties 69 | 70 | next_batch(self, batch_size, target_str) -> encoder_word_input, encoder_character_input, input_seq_len, 71 | decoder_input, decoder_output 72 | randomly sampled batch_size num of sequences for target from target_str. 73 | fills sequences with pad tokens to made them the same lenght. 74 | encoder_word_input and encoder_character_input have reversed order of the words 75 | in case of performance 76 | ''' 77 | 78 | self.data_files = data_files 79 | self.idx_files = idx_files 80 | self.tensor_files = tensor_files 81 | 82 | 83 | self.blind_symbol = '' 84 | self.pad_token = '_' 85 | self.go_token = '>' 86 | self.end_token = '|' 87 | self.a_token = '?' 88 | 89 | idx_exists = fold(f_and, 90 | [os.path.exists(file) for file in self.idx_files], 91 | True) 92 | 93 | tensors_exists = fold(f_and, 94 | [os.path.exists(file) for target in self.tensor_files 95 | for file in target], 96 | True) 97 | 98 | if idx_exists and tensors_exists: 99 | self.load_preprocessed(self.data_files, 100 | self.idx_files, 101 | self.tensor_files) 102 | print('preprocessed data was found and loaded') 103 | else: 104 | self.preprocess(self.data_files, 105 | self.idx_files, 106 | self.tensor_files) 107 | print('data have preprocessed') 108 | 109 | self.word_embedding_index = 0 110 | 111 | def clean_whole_data(self, string): 112 | string = re.sub('^[\d\:]+ ', '', string, 0, re.M) 113 | string = re.sub('\n\s{11}', ' ', string, 0, re.M) 114 | string = re.sub('\n{2}', '\n', string, 0, re.M) 115 | 116 | return string.lower() 117 | 118 | def clean_str(self, string): 119 | ''' 120 | Tokenization/string cleaning for all datasets except for SST. 121 | Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data 122 | ''' 123 | 124 | string = re.sub(r"[^가-힣A-Za-z0-9(),!?:;.\'\`]", " ", string) 125 | string = re.sub(r"\'s", " \'s", string) 126 | string = re.sub(r"\'ve", " \'ve", string) 127 | string = re.sub(r"n\'t", " n\'t", string) 128 | string = re.sub(r"\'re", " \'re", string) 129 | string = re.sub(r"\'d", " \'d", string) 130 | string = re.sub(r"\'ll", " \'ll", string) 131 | string = re.sub(r"\.", " . ", string) 132 | string = re.sub(r",", " , ", string) 133 | string = re.sub(r":", " : ", string) 134 | string = re.sub(r";", " ; ", string) 135 | string = re.sub(r"!", " ! ", string) 136 | string = re.sub(r"\(", " ( ", string) 137 | string = re.sub(r"\)", " ) ", string) 138 | string = re.sub(r"\?", " ? ", string) 139 | string = re.sub(r"\s{2,}", " ", string) 140 | return string.strip() 141 | 142 | def build_character_vocab(self, data): 143 | 144 | # unique characters with blind symbol 145 | chars = list(set(data)) + [self.blind_symbol, self.pad_token, self.go_token, self.end_token] 146 | chars_vocab_size = len(chars) 147 | 148 | # mappings itself 149 | idx_to_char = chars 150 | char_to_idx = {x: i for i, x in enumerate(idx_to_char)} 151 | 152 | return chars_vocab_size, idx_to_char, char_to_idx 153 | 154 | def build_word_vocab(self, sentences): 155 | 156 | # Build vocabulary 157 | word_counts = collections.Counter(sentences) 158 | 159 | # Mapping from index to word 160 | idx_to_word = [x[0] for x in word_counts.most_common()] 161 | idx_to_word = list(sorted(idx_to_word)) + [self.pad_token, self.go_token, self.end_token] 162 | 163 | words_vocab_size = len(idx_to_word) 164 | 165 | # Mapping from word to index 166 | word_to_idx = {x: i for i, x in enumerate(idx_to_word)} 167 | 168 | return words_vocab_size, idx_to_word, word_to_idx 169 | 170 | def preprocess(self, data_files, idx_files, tensor_files): 171 | 172 | data = [open(file, "r").read() for file in data_files] 173 | merged_data = data[0] + '\n' + data[1] 174 | 175 | self.chars_vocab_size, self.idx_to_char, self.char_to_idx = self.build_character_vocab(merged_data) 176 | 177 | with open(idx_files[1], 'wb') as f: 178 | cPickle.dump(self.idx_to_char, f) 179 | 180 | data_words = [[line.split() for line in target.split('\n')] for target in data] 181 | merged_data_words = merged_data.split() 182 | 183 | self.words_vocab_size, self.idx_to_word, self.word_to_idx = self.build_word_vocab(merged_data_words) 184 | self.max_word_len = np.amax([len(word) for word in self.idx_to_word]) 185 | self.max_seq_len = np.amax([len(line) for target in data_words for line in target]) 186 | self.num_lines = [len(target) for target in data_words] 187 | 188 | with open(idx_files[0], 'wb') as f: 189 | cPickle.dump(self.idx_to_word, f) 190 | 191 | self.word_tensor = np.array( 192 | [[list(map(self.word_to_idx.get, line)) for line in target] for target in data_words]) 193 | print(self.word_tensor.shape) 194 | for i, path in enumerate(tensor_files[0]): 195 | np.save(path, self.word_tensor[i]) 196 | 197 | self.character_tensor = np.array( 198 | [[list(map(self.encode_characters, line)) for line in target] for target in data_words]) 199 | for i, path in enumerate(tensor_files[1]): 200 | np.save(path, self.character_tensor[i]) 201 | 202 | self.just_words = [word for line in self.word_tensor[0] for word in line] 203 | 204 | def load_preprocessed(self, data_files, idx_files, tensor_files): 205 | 206 | data = [open(file, "r").read() for file in data_files] 207 | data_words = [[line.split() for line in target.split('\n')] for target in data] 208 | self.max_seq_len = np.amax([len(line) for target in data_words for line in target]) 209 | self.num_lines = [len(target) for target in data_words] 210 | 211 | [self.idx_to_word, self.idx_to_char] = [cPickle.load(open(file, "rb")) for file in idx_files] 212 | 213 | [self.words_vocab_size, self.chars_vocab_size] = [len(idx) for idx in [self.idx_to_word, self.idx_to_char]] 214 | 215 | [self.word_to_idx, self.char_to_idx] = [dict(zip(idx, range(len(idx)))) for idx in 216 | [self.idx_to_word, self.idx_to_char]] 217 | 218 | self.max_word_len = np.amax([len(word) for word in self.idx_to_word]) 219 | 220 | [self.word_tensor, self.character_tensor] = [np.array([np.load(target) for target in input_type]) 221 | for input_type in tensor_files] 222 | 223 | self.just_words = [word for line in self.word_tensor[0] for word in line] 224 | 225 | def next_batch(self, batch_size, target_str,start_index): 226 | # target = 0 if target_str == 'train' else 1 227 | target=0 228 | # indexes = np.array(np.random.randint(self.num_lines[target], size=batch_size)) 229 | # indexes = np.array([10]) 230 | 231 | # print '-----------------Printing ? identity----------------------' 232 | # temp = self.word_to_idx[self.a_token] 233 | # print temp 234 | # print 'DONE!' 235 | # exit() 236 | 237 | indexes = np.array(range(start_index, start_index+batch_size)) 238 | # print '======================' 239 | # print indexes 240 | # print '======================' 241 | # print self.num_lines 242 | 243 | # print 'Printing indexes ------------->' 244 | # print indexes 245 | # print '-------------------------------' 246 | 247 | encoder_word_input = [self.word_tensor[target][index] for index in indexes] 248 | 249 | # print 'Printing encoder_word_input ------------->' 250 | # print encoder_word_input 251 | # print '-------------------------------' 252 | 253 | encoder_character_input = [self.character_tensor[target][index] for index in indexes] 254 | input_seq_len = [len(line) for line in encoder_word_input] 255 | max_input_seq_len = np.amax(input_seq_len) 256 | 257 | encoded_words = [[idx for idx in line] for line in encoder_word_input] 258 | decoder_word_input = [[self.word_to_idx[self.go_token]] + line for line in encoder_word_input] 259 | decoder_character_input = [[self.encode_characters(self.go_token)] + line for line in encoder_character_input] 260 | decoder_output = [line + [self.word_to_idx[self.end_token]] for line in encoded_words] 261 | 262 | # sorry 263 | for i, line in enumerate(decoder_word_input): 264 | line_len = input_seq_len[i] 265 | to_add = max_input_seq_len - line_len 266 | decoder_word_input[i] = line + [self.word_to_idx[self.pad_token]] * to_add 267 | 268 | for i, line in enumerate(decoder_character_input): 269 | line_len = input_seq_len[i] 270 | to_add = max_input_seq_len - line_len 271 | decoder_character_input[i] = line + [self.encode_characters(self.pad_token)] * to_add 272 | 273 | for i, line in enumerate(decoder_output): 274 | line_len = input_seq_len[i] 275 | to_add = max_input_seq_len - line_len 276 | decoder_output[i] = line + [self.word_to_idx[self.pad_token]] * to_add 277 | 278 | for i, line in enumerate(encoder_word_input): 279 | line_len = input_seq_len[i] 280 | to_add = max_input_seq_len - line_len 281 | encoder_word_input[i] = [self.word_to_idx[self.pad_token]] * to_add + line[::-1] 282 | 283 | for i, line in enumerate(encoder_character_input): 284 | line_len = input_seq_len[i] 285 | to_add = max_input_seq_len - line_len 286 | encoder_character_input[i] = [self.encode_characters(self.pad_token)] * to_add + line[::-1] 287 | 288 | return np.array(encoder_word_input), np.array(encoder_character_input), \ 289 | np.array(decoder_word_input), np.array(decoder_character_input), np.array(decoder_output) 290 | 291 | def next_embedding_seq(self, seq_len): 292 | """ 293 | :return: 294 | tuple of input and output for word embedding learning, 295 | where input = [b, b, c, c, d, d, e, e] 296 | and output = [a, c, b, d, d, e, d, g] 297 | for line [a, b, c, d, e, g] at index i 298 | """ 299 | 300 | words_len = len(self.just_words) 301 | seq = [self.just_words[i % words_len] 302 | for i in np.arange(self.word_embedding_index, self.word_embedding_index + seq_len)] 303 | 304 | result = [] 305 | for i in range(seq_len - 2): 306 | result.append([seq[i + 1], seq[i]]) 307 | result.append([seq[i + 1], seq[i + 2]]) 308 | 309 | self.word_embedding_index = (self.word_embedding_index + seq_len) % words_len - 2 310 | 311 | # input and target 312 | result = np.array(result) 313 | #print result 314 | #print "---------------------print is coming --------------" 315 | #print len(result[0]) 316 | return result[:, 0], result[:, 1] 317 | 318 | def go_input(self, batch_size): 319 | go_word_input = [[self.word_to_idx[self.go_token]] for _ in range(batch_size)] 320 | go_character_input = [[self.encode_characters(self.go_token)] for _ in range(batch_size)] 321 | 322 | return np.array(go_word_input), np.array(go_character_input) 323 | 324 | def encode_word(self, idx): 325 | result = np.zeros(self.words_vocab_size) 326 | result[idx] = 1 327 | return result 328 | 329 | def decode_word(self, word_idx): 330 | word = self.idx_to_word[word_idx] 331 | return word 332 | 333 | def sample_word_from_distribution(self, distribution): 334 | ix = np.random.choice(range(self.words_vocab_size), p=distribution.ravel()) 335 | x = np.zeros((self.words_vocab_size, 1)) 336 | x[ix] = 1 337 | return self.idx_to_word[np.argmax(x)] 338 | 339 | def encode_characters(self, characters): 340 | word_len = len(characters) 341 | to_add = self.max_word_len - word_len 342 | characters_idx = [self.char_to_idx[i] for i in characters] + to_add * [self.char_to_idx['']] 343 | return characters_idx 344 | 345 | def decode_characters(self, characters_idx): 346 | characters = [self.idx_to_char[i] for i in characters_idx] 347 | return ''.join(characters) 348 | -------------------------------------------------------------------------------- /model/rvae.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as t 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | 7 | from .decoder import Decoder 8 | from .encoder import Encoder 9 | 10 | from selfModules.embedding import Embedding 11 | 12 | from utils.functional import kld_coef, parameters_allocation_check, fold 13 | from beam_search import Beam 14 | 15 | class RVAE(nn.Module): 16 | def __init__(self, params,params_2): 17 | super(RVAE, self).__init__() 18 | 19 | self.params = params 20 | self.params_2 = params_2 #Encoder-2 parameters 21 | 22 | self.embedding = Embedding(self.params, '') 23 | self.embedding_2 = Embedding(self.params_2, '',True) 24 | 25 | self.encoder = Encoder(self.params) 26 | self.encoder_2 = Encoder(self.params_2) 27 | 28 | 29 | self.context_to_mu = nn.Linear(self.params.encoder_rnn_size * 2, self.params.latent_variable_size) 30 | self.context_to_logvar = nn.Linear(self.params.encoder_rnn_size * 2, self.params.latent_variable_size) 31 | 32 | # self.encoder_3 = Encoder(self.params) 33 | self.decoder = Decoder(self.params_2) #change this to params_2 34 | 35 | def forward(self, drop_prob, 36 | encoder_word_input=None, encoder_character_input=None, 37 | encoder_word_input_2=None, encoder_character_input_2=None, 38 | decoder_word_input_2=None, decoder_character_input_2=None, 39 | z=None, initial_state=None): 40 | 41 | #Modified the parameters of forward function according to Encoder-2 42 | """ 43 | :param encoder_word_input: An tensor with shape of [batch_size, seq_len] of Long type 44 | :param encoder_character_input: An tensor with shape of [batch_size, seq_len, max_word_len] of Long type 45 | :param decoder_word_input: An tensor with shape of [batch_size, max_seq_len + 1] of Long type 46 | :param initial_state: initial state of decoder rnn in order to perform sampling 47 | 48 | :param drop_prob: probability of an element of decoder input to be zeroed in sense of dropout 49 | 50 | :param z: context if sampling is performing 51 | 52 | :return: unnormalized logits of sentence words distribution probabilities 53 | with shape of [batch_size, seq_len, word_vocab_size] 54 | final rnn state with shape of [num_layers, batch_size, decoder_rnn_size] 55 | """ 56 | 57 | assert parameters_allocation_check(self), \ 58 | 'Invalid CUDA options. Parameters should be allocated in the same memory' 59 | use_cuda = self.embedding.word_embed.weight.is_cuda 60 | 61 | assert z is None and fold(lambda acc, parameter: acc and parameter is not None, 62 | [encoder_word_input, encoder_character_input, decoder_word_input_2], 63 | True) \ 64 | or (z is not None and decoder_word_input_2 is not None), \ 65 | "Invalid input. If z is None then encoder and decoder inputs should be passed as arguments" 66 | 67 | if z is None: 68 | ''' Get context from encoder and sample z ~ N(mu, std) 69 | ''' 70 | [batch_size, _] = encoder_word_input.size() 71 | 72 | encoder_input = self.embedding(encoder_word_input, encoder_character_input) 73 | 74 | ''' ===================================================Doing the same for encoder-2=================================================== 75 | ''' 76 | [batch_size_2, _] = encoder_word_input_2.size() 77 | 78 | encoder_input_2 = self.embedding_2(encoder_word_input_2, encoder_character_input_2) 79 | 80 | ''' ================================================================================================================================== 81 | ''' 82 | 83 | context , h_0 , c_0 = self.encoder(encoder_input, None) 84 | 85 | State = (h_0,c_0) #Final state of Encoder-1 86 | context_2 , _ , _ = self.encoder_2( encoder_input_2, State ) #Encoder_2 for Ques_2 87 | 88 | mu = self.context_to_mu(context_2) 89 | logvar = self.context_to_logvar(context_2) 90 | std = t.exp(0.5 * logvar) 91 | 92 | z = Variable(t.randn([batch_size, self.params.latent_variable_size])) 93 | if use_cuda: 94 | z = z.cuda() 95 | 96 | z = z * std + mu 97 | 98 | kld = (-0.5 * t.sum(logvar - t.pow(mu, 2) - t.exp(logvar) + 1, 1)).mean().squeeze() 99 | 100 | # encoder_input = self.embedding(encoder_word_input, encoder_character_input) 101 | # _ , h_0 , c_0 = self.encoder_3(encoder_input, None) 102 | initial_state = State #Final state of Encoder-1 103 | 104 | else: 105 | kld = None 106 | mu = None 107 | std = None 108 | 109 | 110 | 111 | decoder_input_2 = self.embedding_2.word_embed(decoder_word_input_2) # What to do with this decoder input ? --> Slightly resolved 112 | out, final_state = self.decoder(decoder_input_2, z, drop_prob, initial_state) # Take a look at the decoder 113 | 114 | return out, final_state, kld, mu, std 115 | 116 | def learnable_parameters(self): 117 | 118 | # word_embedding is constant parameter thus it must be dropped from list of parameters for optimizer 119 | return [p for p in self.parameters() if p.requires_grad] 120 | 121 | def trainer(self, optimizer, batch_loader, batch_loader_2): 122 | def train(i, batch_size, use_cuda, dropout, start_index): 123 | input = batch_loader.next_batch(batch_size, 'train', start_index) 124 | input = [Variable(t.from_numpy(var)) for var in input] 125 | input = [var.long() for var in input] 126 | input = [var.cuda() if use_cuda else var for var in input] 127 | 128 | [encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target] = input 129 | 130 | 131 | ''' =================================================== Input for Encoder-2 ======================================================== 132 | ''' 133 | 134 | input_2 = batch_loader_2.next_batch(batch_size, 'train', start_index) 135 | input_2 = [Variable(t.from_numpy(var)) for var in input_2] 136 | input_2 = [var.long() for var in input_2] 137 | input_2 = [var.cuda() if use_cuda else var for var in input_2] 138 | 139 | [encoder_word_input_2, encoder_character_input_2, decoder_word_input_2, decoder_character_input_2, target] = input_2 140 | 141 | ''' ================================================================================================================================ 142 | ''' 143 | # exit() 144 | 145 | logits, _, kld,_ ,_ = self(dropout, 146 | encoder_word_input, encoder_character_input, 147 | encoder_word_input_2,encoder_character_input_2, 148 | decoder_word_input_2, decoder_character_input_2, 149 | z=None) 150 | 151 | # logits = logits.view(-1, self.params.word_vocab_size) 152 | logits = logits.view(-1, self.params_2.word_vocab_size) 153 | target = target.view(-1) 154 | cross_entropy = F.cross_entropy(logits, target) 155 | 156 | loss = 79 * cross_entropy + kld_coef(i) * kld 157 | 158 | optimizer.zero_grad() 159 | loss.backward() 160 | optimizer.step() 161 | 162 | return cross_entropy, kld, kld_coef(i) 163 | 164 | return train 165 | 166 | def validater(self, batch_loader,batch_loader_2): 167 | def validate(batch_size, use_cuda, start_index): 168 | input = batch_loader.next_batch(batch_size, 'valid', start_index) 169 | input = [Variable(t.from_numpy(var)) for var in input] 170 | input = [var.long() for var in input] 171 | input = [var.cuda() if use_cuda else var for var in input] 172 | 173 | [encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target] = input 174 | 175 | ''' ==================================================== Input for Encoder-2 ======================================================== 176 | ''' 177 | 178 | input_2 = batch_loader_2.next_batch(batch_size, 'valid', start_index) 179 | input_2 = [Variable(t.from_numpy(var)) for var in input_2] 180 | input_2 = [var.long() for var in input_2] 181 | input_2 = [var.cuda() if use_cuda else var for var in input_2] 182 | [encoder_word_input_2, encoder_character_input_2, decoder_word_input_2, decoder_character_input_2, target] = input_2 183 | 184 | ''' ================================================================================================================================== 185 | ''' 186 | 187 | logits, _, kld,_ ,_ = self(0., 188 | encoder_word_input, encoder_character_input, 189 | encoder_word_input_2,encoder_character_input_2, 190 | decoder_word_input_2, decoder_character_input_2, 191 | z=None) 192 | 193 | # logits = logits.view(-1, self.params.word_vocab_size) 194 | logits = logits.view(-1, self.params_2.word_vocab_size) 195 | target = target.view(-1) 196 | cross_entropy = F.cross_entropy(logits, target) 197 | 198 | return cross_entropy, kld 199 | 200 | return validate 201 | 202 | def sample(self, batch_loader, seq_len, seed, use_cuda, State): 203 | # seed = Variable(t.from_numpy(seed).float()) 204 | if use_cuda: 205 | seed = seed.cuda() 206 | 207 | decoder_word_input_np, decoder_character_input_np = batch_loader.go_input(1) 208 | 209 | decoder_word_input = Variable(t.from_numpy(decoder_word_input_np).long()) 210 | decoder_character_input = Variable(t.from_numpy(decoder_character_input_np).long()) 211 | 212 | if use_cuda: 213 | decoder_word_input, decoder_character_input = decoder_word_input.cuda(), decoder_character_input.cuda() 214 | 215 | result = '' 216 | 217 | initial_state = State 218 | 219 | for i in range(seq_len): 220 | logits, initial_state, _ ,_,_= self(0., None, None, 221 | None, None, 222 | decoder_word_input, decoder_character_input, 223 | seed, initial_state) 224 | 225 | 226 | # forward(self, drop_prob, 227 | # encoder_word_input=None, encoder_character_input=None, 228 | # encoder_word_input_2=None, encoder_character_input_2=None, 229 | # decoder_word_input_2=None, decoder_character_input_2=None, 230 | # z=None, initial_state=None): 231 | 232 | # logits = logits.view(-1, self.params.word_vocab_size) 233 | # logits = logits.view(-1, self.params.word_vocab_size) 234 | logits = logits.view(-1, self.params_2.word_vocab_size) 235 | # print '---------------------------------------' 236 | # print 'Printing logits' 237 | # print logits 238 | # print '------------------------------------------' 239 | 240 | prediction = F.softmax(logits) 241 | 242 | word = batch_loader.sample_word_from_distribution(prediction.data.cpu().numpy()[-1]) 243 | 244 | if word == batch_loader.end_token: 245 | break 246 | 247 | result += ' ' + word 248 | 249 | decoder_word_input_np = np.array([[batch_loader.word_to_idx[word]]]) 250 | decoder_character_input_np = np.array([[batch_loader.encode_characters(word)]]) 251 | 252 | decoder_word_input = Variable(t.from_numpy(decoder_word_input_np).long()) 253 | decoder_character_input = Variable(t.from_numpy(decoder_character_input_np).long()) 254 | 255 | if use_cuda: 256 | decoder_word_input, decoder_character_input = decoder_word_input.cuda(), decoder_character_input.cuda() 257 | 258 | return result 259 | 260 | def sampler(self, batch_loader,batch_loader_2, seq_len, seed, use_cuda,i,beam_size,n_best): 261 | input = batch_loader.next_batch(1, 'valid', i) 262 | input = [Variable(t.from_numpy(var)) for var in input] 263 | input = [var.long() for var in input] 264 | input = [var.cuda() if use_cuda else var for var in input] 265 | [encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target] = input 266 | 267 | encoder_input = self.embedding(encoder_word_input, encoder_character_input) 268 | 269 | _ , h0 , c0 = self.encoder(encoder_input, None) 270 | State = (h0,c0) 271 | 272 | # print '----------------------' 273 | # print 'Printing h0 ---------->' 274 | # print h0 275 | # print '----------------------' 276 | 277 | # State = None 278 | results, scores = self.sample_beam(batch_loader_2, seq_len, seed, use_cuda, State, beam_size, n_best) 279 | 280 | return results, scores 281 | 282 | 283 | def sample_beam(self, batch_loader, seq_len, seed, use_cuda, State, beam_size, n_best): 284 | # seed = Variable(t.from_numpy(seed).float()) 285 | if use_cuda: 286 | seed = seed.cuda() 287 | 288 | decoder_word_input_np, decoder_character_input_np = batch_loader.go_input(1) 289 | 290 | decoder_word_input = Variable(t.from_numpy(decoder_word_input_np).long()) 291 | decoder_character_input = Variable(t.from_numpy(decoder_character_input_np).long()) 292 | 293 | if use_cuda: 294 | decoder_word_input, decoder_character_input = decoder_word_input.cuda(), decoder_character_input.cuda() 295 | 296 | 297 | dec_states = State 298 | 299 | # print '========= Before ================' 300 | # print "dec_states:", dec_states[0].size() 301 | # print "dec_states:", dec_states[1].size() 302 | # print '==================================' 303 | 304 | # dec_states = [ 305 | # Variable(dec_states[0].repeat(1, beam_size, 1)), 306 | # Variable(dec_states[1].repeat(1, beam_size, 1)) 307 | # ] 308 | dec_states = [ 309 | dec_states[0].repeat(1, beam_size, 1), 310 | dec_states[1].repeat(1, beam_size, 1) 311 | ] 312 | 313 | # print'========== After ==================' 314 | # print "dec_states:", dec_states[0].size() 315 | # print "dec_states:", dec_states[1].size() 316 | # print '==================================' 317 | # exit() 318 | 319 | drop_prob = 0.0 320 | beam_size = beam_size 321 | batch_size = 1 322 | 323 | beam = [Beam(beam_size, batch_loader, cuda=True) for k in range(batch_size)] 324 | 325 | batch_idx = list(range(batch_size)) 326 | remaining_sents = batch_size 327 | 328 | 329 | for i in range(seq_len): 330 | 331 | input = t.stack( 332 | [b.get_current_state() for b in beam if not b.done] 333 | ).t().contiguous().view(1, -1) 334 | 335 | trg_emb = self.embedding_2.word_embed(Variable(input).transpose(1, 0)) 336 | 337 | # print trg_emb.size() 338 | # print seed.size() 339 | 340 | trg_h, dec_states = self.decoder.only_decoder_beam(trg_emb, seed, drop_prob, dec_states) 341 | 342 | # trg_h, (trg_h_t, trg_c_t) = self.model.decoder(trg_emb, (dec_states[0].squeeze(0), dec_states[1].squeeze(0)), context ) 343 | 344 | # print trg_h.size() 345 | # print trg_h_t.size() 346 | # print trg_c_t.size() 347 | 348 | # dec_states = (trg_h_t, trg_c_t) 349 | 350 | # print 'State dimension ----------->' 351 | # print State[0].size() 352 | # print State[1].size() 353 | # print '=======================================' 354 | # print "dec_states:", dec_states[0].size() 355 | # print "dec_states:", dec_states[1].size() 356 | # print '========== Things successful ===========' 357 | 358 | # exit() 359 | 360 | dec_out = trg_h.squeeze(1) 361 | 362 | # print "dec_out:", dec_out.size() 363 | 364 | out = F.softmax(self.decoder.fc(dec_out)).unsqueeze(0) 365 | 366 | word_lk = out.view( 367 | beam_size, 368 | remaining_sents, 369 | -1 370 | ).transpose(0, 1).contiguous() 371 | 372 | active = [] 373 | for b in range(batch_size): 374 | if beam[b].done: 375 | continue 376 | 377 | idx = batch_idx[b] 378 | if not beam[b].advance(word_lk.data[idx]): 379 | active += [b] 380 | 381 | for dec_state in dec_states: # iterate over h, c 382 | # layers x beam*sent x dim 383 | sent_states = dec_state.view( 384 | -1, beam_size, remaining_sents, dec_state.size(2) 385 | )[:, :, idx] 386 | sent_states.data.copy_( 387 | sent_states.data.index_select( 388 | 1, 389 | beam[b].get_current_origin() 390 | ) 391 | ) 392 | 393 | if not active: 394 | break 395 | 396 | # in this section, the sentences that are still active are 397 | # compacted so that the decoder is not run on completed sentences 398 | active_idx = t.cuda.LongTensor([batch_idx[k] for k in active]) 399 | batch_idx = {beam: idx for idx, beam in enumerate(active)} 400 | 401 | def update_active(t): 402 | # select only the remaining active sentences 403 | view = t.data.view( 404 | -1, remaining_sents, 405 | self.params.decoder_rnn_size 406 | ) 407 | new_size = list(t.size()) 408 | new_size[-2] = new_size[-2] * len(active_idx) \ 409 | // remaining_sents 410 | return Variable(view.index_select( 411 | 1, active_idx 412 | ).view(*new_size)) 413 | 414 | dec_states = ( 415 | update_active(dec_states[0]), 416 | update_active(dec_states[1]) 417 | ) 418 | dec_out = update_active(dec_out) 419 | # context = update_active(context) 420 | 421 | remaining_sents = len(active) 422 | 423 | # (4) package everything up 424 | 425 | allHyp, allScores = [], [] 426 | 427 | 428 | for b in range(batch_size): 429 | scores, ks = beam[b].sort_best() 430 | # print scores 431 | # print ks 432 | allScores += [scores[:n_best]] 433 | hyps = zip(*[beam[b].get_hyp(k) for k in ks[:n_best]]) 434 | # print hyps 435 | # print "------------------" 436 | allHyp += [hyps] 437 | 438 | # print '==== Complete =========' 439 | 440 | return allHyp, allScores 441 | --------------------------------------------------------------------------------