├── model
    ├── __init__.py
    ├── rvae.pyc
    ├── __init__.pyc
    ├── decoder.pyc
    ├── encoder.pyc
    ├── encoder.py
    ├── decoder.py
    ├── rvae_previous.py
    └── rvae.py
├── utils
    ├── __init__.py
    ├── __init__.pyc
    ├── tensor.pyc
    ├── functional.pyc
    ├── parameters.pyc
    ├── batch_loader.pyc
    ├── parameters.py
    ├── functional.py
    ├── visualize_word_embeddings.py
    ├── tensor.py
    └── batch_loader.py
├── selfModules
    ├── __init__.py
    ├── neg.pyc
    ├── tdnn.pyc
    ├── highway.pyc
    ├── __init__.pyc
    ├── embedding.pyc
    ├── highway.py
    ├── tdnn.py
    ├── embedding.py
    └── neg.py
├── __init__.py
├── beam_search.pyc
├── ce_result_.npy
├── kld_result_npy_.npy
├── README.md
├── train_word_embeddings.py
├── train_word_embeddings_2.py
├── beam_search.py
├── test.py
├── train.py
└── sample_3.py


/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/selfModules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from . import nn_layers
2 | from . import utility
3 | 


--------------------------------------------------------------------------------
/beam_search.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/beam_search.pyc


--------------------------------------------------------------------------------
/ce_result_.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/ce_result_.npy


--------------------------------------------------------------------------------
/model/rvae.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/model/rvae.pyc


--------------------------------------------------------------------------------
/model/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/model/__init__.pyc


--------------------------------------------------------------------------------
/model/decoder.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/model/decoder.pyc


--------------------------------------------------------------------------------
/model/encoder.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/model/encoder.pyc


--------------------------------------------------------------------------------
/utils/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/utils/__init__.pyc


--------------------------------------------------------------------------------
/utils/tensor.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/utils/tensor.pyc


--------------------------------------------------------------------------------
/kld_result_npy_.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/kld_result_npy_.npy


--------------------------------------------------------------------------------
/selfModules/neg.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/selfModules/neg.pyc


--------------------------------------------------------------------------------
/selfModules/tdnn.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/selfModules/tdnn.pyc


--------------------------------------------------------------------------------
/utils/functional.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/utils/functional.pyc


--------------------------------------------------------------------------------
/utils/parameters.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/utils/parameters.pyc


--------------------------------------------------------------------------------
/selfModules/highway.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/selfModules/highway.pyc


--------------------------------------------------------------------------------
/utils/batch_loader.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/utils/batch_loader.pyc


--------------------------------------------------------------------------------
/selfModules/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/selfModules/__init__.pyc


--------------------------------------------------------------------------------
/selfModules/embedding.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arvind385801/paraphraseGen/HEAD/selfModules/embedding.pyc


--------------------------------------------------------------------------------
/utils/parameters.py:
--------------------------------------------------------------------------------
 1 | from .functional import *
 2 | 
 3 | 
 4 | class Parameters:
 5 |     def __init__(self, max_word_len, max_seq_len, word_vocab_size, char_vocab_size):
 6 |         self.max_word_len = int(max_word_len)
 7 |         self.max_seq_len = int(max_seq_len) + 1  # go or eos token
 8 | 
 9 |         self.word_vocab_size = int(word_vocab_size)
10 |         self.char_vocab_size = int(char_vocab_size)
11 | 
12 |         self.word_embed_size = 300
13 |         self.char_embed_size = 15
14 | 
15 |         self.kernels = [(1, 25), (2, 50), (3, 75), (4, 100), (5, 125), (6, 150)]
16 |         self.sum_depth = fold(lambda x, y: x + y, [depth for _, depth in self.kernels], 0)
17 | 
18 |         self.encoder_rnn_size = 600
19 |         self.encoder_num_layers = 1
20 | 
21 |         self.latent_variable_size = 1100
22 | 
23 |         self.decoder_rnn_size = 600
24 |         self.decoder_num_layers = 2
25 | 


--------------------------------------------------------------------------------
/utils/functional.py:
--------------------------------------------------------------------------------
 1 | def fold(f, l, a):
 2 |     return a if (len(l) == 0) else fold(f, l[1:], f(a, l[0]))
 3 | 
 4 | def f_and(x, y):
 5 |     return convert(x) and convert(y)
 6 | 
 7 | def f_or(x, y):
 8 |     return convert(x) or convert(y)
 9 | 
10 | def convert(x):
11 |     if type(x) <> bool:
12 |         return len(x)>0
13 |     else:
14 |         return x
15 | 
16 | def parameters_allocation_check(module):
17 |     parameters = list(module.parameters())
18 |     return fold(f_and, parameters, True) or not fold(f_or, parameters, False)
19 | 
20 | def handle_inputs(inputs, use_cuda):
21 |     import torch as t
22 |     from torch.autograd import Variable
23 | 
24 |     result = [Variable(t.from_numpy(var)) for var in inputs]
25 |     result = [var.cuda() if use_cuda else var for var in result]
26 | 
27 |     return result
28 | 
29 | def kld_coef(i):
30 |     import math
31 |     return (math.tanh((i - 3500)/1000) + 1)/2
32 | 


--------------------------------------------------------------------------------
/utils/visualize_word_embeddings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | from sklearn.decomposition import PCA
 6 | 
 7 | from utils.batch_loader import BatchLoader
 8 | 
 9 | if __name__ == "__main__":
10 |     if not os.path.exists('../../data/word_embeddings.npy'):
11 |         raise FileNotFoundError("word embeddings file was't found")
12 | 
13 |     pca = PCA(n_components=2)
14 |     word_embeddings = np.load('../../data/word_embeddings.npy')
15 |     word_embeddings_pca = pca.fit_transform(word_embeddings)
16 | 
17 |     batch_loader = BatchLoader()
18 |     words = batch_loader.idx_to_word
19 | 
20 |     fig, ax = plt.subplots()
21 |     fig.set_size_inches(150, 150)
22 |     x = word_embeddings_pca[:, 0]
23 |     y = word_embeddings_pca[:, 1]
24 |     ax.scatter(x, y)
25 | 
26 |     for i, word in enumerate(words):
27 |         ax.annotate(word, (x[i], y[i]))
28 | 
29 |     fig.savefig('word_embedding.png', dpi=100)
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # A Deep Generative Framework for Paraphrase Generation
 2 | 
 3 | ## Model:
 4 | This is the implementation of [A Deep Generative Framework for Paraphrase Generation](https://arxiv.org/pdf/1709.05074) by Ankush et al. (AAA2018) with Kim's [Character-Aware Neural Language Models](https://arxiv.org/abs/1508.06615) embedding for tokens. The code used the Samuel Bowman's [Generating Sentences from a Continuous Space](https://arxiv.org/abs/1511.06349#) implementation as a base code available [here](https://github.com/kefirski/pytorch_RVAE).
 5 | 
 6 | 
 7 | 
 8 | ## Usage
 9 | ### Before model training it is necessary to train word embeddings for both questions and its paraphrases:
10 | ```
11 | $ python train_word_embeddings.py --num-iterations 1200000
12 | $ python train_word_embeddings_2.py --num-iterations 1200000
13 | ```
14 | 
15 | This script train word embeddings defined in [Mikolov et al. Distributed Representations of Words and Phrases](https://arxiv.org/abs/1310.4546)
16 | 
17 | #### Parameters:
18 | `--use-cuda`
19 | 
20 | `--num-iterations`
21 | 
22 | `--batch-size`
23 | 
24 | `--num-sample` –– number of sampled from noise tokens
25 | 
26 | 
27 | ### To train model use:
28 | ```
29 | $ python train.py --num-iterations 140000
30 | ```
31 | 
32 | #### Parameters:
33 | `--use-cuda`
34 | 
35 | `--num-iterations`
36 | 
37 | `--batch-size`
38 | 
39 | `--learning-rate`
40 | 
41 | `--dropout` –– probability of units to be zeroed in decoder input
42 | 
43 | `--use-trained` –– use trained before model
44 | 
45 | ### To sample data after training use:
46 | ```
47 | $ python test.py
48 | ```
49 | #### Parameters:
50 | `--use-cuda`
51 | 
52 | `--num-sample`
53 | 
54 | 


--------------------------------------------------------------------------------
/utils/tensor.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import os
 3 | import re
 4 | 
 5 | import numpy as np
 6 | from six.moves import cPickle
 7 | 
 8 | from .functional import *
 9 | 
10 | idx_files = ['data/words_vocab.pkl',
11 |                       'data/characters_vocab.pkl']
12 | 
13 | [idx_to_word, idx_to_char] = [cPickle.load(open(file, "rb")) for file in idx_files]
14 | [word_to_idx, char_to_idx] = [dict(zip(idx, range(len(idx)))) for idx in
15 |                                         [idx_to_word, idx_to_char]]
16 | 
17 | max_word_len = np.amax([len(word) for word in idx_to_word])
18 | 
19 | def encode_characters(characters):
20 |     word_len = len(characters)
21 |     to_add = max_word_len - word_len
22 |     characters_idx = [char_to_idx[i] for i in characters] + to_add * [char_to_idx['']]
23 |     return characters_idx
24 | 
25 | def preprocess_data(data_files, idx_files, tensor_files, file, str=''):
26 | 
27 |     # print 'Preprocessing the test file\n'
28 |     if file:
29 |         data = [open(file, "r").read() for file in data_files]
30 |     else:
31 |         data=[str+'\n']
32 | 
33 |     data_words = [[line.split() for line in target.split('\n')] for target in data]
34 |     data_words = [[[word for word in target if word in idx_to_word] for target in yo] for yo in data_words]
35 | 
36 |     word_tensor = np.array(
37 |         [[list(map(word_to_idx.get, line)) for line in target] for target in data_words])
38 |     np.save(tensor_files[0][0], word_tensor[0])
39 |     # print(word_tensor.shape)
40 |     character_tensor = np.array(
41 |         [[list(map(encode_characters, line)) for line in target] for target in data_words])
42 |     np.save(tensor_files[1][0], character_tensor[0])
43 | 


--------------------------------------------------------------------------------
/selfModules/highway.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class Highway(nn.Module):
 7 |     def __init__(self, size, num_layers, f):
 8 | 
 9 |         super(Highway, self).__init__()
10 | 
11 |         self.num_layers = num_layers
12 | 
13 |         self.nonlinear = [nn.Linear(size, size) for _ in range(num_layers)]
14 |         for i, module in enumerate(self.nonlinear):
15 |             self._add_to_parameters(module.parameters(), 'nonlinear_module_{}'.format(i))
16 | 
17 |         self.linear = [nn.Linear(size, size) for _ in range(num_layers)]
18 |         for i, module in enumerate(self.linear):
19 |             self._add_to_parameters(module.parameters(), 'linear_module_{}'.format(i))
20 | 
21 |         self.gate = [nn.Linear(size, size) for _ in range(num_layers)]
22 |         for i, module in enumerate(self.gate):
23 |             self._add_to_parameters(module.parameters(), 'gate_module_{}'.format(i))
24 | 
25 |         self.f = f
26 | 
27 |     def forward(self, x):
28 |         """
29 |         :param x: tensor with shape of [batch_size, size]
30 | 
31 |         :return: tensor with shape of [batch_size, size]
32 | 
33 |         applies σ(x) ⨀ (f(G(x))) + (1 - σ(x)) ⨀ (Q(x)) transformation | G and Q is affine transformation,
34 |             f is non-linear transformation, σ(x) is affine transformation with sigmoid non-linearition
35 |             and ⨀ is element-wise multiplication
36 |         """
37 | 
38 |         for layer in range(self.num_layers):
39 |             gate = F.sigmoid(self.gate[layer](x))
40 | 
41 |             nonlinear = self.f(self.nonlinear[layer](x))
42 |             linear = self.linear[layer](x)
43 | 
44 |             x = gate * nonlinear + (1 - gate) * linear
45 | 
46 |         return x
47 | 
48 |     def _add_to_parameters(self, parameters, name):
49 |         for i, parameter in enumerate(parameters):
50 |             self.register_parameter(name='{}-{}'.format(name, i), param=parameter)
51 | 


--------------------------------------------------------------------------------
/model/encoder.py:
--------------------------------------------------------------------------------
 1 | import torch as t
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from selfModules.highway import Highway
 6 | from utils.functional import parameters_allocation_check
 7 | 
 8 | 
 9 | class Encoder(nn.Module):
10 |     def __init__(self, params):
11 |         super(Encoder, self).__init__()
12 | 
13 |         self.params = params
14 | 
15 |         self.hw1 = Highway(self.params.sum_depth + self.params.word_embed_size, 2, F.relu)
16 | 
17 |         self.rnn = nn.LSTM(input_size=self.params.word_embed_size + self.params.sum_depth,
18 |                            hidden_size=self.params.encoder_rnn_size,
19 |                            num_layers=self.params.encoder_num_layers,
20 |                            batch_first=True,
21 |                            bidirectional=True)
22 | 
23 |     def forward(self, input, State):
24 |         """
25 |         :param input: [batch_size, seq_len, embed_size] tensor
26 |         :return: context of input sentenses with shape of [batch_size, latent_variable_size]
27 |         """
28 |         #print "Three"
29 |         [batch_size, seq_len, embed_size] = input.size()
30 | 
31 |         input = input.view(-1, embed_size)
32 |         input = self.hw1(input)
33 |         input = input.view(batch_size, seq_len, embed_size)
34 | 
35 |         assert parameters_allocation_check(self), \
36 |             'Invalid CUDA options. Parameters should be allocated in the same memory'
37 | 
38 |         ''' Unfold rnn with zero initial state and get its final state from the last layer
39 |         '''
40 |         _, (transfer_state_1, final_state) = self.rnn(input, State)
41 |         transfer_state_2 = final_state
42 |         
43 |         final_state = final_state.view(self.params.encoder_num_layers, 2, batch_size, self.params.encoder_rnn_size)
44 |         final_state = final_state[-1]
45 |         h_1, h_2 = final_state[0], final_state[1]
46 |         final_state = t.cat([h_1, h_2], 1)
47 | 
48 |         return final_state, transfer_state_1, transfer_state_2
49 | 


--------------------------------------------------------------------------------
/selfModules/tdnn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import torch as t
 3 | from torch.nn import Parameter
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | class TDNN(nn.Module):
 9 |     def __init__(self, params):
10 |         super(TDNN, self).__init__()
11 | 
12 |         self.params = params
13 | 
14 |         self.kernels = [Parameter(t.Tensor(out_dim, self.params.char_embed_size, kW).uniform_(-1, 1))
15 |                         for kW, out_dim in params.kernels]
16 |         self._add_to_parameters(self.kernels, 'TDNN_kernel')
17 | 
18 |     def forward(self, x):
19 |         """
20 |         :param x: tensor with shape [batch_size, max_seq_len, max_word_len, char_embed_size]
21 | 
22 |         :return: tensor with shape [batch_size, max_seq_len, depth_sum]
23 | 
24 |         applies multikenrel 1d-conv layer along every word in input with max-over-time pooling
25 |             to emit fixed-size output
26 |         """
27 | 
28 |         input_size = x.size()
29 |         input_size_len = len(input_size)
30 | 
31 |         assert input_size_len == 4, \
32 |             'Wrong input rang, must be equal to 4, but {} found'.format(input_size_len)
33 | 
34 |         [batch_size, seq_len, _, embed_size] = input_size
35 | 
36 |         assert embed_size == self.params.char_embed_size, \
37 |             'Wrong embedding size, must be equal to {}, but {} found'.format(self.params.char_embed_size, embed_size)
38 | 
39 |         # leaps with shape
40 |         x = x.view(-1, self.params.max_word_len, self.params.char_embed_size).transpose(1, 2).contiguous()
41 | 
42 |         xs = [F.tanh(F.conv1d(x, kernel)) for kernel in self.kernels]
43 |         xs = [x.max(2,keepdim=True)[0].squeeze(2) for x in xs]
44 | 
45 |         x = t.cat(xs, 1)
46 |         x = x.view(batch_size, seq_len, -1)
47 | 
48 |         return x
49 | 
50 |     def _add_to_parameters(self, parameters, name):
51 |         for i, parameter in enumerate(parameters):
52 |             self.register_parameter(name='{}-{}'.format(name, i), param=parameter)
53 | 


--------------------------------------------------------------------------------
/selfModules/embedding.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import numpy as np
 3 | import torch as t
 4 | import torch.nn as nn
 5 | from torch.nn import Parameter
 6 | 
 7 | from .tdnn import TDNN
 8 | 
 9 | 
10 | class Embedding(nn.Module):
11 |     def __init__(self, params, path='../../../', flag=False):
12 |         super(Embedding, self).__init__()
13 | 
14 |         self.params = params
15 | 
16 |         if flag == True:
17 |             word_embed = np.load(path + 'data/super/word_embeddings.npy')
18 |         else :
19 |             word_embed = np.load(path + 'data/word_embeddings.npy')
20 | 
21 |         self.word_embed = nn.Embedding(self.params.word_vocab_size, self.params.word_embed_size)
22 |         self.char_embed = nn.Embedding(self.params.char_vocab_size, self.params.char_embed_size)
23 |         self.word_embed.weight = Parameter(t.from_numpy(word_embed).float(), requires_grad=False)
24 |         self.char_embed.weight = Parameter(
25 |             t.Tensor(self.params.char_vocab_size, self.params.char_embed_size).uniform_(-1, 1))
26 | 
27 |         self.TDNN = TDNN(self.params)
28 | 
29 |     def forward(self, word_input, character_input):
30 |         """
31 |         :param word_input: [batch_size, seq_len] tensor of Long type
32 |         :param character_input: [batch_size, seq_len, max_word_len] tensor of Long type
33 |         :return: input embedding with shape of [batch_size, seq_len, word_embed_size + sum_depth]
34 |         """
35 | 
36 |         assert word_input.size()[:2] == character_input.size()[:2], \
37 |             'Word input and character input must have the same sizes, but {} and {} found'.format(
38 |                 word_input.size(), character_input.size())
39 | 
40 |         [batch_size, seq_len] = word_input.size()
41 | 
42 |         word_input = self.word_embed(word_input)
43 | 
44 |         character_input = character_input.view(-1, self.params.max_word_len)
45 |         character_input = self.char_embed(character_input)
46 |         character_input = character_input.view(batch_size,
47 |                                                seq_len,
48 |                                                self.params.max_word_len,
49 |                                                self.params.char_embed_size)
50 | 
51 |         character_input = self.TDNN(character_input)
52 | 
53 |         result = t.cat([word_input, character_input], 2)
54 | 
55 |         return result
56 | 


--------------------------------------------------------------------------------
/selfModules/neg.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import torch as t
 3 | import torch.nn as nn
 4 | from torch.autograd import Variable
 5 | from torch.nn import Parameter
 6 | 
 7 | from utils.functional import *
 8 | 
 9 | 
10 | class NEG_loss(nn.Module):
11 |     def __init__(self, num_classes, embed_size):
12 |         """
13 |         :param num_classes: An int. The number of possible classes.
14 |         :param embed_size: An int. Embedding size
15 |         """
16 | 
17 |         super(NEG_loss, self).__init__()
18 | 
19 |         self.num_classes = num_classes
20 |         self.embed_size = embed_size
21 | 
22 |         self.out_embed = nn.Embedding(self.num_classes, self.embed_size)
23 |         self.out_embed.weight = Parameter(t.FloatTensor(self.num_classes, self.embed_size).uniform_(-1, 1))
24 | 
25 |         self.in_embed = nn.Embedding(self.num_classes, self.embed_size)
26 |         self.in_embed.weight = Parameter(t.FloatTensor(self.num_classes, self.embed_size).uniform_(-1, 1))
27 | 
28 |     def forward(self, input_labes, out_labels, num_sampled):
29 |         """
30 |         :param input_labes: Tensor with shape of [batch_size] of Long type
31 |         :param out_labels: Tensor with shape of [batch_size] of Long type
32 |         :param num_sampled: An int. The number of sampled from noise examples
33 | 
34 |         :return: Loss estimation with shape of [batch_size]
35 |             loss defined in Mikolov et al. Distributed Representations of Words and Phrases and their Compositionality
36 |             papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf
37 |         """
38 | 
39 |         assert parameters_allocation_check(self), \
40 |             """
41 |             Invalid CUDA options. out_embed and in_embed parameters both should be stored in the same memory
42 |             got out_embed.is_cuda = {}, in_embed.is_cuda = {}
43 |             """.format(self.out_embed.weight.is_cuda, self.in_embed.weight.is_cuda)
44 | 
45 |         use_cuda = self.out_embed.weight.is_cuda
46 | 
47 |         [batch_size] = input_labes.size()
48 | 
49 |         input = self.in_embed(input_labes)
50 |         output = self.out_embed(out_labels)
51 | 
52 |         noise = Variable(t.Tensor(batch_size, num_sampled).uniform_(0, self.num_classes - 1).long())
53 |         if use_cuda:
54 |             noise = noise.cuda()
55 |         noise = self.out_embed(noise).neg()
56 | 
57 |         log_target = (input * output).sum(1).squeeze().sigmoid().log()
58 | 
59 |         ''' âˆ‘[batch_size, num_sampled, embed_size] * [batch_size, embed_size, 1] ->
60 |             âˆ‘[batch_size, num_sampled] -> [batch_size] '''
61 |         sum_log_sampled = t.bmm(noise, input.unsqueeze(2)).sigmoid().log().sum(1).squeeze()
62 | 
63 |         loss = log_target + sum_log_sampled
64 | 
65 |         return -loss
66 | 
67 |     def input_embeddings(self):
68 |         return self.in_embed.weight.data.cpu().numpy()
69 | 


--------------------------------------------------------------------------------
/train_word_embeddings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import argparse
 3 | 
 4 | import numpy as np
 5 | import torch as t
 6 | from torch.autograd import Variable
 7 | from torch.optim import SGD
 8 | 
 9 | from utils.batch_loader import BatchLoader
10 | from utils.parameters import Parameters
11 | from selfModules.neg import NEG_loss
12 | 
13 | if __name__ == '__main__':
14 | 
15 |     parser = argparse.ArgumentParser(description='word2vec')
16 |     parser.add_argument('--num-iterations', type=int, default=1000000, metavar='NI',
17 |                         help='num iterations (default: 1000000)')
18 |     parser.add_argument('--batch-size', type=int, default=10, metavar='BS',
19 |                         help='batch size (default: 10)')
20 |     parser.add_argument('--num-sample', type=int, default=5, metavar='NS',
21 |                         help='num sample (default: 5)')
22 |     parser.add_argument('--use-cuda', type=bool, default=True, metavar='CUDA',
23 |                         help='use cuda (default: True)')
24 |     args = parser.parse_args()
25 | 
26 | 
27 |     path=''
28 | 
29 |     data_files = [path + 'data/train.txt',
30 |                        path + 'data/test.txt']
31 | 
32 |     idx_files = [path + 'data/words_vocab.pkl',
33 |                       path + 'data/characters_vocab.pkl']
34 | 
35 |     tensor_files = [[path + 'data/train_word_tensor.npy',
36 |                           path + 'data/valid_word_tensor.npy'],
37 |                          [path + 'data/train_character_tensor.npy',
38 |                           path + 'data/valid_character_tensor.npy']]
39 | 
40 |     batch_loader = BatchLoader(data_files, idx_files, tensor_files, path)
41 |     
42 |     # batch_loader = BatchLoader('')
43 |     params = Parameters(batch_loader.max_word_len,
44 |                         batch_loader.max_seq_len,
45 |                         batch_loader.words_vocab_size,
46 |                         batch_loader.chars_vocab_size)
47 | 
48 |     neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size)
49 |     if args.use_cuda:
50 |         neg_loss = neg_loss.cuda()
51 | 
52 |     # NEG_loss is defined over two embedding matrixes with shape of [params.word_vocab_size, params.word_embed_size]
53 |     optimizer = SGD(neg_loss.parameters(), 0.1)
54 | 
55 |     for iteration in range(args.num_iterations):
56 | 
57 |         input_idx, target_idx = batch_loader.next_embedding_seq(args.batch_size)
58 | 
59 |         input = Variable(t.from_numpy(input_idx).long())
60 |         target = Variable(t.from_numpy(target_idx).long())
61 |         if args.use_cuda:
62 |             input, target = input.cuda(), target.cuda()
63 | 
64 |         out = neg_loss(input, target, args.num_sample).mean()
65 | 
66 |         optimizer.zero_grad()
67 |         out.backward()
68 |         optimizer.step()
69 | 
70 |         if iteration % 500 == 0:
71 |             out = out.cpu().data.numpy()[0]
72 |             print('iteration = {}, loss = {}'.format(iteration, out))
73 | 
74 |     word_embeddings = neg_loss.input_embeddings()
75 |     #Saves the word embeddings at the end of this programs
76 |     np.save('data/word_embeddings.npy', word_embeddings)
77 | 


--------------------------------------------------------------------------------
/train_word_embeddings_2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import argparse
 3 | 
 4 | import numpy as np
 5 | import torch as t
 6 | from torch.autograd import Variable
 7 | from torch.optim import SGD
 8 | 
 9 | from utils.batch_loader import BatchLoader
10 | from utils.parameters import Parameters
11 | from selfModules.neg import NEG_loss
12 | 
13 | if __name__ == '__main__':
14 | 
15 |     parser = argparse.ArgumentParser(description='word2vec')
16 |     parser.add_argument('--num-iterations', type=int, default=1000000, metavar='NI',
17 |                         help='num iterations (default: 1000000)')
18 |     parser.add_argument('--batch-size', type=int, default=10, metavar='BS',
19 |                         help='batch size (default: 10)')
20 |     parser.add_argument('--num-sample', type=int, default=5, metavar='NS',
21 |                         help='num sample (default: 5)')
22 |     parser.add_argument('--use-cuda', type=bool, default=True, metavar='CUDA',
23 |                         help='use cuda (default: True)')
24 |     args = parser.parse_args()
25 | 
26 | 
27 | 
28 | 
29 |     path=''
30 | 
31 |     data_files = [path + 'data/super/train_2.txt',
32 |                        path + 'data/super/test_2.txt']
33 | 
34 |     idx_files = [path + 'data/super/words_vocab_2.pkl',
35 |                       path + 'data/super/characters_vocab_2.pkl']
36 | 
37 |     tensor_files = [[path + 'data/super/train_word_tensor_2.npy',
38 |                           path + 'data/super/valid_word_tensor_2.npy'],
39 |                          [path + 'data/super/train_character_tensor_2.npy',
40 |                           path + 'data/super/valid_character_tensor_2.npy']]
41 |     batch_loader_2 = BatchLoader(data_files, idx_files, tensor_files, path)
42 | 
43 | 
44 | 
45 | 
46 |     # batch_loader_2 = BatchLoader('')
47 |     params = Parameters(batch_loader_2.max_word_len,
48 |                         batch_loader_2.max_seq_len,
49 |                         batch_loader_2.words_vocab_size,
50 |                         batch_loader_2.chars_vocab_size)
51 | 
52 |     neg_loss = NEG_loss(params.word_vocab_size, params.word_embed_size)
53 |     if args.use_cuda:
54 |         neg_loss = neg_loss.cuda()
55 | 
56 |     # NEG_loss is defined over two embedding matrixes with shape of [params.word_vocab_size, params.word_embed_size]
57 |     optimizer = SGD(neg_loss.parameters(), 0.1)
58 | 
59 |     for iteration in range(args.num_iterations):
60 | 
61 |         input_idx, target_idx = batch_loader_2.next_embedding_seq(args.batch_size)
62 | 
63 |         input = Variable(t.from_numpy(input_idx).long())
64 |         target = Variable(t.from_numpy(target_idx).long())
65 |         if args.use_cuda:
66 |             input, target = input.cuda(), target.cuda()
67 | 
68 |         out = neg_loss(input, target, args.num_sample).mean()
69 | 
70 |         optimizer.zero_grad()
71 |         out.backward()
72 |         optimizer.step()
73 | 
74 |         if iteration % 500 == 0:
75 |             out = out.cpu().data.numpy()[0]
76 |             print('iteration = {}, loss = {}'.format(iteration, out))
77 | 
78 |     word_embeddings = neg_loss.input_embeddings()
79 |     #Saves the word embeddings at the end of this programs
80 |     np.save('data/super/word_embeddings.npy', word_embeddings)
81 | 


--------------------------------------------------------------------------------
/model/decoder.py:
--------------------------------------------------------------------------------
 1 | import torch as t
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from utils.functional import parameters_allocation_check
 6 | 
 7 | 
 8 | class Decoder(nn.Module):
 9 |     def __init__(self, params):
10 |         super(Decoder, self).__init__()
11 | 
12 |         self.params = params
13 | 
14 |         self.rnn = nn.LSTM(input_size=self.params.latent_variable_size + self.params.word_embed_size,
15 |                            hidden_size=self.params.decoder_rnn_size,
16 |                            num_layers=self.params.decoder_num_layers,
17 |                            batch_first=True)
18 | 
19 |         self.fc = nn.Linear(self.params.decoder_rnn_size, self.params.word_vocab_size)
20 | 
21 | 
22 |     def only_decoder_beam(self, decoder_input, z, drop_prob, initial_state=None):
23 |         
24 |         assert parameters_allocation_check(self), \
25 |             'Invalid CUDA options. Parameters should be allocated in the same memory'
26 | 
27 | #         print decoder_input.size()
28 |          
29 |         [beam_batch_size, _, _] = decoder_input.size()
30 | 
31 |         '''
32 |             decoder rnn is conditioned on context via additional bias = W_cond * z to every input token
33 |         '''
34 |         decoder_input = F.dropout(decoder_input, drop_prob)
35 | 
36 |         z = z.unsqueeze(0)
37 |         
38 | #         print z.size()
39 | 
40 |         z = t.cat([z] * beam_batch_size, 0)
41 |         
42 | #         print z.size()
43 | #         z = z.contiguous().view(1, -1)
44 | 
45 | #         z = z.view(beam_batch_size, self.params.latent_variable_size)
46 |         
47 | #         print z.size() 
48 |         
49 |         decoder_input = t.cat([decoder_input, z], 2)
50 | 
51 | #         print "decoder_input:",decoder_input.size() 
52 | 
53 |         rnn_out, final_state = self.rnn(decoder_input, initial_state) 
54 | 
55 | #         print "rnn_out:",rnn_out.size()
56 | #         print "final_state_1:",final_state[0].size()
57 | #         print "final_state_1:",final_state[1].size()   
58 |         
59 |         return rnn_out, final_state
60 |  
61 | 
62 |     def forward(self, decoder_input, z, drop_prob, initial_state=None):
63 |         """
64 |         :param decoder_input: tensor with shape of [batch_size, seq_len, embed_size]
65 |         :param z: sequence context with shape of [batch_size, latent_variable_size]
66 |         :param drop_prob: probability of an element of decoder input to be zeroed in sense of dropout
67 |         :param initial_state: initial state of decoder rnn
68 | 
69 |         :return: unnormalized logits of sentense words distribution probabilities
70 |                     with shape of [batch_size, seq_len, word_vocab_size]
71 |                  final rnn state with shape of [num_layers, batch_size, decoder_rnn_size]
72 |         """
73 | 
74 |         assert parameters_allocation_check(self), \
75 |             'Invalid CUDA options. Parameters should be allocated in the same memory'
76 | 
77 |         [batch_size, seq_len, _] = decoder_input.size()
78 | 
79 |         '''
80 |             decoder rnn is conditioned on context via additional bias = W_cond * z to every input token
81 |         '''
82 |         decoder_input = F.dropout(decoder_input, drop_prob)
83 | 
84 |         z = t.cat([z] * seq_len, 1).view(batch_size, seq_len, self.params.latent_variable_size)
85 |         decoder_input = t.cat([decoder_input, z], 2)
86 | 
87 |         rnn_out, final_state = self.rnn(decoder_input, initial_state)
88 |         rnn_out = rnn_out.contiguous().view(-1, self.params.decoder_rnn_size)
89 |         
90 | 
91 |         result = self.fc(rnn_out)
92 |         result = result.view(batch_size, seq_len, self.params.word_vocab_size)
93 | 
94 |         return result, final_state
95 | 


--------------------------------------------------------------------------------
/beam_search.py:
--------------------------------------------------------------------------------
  1 | """Beam search implementation in PyTorch."""
  2 | #
  3 | #
  4 | #         hyp1#-hyp1---hyp1 -hyp1
  5 | #                 \             /
  6 | #         hyp2 \-hyp2 /-hyp2#hyp2
  7 | #                               /      \
  8 | #         hyp3#-hyp3---hyp3 -hyp3
  9 | #         ========================
 10 | #
 11 | # Takes care of beams, back pointers, and scores.
 12 | 
 13 | # Code borrowed from PyTorch OpenNMT example
 14 | # https://github.com/pytorch/examples/blob/master/OpenNMT/onmt/Beam.py
 15 | 
 16 | import torch
 17 | 
 18 | 
 19 | class Beam(object):
 20 |     """Ordered beam of candidate outputs."""
 21 | 
 22 |     def __init__(self, size, batch_loader, cuda=False):
 23 |         """Initialize params."""
 24 |         self.size = size
 25 |         self.done = False
 26 |         self.pad = batch_loader.word_to_idx[batch_loader.pad_token]
 27 |         self.bos = batch_loader.word_to_idx[batch_loader.go_token]
 28 |         self.eos = batch_loader.word_to_idx[batch_loader.end_token]
 29 |         
 30 |         
 31 |         self.tt = torch.cuda if cuda else torch
 32 | 
 33 |         # The score for each translation on the beam.
 34 |         self.scores = self.tt.FloatTensor(size).zero_()
 35 | 
 36 |         # The backpointers at each time-step.
 37 |         self.prevKs = []
 38 | 
 39 |         # The outputs at each time-step.
 40 |         self.nextYs = [self.tt.LongTensor(size).fill_(self.pad)]
 41 |         self.nextYs[0][0] = self.bos
 42 | 
 43 |         # The attentions (matrix) for each time.
 44 |         self.attn = []
 45 | 
 46 |     # Get the outputs for the current timestep.
 47 |     def get_current_state(self):
 48 |         """Get state of beam."""
 49 |         return self.nextYs[-1]
 50 | 
 51 |     # Get the backpointers for the current timestep.
 52 |     def get_current_origin(self):
 53 |         """Get the backpointer to the beam at this step."""
 54 |         return self.prevKs[-1]
 55 | 
 56 |     #  Given prob over words for every last beam `wordLk` and attention
 57 |     #   `attnOut`: Compute and update the beam search.
 58 |     #
 59 |     # Parameters:
 60 |     #
 61 |     #     * `wordLk`- probs of advancing from the last step (K x words)
 62 |     #     * `attnOut`- attention at the last step
 63 |     #
 64 |     # Returns: True if beam search is complete.
 65 | 
 66 |     def advance(self, workd_lk):
 67 |         """Advance the beam."""
 68 |         num_words = workd_lk.size(1)
 69 | 
 70 |         # Sum the previous scores.
 71 |         if len(self.prevKs) > 0:
 72 |             beam_lk = workd_lk + self.scores.unsqueeze(1).expand_as(workd_lk)
 73 |         else:
 74 |             beam_lk = workd_lk[0]
 75 | 
 76 |         flat_beam_lk = beam_lk.view(-1)
 77 | 
 78 |         bestScores, bestScoresId = flat_beam_lk.topk(self.size, 0, True, True)
 79 |         self.scores = bestScores
 80 | 
 81 |         # bestScoresId is flattened beam x word array, so calculate which
 82 |         # word and beam each score came from
 83 |         prev_k = bestScoresId / num_words
 84 |         self.prevKs.append(prev_k)
 85 |         self.nextYs.append(bestScoresId - prev_k * num_words)
 86 | 
 87 |         # End condition is when top-of-beam is EOS.
 88 |         if self.nextYs[-1][0] == self.eos:
 89 |             self.done = True
 90 | 
 91 |         return self.done
 92 | 
 93 |     def sort_best(self):
 94 |         """Sort the beam."""
 95 |         return torch.sort(self.scores, 0, True)
 96 | 
 97 |     # Get the score of the best in the beam.
 98 |     def get_best(self):
 99 |         """Get the most likely candidate."""
100 |         scores, ids = self.sort_best()
101 |         return scores[1], ids[1]
102 | 
103 |     # Walk back to construct the full hypothesis.
104 |     #
105 |     # Parameters.
106 |     #
107 |     #     * `k` - the position in the beam to construct.
108 |     #
109 |     # Returns.
110 |     #
111 |     #     1. The hypothesis
112 |     #     2. The attention at each time step.
113 |     def get_hyp(self, k):
114 |         """Get hypotheses."""
115 |         hyp = []
116 |         # print(len(self.prevKs), len(self.nextYs), len(self.attn))
117 |         for j in range(len(self.prevKs) - 1, -1, -1):
118 |             hyp.append(self.nextYs[j + 1][k])
119 |             k = self.prevKs[j][k]
120 | #         print "inside:", hyp
121 |         
122 |         return hyp[::-1]


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import time
  4 | 
  5 | import numpy as np
  6 | import torch as t
  7 | 
  8 | from utils.batch_loader import BatchLoader
  9 | from utils.tensor import preprocess_data
 10 | from utils.parameters import Parameters
 11 | from model.rvae import RVAE
 12 | from torch.autograd import Variable
 13 | from six.moves import cPickle
 14 | 
 15 | if __name__ == '__main__':
 16 | 
 17 |     assert os.path.exists('./trained_RVAE'), \
 18 |         'trained model not found'
 19 | 
 20 |     parser = argparse.ArgumentParser(description='Sampler')
 21 |     parser.add_argument('--use-cuda', type=bool, default=True, metavar='CUDA',
 22 |                         help='use cuda (default: True)')
 23 |     parser.add_argument('--num-sample', type=int, default=5, metavar='NS',
 24 |                         help='num samplings (default: 5)')
 25 |     parser.add_argument('--num-sentence', type=int, default=10, metavar='NS',
 26 |                         help='num samplings (default: 10)')
 27 |     parser.add_argument('--beam-top', type=int, default=3, metavar='NS',
 28 |                         help='beam top (default: 1)')
 29 |     parser.add_argument('--beam-size', type=int, default=10, metavar='NS',
 30 |                         help='beam size (default: 10)')
 31 |     parser.add_argument('--use-file', type=bool, default=True, metavar='NS',
 32 |                         help='use file (default: False)')
 33 |     #Path to test file ---
 34 |     parser.add_argument('--test-file', type=str, default='data/test.txt', metavar='NS',
 35 |                         help='test file path (default: data/test.txt)')
 36 |     parser.add_argument('--save-model', type=str, default='./trained_RVAE', metavar='NS',
 37 |                         help='trained model save path (default: ./trained_models/trained_RVAE_quora)')
 38 |     args = parser.parse_args()
 39 |     
 40 |     #Removing, is already some previous files exist from last execution of program
 41 |     if os.path.exists('data/test_word_tensor.npy'):
 42 |         os.remove('data/test_word_tensor.npy')
 43 |     if os.path.exists('data/test_character_tensor.npy'):
 44 |         os.remove('data/test_character_tensor.npy')
 45 | 
 46 |     str =''
 47 |     if not args.use_file:
 48 |         str = raw_input("Input Question : ")
 49 |     else:
 50 |         file_1 = open(args.test_file, 'r')
 51 |         data = file_1.readlines()
 52 | 
 53 |     ''' ================================= BatchLoader loading ===============================================
 54 |     '''
 55 |     data_files = [args.test_file]
 56 | 
 57 |     idx_files = ['data/words_vocab.pkl',
 58 |                       'data/characters_vocab.pkl']
 59 | 
 60 |     tensor_files = [['data/test_word_tensor.npy'],
 61 |                          ['data/test_character_tensor.npy']]
 62 | 
 63 |     preprocess_data(data_files, idx_files, tensor_files, args.use_file, str)
 64 | 
 65 |     batch_loader = BatchLoader(data_files, idx_files, tensor_files)
 66 |     parameters = Parameters(batch_loader.max_word_len,
 67 |                             batch_loader.max_seq_len,
 68 |                             batch_loader.words_vocab_size,
 69 |                             batch_loader.chars_vocab_size)
 70 | 
 71 | 
 72 |     ''' ============================ BatchLoader for Question-2 ===============================================
 73 |     '''
 74 |     data_files = ['data/super/train_2.txt']
 75 | 
 76 |     idx_files = ['data/super/words_vocab_2.pkl',
 77 |                       'data/super/characters_vocab_2.pkl']
 78 | 
 79 |     tensor_files = [['data/super/train_word_tensor_2.npy'],
 80 |                          ['data/super/train_character_tensor_2.npy']]
 81 |     batch_loader_2 = BatchLoader(data_files, idx_files, tensor_files)
 82 |     parameters_2 = Parameters(batch_loader_2.max_word_len,
 83 |                             batch_loader_2.max_seq_len,
 84 |                             batch_loader_2.words_vocab_size,
 85 |                             batch_loader_2.chars_vocab_size)
 86 | 
 87 | 
 88 |     '''======================================== RVAE loading ==================================================
 89 |     '''
 90 |     print 'Started loading'
 91 |     start_time = time.time()
 92 |     rvae = RVAE(parameters,parameters_2)
 93 |     rvae.load_state_dict(t.load(args.save_model))
 94 |     if args.use_cuda:
 95 |         rvae = rvae.cuda()
 96 |     loading_time=time.time() - start_time
 97 |     print 'Time elapsed in loading model =' , loading_time
 98 |     print 'Finished loading'
 99 | 
100 |     ''' ==================================== Parameters Initialising ===========================================
101 |     '''
102 |     n_best = args.beam_top 
103 |     beam_size =args.beam_size 
104 |     
105 |     assert n_best <= beam_size 
106 |     use_cuda = args.use_cuda
107 | 
108 |     if args.use_file:
109 |         num_sentence = args.num_sentence
110 |     else:
111 |         num_sentence = 1
112 | 
113 |     ''' =======================================================================================================
114 |     '''
115 | 
116 |     for i in range(len(data)):
117 |         if args.use_file:
118 |             print (data[i])
119 |         else:
120 |             print str + '\n'
121 |         for iteration in range(args.num_sample):
122 | 
123 |             seed = Variable(t.randn([1, parameters.latent_variable_size]))
124 |             seed = seed.cuda()
125 | 
126 |             results, scores = rvae.sampler(batch_loader,batch_loader_2, 50, seed, args.use_cuda,i,beam_size,n_best)
127 | 
128 |             for tt in results:
129 |                 for k in xrange(n_best):
130 |                     sen = " ". join([batch_loader_2.decode_word(x[k]) for x in tt])
131 |                 if batch_loader.end_token in sen:    
132 |                     print sen[:sen.index(batch_loader.end_token)]
133 |                 else :
134 |                     print sen      
135 |         print '\n'


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import torch as t
  6 | from torch.optim import Adam
  7 | 
  8 | from utils.batch_loader import BatchLoader
  9 | from utils.parameters import Parameters
 10 | from model.rvae import RVAE
 11 | 
 12 | if __name__ == "__main__":
 13 | 
 14 |     if not os.path.exists('data/word_embeddings.npy'):
 15 |         raise FileNotFoundError("word embeddings file was't found")
 16 | 
 17 |     parser = argparse.ArgumentParser(description='RVAE')
 18 |     parser.add_argument('--num-iterations', type=int, default=120000, metavar='NI',
 19 |                         help='num iterations (default: 120000)')
 20 |     parser.add_argument('--batch-size', type=int, default=32, metavar='BS',
 21 |                         help='batch size (default: 32)')
 22 |     parser.add_argument('--use-cuda', type=bool, default=True, metavar='CUDA',
 23 |                         help='use cuda (default: True)')
 24 |     parser.add_argument('--learning-rate', type=float, default=0.00005, metavar='LR',
 25 |                         help='learning rate (default: 0.00005)')
 26 |     parser.add_argument('--dropout', type=float, default=0.3, metavar='DR',
 27 |                         help='dropout (default: 0.3)')
 28 |     parser.add_argument('--use-trained', type=bool, default=False, metavar='UT',
 29 |                         help='load pretrained model (default: False)')
 30 |     parser.add_argument('--ce-result', default='', metavar='CE',
 31 |                         help='ce result path (default: '')')
 32 |     parser.add_argument('--kld-result', default='', metavar='KLD',
 33 |                         help='ce result path (default: '')')
 34 | 
 35 |     args = parser.parse_args()
 36 | 
 37 | 
 38 |     path=''
 39 |     
 40 |     ''' =================== Creating batch_loader for encoder-1 =========================================
 41 |     '''
 42 |     data_files = [path + 'data/train.txt',
 43 |                        path + 'data/test.txt']
 44 | 
 45 |     idx_files = [path + 'data/words_vocab.pkl',
 46 |                       path + 'data/characters_vocab.pkl']
 47 | 
 48 |     tensor_files = [[path + 'data/train_word_tensor.npy',
 49 |                           path + 'data/valid_word_tensor.npy'],
 50 |                          [path + 'data/train_character_tensor.npy',
 51 |                           path + 'data/valid_character_tensor.npy']]
 52 | 
 53 |     batch_loader = BatchLoader(data_files, idx_files, tensor_files, path)
 54 |     parameters = Parameters(batch_loader.max_word_len,
 55 |                             batch_loader.max_seq_len,
 56 |                             batch_loader.words_vocab_size,
 57 |                             batch_loader.chars_vocab_size)
 58 | 
 59 | 
 60 |     ''' =================== Doing the same for encoder-2 ===============================================
 61 |     '''
 62 |     data_files = [path + 'data/super/train_2.txt',
 63 |                        path + 'data/super/test_2.txt']
 64 | 
 65 |     idx_files = [path + 'data/super/words_vocab_2.pkl',
 66 |                       path + 'data/super/characters_vocab_2.pkl']
 67 | 
 68 |     tensor_files = [[path + 'data/super/train_word_tensor_2.npy',
 69 |                           path + 'data/super/valid_word_tensor_2.npy'],
 70 |                          [path + 'data/super/train_character_tensor_2.npy',
 71 |                           path + 'data/super/valid_character_tensor_2.npy']]
 72 |     batch_loader_2 = BatchLoader(data_files, idx_files, tensor_files, path)
 73 |     parameters_2 = Parameters(batch_loader_2.max_word_len,
 74 |                             batch_loader_2.max_seq_len,
 75 |                             batch_loader_2.words_vocab_size,
 76 |                             batch_loader_2.chars_vocab_size)
 77 |     '''=================================================================================================
 78 |     '''
 79 | 
 80 | 
 81 |     rvae = RVAE(parameters,parameters_2)
 82 |     if args.use_trained:
 83 |         rvae.load_state_dict(t.load('trained_RVAE'))
 84 |     if args.use_cuda:
 85 |         rvae = rvae.cuda()
 86 | 
 87 |     optimizer = Adam(rvae.learnable_parameters(), args.learning_rate)
 88 | 
 89 |     train_step = rvae.trainer(optimizer,batch_loader, batch_loader_2)
 90 |     validate = rvae.validater(batch_loader,batch_loader_2)
 91 | 
 92 |     ce_result = []
 93 |     kld_result = []
 94 | 
 95 |     start_index = 0
 96 |     # start_index_2 = 0
 97 | 
 98 |     for iteration in range(args.num_iterations):
 99 |         #This needs to be changed
100 |         #start_index =  (start_index+1)%50000
101 |         start_index = (start_index+args.batch_size)%149163
102 |         cross_entropy, kld, coef = train_step(iteration, args.batch_size, args.use_cuda, args.dropout, start_index)
103 | 
104 |         # exit()
105 | 
106 |         if iteration % 5 == 0:
107 |             print('\n')
108 |             print('------------TRAIN-------------')
109 |             print('----------ITERATION-----------')
110 |             print(iteration)
111 |             print('--------CROSS-ENTROPY---------')
112 |             print(cross_entropy.data.cpu().numpy()[0])
113 |             print('-------------KLD--------------')
114 |             print(kld.data.cpu().numpy()[0])
115 |             print('-----------KLD-coef-----------')
116 |             print(coef)
117 |             print('------------------------------')
118 | 
119 |         # if iteration % 10 == 0:
120 |         #     start_index_2 = (start_index_2+args.batch_size)%3900
121 |         #     cross_entropy, kld = validate(args.batch_size, args.use_cuda, start_index_2)
122 | 
123 |         #     cross_entropy = cross_entropy.data.cpu().numpy()[0]
124 |         #     kld = kld.data.cpu().numpy()[0]
125 | 
126 |         #     print('\n')
127 |         #     print('------------VALID-------------')
128 |         #     print('--------CROSS-ENTROPY---------')
129 |         #     print(cross_entropy)
130 |         #     print('-------------KLD--------------')
131 |         #     print(kld)
132 |         #     print('------------------------------')
133 | 
134 |         #     ce_result += [cross_entropy]
135 |         #     kld_result += [kld]
136 |             '''
137 |          if iteration % 20 == 0:
138 |             seed = np.random.normal(size=[1, parameters.latent_variable_size])
139 | 
140 |             sample = rvae.sample(batch_loader_2, 50, seed, args.use_cuda)
141 | 
142 |             print('\n')
143 |             print('------------SAMPLE------------')
144 |             print('------------------------------')
145 |             print(sample)
146 |             print('------------------------------')
147 |             '''
148 |     t.save(rvae.state_dict(), 'trained_RVAE')
149 | 
150 |     np.save('ce_result_{}.npy'.format(args.ce_result), np.array(ce_result))
151 |     np.save('kld_result_npy_{}'.format(args.kld_result), np.array(kld_result))
152 | 


--------------------------------------------------------------------------------
/sample_3.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import torch as t
  6 | 
  7 | from utils.batch_loader import BatchLoader
  8 | from utils.parameters import Parameters
  9 | from model.rvae import RVAE
 10 | from torch.autograd import Variable
 11 | 
 12 | if __name__ == '__main__':
 13 | 
 14 |     assert os.path.exists('trained_RVAE'), \
 15 |         'trained model not found'
 16 | 
 17 |     parser = argparse.ArgumentParser(description='Sampler')
 18 |     parser.add_argument('--use-cuda', type=bool, default=True, metavar='CUDA',
 19 |                         help='use cuda (default: True)')
 20 |     parser.add_argument('--num-sample', type=int, default=5, metavar='NS',
 21 |                         help='num samplings (default: 5)')
 22 |     parser.add_argument('--num-sentence', type=int, default=10, metavar='NS',
 23 |                         help='num samplings (default: 10)')
 24 |     args = parser.parse_args()
 25 |     
 26 |     file_1 = open('test.txt', 'r')
 27 |     data = file_1.readlines()
 28 | 
 29 |     file_2 = open('test_2.txt', 'r')
 30 |     data_2 = file_2.readlines()    
 31 | 
 32 |     path=''
 33 |     
 34 |     ''' ============================= BatchLoader loading ===============================================
 35 |     '''
 36 |     data_files = [path + 'data/train.txt',
 37 |                        path + 'data/test.txt']
 38 | 
 39 |     idx_files = [path + 'data/words_vocab.pkl',
 40 |                       path + 'data/characters_vocab.pkl']
 41 | 
 42 |     tensor_files = [[path + 'data/train_word_tensor.npy',
 43 |                           path + 'data/valid_word_tensor.npy'],
 44 |                          [path + 'data/train_character_tensor.npy',
 45 |                           path + 'data/valid_character_tensor.npy']]
 46 | 
 47 |     batch_loader = BatchLoader(data_files, idx_files, tensor_files, path)
 48 |     parameters = Parameters(batch_loader.max_word_len,
 49 |                             batch_loader.max_seq_len,
 50 |                             batch_loader.words_vocab_size,
 51 |                             batch_loader.chars_vocab_size)
 52 | 
 53 |     ''' ============================= BatchLoader loading ===============================================
 54 |     '''
 55 | 
 56 |     data_files = [path + 'data/super/train_2.txt',
 57 |                        path + 'data/super/test_2.txt']
 58 | 
 59 |     idx_files = [path + 'data/super/words_vocab_2.pkl',
 60 |                       path + 'data/super/characters_vocab_2.pkl']
 61 | 
 62 |     tensor_files = [[path + 'data/super/train_word_tensor_2.npy',
 63 |                           path + 'data/super/valid_word_tensor_2.npy'],
 64 |                          [path + 'data/super/train_character_tensor_2.npy',
 65 |                           path + 'data/super/valid_character_tensor_2.npy']]
 66 | 
 67 |     batch_loader_2 = BatchLoader(data_files, idx_files, tensor_files, path)
 68 |     parameters_2 = Parameters(batch_loader_2.max_word_len,
 69 |                             batch_loader_2.max_seq_len,
 70 |                             batch_loader_2.words_vocab_size,
 71 |                             batch_loader_2.chars_vocab_size)
 72 | 
 73 |     '''======================================== RVAE creation ==================================================
 74 |     '''
 75 |     
 76 |     rvae = RVAE(parameters,parameters_2)
 77 |     rvae.load_state_dict(t.load('trained_RVAE'))
 78 |     if args.use_cuda:
 79 |         rvae = rvae.cuda()
 80 | 
 81 |     n_best = 3 
 82 |     beam_size=10 
 83 |     
 84 |     assert n_best <= beam_size 
 85 | 
 86 |     for i in range(args.num_sentence):
 87 | 
 88 |         '''================================================== Input Encoder-1 ========================================================
 89 |         '''
 90 |         use_cuda = 1
 91 |         input = batch_loader.next_batch(1, 'valid', i)
 92 |         input = [Variable(t.from_numpy(var)) for var in input]
 93 |         input = [var.long() for var in input]
 94 |         input = [var.cuda() if use_cuda else var for var in input]
 95 | 
 96 |         [encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target] = input
 97 | 
 98 | 
 99 |         ''' =================================================== Input for Encoder-2 ========================================================
100 |         '''
101 | 
102 |         input_2 = batch_loader_2.next_batch(1, 'valid', i)
103 |         input_2 = [Variable(t.from_numpy(var)) for var in input_2]
104 |         input_2 = [var.long() for var in input_2]
105 |         input_2 = [var.cuda() if use_cuda else var for var in input_2]
106 | 
107 |         [encoder_word_input_2, encoder_character_input_2, decoder_word_input_2, decoder_character_input_2, target] = input_2
108 | 
109 |         ''' ================================================== Forward pass ===========================================================
110 |         '''
111 |         # exit()
112 | 
113 |         logits,_,kld,mu,std = rvae.forward(0.,
114 |                               encoder_word_input, encoder_character_input,
115 |                               encoder_word_input_2,encoder_character_input_2,
116 |                               decoder_word_input_2, decoder_character_input_2,
117 |                               z=None)
118 | 
119 |         ''' ================================================================================================================================
120 |         '''
121 | 
122 |         # print '============'
123 |         print (data[i])
124 |         print (data_2[i])
125 |         # print '------------------------------------'
126 | 
127 | 
128 | 
129 | 
130 |         for iteration in range(args.num_sample):
131 |             # seed = np.random.normal(size=[1, parameters.latent_variable_size])
132 |             seed = Variable(t.randn([1, parameters.latent_variable_size]))
133 |             # seed = Variable(t.from_numpy(seed).float())
134 |             # exit()
135 |             # seed = mu
136 |             # if use_cuda:
137 |             seed = seed.cuda()
138 | 
139 |             seed = seed * std + mu
140 |             # seed = seed*std + mu
141 |             # print 'Multiplication done'
142 |             # seed = seed.cuda()
143 |             # print seed.size
144 |             # print type(seed)
145 |             # print seed
146 |             # exit()
147 |             results, scores = rvae.sampler(batch_loader,batch_loader_2, 50, seed, args.use_cuda,i,beam_size,n_best)
148 |             # exit()
149 |             # print(results)
150 |             for tt in results:
151 |                 for k in xrange(n_best):
152 |                     sen = " ". join([batch_loader_2.decode_word(x[k]) for x in tt])
153 |                 # print sen
154 |                 if batch_loader.end_token in sen:    
155 |                     print sen[:sen.index(batch_loader.end_token)]
156 |                 else :
157 |                     print sen
158 |             # exit()       
159 |         print '\n'
160 | 
161 | 
162 |     # print 'words_vocab_size BatchLoader ----------->'
163 |     # print batch_loader.words_vocab_size
164 |     # print '-----------------------------------------'
165 | 
166 |     # print 'words_vocab_size BatchLoader_2 ----------->'
167 |     # print batch_loader_2.words_vocab_size
168 |     # print '-----------------------------------------'
169 | 
170 | 


--------------------------------------------------------------------------------
/model/rvae_previous.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as t
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | 
  7 | from .decoder import Decoder
  8 | from .encoder import Encoder
  9 | 
 10 | from selfModules.embedding import Embedding
 11 | 
 12 | from utils.functional import kld_coef, parameters_allocation_check, fold
 13 | 
 14 | 
 15 | class RVAE(nn.Module):
 16 |     def __init__(self, params,params_2):
 17 |         super(RVAE, self).__init__()
 18 | 
 19 |         self.params = params
 20 |         self.params_2 = params_2        #Encoder-2 parameters
 21 | 
 22 |         self.embedding = Embedding(self.params, '')
 23 |         self.embedding_2 = Embedding(self.params_2, '')
 24 | 
 25 |         self.encoder = Encoder(self.params)
 26 |         self.encoder_2 = Encoder(self.params_2)
 27 | 
 28 | 
 29 |         self.context_to_mu = nn.Linear(self.params.encoder_rnn_size * 2, self.params.latent_variable_size)
 30 |         self.context_to_logvar = nn.Linear(self.params.encoder_rnn_size * 2, self.params.latent_variable_size)
 31 | 
 32 |         self.encoder_3 = Encoder(self.params)
 33 |         self.decoder = Decoder(self.params_2)         #change this to params_2
 34 | 
 35 |     def forward(self, drop_prob,
 36 |                 encoder_word_input=None, encoder_character_input=None,
 37 |                 encoder_word_input_2=None, encoder_character_input_2=None,
 38 |                 decoder_word_input_2=None, decoder_character_input_2=None,
 39 |                 z=None, initial_state=None):
 40 | 
 41 |                 #Modified the parameters of forward function according to Encoder-2
 42 |         """
 43 |         :param encoder_word_input: An tensor with shape of [batch_size, seq_len] of Long type
 44 |         :param encoder_character_input: An tensor with shape of [batch_size, seq_len, max_word_len] of Long type
 45 |         :param decoder_word_input: An tensor with shape of [batch_size, max_seq_len + 1] of Long type
 46 |         :param initial_state: initial state of decoder rnn in order to perform sampling
 47 | 
 48 |         :param drop_prob: probability of an element of decoder input to be zeroed in sense of dropout
 49 | 
 50 |         :param z: context if sampling is performing
 51 | 
 52 |         :return: unnormalized logits of sentence words distribution probabilities
 53 |                     with shape of [batch_size, seq_len, word_vocab_size]
 54 |                  final rnn state with shape of [num_layers, batch_size, decoder_rnn_size]
 55 |         """
 56 | 
 57 |         assert parameters_allocation_check(self), \
 58 |             'Invalid CUDA options. Parameters should be allocated in the same memory'
 59 |         use_cuda = self.embedding.word_embed.weight.is_cuda
 60 | 
 61 |         assert z is None and fold(lambda acc, parameter: acc and parameter is not None,
 62 |                                   [encoder_word_input, encoder_character_input, decoder_word_input_2],
 63 |                                   True) \
 64 |             or (z is not None and decoder_word_input_2 is not None), \
 65 |             "Invalid input. If z is None then encoder and decoder inputs should be passed as arguments"
 66 | 
 67 |         if z is None:
 68 |             ''' Get context from encoder and sample z ~ N(mu, std)
 69 |             '''
 70 |             [batch_size, _] = encoder_word_input.size()
 71 | 
 72 |             encoder_input = self.embedding(encoder_word_input, encoder_character_input)
 73 | 
 74 |             ''' ===================================================Doing the same for encoder-2===================================================
 75 |             '''
 76 |             [batch_size_2, _] = encoder_word_input_2.size()
 77 | 
 78 |             encoder_input_2 = self.embedding_2(encoder_word_input_2, encoder_character_input_2)
 79 | 
 80 |             ''' ==================================================================================================================================
 81 |             '''
 82 |             
 83 |             context , h_0 , c_0 = self.encoder(encoder_input, None)
 84 |             
 85 |             State = (h_0,c_0) #Final state of Encoder-1
 86 |             context_2 , _ , _ = self.encoder_2( encoder_input_2, State )   #Encoder_2 for Ques_2
 87 |             
 88 |             mu = self.context_to_mu(context_2)
 89 |             logvar = self.context_to_logvar(context_2)
 90 |             std = t.exp(0.5 * logvar)
 91 | 
 92 |             z = Variable(t.randn([batch_size, self.params.latent_variable_size]))
 93 |             if use_cuda:
 94 |                 z = z.cuda()
 95 | 
 96 |             z = z * std + mu
 97 | 
 98 |             kld = (-0.5 * t.sum(logvar - t.pow(mu, 2) - t.exp(logvar) + 1, 1)).mean().squeeze()
 99 | 
100 |             encoder_input = self.embedding(encoder_word_input, encoder_character_input)
101 |             _ , h_0 , c_0 = self.encoder_3(encoder_input, None)
102 |             initial_state = (h_0,c_0) #Final state of Encoder-1
103 | 
104 |         else:
105 |             kld = None
106 | 
107 | 
108 |         
109 | 
110 |         decoder_input_2 = self.embedding.word_embed(decoder_word_input_2)   # What to do with this decoder input ? --> Slightly resolved
111 |         out, final_state = self.decoder(decoder_input_2, z, drop_prob, initial_state)           # Take a look at the decoder
112 | 
113 |         return out, final_state, kld
114 | 
115 |     def learnable_parameters(self):
116 | 
117 |         # word_embedding is constant parameter thus it must be dropped from list of parameters for optimizer
118 |         return [p for p in self.parameters() if p.requires_grad]
119 | 
120 |     def trainer(self, optimizer, batch_loader, batch_loader_2):
121 |         def train(i, batch_size, use_cuda, dropout, start_index):
122 |             input = batch_loader.next_batch(batch_size, 'train', start_index)
123 |             input = [Variable(t.from_numpy(var)) for var in input]
124 |             input = [var.long() for var in input]
125 |             input = [var.cuda() if use_cuda else var for var in input]
126 | 
127 |             [encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target] = input
128 | 
129 | 
130 |             ''' =================================================== Input for Encoder-2 ========================================================
131 |             '''
132 | 
133 |             input_2 = batch_loader_2.next_batch(batch_size, 'train', start_index)
134 |             input_2 = [Variable(t.from_numpy(var)) for var in input_2]
135 |             input_2 = [var.long() for var in input_2]
136 |             input_2 = [var.cuda() if use_cuda else var for var in input_2]
137 | 
138 |             [encoder_word_input_2, encoder_character_input_2, decoder_word_input_2, decoder_character_input_2, target] = input_2
139 | 
140 |             ''' ================================================================================================================================
141 |             '''
142 |             # exit()
143 | 
144 |             logits, _, kld = self(dropout,
145 |                                   encoder_word_input, encoder_character_input,
146 |                                   encoder_word_input_2,encoder_character_input_2,
147 |                                   decoder_word_input_2, decoder_character_input_2,
148 |                                   z=None)
149 | 
150 |             # logits = logits.view(-1, self.params.word_vocab_size)
151 |             logits = logits.view(-1, self.params_2.word_vocab_size)
152 |             target = target.view(-1)
153 |             cross_entropy = F.cross_entropy(logits, target)
154 | 
155 |             loss = 79 * cross_entropy + kld_coef(i) * kld
156 | 
157 |             optimizer.zero_grad()
158 |             loss.backward()
159 |             optimizer.step()
160 | 
161 |             return cross_entropy, kld, kld_coef(i)
162 | 
163 |         return train
164 | 
165 |     def validater(self, batch_loader,batch_loader_2):
166 |         def validate(batch_size, use_cuda, start_index):
167 |             input = batch_loader.next_batch(batch_size, 'valid', start_index)
168 |             input = [Variable(t.from_numpy(var)) for var in input]
169 |             input = [var.long() for var in input]
170 |             input = [var.cuda() if use_cuda else var for var in input]
171 | 
172 |             [encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target] = input
173 | 
174 |             ''' ==================================================== Input for Encoder-2 ========================================================
175 |             '''
176 | 
177 |             input_2 = batch_loader_2.next_batch(batch_size, 'valid', start_index)
178 |             input_2 = [Variable(t.from_numpy(var)) for var in input_2]
179 |             input_2 = [var.long() for var in input_2]
180 |             input_2 = [var.cuda() if use_cuda else var for var in input_2]
181 |             [encoder_word_input_2, encoder_character_input_2, decoder_word_input_2, decoder_character_input_2, target] = input_2
182 | 
183 |             ''' ==================================================================================================================================
184 |             '''
185 | 
186 |             logits, _, kld = self(0.,
187 |                                   encoder_word_input, encoder_character_input,
188 |                                   encoder_word_input_2,encoder_character_input_2,
189 |                                   decoder_word_input_2, decoder_character_input_2,
190 |                                   z=None)
191 | 
192 |             # logits = logits.view(-1, self.params.word_vocab_size)
193 |             logits = logits.view(-1, self.params_2.word_vocab_size)
194 |             target = target.view(-1)
195 |             cross_entropy = F.cross_entropy(logits, target)
196 | 
197 |             return cross_entropy, kld
198 | 
199 |         return validate
200 | 
201 |     def sample(self, batch_loader, seq_len, seed, use_cuda, State):
202 |         seed = Variable(t.from_numpy(seed).float())
203 |         if use_cuda:
204 |             seed = seed.cuda()
205 | 
206 |         decoder_word_input_np, decoder_character_input_np = batch_loader.go_input(1)
207 | 
208 |         decoder_word_input = Variable(t.from_numpy(decoder_word_input_np).long())
209 |         decoder_character_input = Variable(t.from_numpy(decoder_character_input_np).long())
210 | 
211 |         if use_cuda:
212 |             decoder_word_input, decoder_character_input = decoder_word_input.cuda(), decoder_character_input.cuda()
213 | 
214 |         result = ''
215 | 
216 |         initial_state = State
217 | 
218 |         for i in range(seq_len):
219 |             logits, initial_state, _ = self(0., None, None,
220 |                                                 None, None,
221 |                                             decoder_word_input, decoder_character_input,
222 |                                             seed, initial_state)
223 | 
224 | 
225 |             # forward(self, drop_prob,
226 |             #           encoder_word_input=None, encoder_character_input=None,
227 |             #           encoder_word_input_2=None, encoder_character_input_2=None,
228 |             #           decoder_word_input_2=None, decoder_character_input_2=None,
229 |             #           z=None, initial_state=None):
230 | 
231 |             # logits = logits.view(-1, self.params.word_vocab_size)
232 |             # logits = logits.view(-1, self.params.word_vocab_size)
233 |             logits = logits.view(-1, self.params_2.word_vocab_size)
234 |             # print '---------------------------------------'
235 |             # print 'Printing logits'
236 |             # print logits
237 |             # print '------------------------------------------'
238 | 
239 |             prediction = F.softmax(logits)
240 | 
241 |             word = batch_loader.sample_word_from_distribution(prediction.data.cpu().numpy()[-1])
242 | 
243 |             if word == batch_loader.end_token:
244 |                 break
245 | 
246 |             result += ' ' + word
247 | 
248 |             decoder_word_input_np = np.array([[batch_loader.word_to_idx[word]]])
249 |             decoder_character_input_np = np.array([[batch_loader.encode_characters(word)]])
250 | 
251 |             decoder_word_input = Variable(t.from_numpy(decoder_word_input_np).long())
252 |             decoder_character_input = Variable(t.from_numpy(decoder_character_input_np).long())
253 | 
254 |             if use_cuda:
255 |                 decoder_word_input, decoder_character_input = decoder_word_input.cuda(), decoder_character_input.cuda()
256 | 
257 |         return result
258 | 
259 |     def sampler(self, batch_loader, seq_len, seed, use_cuda):
260 |         input = batch_loader.next_batch(1, 'valid', 1)
261 |         input = [Variable(t.from_numpy(var)) for var in input]
262 |         input = [var.long() for var in input]
263 |         input = [var.cuda() if use_cuda else var for var in input]
264 |         [encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target] = input
265 | 
266 |         encoder_input = self.embedding(encoder_word_input, encoder_character_input)
267 | 
268 |         _ , h0 , c0 = self.encoder_3(encoder_input, None)
269 |         State = (h0,c0)
270 | 
271 |         # print '----------------------'
272 |         # print 'Printing h0 ---------->'
273 |         # print h0
274 |         # print '----------------------'
275 | 
276 |         # State = None
277 |         result = self.sample(batch_loader, seq_len, seed, use_cuda, State)
278 | 
279 |         return result
280 | 


--------------------------------------------------------------------------------
/utils/batch_loader.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import collections
  4 | import os
  5 | import re
  6 | 
  7 | import numpy as np
  8 | from six.moves import cPickle
  9 | 
 10 | from .functional import *
 11 | 
 12 | 
 13 | class BatchLoader:
 14 |     def __init__(self, data_files, idx_files, tensor_files, path='../../'):
 15 | 
 16 |         '''
 17 |             :properties
 18 | 
 19 |                 data_files - array containing paths to data sources
 20 | 
 21 |                 idx_files - array of paths to vocabulury files
 22 | 
 23 |                 tensor_files - matrix with shape of [2, target_num] containing paths to files
 24 |                     with data represented as tensors
 25 |                     where first index in shape corresponds to types of representation of data,
 26 |                     i.e. word representation and character-aware representation
 27 | 
 28 |                 blind_symbol - special symbol to fill spaces in every word in character-aware representation
 29 |                     to make all words be the same lenght
 30 |                 pad_token - the same special symbol as blind_symbol, but in case of lines of words
 31 |                 go_token - start of sequence symbol
 32 |                 end_token - end of sequence symbol
 33 | 
 34 |                 chars_vocab_size - number of unique characters
 35 |                 idx_to_char - array of shape [chars_vocab_size] containing ordered list of inique characters
 36 |                 char_to_idx - dictionary of shape [chars_vocab_size]
 37 |                     such that idx_to_char[char_to_idx[some_char]] = some_char
 38 |                     where some_char is such that idx_to_char contains it
 39 | 
 40 |                 words_vocab_size, idx_to_word, word_to_idx - same as for characters
 41 | 
 42 |                 max_word_len - maximum word length
 43 |                 max_seq_len - maximum sequence length
 44 |                 num_lines - num of lines in data with shape [target_num]
 45 | 
 46 |                 word_tensor -  tensor of shape [target_num, num_lines, line_lenght] c
 47 |                     ontains word's indexes instead of words itself
 48 | 
 49 |                 character_tensor - tensor of shape [target_num, num_lines, line_lenght, max_word_len].
 50 |                     Rows contain character indexes for every word in data
 51 | 
 52 |             :methods
 53 | 
 54 |                 build_character_vocab(self, data) -> chars_vocab_size, idx_to_char, char_to_idx
 55 |                     chars_vocab_size - size of unique characters in corpus
 56 |                     idx_to_char - array of shape [chars_vocab_size] containing ordered list of inique characters
 57 |                     char_to_idx - dictionary of shape [chars_vocab_size]
 58 |                         such that idx_to_char[char_to_idx[some_char]] = some_char
 59 |                         where some_char is such that idx_to_char contains it
 60 | 
 61 |                 build_word_vocab(self, sentences) -> words_vocab_size, idx_to_word, word_to_idx
 62 |                     same as for characters
 63 | 
 64 |                 preprocess(self, data_files, idx_files, tensor_files) -> Void
 65 |                     preprocessed and initialized properties and then save them
 66 | 
 67 |                 load_preprocessed(self, data_files, idx_files, tensor_files) -> Void
 68 |                     load and and initialized properties
 69 | 
 70 |                 next_batch(self, batch_size, target_str) -> encoder_word_input, encoder_character_input, input_seq_len,
 71 |                         decoder_input, decoder_output
 72 |                     randomly sampled batch_size num of sequences for target from target_str.
 73 |                     fills sequences with pad tokens to made them the same lenght.
 74 |                     encoder_word_input and encoder_character_input have reversed order of the words
 75 |                         in case of performance
 76 |         '''
 77 | 
 78 |         self.data_files = data_files
 79 |         self.idx_files = idx_files
 80 |         self.tensor_files = tensor_files
 81 |         
 82 | 
 83 |         self.blind_symbol = ''
 84 |         self.pad_token = '_'
 85 |         self.go_token = '>'
 86 |         self.end_token = '|'
 87 |         self.a_token = '?'
 88 | 
 89 |         idx_exists = fold(f_and,
 90 |                           [os.path.exists(file) for file in self.idx_files],
 91 |                           True)
 92 | 
 93 |         tensors_exists = fold(f_and,
 94 |                               [os.path.exists(file) for target in self.tensor_files
 95 |                                for file in target],
 96 |                               True)
 97 | 
 98 |         if idx_exists and tensors_exists:
 99 |             self.load_preprocessed(self.data_files,
100 |                                    self.idx_files,
101 |                                    self.tensor_files)
102 |             print('preprocessed data was found and loaded')
103 |         else:
104 |             self.preprocess(self.data_files,
105 |                             self.idx_files,
106 |                             self.tensor_files)
107 |             print('data have preprocessed')
108 | 
109 |         self.word_embedding_index = 0
110 | 
111 |     def clean_whole_data(self, string):
112 |         string = re.sub('^[\d\:]+ ', '', string, 0, re.M)
113 |         string = re.sub('\n\s{11}', ' ', string, 0, re.M)
114 |         string = re.sub('\n{2}', '\n', string, 0, re.M)
115 | 
116 |         return string.lower()
117 | 
118 |     def clean_str(self, string):
119 |         '''
120 |             Tokenization/string cleaning for all datasets except for SST.
121 |             Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data
122 |         '''
123 | 
124 |         string = re.sub(r"[^가-힣A-Za-z0-9(),!?:;.\'\`]", " ", string)
125 |         string = re.sub(r"\'s", " \'s", string)
126 |         string = re.sub(r"\'ve", " \'ve", string)
127 |         string = re.sub(r"n\'t", " n\'t", string)
128 |         string = re.sub(r"\'re", " \'re", string)
129 |         string = re.sub(r"\'d", " \'d", string)
130 |         string = re.sub(r"\'ll", " \'ll", string)
131 |         string = re.sub(r"\.", " . ", string)
132 |         string = re.sub(r",", " , ", string)
133 |         string = re.sub(r":", " : ", string)
134 |         string = re.sub(r";", " ; ", string)
135 |         string = re.sub(r"!", " ! ", string)
136 |         string = re.sub(r"\(", " ( ", string)
137 |         string = re.sub(r"\)", " ) ", string)
138 |         string = re.sub(r"\?", " ? ", string)
139 |         string = re.sub(r"\s{2,}", " ", string)
140 |         return string.strip()
141 | 
142 |     def build_character_vocab(self, data):
143 | 
144 |         # unique characters with blind symbol
145 |         chars = list(set(data)) + [self.blind_symbol, self.pad_token, self.go_token, self.end_token]
146 |         chars_vocab_size = len(chars)
147 | 
148 |         # mappings itself
149 |         idx_to_char = chars
150 |         char_to_idx = {x: i for i, x in enumerate(idx_to_char)}
151 | 
152 |         return chars_vocab_size, idx_to_char, char_to_idx
153 | 
154 |     def build_word_vocab(self, sentences):
155 | 
156 |         # Build vocabulary
157 |         word_counts = collections.Counter(sentences)
158 | 
159 |         # Mapping from index to word
160 |         idx_to_word = [x[0] for x in word_counts.most_common()]
161 |         idx_to_word = list(sorted(idx_to_word)) + [self.pad_token, self.go_token, self.end_token]
162 | 
163 |         words_vocab_size = len(idx_to_word)
164 | 
165 |         # Mapping from word to index
166 |         word_to_idx = {x: i for i, x in enumerate(idx_to_word)}
167 | 
168 |         return words_vocab_size, idx_to_word, word_to_idx
169 | 
170 |     def preprocess(self, data_files, idx_files, tensor_files):
171 | 
172 |         data = [open(file, "r").read() for file in data_files]
173 |         merged_data = data[0] + '\n' + data[1]
174 | 
175 |         self.chars_vocab_size, self.idx_to_char, self.char_to_idx = self.build_character_vocab(merged_data)
176 | 
177 |         with open(idx_files[1], 'wb') as f:
178 |             cPickle.dump(self.idx_to_char, f)
179 | 
180 |         data_words = [[line.split() for line in target.split('\n')] for target in data]
181 |         merged_data_words = merged_data.split()
182 | 
183 |         self.words_vocab_size, self.idx_to_word, self.word_to_idx = self.build_word_vocab(merged_data_words)
184 |         self.max_word_len = np.amax([len(word) for word in self.idx_to_word])
185 |         self.max_seq_len = np.amax([len(line) for target in data_words for line in target])
186 |         self.num_lines = [len(target) for target in data_words]
187 | 
188 |         with open(idx_files[0], 'wb') as f:
189 |             cPickle.dump(self.idx_to_word, f)
190 | 
191 |         self.word_tensor = np.array(
192 |             [[list(map(self.word_to_idx.get, line)) for line in target] for target in data_words])
193 |         print(self.word_tensor.shape)
194 |         for i, path in enumerate(tensor_files[0]):
195 |             np.save(path, self.word_tensor[i])
196 | 
197 |         self.character_tensor = np.array(
198 |             [[list(map(self.encode_characters, line)) for line in target] for target in data_words])
199 |         for i, path in enumerate(tensor_files[1]):
200 |             np.save(path, self.character_tensor[i])
201 | 
202 |         self.just_words = [word for line in self.word_tensor[0] for word in line]
203 | 
204 |     def load_preprocessed(self, data_files, idx_files, tensor_files):
205 | 
206 |         data = [open(file, "r").read() for file in data_files]
207 |         data_words = [[line.split() for line in target.split('\n')] for target in data]
208 |         self.max_seq_len = np.amax([len(line) for target in data_words for line in target])
209 |         self.num_lines = [len(target) for target in data_words]
210 | 
211 |         [self.idx_to_word, self.idx_to_char] = [cPickle.load(open(file, "rb")) for file in idx_files]
212 | 
213 |         [self.words_vocab_size, self.chars_vocab_size] = [len(idx) for idx in [self.idx_to_word, self.idx_to_char]]
214 | 
215 |         [self.word_to_idx, self.char_to_idx] = [dict(zip(idx, range(len(idx)))) for idx in
216 |                                                 [self.idx_to_word, self.idx_to_char]]
217 | 
218 |         self.max_word_len = np.amax([len(word) for word in self.idx_to_word])
219 | 
220 |         [self.word_tensor, self.character_tensor] = [np.array([np.load(target) for target in input_type])
221 |                                                      for input_type in tensor_files]
222 | 
223 |         self.just_words = [word for line in self.word_tensor[0] for word in line]
224 | 
225 |     def next_batch(self, batch_size, target_str,start_index):
226 | #         target = 0 if target_str == 'train' else 1
227 |         target=0
228 |         # indexes = np.array(np.random.randint(self.num_lines[target], size=batch_size))
229 |         # indexes = np.array([10])
230 | 
231 |         # print '-----------------Printing ? identity----------------------'
232 |         # temp = self.word_to_idx[self.a_token]
233 |         # print temp
234 |         # print 'DONE!'
235 |         # exit()
236 | 
237 |         indexes = np.array(range(start_index, start_index+batch_size))
238 |         # print '======================'
239 |         # print indexes
240 |         # print '======================'
241 |         # print self.num_lines
242 | 
243 | #         print 'Printing indexes ------------->'
244 | #         print indexes
245 | #         print '-------------------------------'
246 | 
247 |         encoder_word_input = [self.word_tensor[target][index] for index in indexes]
248 |         
249 | #         print 'Printing encoder_word_input ------------->'
250 | #         print encoder_word_input
251 |         # print '-------------------------------'
252 | 
253 |         encoder_character_input = [self.character_tensor[target][index] for index in indexes]
254 |         input_seq_len = [len(line) for line in encoder_word_input]
255 |         max_input_seq_len = np.amax(input_seq_len)
256 | 
257 |         encoded_words = [[idx for idx in line] for line in encoder_word_input]
258 |         decoder_word_input = [[self.word_to_idx[self.go_token]] + line for line in encoder_word_input]
259 |         decoder_character_input = [[self.encode_characters(self.go_token)] + line for line in encoder_character_input]
260 |         decoder_output = [line + [self.word_to_idx[self.end_token]] for line in encoded_words]
261 | 
262 |         # sorry
263 |         for i, line in enumerate(decoder_word_input):
264 |             line_len = input_seq_len[i]
265 |             to_add = max_input_seq_len - line_len
266 |             decoder_word_input[i] = line + [self.word_to_idx[self.pad_token]] * to_add
267 | 
268 |         for i, line in enumerate(decoder_character_input):
269 |             line_len = input_seq_len[i]
270 |             to_add = max_input_seq_len - line_len
271 |             decoder_character_input[i] = line + [self.encode_characters(self.pad_token)] * to_add
272 | 
273 |         for i, line in enumerate(decoder_output):
274 |             line_len = input_seq_len[i]
275 |             to_add = max_input_seq_len - line_len
276 |             decoder_output[i] = line + [self.word_to_idx[self.pad_token]] * to_add
277 | 
278 |         for i, line in enumerate(encoder_word_input):
279 |             line_len = input_seq_len[i]
280 |             to_add = max_input_seq_len - line_len
281 |             encoder_word_input[i] = [self.word_to_idx[self.pad_token]] * to_add + line[::-1]
282 | 
283 |         for i, line in enumerate(encoder_character_input):
284 |             line_len = input_seq_len[i]
285 |             to_add = max_input_seq_len - line_len
286 |             encoder_character_input[i] = [self.encode_characters(self.pad_token)] * to_add + line[::-1]
287 | 
288 |         return np.array(encoder_word_input), np.array(encoder_character_input), \
289 |                np.array(decoder_word_input), np.array(decoder_character_input), np.array(decoder_output)
290 | 
291 |     def next_embedding_seq(self, seq_len):
292 |         """
293 |         :return:
294 |             tuple of input and output for word embedding learning,
295 |             where input = [b, b, c, c, d, d, e, e]
296 |             and output  = [a, c, b, d, d, e, d, g]
297 |             for line [a, b, c, d, e, g] at index i
298 |         """
299 | 
300 |         words_len = len(self.just_words)
301 |         seq = [self.just_words[i % words_len]
302 |                for i in np.arange(self.word_embedding_index, self.word_embedding_index + seq_len)]
303 | 
304 |         result = []
305 |         for i in range(seq_len - 2):
306 |             result.append([seq[i + 1], seq[i]])
307 |             result.append([seq[i + 1], seq[i + 2]])
308 | 
309 |         self.word_embedding_index = (self.word_embedding_index + seq_len) % words_len - 2
310 | 
311 |         # input and target
312 |         result = np.array(result)
313 |         #print result
314 |         #print "---------------------print is coming --------------"
315 |         #print len(result[0])
316 |         return result[:, 0], result[:, 1]
317 | 
318 |     def go_input(self, batch_size):
319 |         go_word_input = [[self.word_to_idx[self.go_token]] for _ in range(batch_size)]
320 |         go_character_input = [[self.encode_characters(self.go_token)] for _ in range(batch_size)]
321 | 
322 |         return np.array(go_word_input), np.array(go_character_input)
323 | 
324 |     def encode_word(self, idx):
325 |         result = np.zeros(self.words_vocab_size)
326 |         result[idx] = 1
327 |         return result
328 | 
329 |     def decode_word(self, word_idx):
330 |         word = self.idx_to_word[word_idx]
331 |         return word
332 | 
333 |     def sample_word_from_distribution(self, distribution):
334 |         ix = np.random.choice(range(self.words_vocab_size), p=distribution.ravel())
335 |         x = np.zeros((self.words_vocab_size, 1))
336 |         x[ix] = 1
337 |         return self.idx_to_word[np.argmax(x)]
338 | 
339 |     def encode_characters(self, characters):
340 |         word_len = len(characters)
341 |         to_add = self.max_word_len - word_len
342 |         characters_idx = [self.char_to_idx[i] for i in characters] + to_add * [self.char_to_idx['']]
343 |         return characters_idx
344 | 
345 |     def decode_characters(self, characters_idx):
346 |         characters = [self.idx_to_char[i] for i in characters_idx]
347 |         return ''.join(characters)
348 | 


--------------------------------------------------------------------------------
/model/rvae.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as t
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | 
  7 | from .decoder import Decoder
  8 | from .encoder import Encoder
  9 | 
 10 | from selfModules.embedding import Embedding
 11 | 
 12 | from utils.functional import kld_coef, parameters_allocation_check, fold
 13 | from beam_search import Beam
 14 | 
 15 | class RVAE(nn.Module):
 16 |     def __init__(self, params,params_2):
 17 |         super(RVAE, self).__init__()
 18 | 
 19 |         self.params = params
 20 |         self.params_2 = params_2        #Encoder-2 parameters
 21 | 
 22 |         self.embedding = Embedding(self.params, '')
 23 |         self.embedding_2 = Embedding(self.params_2, '',True)
 24 | 
 25 |         self.encoder = Encoder(self.params)
 26 |         self.encoder_2 = Encoder(self.params_2)
 27 | 
 28 | 
 29 |         self.context_to_mu = nn.Linear(self.params.encoder_rnn_size * 2, self.params.latent_variable_size)
 30 |         self.context_to_logvar = nn.Linear(self.params.encoder_rnn_size * 2, self.params.latent_variable_size)
 31 | 
 32 |         # self.encoder_3 = Encoder(self.params)
 33 |         self.decoder = Decoder(self.params_2)         #change this to params_2
 34 | 
 35 |     def forward(self, drop_prob,
 36 |                 encoder_word_input=None, encoder_character_input=None,
 37 |                 encoder_word_input_2=None, encoder_character_input_2=None,
 38 |                 decoder_word_input_2=None, decoder_character_input_2=None,
 39 |                 z=None, initial_state=None):
 40 | 
 41 |                 #Modified the parameters of forward function according to Encoder-2
 42 |         """
 43 |         :param encoder_word_input: An tensor with shape of [batch_size, seq_len] of Long type
 44 |         :param encoder_character_input: An tensor with shape of [batch_size, seq_len, max_word_len] of Long type
 45 |         :param decoder_word_input: An tensor with shape of [batch_size, max_seq_len + 1] of Long type
 46 |         :param initial_state: initial state of decoder rnn in order to perform sampling
 47 | 
 48 |         :param drop_prob: probability of an element of decoder input to be zeroed in sense of dropout
 49 | 
 50 |         :param z: context if sampling is performing
 51 | 
 52 |         :return: unnormalized logits of sentence words distribution probabilities
 53 |                     with shape of [batch_size, seq_len, word_vocab_size]
 54 |                  final rnn state with shape of [num_layers, batch_size, decoder_rnn_size]
 55 |         """
 56 | 
 57 |         assert parameters_allocation_check(self), \
 58 |             'Invalid CUDA options. Parameters should be allocated in the same memory'
 59 |         use_cuda = self.embedding.word_embed.weight.is_cuda
 60 | 
 61 |         assert z is None and fold(lambda acc, parameter: acc and parameter is not None,
 62 |                                   [encoder_word_input, encoder_character_input, decoder_word_input_2],
 63 |                                   True) \
 64 |             or (z is not None and decoder_word_input_2 is not None), \
 65 |             "Invalid input. If z is None then encoder and decoder inputs should be passed as arguments"
 66 | 
 67 |         if z is None:
 68 |             ''' Get context from encoder and sample z ~ N(mu, std)
 69 |             '''
 70 |             [batch_size, _] = encoder_word_input.size()
 71 | 
 72 |             encoder_input = self.embedding(encoder_word_input, encoder_character_input)
 73 | 
 74 |             ''' ===================================================Doing the same for encoder-2===================================================
 75 |             '''
 76 |             [batch_size_2, _] = encoder_word_input_2.size()
 77 | 
 78 |             encoder_input_2 = self.embedding_2(encoder_word_input_2, encoder_character_input_2)
 79 | 
 80 |             ''' ==================================================================================================================================
 81 |             '''
 82 |             
 83 |             context , h_0 , c_0 = self.encoder(encoder_input, None)
 84 |             
 85 |             State = (h_0,c_0) #Final state of Encoder-1
 86 |             context_2 , _ , _ = self.encoder_2( encoder_input_2, State )   #Encoder_2 for Ques_2
 87 |             
 88 |             mu = self.context_to_mu(context_2)
 89 |             logvar = self.context_to_logvar(context_2)
 90 |             std = t.exp(0.5 * logvar)
 91 | 
 92 |             z = Variable(t.randn([batch_size, self.params.latent_variable_size]))
 93 |             if use_cuda:
 94 |                 z = z.cuda()
 95 | 
 96 |             z = z * std + mu
 97 | 
 98 |             kld = (-0.5 * t.sum(logvar - t.pow(mu, 2) - t.exp(logvar) + 1, 1)).mean().squeeze()
 99 | 
100 |             # encoder_input = self.embedding(encoder_word_input, encoder_character_input)
101 |             # _ , h_0 , c_0 = self.encoder_3(encoder_input, None)
102 |             initial_state = State   #Final state of Encoder-1
103 | 
104 |         else:
105 |             kld = None
106 |             mu = None
107 |             std = None
108 | 
109 |         
110 | 
111 |         decoder_input_2 = self.embedding_2.word_embed(decoder_word_input_2)   # What to do with this decoder input ? --> Slightly resolved
112 |         out, final_state = self.decoder(decoder_input_2, z, drop_prob, initial_state)           # Take a look at the decoder
113 | 
114 |         return out, final_state, kld, mu, std
115 | 
116 |     def learnable_parameters(self):
117 | 
118 |         # word_embedding is constant parameter thus it must be dropped from list of parameters for optimizer
119 |         return [p for p in self.parameters() if p.requires_grad]
120 | 
121 |     def trainer(self, optimizer, batch_loader, batch_loader_2):
122 |         def train(i, batch_size, use_cuda, dropout, start_index):
123 |             input = batch_loader.next_batch(batch_size, 'train', start_index)
124 |             input = [Variable(t.from_numpy(var)) for var in input]
125 |             input = [var.long() for var in input]
126 |             input = [var.cuda() if use_cuda else var for var in input]
127 | 
128 |             [encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target] = input
129 | 
130 | 
131 |             ''' =================================================== Input for Encoder-2 ========================================================
132 |             '''
133 | 
134 |             input_2 = batch_loader_2.next_batch(batch_size, 'train', start_index)
135 |             input_2 = [Variable(t.from_numpy(var)) for var in input_2]
136 |             input_2 = [var.long() for var in input_2]
137 |             input_2 = [var.cuda() if use_cuda else var for var in input_2]
138 | 
139 |             [encoder_word_input_2, encoder_character_input_2, decoder_word_input_2, decoder_character_input_2, target] = input_2
140 | 
141 |             ''' ================================================================================================================================
142 |             '''
143 |             # exit()
144 | 
145 |             logits, _, kld,_ ,_ = self(dropout,
146 |                                   encoder_word_input, encoder_character_input,
147 |                                   encoder_word_input_2,encoder_character_input_2,
148 |                                   decoder_word_input_2, decoder_character_input_2,
149 |                                   z=None)
150 | 
151 |             # logits = logits.view(-1, self.params.word_vocab_size)
152 |             logits = logits.view(-1, self.params_2.word_vocab_size)
153 |             target = target.view(-1)
154 |             cross_entropy = F.cross_entropy(logits, target)
155 | 
156 |             loss = 79 * cross_entropy + kld_coef(i) * kld
157 | 
158 |             optimizer.zero_grad()
159 |             loss.backward()
160 |             optimizer.step()
161 | 
162 |             return cross_entropy, kld, kld_coef(i)
163 | 
164 |         return train
165 | 
166 |     def validater(self, batch_loader,batch_loader_2):
167 |         def validate(batch_size, use_cuda, start_index):
168 |             input = batch_loader.next_batch(batch_size, 'valid', start_index)
169 |             input = [Variable(t.from_numpy(var)) for var in input]
170 |             input = [var.long() for var in input]
171 |             input = [var.cuda() if use_cuda else var for var in input]
172 | 
173 |             [encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target] = input
174 | 
175 |             ''' ==================================================== Input for Encoder-2 ========================================================
176 |             '''
177 | 
178 |             input_2 = batch_loader_2.next_batch(batch_size, 'valid', start_index)
179 |             input_2 = [Variable(t.from_numpy(var)) for var in input_2]
180 |             input_2 = [var.long() for var in input_2]
181 |             input_2 = [var.cuda() if use_cuda else var for var in input_2]
182 |             [encoder_word_input_2, encoder_character_input_2, decoder_word_input_2, decoder_character_input_2, target] = input_2
183 | 
184 |             ''' ==================================================================================================================================
185 |             '''
186 | 
187 |             logits, _, kld,_ ,_ = self(0.,
188 |                                   encoder_word_input, encoder_character_input,
189 |                                   encoder_word_input_2,encoder_character_input_2,
190 |                                   decoder_word_input_2, decoder_character_input_2,
191 |                                   z=None)
192 | 
193 |             # logits = logits.view(-1, self.params.word_vocab_size)
194 |             logits = logits.view(-1, self.params_2.word_vocab_size)
195 |             target = target.view(-1)
196 |             cross_entropy = F.cross_entropy(logits, target)
197 | 
198 |             return cross_entropy, kld
199 | 
200 |         return validate
201 | 
202 |     def sample(self, batch_loader, seq_len, seed, use_cuda, State):
203 |         # seed = Variable(t.from_numpy(seed).float())
204 |         if use_cuda:
205 |             seed = seed.cuda()
206 | 
207 |         decoder_word_input_np, decoder_character_input_np = batch_loader.go_input(1)
208 | 
209 |         decoder_word_input = Variable(t.from_numpy(decoder_word_input_np).long())
210 |         decoder_character_input = Variable(t.from_numpy(decoder_character_input_np).long())
211 | 
212 |         if use_cuda:
213 |             decoder_word_input, decoder_character_input = decoder_word_input.cuda(), decoder_character_input.cuda()
214 | 
215 |         result = ''
216 | 
217 |         initial_state = State
218 | 
219 |         for i in range(seq_len):
220 |             logits, initial_state, _ ,_,_= self(0., None, None,
221 |                                                 None, None,
222 |                                             decoder_word_input, decoder_character_input,
223 |                                             seed, initial_state)
224 | 
225 | 
226 |             # forward(self, drop_prob,
227 |             #           encoder_word_input=None, encoder_character_input=None,
228 |             #           encoder_word_input_2=None, encoder_character_input_2=None,
229 |             #           decoder_word_input_2=None, decoder_character_input_2=None,
230 |             #           z=None, initial_state=None):
231 | 
232 |             # logits = logits.view(-1, self.params.word_vocab_size)
233 |             # logits = logits.view(-1, self.params.word_vocab_size)
234 |             logits = logits.view(-1, self.params_2.word_vocab_size)
235 |             # print '---------------------------------------'
236 |             # print 'Printing logits'
237 |             # print logits
238 |             # print '------------------------------------------'
239 | 
240 |             prediction = F.softmax(logits)
241 | 
242 |             word = batch_loader.sample_word_from_distribution(prediction.data.cpu().numpy()[-1])
243 | 
244 |             if word == batch_loader.end_token:
245 |                 break
246 | 
247 |             result += ' ' + word
248 | 
249 |             decoder_word_input_np = np.array([[batch_loader.word_to_idx[word]]])
250 |             decoder_character_input_np = np.array([[batch_loader.encode_characters(word)]])
251 | 
252 |             decoder_word_input = Variable(t.from_numpy(decoder_word_input_np).long())
253 |             decoder_character_input = Variable(t.from_numpy(decoder_character_input_np).long())
254 | 
255 |             if use_cuda:
256 |                 decoder_word_input, decoder_character_input = decoder_word_input.cuda(), decoder_character_input.cuda()
257 | 
258 |         return result
259 | 
260 |     def sampler(self, batch_loader,batch_loader_2, seq_len, seed, use_cuda,i,beam_size,n_best):
261 |         input = batch_loader.next_batch(1, 'valid', i)
262 |         input = [Variable(t.from_numpy(var)) for var in input]
263 |         input = [var.long() for var in input]
264 |         input = [var.cuda() if use_cuda else var for var in input]
265 |         [encoder_word_input, encoder_character_input, decoder_word_input, decoder_character_input, target] = input
266 | 
267 |         encoder_input = self.embedding(encoder_word_input, encoder_character_input)
268 | 
269 |         _ , h0 , c0 = self.encoder(encoder_input, None)
270 |         State = (h0,c0)
271 | 
272 |         # print '----------------------'
273 |         # print 'Printing h0 ---------->'
274 |         # print h0
275 |         # print '----------------------'
276 | 
277 |         # State = None
278 |         results, scores = self.sample_beam(batch_loader_2, seq_len, seed, use_cuda, State, beam_size, n_best)
279 | 
280 |         return results, scores
281 | 
282 | 
283 |     def sample_beam(self, batch_loader, seq_len, seed, use_cuda, State, beam_size, n_best):
284 |         # seed = Variable(t.from_numpy(seed).float())
285 |         if use_cuda:
286 |             seed = seed.cuda()
287 | 
288 |         decoder_word_input_np, decoder_character_input_np = batch_loader.go_input(1)
289 | 
290 |         decoder_word_input = Variable(t.from_numpy(decoder_word_input_np).long())
291 |         decoder_character_input = Variable(t.from_numpy(decoder_character_input_np).long())
292 | 
293 |         if use_cuda:
294 |             decoder_word_input, decoder_character_input = decoder_word_input.cuda(), decoder_character_input.cuda()
295 | 
296 | 
297 |         dec_states = State
298 | 
299 |         # print '========= Before ================'
300 |         # print "dec_states:", dec_states[0].size()
301 |         # print "dec_states:", dec_states[1].size() 
302 |         # print '=================================='
303 | 
304 |         # dec_states = [
305 |         #     Variable(dec_states[0].repeat(1, beam_size, 1)),
306 |         #     Variable(dec_states[1].repeat(1, beam_size, 1))
307 |         # ]  
308 |         dec_states = [
309 |             dec_states[0].repeat(1, beam_size, 1),
310 |             dec_states[1].repeat(1, beam_size, 1)
311 |         ]
312 | 
313 |         # print'========== After =================='      
314 |         # print "dec_states:", dec_states[0].size()
315 |         # print "dec_states:", dec_states[1].size() 
316 |         # print '=================================='
317 |         # exit()
318 | 
319 |         drop_prob = 0.0
320 |         beam_size = beam_size
321 |         batch_size = 1  
322 |         
323 |         beam = [Beam(beam_size, batch_loader, cuda=True) for k in range(batch_size)]
324 | 
325 |         batch_idx = list(range(batch_size))
326 |         remaining_sents = batch_size
327 |         
328 |         
329 |         for i in range(seq_len):
330 |             
331 |             input = t.stack(
332 |                 [b.get_current_state() for b in beam if not b.done]
333 |             ).t().contiguous().view(1, -1)
334 | 
335 |             trg_emb = self.embedding_2.word_embed(Variable(input).transpose(1, 0))
336 |             
337 |             # print trg_emb.size()
338 |             # print seed.size()
339 |              
340 |             trg_h, dec_states = self.decoder.only_decoder_beam(trg_emb, seed, drop_prob, dec_states)
341 | 
342 |             # trg_h, (trg_h_t, trg_c_t) = self.model.decoder(trg_emb, (dec_states[0].squeeze(0), dec_states[1].squeeze(0)), context )
343 | 
344 |             # print trg_h.size()
345 |             # print trg_h_t.size()
346 |             # print trg_c_t.size()
347 | 
348 |             # dec_states = (trg_h_t, trg_c_t)
349 | 
350 |             # print 'State dimension ----------->'
351 |             # print State[0].size()
352 |             # print State[1].size()
353 |             # print '======================================='
354 |             # print "dec_states:", dec_states[0].size()
355 |             # print "dec_states:", dec_states[1].size()  
356 |             # print '========== Things successful ===========' 
357 | 
358 |             # exit()
359 |             
360 |             dec_out = trg_h.squeeze(1)
361 |             
362 |             # print "dec_out:", dec_out.size()
363 |             
364 |             out = F.softmax(self.decoder.fc(dec_out)).unsqueeze(0)
365 | 
366 |             word_lk = out.view(
367 |                 beam_size,
368 |                 remaining_sents,
369 |                 -1
370 |             ).transpose(0, 1).contiguous()
371 | 
372 |             active = []
373 |             for b in range(batch_size):
374 |                 if beam[b].done:
375 |                     continue
376 | 
377 |                 idx = batch_idx[b]
378 |                 if not beam[b].advance(word_lk.data[idx]):
379 |                     active += [b]
380 | 
381 |                 for dec_state in dec_states:  # iterate over h, c
382 |                     # layers x beam*sent x dim
383 |                     sent_states = dec_state.view(
384 |                         -1, beam_size, remaining_sents, dec_state.size(2)
385 |                     )[:, :, idx]
386 |                     sent_states.data.copy_(
387 |                         sent_states.data.index_select(
388 |                             1,
389 |                             beam[b].get_current_origin()
390 |                         )
391 |                     )
392 | 
393 |             if not active:
394 |                 break
395 | 
396 |             # in this section, the sentences that are still active are
397 |             # compacted so that the decoder is not run on completed sentences
398 |             active_idx = t.cuda.LongTensor([batch_idx[k] for k in active])
399 |             batch_idx = {beam: idx for idx, beam in enumerate(active)}
400 | 
401 |             def update_active(t):
402 |                 # select only the remaining active sentences
403 |                 view = t.data.view(
404 |                     -1, remaining_sents,
405 |                     self.params.decoder_rnn_size
406 |                 )
407 |                 new_size = list(t.size())
408 |                 new_size[-2] = new_size[-2] * len(active_idx) \
409 |                     // remaining_sents
410 |                 return Variable(view.index_select(
411 |                     1, active_idx
412 |                 ).view(*new_size))
413 | 
414 |             dec_states = (
415 |                 update_active(dec_states[0]),
416 |                 update_active(dec_states[1])
417 |             )
418 |             dec_out = update_active(dec_out)
419 |             # context = update_active(context)
420 | 
421 |             remaining_sents = len(active) 
422 | 
423 |          # (4) package everything up
424 | 
425 |         allHyp, allScores = [], []
426 | 
427 | 
428 |         for b in range(batch_size):
429 |             scores, ks = beam[b].sort_best()
430 |             # print scores
431 |             # print ks 
432 |             allScores += [scores[:n_best]]
433 |             hyps = zip(*[beam[b].get_hyp(k) for k in ks[:n_best]])
434 |             # print hyps
435 |             # print "------------------"
436 |             allHyp += [hyps]
437 | 
438 |         # print '==== Complete ========='
439 | 
440 |         return allHyp, allScores 
441 | 


--------------------------------------------------------------------------------