├── D_pretrain
    └── onmt
    │   ├── translate
    │       ├── __init__.py
    │       ├── Translation.py
    │       └── Beam.py
    │   ├── __init__.py
    │   ├── io
    │       ├── __init__.py
    │       └── DatasetBase.py
    │   ├── modules
    │       ├── StructuredAttention.py
    │       ├── StackedRNN.py
    │       ├── __init__.py
    │       ├── AudioEncoder.py
    │       ├── ConvMultiStepAttention.py
    │       ├── UtilClass.py
    │       ├── Gate.py
    │       ├── ImageEncoder.py
    │       └── MultiHeadedAttn.py
    │   ├── Utils.py
    │   └── Optim.py
├── G_pretrain
    └── onmt
    │   ├── translate
    │       ├── __init__.py
    │       ├── Translation.py
    │       └── Beam.py
    │   ├── __init__.py
    │   ├── io
    │       ├── __init__.py
    │       └── DatasetBase.py
    │   ├── modules
    │       ├── StructuredAttention.py
    │       ├── StackedRNN.py
    │       ├── __init__.py
    │       ├── AudioEncoder.py
    │       ├── ConvMultiStepAttention.py
    │       ├── UtilClass.py
    │       ├── Gate.py
    │       ├── ImageEncoder.py
    │       └── MultiHeadedAttn.py
    │   ├── Utils.py
    │   └── Optim.py
├── NLI_pretrain
    └── onmt
    │   ├── translate
    │       ├── __init__.py
    │       └── Translation.py
    │   ├── __init__.py
    │   ├── io
    │       ├── __init__.py
    │       └── DatasetBase.py
    │   ├── modules
    │       ├── StructuredAttention.py
    │       ├── StackedRNN.py
    │       ├── __init__.py
    │       ├── AudioEncoder.py
    │       ├── ConvMultiStepAttention.py
    │       ├── UtilClass.py
    │       ├── Gate.py
    │       ├── ImageEncoder.py
    │       └── MultiHeadedAttn.py
    │   ├── Utils.py
    │   └── Optim.py
├── reinforcement_train
    ├── onmt
    │   ├── translate
    │   │   ├── __init__.py
    │   │   └── Translation.py
    │   ├── __init__.py
    │   ├── io
    │   │   ├── __init__.py
    │   │   └── DatasetBase.py
    │   ├── modules
    │   │   ├── StructuredAttention.py
    │   │   ├── StackedRNN.py
    │   │   ├── __init__.py
    │   │   ├── AudioEncoder.py
    │   │   ├── ConvMultiStepAttention.py
    │   │   ├── UtilClass.py
    │   │   ├── Gate.py
    │   │   ├── ImageEncoder.py
    │   │   └── MultiHeadedAttn.py
    │   ├── Utils.py
    │   └── Optim.py
    └── predict.py
├── LICENSE.md
└── .gitignore


/D_pretrain/onmt/translate/__init__.py:
--------------------------------------------------------------------------------
1 | from onmt.translate.Translator import Translator
2 | from onmt.translate.Translation import Translation, TranslationBuilder
3 | from onmt.translate.Beam import Beam, GNMTGlobalScorer
4 | 
5 | __all__ = [Translator, Translation, Beam, GNMTGlobalScorer, TranslationBuilder]
6 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/translate/__init__.py:
--------------------------------------------------------------------------------
1 | from onmt.translate.Translator import Translator
2 | from onmt.translate.Translation import Translation, TranslationBuilder
3 | from onmt.translate.Beam import Beam, GNMTGlobalScorer
4 | 
5 | __all__ = [Translator, Translation, Beam, GNMTGlobalScorer, TranslationBuilder]
6 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/translate/__init__.py:
--------------------------------------------------------------------------------
1 | from onmt.translate.Translator import Translator
2 | from onmt.translate.Translation import Translation, TranslationBuilder
3 | from onmt.translate.Beam import Beam, GNMTGlobalScorer
4 | 
5 | __all__ = [Translator, Translation, Beam, GNMTGlobalScorer, TranslationBuilder]
6 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/translate/__init__.py:
--------------------------------------------------------------------------------
1 | from onmt.translate.Translator import Translator
2 | from onmt.translate.Translation import Translation, TranslationBuilder
3 | from onmt.translate.Beam import Beam, GNMTGlobalScorer
4 | 
5 | __all__ = [Translator, Translation, Beam, GNMTGlobalScorer, TranslationBuilder]
6 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/__init__.py:
--------------------------------------------------------------------------------
 1 | import onmt.io
 2 | import onmt.translate
 3 | import onmt.Models
 4 | import onmt.Loss
 5 | from onmt.Trainer import Trainer, Statistics
 6 | from onmt.Optim import Optim
 7 | 
 8 | # For flake8 compatibility
 9 | __all__ = [onmt.Loss, onmt.Models,
10 |            Trainer, Optim, Statistics, onmt.io, onmt.translate]
11 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/__init__.py:
--------------------------------------------------------------------------------
 1 | import onmt.io
 2 | import onmt.translate
 3 | import onmt.Models
 4 | import onmt.Loss
 5 | from onmt.Trainer import Trainer, Statistics
 6 | from onmt.Optim import Optim
 7 | 
 8 | # For flake8 compatibility
 9 | __all__ = [onmt.Loss, onmt.Models,
10 |            Trainer, Optim, Statistics, onmt.io, onmt.translate]
11 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/__init__.py:
--------------------------------------------------------------------------------
 1 | import onmt.io
 2 | import onmt.translate
 3 | import onmt.Models
 4 | import onmt.Loss
 5 | from onmt.Trainer import Trainer, Statistics
 6 | from onmt.Optim import Optim
 7 | 
 8 | # For flake8 compatibility
 9 | __all__ = [onmt.Loss, onmt.Models,
10 |            Trainer, Optim, Statistics, onmt.io, onmt.translate]
11 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/__init__.py:
--------------------------------------------------------------------------------
 1 | import onmt.io
 2 | import onmt.translate
 3 | import onmt.Models
 4 | import onmt.Loss
 5 | from onmt.Trainer import Trainer, Statistics
 6 | from onmt.Optim import Optim
 7 | 
 8 | # For flake8 compatibility
 9 | __all__ = [onmt.Loss, onmt.Models,
10 |            Trainer, Optim, Statistics, onmt.io, onmt.translate]
11 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/io/__init__.py:
--------------------------------------------------------------------------------
 1 | from onmt.io.IO import collect_feature_vocabs, make_features, \
 2 |                        collect_features, get_num_features, \
 3 |                        load_fields_from_vocab, get_fields, \
 4 |                        save_fields_to_vocab, build_dataset, \
 5 |                        build_vocab, merge_vocabs, OrderedIterator
 6 | from onmt.io.DatasetBase import ONMTDatasetBase, PAD_WORD, BOS_WORD, \
 7 |                                 EOS_WORD, UNK
 8 | from onmt.io.TextDataset import TextDataset, ShardedTextCorpusIterator
 9 | from onmt.io.ImageDataset import ImageDataset
10 | from onmt.io.AudioDataset import AudioDataset
11 | 
12 | 
13 | __all__ = [PAD_WORD, BOS_WORD, EOS_WORD, UNK, ONMTDatasetBase,
14 |            collect_feature_vocabs, make_features,
15 |            collect_features, get_num_features,
16 |            load_fields_from_vocab, get_fields,
17 |            save_fields_to_vocab, build_dataset,
18 |            build_vocab, merge_vocabs, OrderedIterator,
19 |            TextDataset, ImageDataset, AudioDataset,
20 |            ShardedTextCorpusIterator]
21 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/io/__init__.py:
--------------------------------------------------------------------------------
 1 | from onmt.io.IO import collect_feature_vocabs, make_features, \
 2 |                        collect_features, get_num_features, \
 3 |                        load_fields_from_vocab, get_fields, \
 4 |                        save_fields_to_vocab, build_dataset, \
 5 |                        build_vocab, merge_vocabs, OrderedIterator
 6 | from onmt.io.DatasetBase import ONMTDatasetBase, PAD_WORD, BOS_WORD, \
 7 |                                 EOS_WORD, UNK
 8 | from onmt.io.TextDataset import TextDataset, ShardedTextCorpusIterator
 9 | from onmt.io.ImageDataset import ImageDataset
10 | from onmt.io.AudioDataset import AudioDataset
11 | 
12 | 
13 | __all__ = [PAD_WORD, BOS_WORD, EOS_WORD, UNK, ONMTDatasetBase,
14 |            collect_feature_vocabs, make_features,
15 |            collect_features, get_num_features,
16 |            load_fields_from_vocab, get_fields,
17 |            save_fields_to_vocab, build_dataset,
18 |            build_vocab, merge_vocabs, OrderedIterator,
19 |            TextDataset, ImageDataset, AudioDataset,
20 |            ShardedTextCorpusIterator]
21 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/io/__init__.py:
--------------------------------------------------------------------------------
 1 | from onmt.io.IO import collect_feature_vocabs, make_features, \
 2 |                        collect_features, get_num_features, \
 3 |                        load_fields_from_vocab, get_fields, \
 4 |                        save_fields_to_vocab, build_dataset, \
 5 |                        build_vocab, merge_vocabs, OrderedIterator
 6 | from onmt.io.DatasetBase import ONMTDatasetBase, PAD_WORD, BOS_WORD, \
 7 |                                 EOS_WORD, UNK
 8 | from onmt.io.TextDataset import TextDataset, ShardedTextCorpusIterator
 9 | from onmt.io.ImageDataset import ImageDataset
10 | from onmt.io.AudioDataset import AudioDataset
11 | 
12 | 
13 | __all__ = [PAD_WORD, BOS_WORD, EOS_WORD, UNK, ONMTDatasetBase,
14 |            collect_feature_vocabs, make_features,
15 |            collect_features, get_num_features,
16 |            load_fields_from_vocab, get_fields,
17 |            save_fields_to_vocab, build_dataset,
18 |            build_vocab, merge_vocabs, OrderedIterator,
19 |            TextDataset, ImageDataset, AudioDataset,
20 |            ShardedTextCorpusIterator]
21 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/io/__init__.py:
--------------------------------------------------------------------------------
 1 | from onmt.io.IO import collect_feature_vocabs, make_features, \
 2 |                        collect_features, get_num_features, \
 3 |                        load_fields_from_vocab, get_fields, \
 4 |                        save_fields_to_vocab, build_dataset, \
 5 |                        build_vocab, merge_vocabs, OrderedIterator
 6 | from onmt.io.DatasetBase import ONMTDatasetBase, PAD_WORD, BOS_WORD, \
 7 |                                 EOS_WORD, UNK
 8 | from onmt.io.TextDataset import TextDataset, ShardedTextCorpusIterator
 9 | from onmt.io.ImageDataset import ImageDataset
10 | from onmt.io.AudioDataset import AudioDataset
11 | 
12 | 
13 | __all__ = [PAD_WORD, BOS_WORD, EOS_WORD, UNK, ONMTDatasetBase,
14 |            collect_feature_vocabs, make_features,
15 |            collect_features, get_num_features,
16 |            load_fields_from_vocab, get_fields,
17 |            save_fields_to_vocab, build_dataset,
18 |            build_vocab, merge_vocabs, OrderedIterator,
19 |            TextDataset, ImageDataset, AudioDataset,
20 |            ShardedTextCorpusIterator]
21 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 OpenNMT
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/modules/StructuredAttention.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | import torch.cuda
 4 | from torch.autograd import Variable
 5 | 
 6 | 
 7 | class MatrixTree(nn.Module):
 8 |     """Implementation of the matrix-tree theorem for computing marginals
 9 |     of non-projective dependency parsing. This attention layer is used
10 |     in the paper "Learning Structured Text Representations."
11 | 
12 | 
13 |     :cite:`DBLP:journals/corr/LiuL17d`
14 |     """
15 |     def __init__(self, eps=1e-5):
16 |         self.eps = eps
17 |         super(MatrixTree, self).__init__()
18 | 
19 |     def forward(self, input):
20 |         laplacian = input.exp() + self.eps
21 |         output = input.clone()
22 |         for b in range(input.size(0)):
23 |             lap = laplacian[b].masked_fill(
24 |                 Variable(torch.eye(input.size(1)).cuda().ne(0)), 0)
25 |             lap = -lap + torch.diag(lap.sum(0))
26 |             # store roots on diagonal
27 |             lap[0] = input[b].diag().exp()
28 |             inv_laplacian = lap.inverse()
29 | 
30 |             factor = inv_laplacian.diag().unsqueeze(1)\
31 |                                          .expand_as(input[b]).transpose(0, 1)
32 |             term1 = input[b].exp().mul(factor).clone()
33 |             term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone()
34 |             term1[:, 0] = 0
35 |             term2[0] = 0
36 |             output[b] = term1 - term2
37 |             roots_output = input[b].diag().exp().mul(
38 |                 inv_laplacian.transpose(0, 1)[0])
39 |             output[b] = output[b] + torch.diag(roots_output)
40 |         return output
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     dtree = MatrixTree()
45 |     q = torch.rand(1, 5, 5).cuda()
46 |     marg = dtree.forward(Variable(q))
47 |     print(marg.sum(1))
48 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/modules/StructuredAttention.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | import torch.cuda
 4 | from torch.autograd import Variable
 5 | 
 6 | 
 7 | class MatrixTree(nn.Module):
 8 |     """Implementation of the matrix-tree theorem for computing marginals
 9 |     of non-projective dependency parsing. This attention layer is used
10 |     in the paper "Learning Structured Text Representations."
11 | 
12 | 
13 |     :cite:`DBLP:journals/corr/LiuL17d`
14 |     """
15 |     def __init__(self, eps=1e-5):
16 |         self.eps = eps
17 |         super(MatrixTree, self).__init__()
18 | 
19 |     def forward(self, input):
20 |         laplacian = input.exp() + self.eps
21 |         output = input.clone()
22 |         for b in range(input.size(0)):
23 |             lap = laplacian[b].masked_fill(
24 |                 Variable(torch.eye(input.size(1)).cuda().ne(0)), 0)
25 |             lap = -lap + torch.diag(lap.sum(0))
26 |             # store roots on diagonal
27 |             lap[0] = input[b].diag().exp()
28 |             inv_laplacian = lap.inverse()
29 | 
30 |             factor = inv_laplacian.diag().unsqueeze(1)\
31 |                                          .expand_as(input[b]).transpose(0, 1)
32 |             term1 = input[b].exp().mul(factor).clone()
33 |             term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone()
34 |             term1[:, 0] = 0
35 |             term2[0] = 0
36 |             output[b] = term1 - term2
37 |             roots_output = input[b].diag().exp().mul(
38 |                 inv_laplacian.transpose(0, 1)[0])
39 |             output[b] = output[b] + torch.diag(roots_output)
40 |         return output
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     dtree = MatrixTree()
45 |     q = torch.rand(1, 5, 5).cuda()
46 |     marg = dtree.forward(Variable(q))
47 |     print(marg.sum(1))
48 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/modules/StructuredAttention.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | import torch.cuda
 4 | from torch.autograd import Variable
 5 | 
 6 | 
 7 | class MatrixTree(nn.Module):
 8 |     """Implementation of the matrix-tree theorem for computing marginals
 9 |     of non-projective dependency parsing. This attention layer is used
10 |     in the paper "Learning Structured Text Representations."
11 | 
12 | 
13 |     :cite:`DBLP:journals/corr/LiuL17d`
14 |     """
15 |     def __init__(self, eps=1e-5):
16 |         self.eps = eps
17 |         super(MatrixTree, self).__init__()
18 | 
19 |     def forward(self, input):
20 |         laplacian = input.exp() + self.eps
21 |         output = input.clone()
22 |         for b in range(input.size(0)):
23 |             lap = laplacian[b].masked_fill(
24 |                 Variable(torch.eye(input.size(1)).cuda().ne(0)), 0)
25 |             lap = -lap + torch.diag(lap.sum(0))
26 |             # store roots on diagonal
27 |             lap[0] = input[b].diag().exp()
28 |             inv_laplacian = lap.inverse()
29 | 
30 |             factor = inv_laplacian.diag().unsqueeze(1)\
31 |                                          .expand_as(input[b]).transpose(0, 1)
32 |             term1 = input[b].exp().mul(factor).clone()
33 |             term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone()
34 |             term1[:, 0] = 0
35 |             term2[0] = 0
36 |             output[b] = term1 - term2
37 |             roots_output = input[b].diag().exp().mul(
38 |                 inv_laplacian.transpose(0, 1)[0])
39 |             output[b] = output[b] + torch.diag(roots_output)
40 |         return output
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     dtree = MatrixTree()
45 |     q = torch.rand(1, 5, 5).cuda()
46 |     marg = dtree.forward(Variable(q))
47 |     print(marg.sum(1))
48 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/modules/StructuredAttention.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | import torch.cuda
 4 | from torch.autograd import Variable
 5 | 
 6 | 
 7 | class MatrixTree(nn.Module):
 8 |     """Implementation of the matrix-tree theorem for computing marginals
 9 |     of non-projective dependency parsing. This attention layer is used
10 |     in the paper "Learning Structured Text Representations."
11 | 
12 | 
13 |     :cite:`DBLP:journals/corr/LiuL17d`
14 |     """
15 |     def __init__(self, eps=1e-5):
16 |         self.eps = eps
17 |         super(MatrixTree, self).__init__()
18 | 
19 |     def forward(self, input):
20 |         laplacian = input.exp() + self.eps
21 |         output = input.clone()
22 |         for b in range(input.size(0)):
23 |             lap = laplacian[b].masked_fill(
24 |                 Variable(torch.eye(input.size(1)).cuda().ne(0)), 0)
25 |             lap = -lap + torch.diag(lap.sum(0))
26 |             # store roots on diagonal
27 |             lap[0] = input[b].diag().exp()
28 |             inv_laplacian = lap.inverse()
29 | 
30 |             factor = inv_laplacian.diag().unsqueeze(1)\
31 |                                          .expand_as(input[b]).transpose(0, 1)
32 |             term1 = input[b].exp().mul(factor).clone()
33 |             term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone()
34 |             term1[:, 0] = 0
35 |             term2[0] = 0
36 |             output[b] = term1 - term2
37 |             roots_output = input[b].diag().exp().mul(
38 |                 inv_laplacian.transpose(0, 1)[0])
39 |             output[b] = output[b] + torch.diag(roots_output)
40 |         return output
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     dtree = MatrixTree()
45 |     q = torch.rand(1, 5, 5).cuda()
46 |     marg = dtree.forward(Variable(q))
47 |     print(marg.sum(1))
48 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/modules/StackedRNN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class StackedLSTM(nn.Module):
 6 |     """
 7 |     Our own implementation of stacked LSTM.
 8 |     Needed for the decoder, because we do input feeding.
 9 |     """
10 |     def __init__(self, num_layers, input_size, rnn_size, dropout):
11 |         super(StackedLSTM, self).__init__()
12 |         self.dropout = nn.Dropout(dropout)
13 |         self.num_layers = num_layers
14 |         self.layers = nn.ModuleList()
15 | 
16 |         for i in range(num_layers):
17 |             self.layers.append(nn.LSTMCell(input_size, rnn_size))
18 |             input_size = rnn_size
19 | 
20 |     def forward(self, input, hidden):
21 |         h_0, c_0 = hidden
22 |         h_1, c_1 = [], []
23 |         for i, layer in enumerate(self.layers):
24 |             h_1_i, c_1_i = layer(input, (h_0[i], c_0[i]))
25 |             input = h_1_i
26 |             if i + 1 != self.num_layers:
27 |                 input = self.dropout(input)
28 |             h_1 += [h_1_i]
29 |             c_1 += [c_1_i]
30 | 
31 |         h_1 = torch.stack(h_1)
32 |         c_1 = torch.stack(c_1)
33 | 
34 |         return input, (h_1, c_1)
35 | 
36 | 
37 | class StackedGRU(nn.Module):
38 | 
39 |     def __init__(self, num_layers, input_size, rnn_size, dropout):
40 |         super(StackedGRU, self).__init__()
41 |         self.dropout = nn.Dropout(dropout)
42 |         self.num_layers = num_layers
43 |         self.layers = nn.ModuleList()
44 | 
45 |         for i in range(num_layers):
46 |             self.layers.append(nn.GRUCell(input_size, rnn_size))
47 |             input_size = rnn_size
48 | 
49 |     def forward(self, input, hidden):
50 |         h_1 = []
51 |         for i, layer in enumerate(self.layers):
52 |             h_1_i = layer(input, hidden[0][i])
53 |             input = h_1_i
54 |             if i + 1 != self.num_layers:
55 |                 input = self.dropout(input)
56 |             h_1 += [h_1_i]
57 | 
58 |         h_1 = torch.stack(h_1)
59 |         return input, (h_1,)
60 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/modules/StackedRNN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class StackedLSTM(nn.Module):
 6 |     """
 7 |     Our own implementation of stacked LSTM.
 8 |     Needed for the decoder, because we do input feeding.
 9 |     """
10 |     def __init__(self, num_layers, input_size, rnn_size, dropout):
11 |         super(StackedLSTM, self).__init__()
12 |         self.dropout = nn.Dropout(dropout)
13 |         self.num_layers = num_layers
14 |         self.layers = nn.ModuleList()
15 | 
16 |         for i in range(num_layers):
17 |             self.layers.append(nn.LSTMCell(input_size, rnn_size))
18 |             input_size = rnn_size
19 | 
20 |     def forward(self, input, hidden):
21 |         h_0, c_0 = hidden
22 |         h_1, c_1 = [], []
23 |         for i, layer in enumerate(self.layers):
24 |             h_1_i, c_1_i = layer(input, (h_0[i], c_0[i]))
25 |             input = h_1_i
26 |             if i + 1 != self.num_layers:
27 |                 input = self.dropout(input)
28 |             h_1 += [h_1_i]
29 |             c_1 += [c_1_i]
30 | 
31 |         h_1 = torch.stack(h_1)
32 |         c_1 = torch.stack(c_1)
33 | 
34 |         return input, (h_1, c_1)
35 | 
36 | 
37 | class StackedGRU(nn.Module):
38 | 
39 |     def __init__(self, num_layers, input_size, rnn_size, dropout):
40 |         super(StackedGRU, self).__init__()
41 |         self.dropout = nn.Dropout(dropout)
42 |         self.num_layers = num_layers
43 |         self.layers = nn.ModuleList()
44 | 
45 |         for i in range(num_layers):
46 |             self.layers.append(nn.GRUCell(input_size, rnn_size))
47 |             input_size = rnn_size
48 | 
49 |     def forward(self, input, hidden):
50 |         h_1 = []
51 |         for i, layer in enumerate(self.layers):
52 |             h_1_i = layer(input, hidden[0][i])
53 |             input = h_1_i
54 |             if i + 1 != self.num_layers:
55 |                 input = self.dropout(input)
56 |             h_1 += [h_1_i]
57 | 
58 |         h_1 = torch.stack(h_1)
59 |         return input, (h_1,)
60 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/modules/StackedRNN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class StackedLSTM(nn.Module):
 6 |     """
 7 |     Our own implementation of stacked LSTM.
 8 |     Needed for the decoder, because we do input feeding.
 9 |     """
10 |     def __init__(self, num_layers, input_size, rnn_size, dropout):
11 |         super(StackedLSTM, self).__init__()
12 |         self.dropout = nn.Dropout(dropout)
13 |         self.num_layers = num_layers
14 |         self.layers = nn.ModuleList()
15 | 
16 |         for i in range(num_layers):
17 |             self.layers.append(nn.LSTMCell(input_size, rnn_size))
18 |             input_size = rnn_size
19 | 
20 |     def forward(self, input, hidden):
21 |         h_0, c_0 = hidden
22 |         h_1, c_1 = [], []
23 |         for i, layer in enumerate(self.layers):
24 |             h_1_i, c_1_i = layer(input, (h_0[i], c_0[i]))
25 |             input = h_1_i
26 |             if i + 1 != self.num_layers:
27 |                 input = self.dropout(input)
28 |             h_1 += [h_1_i]
29 |             c_1 += [c_1_i]
30 | 
31 |         h_1 = torch.stack(h_1)
32 |         c_1 = torch.stack(c_1)
33 | 
34 |         return input, (h_1, c_1)
35 | 
36 | 
37 | class StackedGRU(nn.Module):
38 | 
39 |     def __init__(self, num_layers, input_size, rnn_size, dropout):
40 |         super(StackedGRU, self).__init__()
41 |         self.dropout = nn.Dropout(dropout)
42 |         self.num_layers = num_layers
43 |         self.layers = nn.ModuleList()
44 | 
45 |         for i in range(num_layers):
46 |             self.layers.append(nn.GRUCell(input_size, rnn_size))
47 |             input_size = rnn_size
48 | 
49 |     def forward(self, input, hidden):
50 |         h_1 = []
51 |         for i, layer in enumerate(self.layers):
52 |             h_1_i = layer(input, hidden[0][i])
53 |             input = h_1_i
54 |             if i + 1 != self.num_layers:
55 |                 input = self.dropout(input)
56 |             h_1 += [h_1_i]
57 | 
58 |         h_1 = torch.stack(h_1)
59 |         return input, (h_1,)
60 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/modules/StackedRNN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class StackedLSTM(nn.Module):
 6 |     """
 7 |     Our own implementation of stacked LSTM.
 8 |     Needed for the decoder, because we do input feeding.
 9 |     """
10 |     def __init__(self, num_layers, input_size, rnn_size, dropout):
11 |         super(StackedLSTM, self).__init__()
12 |         self.dropout = nn.Dropout(dropout)
13 |         self.num_layers = num_layers
14 |         self.layers = nn.ModuleList()
15 | 
16 |         for i in range(num_layers):
17 |             self.layers.append(nn.LSTMCell(input_size, rnn_size))
18 |             input_size = rnn_size
19 | 
20 |     def forward(self, input, hidden):
21 |         h_0, c_0 = hidden
22 |         h_1, c_1 = [], []
23 |         for i, layer in enumerate(self.layers):
24 |             h_1_i, c_1_i = layer(input, (h_0[i], c_0[i]))
25 |             input = h_1_i
26 |             if i + 1 != self.num_layers:
27 |                 input = self.dropout(input)
28 |             h_1 += [h_1_i]
29 |             c_1 += [c_1_i]
30 | 
31 |         h_1 = torch.stack(h_1)
32 |         c_1 = torch.stack(c_1)
33 | 
34 |         return input, (h_1, c_1)
35 | 
36 | 
37 | class StackedGRU(nn.Module):
38 | 
39 |     def __init__(self, num_layers, input_size, rnn_size, dropout):
40 |         super(StackedGRU, self).__init__()
41 |         self.dropout = nn.Dropout(dropout)
42 |         self.num_layers = num_layers
43 |         self.layers = nn.ModuleList()
44 | 
45 |         for i in range(num_layers):
46 |             self.layers.append(nn.GRUCell(input_size, rnn_size))
47 |             input_size = rnn_size
48 | 
49 |     def forward(self, input, hidden):
50 |         h_1 = []
51 |         for i, layer in enumerate(self.layers):
52 |             h_1_i = layer(input, hidden[0][i])
53 |             input = h_1_i
54 |             if i + 1 != self.num_layers:
55 |                 input = self.dropout(input)
56 |             h_1 += [h_1_i]
57 | 
58 |         h_1 = torch.stack(h_1)
59 |         return input, (h_1,)
60 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | from onmt.modules.UtilClass import LayerNorm, Bottle, BottleLinear, \
 2 |     BottleLayerNorm, BottleSoftmax, Elementwise
 3 | from onmt.modules.Gate import context_gate_factory, ContextGate
 4 | from onmt.modules.GlobalAttention import GlobalAttention
 5 | from onmt.modules.ConvMultiStepAttention import ConvMultiStepAttention
 6 | from onmt.modules.ImageEncoder import ImageEncoder
 7 | from onmt.modules.AudioEncoder import AudioEncoder
 8 | from onmt.modules.CopyGenerator import CopyGenerator, CopyGeneratorLossCompute
 9 | from onmt.modules.StructuredAttention import MatrixTree
10 | from onmt.modules.Transformer import \
11 |    TransformerEncoder, TransformerDecoder, PositionwiseFeedForward
12 | from onmt.modules.Conv2Conv import CNNEncoder, CNNDecoder
13 | from onmt.modules.MultiHeadedAttn import MultiHeadedAttention
14 | from onmt.modules.StackedRNN import StackedLSTM, StackedGRU
15 | from onmt.modules.Embeddings import Embeddings, PositionalEncoding
16 | from onmt.modules.WeightNorm import WeightNormConv2d
17 | from onmt.modules.Distriminitor import Disc, NLI
18 | 
19 | from onmt.Models import EncoderBase, MeanEncoder, StdRNNDecoder, \
20 |     RNNDecoderBase, InputFeedRNNDecoder, RNNEncoder, NMTModel
21 | 
22 | from onmt.modules.SRU import check_sru_requirement
23 | can_use_sru = check_sru_requirement()
24 | if can_use_sru:
25 |     from onmt.modules.SRU import SRU
26 | 
27 | 
28 | # For flake8 compatibility.
29 | __all__ = [EncoderBase, MeanEncoder, RNNDecoderBase, InputFeedRNNDecoder,
30 |            RNNEncoder, NMTModel,
31 |            StdRNNDecoder, ContextGate, GlobalAttention, ImageEncoder,
32 |            PositionwiseFeedForward, PositionalEncoding,
33 |            CopyGenerator, MultiHeadedAttention,
34 |            LayerNorm, Bottle, BottleLinear, BottleLayerNorm, BottleSoftmax,
35 |            TransformerEncoder, TransformerDecoder, Embeddings, Elementwise,
36 |            MatrixTree, WeightNormConv2d, ConvMultiStepAttention,
37 |            CNNEncoder, CNNDecoder, StackedLSTM, StackedGRU,
38 |            context_gate_factory, CopyGeneratorLossCompute, AudioEncoder,
39 |            Disc, NLI]
40 | 
41 | if can_use_sru:
42 |     __all__.extend([SRU, check_sru_requirement])
43 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | from onmt.modules.UtilClass import LayerNorm, Bottle, BottleLinear, \
 2 |     BottleLayerNorm, BottleSoftmax, Elementwise
 3 | from onmt.modules.Gate import context_gate_factory, ContextGate
 4 | from onmt.modules.GlobalAttention import GlobalAttention
 5 | from onmt.modules.ConvMultiStepAttention import ConvMultiStepAttention
 6 | from onmt.modules.ImageEncoder import ImageEncoder
 7 | from onmt.modules.AudioEncoder import AudioEncoder
 8 | from onmt.modules.CopyGenerator import CopyGenerator, CopyGeneratorLossCompute
 9 | from onmt.modules.StructuredAttention import MatrixTree
10 | from onmt.modules.Transformer import \
11 |    TransformerEncoder, TransformerDecoder, PositionwiseFeedForward
12 | from onmt.modules.Conv2Conv import CNNEncoder, CNNDecoder
13 | from onmt.modules.MultiHeadedAttn import MultiHeadedAttention
14 | from onmt.modules.StackedRNN import StackedLSTM, StackedGRU
15 | from onmt.modules.Embeddings import Embeddings, PositionalEncoding
16 | from onmt.modules.WeightNorm import WeightNormConv2d
17 | from onmt.modules.Distriminitor import Disc, NLI
18 | 
19 | from onmt.Models import EncoderBase, MeanEncoder, StdRNNDecoder, \
20 |     RNNDecoderBase, InputFeedRNNDecoder, RNNEncoder, NMTModel
21 | 
22 | from onmt.modules.SRU import check_sru_requirement
23 | can_use_sru = check_sru_requirement()
24 | if can_use_sru:
25 |     from onmt.modules.SRU import SRU
26 | 
27 | 
28 | # For flake8 compatibility.
29 | __all__ = [EncoderBase, MeanEncoder, RNNDecoderBase, InputFeedRNNDecoder,
30 |            RNNEncoder, NMTModel,
31 |            StdRNNDecoder, ContextGate, GlobalAttention, ImageEncoder,
32 |            PositionwiseFeedForward, PositionalEncoding,
33 |            CopyGenerator, MultiHeadedAttention,
34 |            LayerNorm, Bottle, BottleLinear, BottleLayerNorm, BottleSoftmax,
35 |            TransformerEncoder, TransformerDecoder, Embeddings, Elementwise,
36 |            MatrixTree, WeightNormConv2d, ConvMultiStepAttention,
37 |            CNNEncoder, CNNDecoder, StackedLSTM, StackedGRU,
38 |            context_gate_factory, CopyGeneratorLossCompute, AudioEncoder,
39 |            Disc, NLI]
40 | 
41 | if can_use_sru:
42 |     __all__.extend([SRU, check_sru_requirement])
43 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | from onmt.modules.UtilClass import LayerNorm, Bottle, BottleLinear, \
 2 |     BottleLayerNorm, BottleSoftmax, Elementwise
 3 | from onmt.modules.Gate import context_gate_factory, ContextGate
 4 | from onmt.modules.GlobalAttention import GlobalAttention
 5 | from onmt.modules.ConvMultiStepAttention import ConvMultiStepAttention
 6 | from onmt.modules.ImageEncoder import ImageEncoder
 7 | from onmt.modules.AudioEncoder import AudioEncoder
 8 | from onmt.modules.CopyGenerator import CopyGenerator, CopyGeneratorLossCompute
 9 | from onmt.modules.StructuredAttention import MatrixTree
10 | from onmt.modules.Transformer import \
11 |    TransformerEncoder, TransformerDecoder, PositionwiseFeedForward
12 | from onmt.modules.Conv2Conv import CNNEncoder, CNNDecoder
13 | from onmt.modules.MultiHeadedAttn import MultiHeadedAttention
14 | from onmt.modules.StackedRNN import StackedLSTM, StackedGRU
15 | from onmt.modules.Embeddings import Embeddings, PositionalEncoding
16 | from onmt.modules.WeightNorm import WeightNormConv2d
17 | from onmt.modules.Distriminitor import Disc, NLI
18 | 
19 | from onmt.Models import EncoderBase, MeanEncoder, StdRNNDecoder, \
20 |     RNNDecoderBase, InputFeedRNNDecoder, RNNEncoder, NMTModel
21 | 
22 | from onmt.modules.SRU import check_sru_requirement
23 | can_use_sru = check_sru_requirement()
24 | if can_use_sru:
25 |     from onmt.modules.SRU import SRU
26 | 
27 | 
28 | # For flake8 compatibility.
29 | __all__ = [EncoderBase, MeanEncoder, RNNDecoderBase, InputFeedRNNDecoder,
30 |            RNNEncoder, NMTModel,
31 |            StdRNNDecoder, ContextGate, GlobalAttention, ImageEncoder,
32 |            PositionwiseFeedForward, PositionalEncoding,
33 |            CopyGenerator, MultiHeadedAttention,
34 |            LayerNorm, Bottle, BottleLinear, BottleLayerNorm, BottleSoftmax,
35 |            TransformerEncoder, TransformerDecoder, Embeddings, Elementwise,
36 |            MatrixTree, WeightNormConv2d, ConvMultiStepAttention,
37 |            CNNEncoder, CNNDecoder, StackedLSTM, StackedGRU,
38 |            context_gate_factory, CopyGeneratorLossCompute, AudioEncoder,
39 |            Disc, NLI]
40 | 
41 | if can_use_sru:
42 |     __all__.extend([SRU, check_sru_requirement])
43 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | from onmt.modules.UtilClass import LayerNorm, Bottle, BottleLinear, \
 2 |     BottleLayerNorm, BottleSoftmax, Elementwise
 3 | from onmt.modules.Gate import context_gate_factory, ContextGate
 4 | from onmt.modules.GlobalAttention import GlobalAttention
 5 | from onmt.modules.ConvMultiStepAttention import ConvMultiStepAttention
 6 | from onmt.modules.ImageEncoder import ImageEncoder
 7 | from onmt.modules.AudioEncoder import AudioEncoder
 8 | from onmt.modules.CopyGenerator import CopyGenerator, CopyGeneratorLossCompute
 9 | from onmt.modules.StructuredAttention import MatrixTree
10 | from onmt.modules.Transformer import \
11 |    TransformerEncoder, TransformerDecoder, PositionwiseFeedForward
12 | from onmt.modules.Conv2Conv import CNNEncoder, CNNDecoder
13 | from onmt.modules.MultiHeadedAttn import MultiHeadedAttention
14 | from onmt.modules.StackedRNN import StackedLSTM, StackedGRU
15 | from onmt.modules.Embeddings import Embeddings, PositionalEncoding
16 | from onmt.modules.WeightNorm import WeightNormConv2d
17 | from onmt.modules.Distriminitor import Disc, NLI
18 | 
19 | from onmt.Models import EncoderBase, MeanEncoder, StdRNNDecoder, \
20 |     RNNDecoderBase, InputFeedRNNDecoder, RNNEncoder, NMTModel
21 | 
22 | from onmt.modules.SRU import check_sru_requirement
23 | can_use_sru = check_sru_requirement()
24 | if can_use_sru:
25 |     from onmt.modules.SRU import SRU
26 | 
27 | 
28 | # For flake8 compatibility.
29 | __all__ = [EncoderBase, MeanEncoder, RNNDecoderBase, InputFeedRNNDecoder,
30 |            RNNEncoder, NMTModel,
31 |            StdRNNDecoder, ContextGate, GlobalAttention, ImageEncoder,
32 |            PositionwiseFeedForward, PositionalEncoding,
33 |            CopyGenerator, MultiHeadedAttention,
34 |            LayerNorm, Bottle, BottleLinear, BottleLayerNorm, BottleSoftmax,
35 |            TransformerEncoder, TransformerDecoder, Embeddings, Elementwise,
36 |            MatrixTree, WeightNormConv2d, ConvMultiStepAttention,
37 |            CNNEncoder, CNNDecoder, StackedLSTM, StackedGRU,
38 |            context_gate_factory, CopyGeneratorLossCompute, AudioEncoder,
39 |            Disc, NLI]
40 | 
41 | if can_use_sru:
42 |     __all__.extend([SRU, check_sru_requirement])
43 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/Utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | 
 4 | 
 5 | def aeq(*args):
 6 |     """
 7 |     Assert all arguments have the same value
 8 |     """
 9 |     arguments = (arg for arg in args)
10 |     first = next(arguments)
11 |     assert all(arg == first for arg in arguments), \
12 |         "Not all arguments have the same value: " + str(args)
13 | 
14 | 
15 | def sequence_mask(lengths, max_len=None):
16 |     """
17 |     Creates a boolean mask from sequence lengths.
18 |     """
19 |     batch_size = lengths.numel()
20 |     max_len = max_len or lengths.max()
21 |     return (torch.arange(0, max_len)
22 |             .type_as(lengths)
23 |             .repeat(batch_size, 1)
24 |             .lt(lengths.unsqueeze(1)))
25 | 
26 | 
27 | def use_gpu(opt):
28 |     return (hasattr(opt, 'gpuid') and len(opt.gpuid) > 0) or \
29 |         (hasattr(opt, 'gpu') and opt.gpu > -1)
30 | 
31 | 
32 | def formalize(batch, batch_length, batch_first=False):
33 |     """formalize a batch to sort the batch according to its length
34 | 
35 |     Args:
36 |         batch: batch
37 |         batch_length: batch length list
38 |     Returns:
39 |         formalized batch
40 |     """
41 |     sorted_lengths, _ = torch.sort(batch_length, descending=True)
42 |     batch_length = batch_length.view(-1).tolist()
43 |     index_length = [(i, l) for i, l in enumerate(batch_length)]
44 |     ordered_index = sorted(index_length, key=lambda e: e[1], reverse=True)
45 | 
46 |     origin_new = dict([(v[0], k) for k, v in enumerate(ordered_index)])
47 | 
48 |     sorted_batch = Variable(batch.data.new(batch.size()))
49 |     for k, v in origin_new.items():
50 |         if batch_first:
51 |             sorted_batch[v] = batch[k]
52 |         else:
53 |             sorted_batch[:, v] = batch[:, k]
54 |     return sorted_batch, sorted_lengths, origin_new
55 | 
56 | 
57 | def deformalize(batch, origin_new):
58 |     """reform batch in the origin order, batch is the second dimension.
59 | 
60 |     Args:
61 |         batch: encoded batch, length*batch_size*dim
62 |         origin_new: origin->new index dict
63 |     Returns:
64 |         reformed batch
65 |     """
66 |     desorted_batch = Variable(batch.data.new(batch.size()))
67 |     for k, v in origin_new.items():
68 |         desorted_batch[:, k] = batch[:, v]
69 |     return desorted_batch


--------------------------------------------------------------------------------
/G_pretrain/onmt/Utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | 
 4 | 
 5 | def aeq(*args):
 6 |     """
 7 |     Assert all arguments have the same value
 8 |     """
 9 |     arguments = (arg for arg in args)
10 |     first = next(arguments)
11 |     assert all(arg == first for arg in arguments), \
12 |         "Not all arguments have the same value: " + str(args)
13 | 
14 | 
15 | def sequence_mask(lengths, max_len=None):
16 |     """
17 |     Creates a boolean mask from sequence lengths.
18 |     """
19 |     batch_size = lengths.numel()
20 |     max_len = max_len or lengths.max()
21 |     return (torch.arange(0, max_len)
22 |             .type_as(lengths)
23 |             .repeat(batch_size, 1)
24 |             .lt(lengths.unsqueeze(1)))
25 | 
26 | 
27 | def use_gpu(opt):
28 |     return (hasattr(opt, 'gpuid') and len(opt.gpuid) > 0) or \
29 |         (hasattr(opt, 'gpu') and opt.gpu > -1)
30 | 
31 | 
32 | def formalize(batch, batch_length, batch_first=False):
33 |     """formalize a batch to sort the batch according to its length
34 | 
35 |     Args:
36 |         batch: batch
37 |         batch_length: batch length list
38 |     Returns:
39 |         formalized batch
40 |     """
41 |     sorted_lengths, _ = torch.sort(batch_length, descending=True)
42 |     batch_length = batch_length.view(-1).tolist()
43 |     index_length = [(i, l) for i, l in enumerate(batch_length)]
44 |     ordered_index = sorted(index_length, key=lambda e: e[1], reverse=True)
45 | 
46 |     origin_new = dict([(v[0], k) for k, v in enumerate(ordered_index)])
47 | 
48 |     sorted_batch = Variable(batch.data.new(batch.size()))
49 |     for k, v in origin_new.items():
50 |         if batch_first:
51 |             sorted_batch[v] = batch[k]
52 |         else:
53 |             sorted_batch[:, v] = batch[:, k]
54 |     return sorted_batch, sorted_lengths, origin_new
55 | 
56 | 
57 | def deformalize(batch, origin_new):
58 |     """reform batch in the origin order, batch is the second dimension.
59 | 
60 |     Args:
61 |         batch: encoded batch, length*batch_size*dim
62 |         origin_new: origin->new index dict
63 |     Returns:
64 |         reformed batch
65 |     """
66 |     desorted_batch = Variable(batch.data.new(batch.size()))
67 |     for k, v in origin_new.items():
68 |         desorted_batch[:, k] = batch[:, v]
69 |     return desorted_batch


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/Utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | 
 4 | 
 5 | def aeq(*args):
 6 |     """
 7 |     Assert all arguments have the same value
 8 |     """
 9 |     arguments = (arg for arg in args)
10 |     first = next(arguments)
11 |     assert all(arg == first for arg in arguments), \
12 |         "Not all arguments have the same value: " + str(args)
13 | 
14 | 
15 | def sequence_mask(lengths, max_len=None):
16 |     """
17 |     Creates a boolean mask from sequence lengths.
18 |     """
19 |     batch_size = lengths.numel()
20 |     max_len = max_len or lengths.max()
21 |     return (torch.arange(0, max_len)
22 |             .type_as(lengths)
23 |             .repeat(batch_size, 1)
24 |             .lt(lengths.unsqueeze(1)))
25 | 
26 | 
27 | def use_gpu(opt):
28 |     return (hasattr(opt, 'gpuid') and len(opt.gpuid) > 0) or \
29 |         (hasattr(opt, 'gpu') and opt.gpu > -1)
30 | 
31 | 
32 | def formalize(batch, batch_length, batch_first=False):
33 |     """formalize a batch to sort the batch according to its length
34 | 
35 |     Args:
36 |         batch: batch
37 |         batch_length: batch length list
38 |     Returns:
39 |         formalized batch
40 |     """
41 |     sorted_lengths, _ = torch.sort(batch_length, descending=True)
42 |     batch_length = batch_length.view(-1).tolist()
43 |     index_length = [(i, l) for i, l in enumerate(batch_length)]
44 |     ordered_index = sorted(index_length, key=lambda e: e[1], reverse=True)
45 | 
46 |     origin_new = dict([(v[0], k) for k, v in enumerate(ordered_index)])
47 | 
48 |     sorted_batch = Variable(batch.data.new(batch.size()))
49 |     for k, v in origin_new.items():
50 |         if batch_first:
51 |             sorted_batch[v] = batch[k]
52 |         else:
53 |             sorted_batch[:, v] = batch[:, k]
54 |     return sorted_batch, sorted_lengths, origin_new
55 | 
56 | 
57 | def deformalize(batch, origin_new):
58 |     """reform batch in the origin order, batch is the second dimension.
59 | 
60 |     Args:
61 |         batch: encoded batch, length*batch_size*dim
62 |         origin_new: origin->new index dict
63 |     Returns:
64 |         reformed batch
65 |     """
66 |     desorted_batch = Variable(batch.data.new(batch.size()))
67 |     for k, v in origin_new.items():
68 |         desorted_batch[:, k] = batch[:, v]
69 |     return desorted_batch


--------------------------------------------------------------------------------
/reinforcement_train/onmt/Utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | 
 4 | 
 5 | def aeq(*args):
 6 |     """
 7 |     Assert all arguments have the same value
 8 |     """
 9 |     arguments = (arg for arg in args)
10 |     first = next(arguments)
11 |     assert all(arg == first for arg in arguments), \
12 |         "Not all arguments have the same value: " + str(args)
13 | 
14 | 
15 | def sequence_mask(lengths, max_len=None):
16 |     """
17 |     Creates a boolean mask from sequence lengths.
18 |     """
19 |     batch_size = lengths.numel()
20 |     max_len = max_len or lengths.max()
21 |     return (torch.arange(0, max_len)
22 |             .type_as(lengths)
23 |             .repeat(batch_size, 1)
24 |             .lt(lengths.unsqueeze(1)))
25 | 
26 | 
27 | def use_gpu(opt):
28 |     return (hasattr(opt, 'gpuid') and len(opt.gpuid) > 0) or \
29 |         (hasattr(opt, 'gpu') and opt.gpu > -1)
30 | 
31 | 
32 | def formalize(batch, batch_length, batch_first=False):
33 |     """formalize a batch to sort the batch according to its length
34 | 
35 |     Args:
36 |         batch: batch
37 |         batch_length: batch length list
38 |     Returns:
39 |         formalized batch
40 |     """
41 |     sorted_lengths, _ = torch.sort(batch_length, descending=True)
42 |     batch_length = batch_length.view(-1).tolist()
43 |     index_length = [(i, l) for i, l in enumerate(batch_length)]
44 |     ordered_index = sorted(index_length, key=lambda e: e[1], reverse=True)
45 | 
46 |     origin_new = dict([(v[0], k) for k, v in enumerate(ordered_index)])
47 | 
48 |     sorted_batch = Variable(batch.data.new(batch.size()))
49 |     for k, v in origin_new.items():
50 |         if batch_first:
51 |             sorted_batch[v] = batch[k]
52 |         else:
53 |             sorted_batch[:, v] = batch[:, k]
54 |     return sorted_batch, sorted_lengths, origin_new
55 | 
56 | 
57 | def deformalize(batch, origin_new):
58 |     """reform batch in the origin order, batch is the second dimension.
59 | 
60 |     Args:
61 |         batch: encoded batch, length*batch_size*dim
62 |         origin_new: origin->new index dict
63 |     Returns:
64 |         reformed batch
65 |     """
66 |     desorted_batch = Variable(batch.data.new(batch.size()))
67 |     for k, v in origin_new.items():
68 |         desorted_batch[:, k] = batch[:, v]
69 |     return desorted_batch


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/modules/AudioEncoder.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class AudioEncoder(nn.Module):
 7 |     """
 8 |     A simple encoder convolutional -> recurrent neural network for
 9 |     audio input.
10 | 
11 |     Args:
12 |         num_layers (int): number of encoder layers.
13 |         bidirectional (bool): bidirectional encoder.
14 |         rnn_size (int): size of hidden states of the rnn.
15 |         dropout (float): dropout probablity.
16 |         sample_rate (float): input spec
17 |         window_size (int): input spec
18 | 
19 |     """
20 |     def __init__(self, num_layers, bidirectional, rnn_size, dropout,
21 |                  sample_rate, window_size):
22 |         super(AudioEncoder, self).__init__()
23 |         self.num_layers = num_layers
24 |         self.num_directions = 2 if bidirectional else 1
25 |         self.hidden_size = rnn_size
26 | 
27 |         self.layer1 = nn.Conv2d(1,   32, kernel_size=(41, 11),
28 |                                 padding=(0, 10), stride=(2, 2))
29 |         self.batch_norm1 = nn.BatchNorm2d(32)
30 |         self.layer2 = nn.Conv2d(32,  32, kernel_size=(21, 11),
31 |                                 padding=(0, 0), stride=(2, 1))
32 |         self.batch_norm2 = nn.BatchNorm2d(32)
33 | 
34 |         input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
35 |         input_size = int(math.floor(input_size - 41) / 2 + 1)
36 |         input_size = int(math.floor(input_size - 21) / 2 + 1)
37 |         input_size *= 32
38 |         self.rnn = nn.LSTM(input_size, rnn_size,
39 |                            num_layers=num_layers,
40 |                            dropout=dropout,
41 |                            bidirectional=bidirectional)
42 | 
43 |     def load_pretrained_vectors(self, opt):
44 |         # Pass in needed options only when modify function definition.
45 |         pass
46 | 
47 |     def forward(self, input, lengths=None):
48 |         "See :obj:`onmt.modules.EncoderBase.forward()`"
49 |         # (batch_size, 1, nfft, t)
50 |         # layer 1
51 |         input = self.batch_norm1(self.layer1(input[:, :, :, :]))
52 | 
53 |         # (batch_size, 32, nfft/2, t/2)
54 |         input = F.hardtanh(input, 0, 20, inplace=True)
55 | 
56 |         # (batch_size, 32, nfft/2/2, t/2)
57 |         # layer 2
58 |         input = self.batch_norm2(self.layer2(input))
59 | 
60 |         # (batch_size, 32, nfft/2/2, t/2)
61 |         input = F.hardtanh(input, 0, 20, inplace=True)
62 | 
63 |         batch_size = input.size(0)
64 |         length = input.size(3)
65 |         input = input.view(batch_size, -1, length)
66 |         input = input.transpose(0, 2).transpose(1, 2)
67 | 
68 |         output, hidden = self.rnn(input)
69 | 
70 |         return hidden, output
71 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/modules/AudioEncoder.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class AudioEncoder(nn.Module):
 7 |     """
 8 |     A simple encoder convolutional -> recurrent neural network for
 9 |     audio input.
10 | 
11 |     Args:
12 |         num_layers (int): number of encoder layers.
13 |         bidirectional (bool): bidirectional encoder.
14 |         rnn_size (int): size of hidden states of the rnn.
15 |         dropout (float): dropout probablity.
16 |         sample_rate (float): input spec
17 |         window_size (int): input spec
18 | 
19 |     """
20 |     def __init__(self, num_layers, bidirectional, rnn_size, dropout,
21 |                  sample_rate, window_size):
22 |         super(AudioEncoder, self).__init__()
23 |         self.num_layers = num_layers
24 |         self.num_directions = 2 if bidirectional else 1
25 |         self.hidden_size = rnn_size
26 | 
27 |         self.layer1 = nn.Conv2d(1,   32, kernel_size=(41, 11),
28 |                                 padding=(0, 10), stride=(2, 2))
29 |         self.batch_norm1 = nn.BatchNorm2d(32)
30 |         self.layer2 = nn.Conv2d(32,  32, kernel_size=(21, 11),
31 |                                 padding=(0, 0), stride=(2, 1))
32 |         self.batch_norm2 = nn.BatchNorm2d(32)
33 | 
34 |         input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
35 |         input_size = int(math.floor(input_size - 41) / 2 + 1)
36 |         input_size = int(math.floor(input_size - 21) / 2 + 1)
37 |         input_size *= 32
38 |         self.rnn = nn.LSTM(input_size, rnn_size,
39 |                            num_layers=num_layers,
40 |                            dropout=dropout,
41 |                            bidirectional=bidirectional)
42 | 
43 |     def load_pretrained_vectors(self, opt):
44 |         # Pass in needed options only when modify function definition.
45 |         pass
46 | 
47 |     def forward(self, input, lengths=None):
48 |         "See :obj:`onmt.modules.EncoderBase.forward()`"
49 |         # (batch_size, 1, nfft, t)
50 |         # layer 1
51 |         input = self.batch_norm1(self.layer1(input[:, :, :, :]))
52 | 
53 |         # (batch_size, 32, nfft/2, t/2)
54 |         input = F.hardtanh(input, 0, 20, inplace=True)
55 | 
56 |         # (batch_size, 32, nfft/2/2, t/2)
57 |         # layer 2
58 |         input = self.batch_norm2(self.layer2(input))
59 | 
60 |         # (batch_size, 32, nfft/2/2, t/2)
61 |         input = F.hardtanh(input, 0, 20, inplace=True)
62 | 
63 |         batch_size = input.size(0)
64 |         length = input.size(3)
65 |         input = input.view(batch_size, -1, length)
66 |         input = input.transpose(0, 2).transpose(1, 2)
67 | 
68 |         output, hidden = self.rnn(input)
69 | 
70 |         return hidden, output
71 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/modules/AudioEncoder.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class AudioEncoder(nn.Module):
 7 |     """
 8 |     A simple encoder convolutional -> recurrent neural network for
 9 |     audio input.
10 | 
11 |     Args:
12 |         num_layers (int): number of encoder layers.
13 |         bidirectional (bool): bidirectional encoder.
14 |         rnn_size (int): size of hidden states of the rnn.
15 |         dropout (float): dropout probablity.
16 |         sample_rate (float): input spec
17 |         window_size (int): input spec
18 | 
19 |     """
20 |     def __init__(self, num_layers, bidirectional, rnn_size, dropout,
21 |                  sample_rate, window_size):
22 |         super(AudioEncoder, self).__init__()
23 |         self.num_layers = num_layers
24 |         self.num_directions = 2 if bidirectional else 1
25 |         self.hidden_size = rnn_size
26 | 
27 |         self.layer1 = nn.Conv2d(1,   32, kernel_size=(41, 11),
28 |                                 padding=(0, 10), stride=(2, 2))
29 |         self.batch_norm1 = nn.BatchNorm2d(32)
30 |         self.layer2 = nn.Conv2d(32,  32, kernel_size=(21, 11),
31 |                                 padding=(0, 0), stride=(2, 1))
32 |         self.batch_norm2 = nn.BatchNorm2d(32)
33 | 
34 |         input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
35 |         input_size = int(math.floor(input_size - 41) / 2 + 1)
36 |         input_size = int(math.floor(input_size - 21) / 2 + 1)
37 |         input_size *= 32
38 |         self.rnn = nn.LSTM(input_size, rnn_size,
39 |                            num_layers=num_layers,
40 |                            dropout=dropout,
41 |                            bidirectional=bidirectional)
42 | 
43 |     def load_pretrained_vectors(self, opt):
44 |         # Pass in needed options only when modify function definition.
45 |         pass
46 | 
47 |     def forward(self, input, lengths=None):
48 |         "See :obj:`onmt.modules.EncoderBase.forward()`"
49 |         # (batch_size, 1, nfft, t)
50 |         # layer 1
51 |         input = self.batch_norm1(self.layer1(input[:, :, :, :]))
52 | 
53 |         # (batch_size, 32, nfft/2, t/2)
54 |         input = F.hardtanh(input, 0, 20, inplace=True)
55 | 
56 |         # (batch_size, 32, nfft/2/2, t/2)
57 |         # layer 2
58 |         input = self.batch_norm2(self.layer2(input))
59 | 
60 |         # (batch_size, 32, nfft/2/2, t/2)
61 |         input = F.hardtanh(input, 0, 20, inplace=True)
62 | 
63 |         batch_size = input.size(0)
64 |         length = input.size(3)
65 |         input = input.view(batch_size, -1, length)
66 |         input = input.transpose(0, 2).transpose(1, 2)
67 | 
68 |         output, hidden = self.rnn(input)
69 | 
70 |         return hidden, output
71 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/modules/AudioEncoder.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class AudioEncoder(nn.Module):
 7 |     """
 8 |     A simple encoder convolutional -> recurrent neural network for
 9 |     audio input.
10 | 
11 |     Args:
12 |         num_layers (int): number of encoder layers.
13 |         bidirectional (bool): bidirectional encoder.
14 |         rnn_size (int): size of hidden states of the rnn.
15 |         dropout (float): dropout probablity.
16 |         sample_rate (float): input spec
17 |         window_size (int): input spec
18 | 
19 |     """
20 |     def __init__(self, num_layers, bidirectional, rnn_size, dropout,
21 |                  sample_rate, window_size):
22 |         super(AudioEncoder, self).__init__()
23 |         self.num_layers = num_layers
24 |         self.num_directions = 2 if bidirectional else 1
25 |         self.hidden_size = rnn_size
26 | 
27 |         self.layer1 = nn.Conv2d(1,   32, kernel_size=(41, 11),
28 |                                 padding=(0, 10), stride=(2, 2))
29 |         self.batch_norm1 = nn.BatchNorm2d(32)
30 |         self.layer2 = nn.Conv2d(32,  32, kernel_size=(21, 11),
31 |                                 padding=(0, 0), stride=(2, 1))
32 |         self.batch_norm2 = nn.BatchNorm2d(32)
33 | 
34 |         input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
35 |         input_size = int(math.floor(input_size - 41) / 2 + 1)
36 |         input_size = int(math.floor(input_size - 21) / 2 + 1)
37 |         input_size *= 32
38 |         self.rnn = nn.LSTM(input_size, rnn_size,
39 |                            num_layers=num_layers,
40 |                            dropout=dropout,
41 |                            bidirectional=bidirectional)
42 | 
43 |     def load_pretrained_vectors(self, opt):
44 |         # Pass in needed options only when modify function definition.
45 |         pass
46 | 
47 |     def forward(self, input, lengths=None):
48 |         "See :obj:`onmt.modules.EncoderBase.forward()`"
49 |         # (batch_size, 1, nfft, t)
50 |         # layer 1
51 |         input = self.batch_norm1(self.layer1(input[:, :, :, :]))
52 | 
53 |         # (batch_size, 32, nfft/2, t/2)
54 |         input = F.hardtanh(input, 0, 20, inplace=True)
55 | 
56 |         # (batch_size, 32, nfft/2/2, t/2)
57 |         # layer 2
58 |         input = self.batch_norm2(self.layer2(input))
59 | 
60 |         # (batch_size, 32, nfft/2/2, t/2)
61 |         input = F.hardtanh(input, 0, 20, inplace=True)
62 | 
63 |         batch_size = input.size(0)
64 |         length = input.size(3)
65 |         input = input.view(batch_size, -1, length)
66 |         input = input.transpose(0, 2).transpose(1, 2)
67 | 
68 |         output, hidden = self.rnn(input)
69 | 
70 |         return hidden, output
71 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/modules/ConvMultiStepAttention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from onmt.Utils import aeq
 5 | 
 6 | 
 7 | SCALE_WEIGHT = 0.5 ** 0.5
 8 | 
 9 | 
10 | def seq_linear(linear, x):
11 |     # linear transform for 3-d tensor
12 |     batch, hidden_size, length, _ = x.size()
13 |     h = linear(torch.transpose(x, 1, 2).contiguous().view(
14 |         batch * length, hidden_size))
15 |     return torch.transpose(h.view(batch, length, hidden_size, 1), 1, 2)
16 | 
17 | 
18 | class ConvMultiStepAttention(nn.Module):
19 |     """
20 | 
21 |     Conv attention takes a key matrix, a value matrix and a query vector.
22 |     Attention weight is calculated by key matrix with the query vector
23 |     and sum on the value matrix. And the same operation is applied
24 |     in each decode conv layer.
25 | 
26 |     """
27 | 
28 |     def __init__(self, input_size):
29 |         super(ConvMultiStepAttention, self).__init__()
30 |         self.linear_in = nn.Linear(input_size, input_size)
31 |         self.mask = None
32 | 
33 |     def apply_mask(self, mask):
34 |         self.mask = mask
35 | 
36 |     def forward(self, base_target_emb, input, encoder_out_top,
37 |                 encoder_out_combine):
38 |         """
39 |         Args:
40 |             base_target_emb: target emb tensor
41 |             input: output of decode conv
42 |             encoder_out_t: the key matrix for calculation of attetion weight,
43 |                 which is the top output of encode conv
44 |             encoder_out_combine:
45 |                 the value matrix for the attention-weighted sum,
46 |                 which is the combination of base emb and top output of encode
47 | 
48 |         """
49 |         # checks
50 |         batch, channel, height, width = base_target_emb.size()
51 |         batch_, channel_, height_, width_ = input.size()
52 |         aeq(batch, batch_)
53 |         aeq(height, height_)
54 | 
55 |         enc_batch, enc_channel, enc_height = encoder_out_top.size()
56 |         enc_batch_, enc_channel_, enc_height_ = encoder_out_combine.size()
57 | 
58 |         aeq(enc_batch, enc_batch_)
59 |         aeq(enc_height, enc_height_)
60 | 
61 |         preatt = seq_linear(self.linear_in, input)
62 |         target = (base_target_emb + preatt) * SCALE_WEIGHT
63 |         target = torch.squeeze(target, 3)
64 |         target = torch.transpose(target, 1, 2)
65 |         pre_attn = torch.bmm(target, encoder_out_top)
66 | 
67 |         if self.mask is not None:
68 |             pre_attn.data.masked_fill_(self.mask, -float('inf'))
69 | 
70 |         pre_attn = pre_attn.transpose(0, 2)
71 |         attn = F.softmax(pre_attn)
72 |         attn = attn.transpose(0, 2).contiguous()
73 |         context_output = torch.bmm(
74 |             attn, torch.transpose(encoder_out_combine, 1, 2))
75 |         context_output = torch.transpose(
76 |             torch.unsqueeze(context_output, 3), 1, 2)
77 |         return context_output, attn
78 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/modules/ConvMultiStepAttention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from onmt.Utils import aeq
 5 | 
 6 | 
 7 | SCALE_WEIGHT = 0.5 ** 0.5
 8 | 
 9 | 
10 | def seq_linear(linear, x):
11 |     # linear transform for 3-d tensor
12 |     batch, hidden_size, length, _ = x.size()
13 |     h = linear(torch.transpose(x, 1, 2).contiguous().view(
14 |         batch * length, hidden_size))
15 |     return torch.transpose(h.view(batch, length, hidden_size, 1), 1, 2)
16 | 
17 | 
18 | class ConvMultiStepAttention(nn.Module):
19 |     """
20 | 
21 |     Conv attention takes a key matrix, a value matrix and a query vector.
22 |     Attention weight is calculated by key matrix with the query vector
23 |     and sum on the value matrix. And the same operation is applied
24 |     in each decode conv layer.
25 | 
26 |     """
27 | 
28 |     def __init__(self, input_size):
29 |         super(ConvMultiStepAttention, self).__init__()
30 |         self.linear_in = nn.Linear(input_size, input_size)
31 |         self.mask = None
32 | 
33 |     def apply_mask(self, mask):
34 |         self.mask = mask
35 | 
36 |     def forward(self, base_target_emb, input, encoder_out_top,
37 |                 encoder_out_combine):
38 |         """
39 |         Args:
40 |             base_target_emb: target emb tensor
41 |             input: output of decode conv
42 |             encoder_out_t: the key matrix for calculation of attetion weight,
43 |                 which is the top output of encode conv
44 |             encoder_out_combine:
45 |                 the value matrix for the attention-weighted sum,
46 |                 which is the combination of base emb and top output of encode
47 | 
48 |         """
49 |         # checks
50 |         batch, channel, height, width = base_target_emb.size()
51 |         batch_, channel_, height_, width_ = input.size()
52 |         aeq(batch, batch_)
53 |         aeq(height, height_)
54 | 
55 |         enc_batch, enc_channel, enc_height = encoder_out_top.size()
56 |         enc_batch_, enc_channel_, enc_height_ = encoder_out_combine.size()
57 | 
58 |         aeq(enc_batch, enc_batch_)
59 |         aeq(enc_height, enc_height_)
60 | 
61 |         preatt = seq_linear(self.linear_in, input)
62 |         target = (base_target_emb + preatt) * SCALE_WEIGHT
63 |         target = torch.squeeze(target, 3)
64 |         target = torch.transpose(target, 1, 2)
65 |         pre_attn = torch.bmm(target, encoder_out_top)
66 | 
67 |         if self.mask is not None:
68 |             pre_attn.data.masked_fill_(self.mask, -float('inf'))
69 | 
70 |         pre_attn = pre_attn.transpose(0, 2)
71 |         attn = F.softmax(pre_attn)
72 |         attn = attn.transpose(0, 2).contiguous()
73 |         context_output = torch.bmm(
74 |             attn, torch.transpose(encoder_out_combine, 1, 2))
75 |         context_output = torch.transpose(
76 |             torch.unsqueeze(context_output, 3), 1, 2)
77 |         return context_output, attn
78 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/modules/ConvMultiStepAttention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from onmt.Utils import aeq
 5 | 
 6 | 
 7 | SCALE_WEIGHT = 0.5 ** 0.5
 8 | 
 9 | 
10 | def seq_linear(linear, x):
11 |     # linear transform for 3-d tensor
12 |     batch, hidden_size, length, _ = x.size()
13 |     h = linear(torch.transpose(x, 1, 2).contiguous().view(
14 |         batch * length, hidden_size))
15 |     return torch.transpose(h.view(batch, length, hidden_size, 1), 1, 2)
16 | 
17 | 
18 | class ConvMultiStepAttention(nn.Module):
19 |     """
20 | 
21 |     Conv attention takes a key matrix, a value matrix and a query vector.
22 |     Attention weight is calculated by key matrix with the query vector
23 |     and sum on the value matrix. And the same operation is applied
24 |     in each decode conv layer.
25 | 
26 |     """
27 | 
28 |     def __init__(self, input_size):
29 |         super(ConvMultiStepAttention, self).__init__()
30 |         self.linear_in = nn.Linear(input_size, input_size)
31 |         self.mask = None
32 | 
33 |     def apply_mask(self, mask):
34 |         self.mask = mask
35 | 
36 |     def forward(self, base_target_emb, input, encoder_out_top,
37 |                 encoder_out_combine):
38 |         """
39 |         Args:
40 |             base_target_emb: target emb tensor
41 |             input: output of decode conv
42 |             encoder_out_t: the key matrix for calculation of attetion weight,
43 |                 which is the top output of encode conv
44 |             encoder_out_combine:
45 |                 the value matrix for the attention-weighted sum,
46 |                 which is the combination of base emb and top output of encode
47 | 
48 |         """
49 |         # checks
50 |         batch, channel, height, width = base_target_emb.size()
51 |         batch_, channel_, height_, width_ = input.size()
52 |         aeq(batch, batch_)
53 |         aeq(height, height_)
54 | 
55 |         enc_batch, enc_channel, enc_height = encoder_out_top.size()
56 |         enc_batch_, enc_channel_, enc_height_ = encoder_out_combine.size()
57 | 
58 |         aeq(enc_batch, enc_batch_)
59 |         aeq(enc_height, enc_height_)
60 | 
61 |         preatt = seq_linear(self.linear_in, input)
62 |         target = (base_target_emb + preatt) * SCALE_WEIGHT
63 |         target = torch.squeeze(target, 3)
64 |         target = torch.transpose(target, 1, 2)
65 |         pre_attn = torch.bmm(target, encoder_out_top)
66 | 
67 |         if self.mask is not None:
68 |             pre_attn.data.masked_fill_(self.mask, -float('inf'))
69 | 
70 |         pre_attn = pre_attn.transpose(0, 2)
71 |         attn = F.softmax(pre_attn)
72 |         attn = attn.transpose(0, 2).contiguous()
73 |         context_output = torch.bmm(
74 |             attn, torch.transpose(encoder_out_combine, 1, 2))
75 |         context_output = torch.transpose(
76 |             torch.unsqueeze(context_output, 3), 1, 2)
77 |         return context_output, attn
78 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/modules/ConvMultiStepAttention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from onmt.Utils import aeq
 5 | 
 6 | 
 7 | SCALE_WEIGHT = 0.5 ** 0.5
 8 | 
 9 | 
10 | def seq_linear(linear, x):
11 |     # linear transform for 3-d tensor
12 |     batch, hidden_size, length, _ = x.size()
13 |     h = linear(torch.transpose(x, 1, 2).contiguous().view(
14 |         batch * length, hidden_size))
15 |     return torch.transpose(h.view(batch, length, hidden_size, 1), 1, 2)
16 | 
17 | 
18 | class ConvMultiStepAttention(nn.Module):
19 |     """
20 | 
21 |     Conv attention takes a key matrix, a value matrix and a query vector.
22 |     Attention weight is calculated by key matrix with the query vector
23 |     and sum on the value matrix. And the same operation is applied
24 |     in each decode conv layer.
25 | 
26 |     """
27 | 
28 |     def __init__(self, input_size):
29 |         super(ConvMultiStepAttention, self).__init__()
30 |         self.linear_in = nn.Linear(input_size, input_size)
31 |         self.mask = None
32 | 
33 |     def apply_mask(self, mask):
34 |         self.mask = mask
35 | 
36 |     def forward(self, base_target_emb, input, encoder_out_top,
37 |                 encoder_out_combine):
38 |         """
39 |         Args:
40 |             base_target_emb: target emb tensor
41 |             input: output of decode conv
42 |             encoder_out_t: the key matrix for calculation of attetion weight,
43 |                 which is the top output of encode conv
44 |             encoder_out_combine:
45 |                 the value matrix for the attention-weighted sum,
46 |                 which is the combination of base emb and top output of encode
47 | 
48 |         """
49 |         # checks
50 |         batch, channel, height, width = base_target_emb.size()
51 |         batch_, channel_, height_, width_ = input.size()
52 |         aeq(batch, batch_)
53 |         aeq(height, height_)
54 | 
55 |         enc_batch, enc_channel, enc_height = encoder_out_top.size()
56 |         enc_batch_, enc_channel_, enc_height_ = encoder_out_combine.size()
57 | 
58 |         aeq(enc_batch, enc_batch_)
59 |         aeq(enc_height, enc_height_)
60 | 
61 |         preatt = seq_linear(self.linear_in, input)
62 |         target = (base_target_emb + preatt) * SCALE_WEIGHT
63 |         target = torch.squeeze(target, 3)
64 |         target = torch.transpose(target, 1, 2)
65 |         pre_attn = torch.bmm(target, encoder_out_top)
66 | 
67 |         if self.mask is not None:
68 |             pre_attn.data.masked_fill_(self.mask, -float('inf'))
69 | 
70 |         pre_attn = pre_attn.transpose(0, 2)
71 |         attn = F.softmax(pre_attn)
72 |         attn = attn.transpose(0, 2).contiguous()
73 |         context_output = torch.bmm(
74 |             attn, torch.transpose(encoder_out_combine, 1, 2))
75 |         context_output = torch.transpose(
76 |             torch.unsqueeze(context_output, 3), 1, 2)
77 |         return context_output, attn
78 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/modules/UtilClass.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class Bottle(nn.Module):
 6 |         def forward(self, input):
 7 |             if len(input.size()) <= 2:
 8 |                 return super(Bottle, self).forward(input)
 9 |             size = input.size()[:2]
10 |             out = super(Bottle, self).forward(input.view(size[0]*size[1], -1))
11 |             return out.contiguous().view(size[0], size[1], -1)
12 | 
13 | 
14 | class Bottle2(nn.Module):
15 |         def forward(self, input):
16 |             if len(input.size()) <= 3:
17 |                 return super(Bottle2, self).forward(input)
18 |             size = input.size()
19 |             out = super(Bottle2, self).forward(input.view(size[0]*size[1],
20 |                                                           size[2], size[3]))
21 |             return out.contiguous().view(size[0], size[1], size[2], size[3])
22 | 
23 | 
24 | class LayerNorm(nn.Module):
25 |     ''' Layer normalization module '''
26 | 
27 |     def __init__(self, d_hid, eps=1e-3):
28 |         super(LayerNorm, self).__init__()
29 | 
30 |         self.eps = eps
31 |         self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
32 |         self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)
33 | 
34 |     def forward(self, z):
35 |         if z.size(1) == 1:
36 |             return z
37 |         mu = torch.mean(z, dim=1)
38 |         sigma = torch.std(z, dim=1)
39 |         # HACK. PyTorch is changing behavior
40 |         if mu.dim() == 1:
41 |             mu = mu.unsqueeze(1)
42 |             sigma = sigma.unsqueeze(1)
43 |         ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps)
44 |         ln_out = ln_out.mul(self.a_2.expand_as(ln_out)) \
45 |             + self.b_2.expand_as(ln_out)
46 |         return ln_out
47 | 
48 | 
49 | class BottleLinear(Bottle, nn.Linear):
50 |     pass
51 | 
52 | 
53 | class BottleLayerNorm(Bottle, LayerNorm):
54 |     pass
55 | 
56 | 
57 | class BottleSoftmax(Bottle, nn.Softmax):
58 |     pass
59 | 
60 | 
61 | class Elementwise(nn.ModuleList):
62 |     """
63 |     A simple network container.
64 |     Parameters are a list of modules.
65 |     Inputs are a 3d Variable whose last dimension is the same length
66 |     as the list.
67 |     Outputs are the result of applying modules to inputs elementwise.
68 |     An optional merge parameter allows the outputs to be reduced to a
69 |     single Variable.
70 |     """
71 | 
72 |     def __init__(self, merge=None, *args):
73 |         assert merge in [None, 'first', 'concat', 'sum', 'mlp']
74 |         self.merge = merge
75 |         super(Elementwise, self).__init__(*args)
76 | 
77 |     def forward(self, input):
78 |         inputs = [feat.squeeze(2) for feat in input.split(1, dim=2)]
79 |         assert len(self) == len(inputs)
80 |         outputs = [f(x) for f, x in zip(self, inputs)]
81 |         if self.merge == 'first':
82 |             return outputs[0]
83 |         elif self.merge == 'concat' or self.merge == 'mlp':
84 |             return torch.cat(outputs, 2)
85 |         elif self.merge == 'sum':
86 |             return sum(outputs)
87 |         else:
88 |             return outputs
89 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/modules/UtilClass.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class Bottle(nn.Module):
 6 |         def forward(self, input):
 7 |             if len(input.size()) <= 2:
 8 |                 return super(Bottle, self).forward(input)
 9 |             size = input.size()[:2]
10 |             out = super(Bottle, self).forward(input.view(size[0]*size[1], -1))
11 |             return out.contiguous().view(size[0], size[1], -1)
12 | 
13 | 
14 | class Bottle2(nn.Module):
15 |         def forward(self, input):
16 |             if len(input.size()) <= 3:
17 |                 return super(Bottle2, self).forward(input)
18 |             size = input.size()
19 |             out = super(Bottle2, self).forward(input.view(size[0]*size[1],
20 |                                                           size[2], size[3]))
21 |             return out.contiguous().view(size[0], size[1], size[2], size[3])
22 | 
23 | 
24 | class LayerNorm(nn.Module):
25 |     ''' Layer normalization module '''
26 | 
27 |     def __init__(self, d_hid, eps=1e-3):
28 |         super(LayerNorm, self).__init__()
29 | 
30 |         self.eps = eps
31 |         self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
32 |         self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)
33 | 
34 |     def forward(self, z):
35 |         if z.size(1) == 1:
36 |             return z
37 |         mu = torch.mean(z, dim=1)
38 |         sigma = torch.std(z, dim=1)
39 |         # HACK. PyTorch is changing behavior
40 |         if mu.dim() == 1:
41 |             mu = mu.unsqueeze(1)
42 |             sigma = sigma.unsqueeze(1)
43 |         ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps)
44 |         ln_out = ln_out.mul(self.a_2.expand_as(ln_out)) \
45 |             + self.b_2.expand_as(ln_out)
46 |         return ln_out
47 | 
48 | 
49 | class BottleLinear(Bottle, nn.Linear):
50 |     pass
51 | 
52 | 
53 | class BottleLayerNorm(Bottle, LayerNorm):
54 |     pass
55 | 
56 | 
57 | class BottleSoftmax(Bottle, nn.Softmax):
58 |     pass
59 | 
60 | 
61 | class Elementwise(nn.ModuleList):
62 |     """
63 |     A simple network container.
64 |     Parameters are a list of modules.
65 |     Inputs are a 3d Variable whose last dimension is the same length
66 |     as the list.
67 |     Outputs are the result of applying modules to inputs elementwise.
68 |     An optional merge parameter allows the outputs to be reduced to a
69 |     single Variable.
70 |     """
71 | 
72 |     def __init__(self, merge=None, *args):
73 |         assert merge in [None, 'first', 'concat', 'sum', 'mlp']
74 |         self.merge = merge
75 |         super(Elementwise, self).__init__(*args)
76 | 
77 |     def forward(self, input):
78 |         inputs = [feat.squeeze(2) for feat in input.split(1, dim=2)]
79 |         assert len(self) == len(inputs)
80 |         outputs = [f(x) for f, x in zip(self, inputs)]
81 |         if self.merge == 'first':
82 |             return outputs[0]
83 |         elif self.merge == 'concat' or self.merge == 'mlp':
84 |             return torch.cat(outputs, 2)
85 |         elif self.merge == 'sum':
86 |             return sum(outputs)
87 |         else:
88 |             return outputs
89 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/modules/UtilClass.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class Bottle(nn.Module):
 6 |         def forward(self, input):
 7 |             if len(input.size()) <= 2:
 8 |                 return super(Bottle, self).forward(input)
 9 |             size = input.size()[:2]
10 |             out = super(Bottle, self).forward(input.view(size[0]*size[1], -1))
11 |             return out.contiguous().view(size[0], size[1], -1)
12 | 
13 | 
14 | class Bottle2(nn.Module):
15 |         def forward(self, input):
16 |             if len(input.size()) <= 3:
17 |                 return super(Bottle2, self).forward(input)
18 |             size = input.size()
19 |             out = super(Bottle2, self).forward(input.view(size[0]*size[1],
20 |                                                           size[2], size[3]))
21 |             return out.contiguous().view(size[0], size[1], size[2], size[3])
22 | 
23 | 
24 | class LayerNorm(nn.Module):
25 |     ''' Layer normalization module '''
26 | 
27 |     def __init__(self, d_hid, eps=1e-3):
28 |         super(LayerNorm, self).__init__()
29 | 
30 |         self.eps = eps
31 |         self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
32 |         self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)
33 | 
34 |     def forward(self, z):
35 |         if z.size(1) == 1:
36 |             return z
37 |         mu = torch.mean(z, dim=1)
38 |         sigma = torch.std(z, dim=1)
39 |         # HACK. PyTorch is changing behavior
40 |         if mu.dim() == 1:
41 |             mu = mu.unsqueeze(1)
42 |             sigma = sigma.unsqueeze(1)
43 |         ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps)
44 |         ln_out = ln_out.mul(self.a_2.expand_as(ln_out)) \
45 |             + self.b_2.expand_as(ln_out)
46 |         return ln_out
47 | 
48 | 
49 | class BottleLinear(Bottle, nn.Linear):
50 |     pass
51 | 
52 | 
53 | class BottleLayerNorm(Bottle, LayerNorm):
54 |     pass
55 | 
56 | 
57 | class BottleSoftmax(Bottle, nn.Softmax):
58 |     pass
59 | 
60 | 
61 | class Elementwise(nn.ModuleList):
62 |     """
63 |     A simple network container.
64 |     Parameters are a list of modules.
65 |     Inputs are a 3d Variable whose last dimension is the same length
66 |     as the list.
67 |     Outputs are the result of applying modules to inputs elementwise.
68 |     An optional merge parameter allows the outputs to be reduced to a
69 |     single Variable.
70 |     """
71 | 
72 |     def __init__(self, merge=None, *args):
73 |         assert merge in [None, 'first', 'concat', 'sum', 'mlp']
74 |         self.merge = merge
75 |         super(Elementwise, self).__init__(*args)
76 | 
77 |     def forward(self, input):
78 |         inputs = [feat.squeeze(2) for feat in input.split(1, dim=2)]
79 |         assert len(self) == len(inputs)
80 |         outputs = [f(x) for f, x in zip(self, inputs)]
81 |         if self.merge == 'first':
82 |             return outputs[0]
83 |         elif self.merge == 'concat' or self.merge == 'mlp':
84 |             return torch.cat(outputs, 2)
85 |         elif self.merge == 'sum':
86 |             return sum(outputs)
87 |         else:
88 |             return outputs
89 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/modules/UtilClass.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class Bottle(nn.Module):
 6 |         def forward(self, input):
 7 |             if len(input.size()) <= 2:
 8 |                 return super(Bottle, self).forward(input)
 9 |             size = input.size()[:2]
10 |             out = super(Bottle, self).forward(input.view(size[0]*size[1], -1))
11 |             return out.contiguous().view(size[0], size[1], -1)
12 | 
13 | 
14 | class Bottle2(nn.Module):
15 |         def forward(self, input):
16 |             if len(input.size()) <= 3:
17 |                 return super(Bottle2, self).forward(input)
18 |             size = input.size()
19 |             out = super(Bottle2, self).forward(input.view(size[0]*size[1],
20 |                                                           size[2], size[3]))
21 |             return out.contiguous().view(size[0], size[1], size[2], size[3])
22 | 
23 | 
24 | class LayerNorm(nn.Module):
25 |     ''' Layer normalization module '''
26 | 
27 |     def __init__(self, d_hid, eps=1e-3):
28 |         super(LayerNorm, self).__init__()
29 | 
30 |         self.eps = eps
31 |         self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
32 |         self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)
33 | 
34 |     def forward(self, z):
35 |         if z.size(1) == 1:
36 |             return z
37 |         mu = torch.mean(z, dim=1)
38 |         sigma = torch.std(z, dim=1)
39 |         # HACK. PyTorch is changing behavior
40 |         if mu.dim() == 1:
41 |             mu = mu.unsqueeze(1)
42 |             sigma = sigma.unsqueeze(1)
43 |         ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps)
44 |         ln_out = ln_out.mul(self.a_2.expand_as(ln_out)) \
45 |             + self.b_2.expand_as(ln_out)
46 |         return ln_out
47 | 
48 | 
49 | class BottleLinear(Bottle, nn.Linear):
50 |     pass
51 | 
52 | 
53 | class BottleLayerNorm(Bottle, LayerNorm):
54 |     pass
55 | 
56 | 
57 | class BottleSoftmax(Bottle, nn.Softmax):
58 |     pass
59 | 
60 | 
61 | class Elementwise(nn.ModuleList):
62 |     """
63 |     A simple network container.
64 |     Parameters are a list of modules.
65 |     Inputs are a 3d Variable whose last dimension is the same length
66 |     as the list.
67 |     Outputs are the result of applying modules to inputs elementwise.
68 |     An optional merge parameter allows the outputs to be reduced to a
69 |     single Variable.
70 |     """
71 | 
72 |     def __init__(self, merge=None, *args):
73 |         assert merge in [None, 'first', 'concat', 'sum', 'mlp']
74 |         self.merge = merge
75 |         super(Elementwise, self).__init__(*args)
76 | 
77 |     def forward(self, input):
78 |         inputs = [feat.squeeze(2) for feat in input.split(1, dim=2)]
79 |         assert len(self) == len(inputs)
80 |         outputs = [f(x) for f, x in zip(self, inputs)]
81 |         if self.merge == 'first':
82 |             return outputs[0]
83 |         elif self.merge == 'concat' or self.merge == 'mlp':
84 |             return torch.cat(outputs, 2)
85 |         elif self.merge == 'sum':
86 |             return sum(outputs)
87 |         else:
88 |             return outputs
89 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/modules/Gate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | def context_gate_factory(type, embeddings_size, decoder_size,
 6 |                          attention_size, output_size):
 7 |     """Returns the correct ContextGate class"""
 8 | 
 9 |     gate_types = {'source': SourceContextGate,
10 |                   'target': TargetContextGate,
11 |                   'both': BothContextGate}
12 | 
13 |     assert type in gate_types, "Not valid ContextGate type: {0}".format(type)
14 |     return gate_types[type](embeddings_size, decoder_size, attention_size,
15 |                             output_size)
16 | 
17 | 
18 | class ContextGate(nn.Module):
19 |     """
20 |     Context gate is a decoder module that takes as input the previous word
21 |     embedding, the current decoder state and the attention state, and
22 |     produces a gate.
23 |     The gate can be used to select the input from the target side context
24 |     (decoder state), from the source context (attention state) or both.
25 |     """
26 |     def __init__(self, embeddings_size, decoder_size,
27 |                  attention_size, output_size):
28 |         super(ContextGate, self).__init__()
29 |         input_size = embeddings_size + decoder_size + attention_size
30 |         self.gate = nn.Linear(input_size, output_size, bias=True)
31 |         self.sig = nn.Sigmoid()
32 |         self.source_proj = nn.Linear(attention_size, output_size)
33 |         self.target_proj = nn.Linear(embeddings_size + decoder_size,
34 |                                      output_size)
35 | 
36 |     def forward(self, prev_emb, dec_state, attn_state):
37 |         input_tensor = torch.cat((prev_emb, dec_state, attn_state), dim=1)
38 |         z = self.sig(self.gate(input_tensor))
39 |         proj_source = self.source_proj(attn_state)
40 |         proj_target = self.target_proj(
41 |             torch.cat((prev_emb, dec_state), dim=1))
42 |         return z, proj_source, proj_target
43 | 
44 | 
45 | class SourceContextGate(nn.Module):
46 |     """Apply the context gate only to the source context"""
47 | 
48 |     def __init__(self, embeddings_size, decoder_size,
49 |                  attention_size, output_size):
50 |         super(SourceContextGate, self).__init__()
51 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
52 |                                         attention_size, output_size)
53 |         self.tanh = nn.Tanh()
54 | 
55 |     def forward(self, prev_emb, dec_state, attn_state):
56 |         z, source, target = self.context_gate(
57 |             prev_emb, dec_state, attn_state)
58 |         return self.tanh(target + z * source)
59 | 
60 | 
61 | class TargetContextGate(nn.Module):
62 |     """Apply the context gate only to the target context"""
63 | 
64 |     def __init__(self, embeddings_size, decoder_size,
65 |                  attention_size, output_size):
66 |         super(TargetContextGate, self).__init__()
67 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
68 |                                         attention_size, output_size)
69 |         self.tanh = nn.Tanh()
70 | 
71 |     def forward(self, prev_emb, dec_state, attn_state):
72 |         z, source, target = self.context_gate(prev_emb, dec_state, attn_state)
73 |         return self.tanh(z * target + source)
74 | 
75 | 
76 | class BothContextGate(nn.Module):
77 |     """Apply the context gate to both contexts"""
78 | 
79 |     def __init__(self, embeddings_size, decoder_size,
80 |                  attention_size, output_size):
81 |         super(BothContextGate, self).__init__()
82 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
83 |                                         attention_size, output_size)
84 |         self.tanh = nn.Tanh()
85 | 
86 |     def forward(self, prev_emb, dec_state, attn_state):
87 |         z, source, target = self.context_gate(prev_emb, dec_state, attn_state)
88 |         return self.tanh((1. - z) * target + z * source)
89 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/modules/Gate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | def context_gate_factory(type, embeddings_size, decoder_size,
 6 |                          attention_size, output_size):
 7 |     """Returns the correct ContextGate class"""
 8 | 
 9 |     gate_types = {'source': SourceContextGate,
10 |                   'target': TargetContextGate,
11 |                   'both': BothContextGate}
12 | 
13 |     assert type in gate_types, "Not valid ContextGate type: {0}".format(type)
14 |     return gate_types[type](embeddings_size, decoder_size, attention_size,
15 |                             output_size)
16 | 
17 | 
18 | class ContextGate(nn.Module):
19 |     """
20 |     Context gate is a decoder module that takes as input the previous word
21 |     embedding, the current decoder state and the attention state, and
22 |     produces a gate.
23 |     The gate can be used to select the input from the target side context
24 |     (decoder state), from the source context (attention state) or both.
25 |     """
26 |     def __init__(self, embeddings_size, decoder_size,
27 |                  attention_size, output_size):
28 |         super(ContextGate, self).__init__()
29 |         input_size = embeddings_size + decoder_size + attention_size
30 |         self.gate = nn.Linear(input_size, output_size, bias=True)
31 |         self.sig = nn.Sigmoid()
32 |         self.source_proj = nn.Linear(attention_size, output_size)
33 |         self.target_proj = nn.Linear(embeddings_size + decoder_size,
34 |                                      output_size)
35 | 
36 |     def forward(self, prev_emb, dec_state, attn_state):
37 |         input_tensor = torch.cat((prev_emb, dec_state, attn_state), dim=1)
38 |         z = self.sig(self.gate(input_tensor))
39 |         proj_source = self.source_proj(attn_state)
40 |         proj_target = self.target_proj(
41 |             torch.cat((prev_emb, dec_state), dim=1))
42 |         return z, proj_source, proj_target
43 | 
44 | 
45 | class SourceContextGate(nn.Module):
46 |     """Apply the context gate only to the source context"""
47 | 
48 |     def __init__(self, embeddings_size, decoder_size,
49 |                  attention_size, output_size):
50 |         super(SourceContextGate, self).__init__()
51 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
52 |                                         attention_size, output_size)
53 |         self.tanh = nn.Tanh()
54 | 
55 |     def forward(self, prev_emb, dec_state, attn_state):
56 |         z, source, target = self.context_gate(
57 |             prev_emb, dec_state, attn_state)
58 |         return self.tanh(target + z * source)
59 | 
60 | 
61 | class TargetContextGate(nn.Module):
62 |     """Apply the context gate only to the target context"""
63 | 
64 |     def __init__(self, embeddings_size, decoder_size,
65 |                  attention_size, output_size):
66 |         super(TargetContextGate, self).__init__()
67 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
68 |                                         attention_size, output_size)
69 |         self.tanh = nn.Tanh()
70 | 
71 |     def forward(self, prev_emb, dec_state, attn_state):
72 |         z, source, target = self.context_gate(prev_emb, dec_state, attn_state)
73 |         return self.tanh(z * target + source)
74 | 
75 | 
76 | class BothContextGate(nn.Module):
77 |     """Apply the context gate to both contexts"""
78 | 
79 |     def __init__(self, embeddings_size, decoder_size,
80 |                  attention_size, output_size):
81 |         super(BothContextGate, self).__init__()
82 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
83 |                                         attention_size, output_size)
84 |         self.tanh = nn.Tanh()
85 | 
86 |     def forward(self, prev_emb, dec_state, attn_state):
87 |         z, source, target = self.context_gate(prev_emb, dec_state, attn_state)
88 |         return self.tanh((1. - z) * target + z * source)
89 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/modules/Gate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | def context_gate_factory(type, embeddings_size, decoder_size,
 6 |                          attention_size, output_size):
 7 |     """Returns the correct ContextGate class"""
 8 | 
 9 |     gate_types = {'source': SourceContextGate,
10 |                   'target': TargetContextGate,
11 |                   'both': BothContextGate}
12 | 
13 |     assert type in gate_types, "Not valid ContextGate type: {0}".format(type)
14 |     return gate_types[type](embeddings_size, decoder_size, attention_size,
15 |                             output_size)
16 | 
17 | 
18 | class ContextGate(nn.Module):
19 |     """
20 |     Context gate is a decoder module that takes as input the previous word
21 |     embedding, the current decoder state and the attention state, and
22 |     produces a gate.
23 |     The gate can be used to select the input from the target side context
24 |     (decoder state), from the source context (attention state) or both.
25 |     """
26 |     def __init__(self, embeddings_size, decoder_size,
27 |                  attention_size, output_size):
28 |         super(ContextGate, self).__init__()
29 |         input_size = embeddings_size + decoder_size + attention_size
30 |         self.gate = nn.Linear(input_size, output_size, bias=True)
31 |         self.sig = nn.Sigmoid()
32 |         self.source_proj = nn.Linear(attention_size, output_size)
33 |         self.target_proj = nn.Linear(embeddings_size + decoder_size,
34 |                                      output_size)
35 | 
36 |     def forward(self, prev_emb, dec_state, attn_state):
37 |         input_tensor = torch.cat((prev_emb, dec_state, attn_state), dim=1)
38 |         z = self.sig(self.gate(input_tensor))
39 |         proj_source = self.source_proj(attn_state)
40 |         proj_target = self.target_proj(
41 |             torch.cat((prev_emb, dec_state), dim=1))
42 |         return z, proj_source, proj_target
43 | 
44 | 
45 | class SourceContextGate(nn.Module):
46 |     """Apply the context gate only to the source context"""
47 | 
48 |     def __init__(self, embeddings_size, decoder_size,
49 |                  attention_size, output_size):
50 |         super(SourceContextGate, self).__init__()
51 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
52 |                                         attention_size, output_size)
53 |         self.tanh = nn.Tanh()
54 | 
55 |     def forward(self, prev_emb, dec_state, attn_state):
56 |         z, source, target = self.context_gate(
57 |             prev_emb, dec_state, attn_state)
58 |         return self.tanh(target + z * source)
59 | 
60 | 
61 | class TargetContextGate(nn.Module):
62 |     """Apply the context gate only to the target context"""
63 | 
64 |     def __init__(self, embeddings_size, decoder_size,
65 |                  attention_size, output_size):
66 |         super(TargetContextGate, self).__init__()
67 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
68 |                                         attention_size, output_size)
69 |         self.tanh = nn.Tanh()
70 | 
71 |     def forward(self, prev_emb, dec_state, attn_state):
72 |         z, source, target = self.context_gate(prev_emb, dec_state, attn_state)
73 |         return self.tanh(z * target + source)
74 | 
75 | 
76 | class BothContextGate(nn.Module):
77 |     """Apply the context gate to both contexts"""
78 | 
79 |     def __init__(self, embeddings_size, decoder_size,
80 |                  attention_size, output_size):
81 |         super(BothContextGate, self).__init__()
82 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
83 |                                         attention_size, output_size)
84 |         self.tanh = nn.Tanh()
85 | 
86 |     def forward(self, prev_emb, dec_state, attn_state):
87 |         z, source, target = self.context_gate(prev_emb, dec_state, attn_state)
88 |         return self.tanh((1. - z) * target + z * source)
89 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/modules/Gate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | def context_gate_factory(type, embeddings_size, decoder_size,
 6 |                          attention_size, output_size):
 7 |     """Returns the correct ContextGate class"""
 8 | 
 9 |     gate_types = {'source': SourceContextGate,
10 |                   'target': TargetContextGate,
11 |                   'both': BothContextGate}
12 | 
13 |     assert type in gate_types, "Not valid ContextGate type: {0}".format(type)
14 |     return gate_types[type](embeddings_size, decoder_size, attention_size,
15 |                             output_size)
16 | 
17 | 
18 | class ContextGate(nn.Module):
19 |     """
20 |     Context gate is a decoder module that takes as input the previous word
21 |     embedding, the current decoder state and the attention state, and
22 |     produces a gate.
23 |     The gate can be used to select the input from the target side context
24 |     (decoder state), from the source context (attention state) or both.
25 |     """
26 |     def __init__(self, embeddings_size, decoder_size,
27 |                  attention_size, output_size):
28 |         super(ContextGate, self).__init__()
29 |         input_size = embeddings_size + decoder_size + attention_size
30 |         self.gate = nn.Linear(input_size, output_size, bias=True)
31 |         self.sig = nn.Sigmoid()
32 |         self.source_proj = nn.Linear(attention_size, output_size)
33 |         self.target_proj = nn.Linear(embeddings_size + decoder_size,
34 |                                      output_size)
35 | 
36 |     def forward(self, prev_emb, dec_state, attn_state):
37 |         input_tensor = torch.cat((prev_emb, dec_state, attn_state), dim=1)
38 |         z = self.sig(self.gate(input_tensor))
39 |         proj_source = self.source_proj(attn_state)
40 |         proj_target = self.target_proj(
41 |             torch.cat((prev_emb, dec_state), dim=1))
42 |         return z, proj_source, proj_target
43 | 
44 | 
45 | class SourceContextGate(nn.Module):
46 |     """Apply the context gate only to the source context"""
47 | 
48 |     def __init__(self, embeddings_size, decoder_size,
49 |                  attention_size, output_size):
50 |         super(SourceContextGate, self).__init__()
51 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
52 |                                         attention_size, output_size)
53 |         self.tanh = nn.Tanh()
54 | 
55 |     def forward(self, prev_emb, dec_state, attn_state):
56 |         z, source, target = self.context_gate(
57 |             prev_emb, dec_state, attn_state)
58 |         return self.tanh(target + z * source)
59 | 
60 | 
61 | class TargetContextGate(nn.Module):
62 |     """Apply the context gate only to the target context"""
63 | 
64 |     def __init__(self, embeddings_size, decoder_size,
65 |                  attention_size, output_size):
66 |         super(TargetContextGate, self).__init__()
67 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
68 |                                         attention_size, output_size)
69 |         self.tanh = nn.Tanh()
70 | 
71 |     def forward(self, prev_emb, dec_state, attn_state):
72 |         z, source, target = self.context_gate(prev_emb, dec_state, attn_state)
73 |         return self.tanh(z * target + source)
74 | 
75 | 
76 | class BothContextGate(nn.Module):
77 |     """Apply the context gate to both contexts"""
78 | 
79 |     def __init__(self, embeddings_size, decoder_size,
80 |                  attention_size, output_size):
81 |         super(BothContextGate, self).__init__()
82 |         self.context_gate = ContextGate(embeddings_size, decoder_size,
83 |                                         attention_size, output_size)
84 |         self.tanh = nn.Tanh()
85 | 
86 |     def forward(self, prev_emb, dec_state, attn_state):
87 |         z, source, target = self.context_gate(prev_emb, dec_state, attn_state)
88 |         return self.tanh((1. - z) * target + z * source)
89 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/modules/ImageEncoder.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | import torch
  4 | from torch.autograd import Variable
  5 | 
  6 | 
  7 | class ImageEncoder(nn.Module):
  8 |     """
  9 |     A simple encoder convolutional -> recurrent neural network for
 10 |     image input.
 11 | 
 12 |     Args:
 13 |         num_layers (int): number of encoder layers.
 14 |         bidirectional (bool): bidirectional encoder.
 15 |         rnn_size (int): size of hidden states of the rnn.
 16 |         dropout (float): dropout probablity.
 17 |     """
 18 |     def __init__(self, num_layers, bidirectional, rnn_size, dropout):
 19 |         super(ImageEncoder, self).__init__()
 20 |         self.num_layers = num_layers
 21 |         self.num_directions = 2 if bidirectional else 1
 22 |         self.hidden_size = rnn_size
 23 | 
 24 |         self.layer1 = nn.Conv2d(3,   64, kernel_size=(3, 3),
 25 |                                 padding=(1, 1), stride=(1, 1))
 26 |         self.layer2 = nn.Conv2d(64,  128, kernel_size=(3, 3),
 27 |                                 padding=(1, 1), stride=(1, 1))
 28 |         self.layer3 = nn.Conv2d(128, 256, kernel_size=(3, 3),
 29 |                                 padding=(1, 1), stride=(1, 1))
 30 |         self.layer4 = nn.Conv2d(256, 256, kernel_size=(3, 3),
 31 |                                 padding=(1, 1), stride=(1, 1))
 32 |         self.layer5 = nn.Conv2d(256, 512, kernel_size=(3, 3),
 33 |                                 padding=(1, 1), stride=(1, 1))
 34 |         self.layer6 = nn.Conv2d(512, 512, kernel_size=(3, 3),
 35 |                                 padding=(1, 1), stride=(1, 1))
 36 | 
 37 |         self.batch_norm1 = nn.BatchNorm2d(256)
 38 |         self.batch_norm2 = nn.BatchNorm2d(512)
 39 |         self.batch_norm3 = nn.BatchNorm2d(512)
 40 | 
 41 |         input_size = 512
 42 |         self.rnn = nn.LSTM(input_size, rnn_size,
 43 |                            num_layers=num_layers,
 44 |                            dropout=dropout,
 45 |                            bidirectional=bidirectional)
 46 |         self.pos_lut = nn.Embedding(1000, input_size)
 47 | 
 48 |     def load_pretrained_vectors(self, opt):
 49 |         # Pass in needed options only when modify function definition.
 50 |         pass
 51 | 
 52 |     def forward(self, input, lengths=None):
 53 |         "See :obj:`onmt.modules.EncoderBase.forward()`"
 54 | 
 55 |         batch_size = input.size(0)
 56 |         # (batch_size, 64, imgH, imgW)
 57 |         # layer 1
 58 |         input = F.relu(self.layer1(input[:, :, :, :]-0.5), True)
 59 | 
 60 |         # (batch_size, 64, imgH/2, imgW/2)
 61 |         input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2))
 62 | 
 63 |         # (batch_size, 128, imgH/2, imgW/2)
 64 |         # layer 2
 65 |         input = F.relu(self.layer2(input), True)
 66 | 
 67 |         # (batch_size, 128, imgH/2/2, imgW/2/2)
 68 |         input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2))
 69 | 
 70 |         #  (batch_size, 256, imgH/2/2, imgW/2/2)
 71 |         # layer 3
 72 |         # batch norm 1
 73 |         input = F.relu(self.batch_norm1(self.layer3(input)), True)
 74 | 
 75 |         # (batch_size, 256, imgH/2/2, imgW/2/2)
 76 |         # layer4
 77 |         input = F.relu(self.layer4(input), True)
 78 | 
 79 |         # (batch_size, 256, imgH/2/2/2, imgW/2/2)
 80 |         input = F.max_pool2d(input, kernel_size=(1, 2), stride=(1, 2))
 81 | 
 82 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2)
 83 |         # layer 5
 84 |         # batch norm 2
 85 |         input = F.relu(self.batch_norm2(self.layer5(input)), True)
 86 | 
 87 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2/2)
 88 |         input = F.max_pool2d(input, kernel_size=(2, 1), stride=(2, 1))
 89 | 
 90 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2/2)
 91 |         input = F.relu(self.batch_norm3(self.layer6(input)), True)
 92 | 
 93 |         # # (batch_size, 512, H, W)
 94 |         all_outputs = []
 95 |         for row in range(input.size(2)):
 96 |             inp = input[:, :, row, :].transpose(0, 2)\
 97 |                                      .transpose(1, 2)
 98 |             row_vec = torch.Tensor(batch_size).type_as(inp.data)\
 99 |                                               .long().fill_(row)
100 |             pos_emb = self.pos_lut(Variable(row_vec))
101 |             with_pos = torch.cat(
102 |                 (pos_emb.view(1, pos_emb.size(0), pos_emb.size(1)), inp), 0)
103 |             outputs, hidden_t = self.rnn(with_pos)
104 |             all_outputs.append(outputs)
105 |         out = torch.cat(all_outputs, 0)
106 | 
107 |         return hidden_t, out
108 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/modules/ImageEncoder.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | import torch
  4 | from torch.autograd import Variable
  5 | 
  6 | 
  7 | class ImageEncoder(nn.Module):
  8 |     """
  9 |     A simple encoder convolutional -> recurrent neural network for
 10 |     image input.
 11 | 
 12 |     Args:
 13 |         num_layers (int): number of encoder layers.
 14 |         bidirectional (bool): bidirectional encoder.
 15 |         rnn_size (int): size of hidden states of the rnn.
 16 |         dropout (float): dropout probablity.
 17 |     """
 18 |     def __init__(self, num_layers, bidirectional, rnn_size, dropout):
 19 |         super(ImageEncoder, self).__init__()
 20 |         self.num_layers = num_layers
 21 |         self.num_directions = 2 if bidirectional else 1
 22 |         self.hidden_size = rnn_size
 23 | 
 24 |         self.layer1 = nn.Conv2d(3,   64, kernel_size=(3, 3),
 25 |                                 padding=(1, 1), stride=(1, 1))
 26 |         self.layer2 = nn.Conv2d(64,  128, kernel_size=(3, 3),
 27 |                                 padding=(1, 1), stride=(1, 1))
 28 |         self.layer3 = nn.Conv2d(128, 256, kernel_size=(3, 3),
 29 |                                 padding=(1, 1), stride=(1, 1))
 30 |         self.layer4 = nn.Conv2d(256, 256, kernel_size=(3, 3),
 31 |                                 padding=(1, 1), stride=(1, 1))
 32 |         self.layer5 = nn.Conv2d(256, 512, kernel_size=(3, 3),
 33 |                                 padding=(1, 1), stride=(1, 1))
 34 |         self.layer6 = nn.Conv2d(512, 512, kernel_size=(3, 3),
 35 |                                 padding=(1, 1), stride=(1, 1))
 36 | 
 37 |         self.batch_norm1 = nn.BatchNorm2d(256)
 38 |         self.batch_norm2 = nn.BatchNorm2d(512)
 39 |         self.batch_norm3 = nn.BatchNorm2d(512)
 40 | 
 41 |         input_size = 512
 42 |         self.rnn = nn.LSTM(input_size, rnn_size,
 43 |                            num_layers=num_layers,
 44 |                            dropout=dropout,
 45 |                            bidirectional=bidirectional)
 46 |         self.pos_lut = nn.Embedding(1000, input_size)
 47 | 
 48 |     def load_pretrained_vectors(self, opt):
 49 |         # Pass in needed options only when modify function definition.
 50 |         pass
 51 | 
 52 |     def forward(self, input, lengths=None):
 53 |         "See :obj:`onmt.modules.EncoderBase.forward()`"
 54 | 
 55 |         batch_size = input.size(0)
 56 |         # (batch_size, 64, imgH, imgW)
 57 |         # layer 1
 58 |         input = F.relu(self.layer1(input[:, :, :, :]-0.5), True)
 59 | 
 60 |         # (batch_size, 64, imgH/2, imgW/2)
 61 |         input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2))
 62 | 
 63 |         # (batch_size, 128, imgH/2, imgW/2)
 64 |         # layer 2
 65 |         input = F.relu(self.layer2(input), True)
 66 | 
 67 |         # (batch_size, 128, imgH/2/2, imgW/2/2)
 68 |         input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2))
 69 | 
 70 |         #  (batch_size, 256, imgH/2/2, imgW/2/2)
 71 |         # layer 3
 72 |         # batch norm 1
 73 |         input = F.relu(self.batch_norm1(self.layer3(input)), True)
 74 | 
 75 |         # (batch_size, 256, imgH/2/2, imgW/2/2)
 76 |         # layer4
 77 |         input = F.relu(self.layer4(input), True)
 78 | 
 79 |         # (batch_size, 256, imgH/2/2/2, imgW/2/2)
 80 |         input = F.max_pool2d(input, kernel_size=(1, 2), stride=(1, 2))
 81 | 
 82 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2)
 83 |         # layer 5
 84 |         # batch norm 2
 85 |         input = F.relu(self.batch_norm2(self.layer5(input)), True)
 86 | 
 87 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2/2)
 88 |         input = F.max_pool2d(input, kernel_size=(2, 1), stride=(2, 1))
 89 | 
 90 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2/2)
 91 |         input = F.relu(self.batch_norm3(self.layer6(input)), True)
 92 | 
 93 |         # # (batch_size, 512, H, W)
 94 |         all_outputs = []
 95 |         for row in range(input.size(2)):
 96 |             inp = input[:, :, row, :].transpose(0, 2)\
 97 |                                      .transpose(1, 2)
 98 |             row_vec = torch.Tensor(batch_size).type_as(inp.data)\
 99 |                                               .long().fill_(row)
100 |             pos_emb = self.pos_lut(Variable(row_vec))
101 |             with_pos = torch.cat(
102 |                 (pos_emb.view(1, pos_emb.size(0), pos_emb.size(1)), inp), 0)
103 |             outputs, hidden_t = self.rnn(with_pos)
104 |             all_outputs.append(outputs)
105 |         out = torch.cat(all_outputs, 0)
106 | 
107 |         return hidden_t, out
108 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/modules/ImageEncoder.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | import torch
  4 | from torch.autograd import Variable
  5 | 
  6 | 
  7 | class ImageEncoder(nn.Module):
  8 |     """
  9 |     A simple encoder convolutional -> recurrent neural network for
 10 |     image input.
 11 | 
 12 |     Args:
 13 |         num_layers (int): number of encoder layers.
 14 |         bidirectional (bool): bidirectional encoder.
 15 |         rnn_size (int): size of hidden states of the rnn.
 16 |         dropout (float): dropout probablity.
 17 |     """
 18 |     def __init__(self, num_layers, bidirectional, rnn_size, dropout):
 19 |         super(ImageEncoder, self).__init__()
 20 |         self.num_layers = num_layers
 21 |         self.num_directions = 2 if bidirectional else 1
 22 |         self.hidden_size = rnn_size
 23 | 
 24 |         self.layer1 = nn.Conv2d(3,   64, kernel_size=(3, 3),
 25 |                                 padding=(1, 1), stride=(1, 1))
 26 |         self.layer2 = nn.Conv2d(64,  128, kernel_size=(3, 3),
 27 |                                 padding=(1, 1), stride=(1, 1))
 28 |         self.layer3 = nn.Conv2d(128, 256, kernel_size=(3, 3),
 29 |                                 padding=(1, 1), stride=(1, 1))
 30 |         self.layer4 = nn.Conv2d(256, 256, kernel_size=(3, 3),
 31 |                                 padding=(1, 1), stride=(1, 1))
 32 |         self.layer5 = nn.Conv2d(256, 512, kernel_size=(3, 3),
 33 |                                 padding=(1, 1), stride=(1, 1))
 34 |         self.layer6 = nn.Conv2d(512, 512, kernel_size=(3, 3),
 35 |                                 padding=(1, 1), stride=(1, 1))
 36 | 
 37 |         self.batch_norm1 = nn.BatchNorm2d(256)
 38 |         self.batch_norm2 = nn.BatchNorm2d(512)
 39 |         self.batch_norm3 = nn.BatchNorm2d(512)
 40 | 
 41 |         input_size = 512
 42 |         self.rnn = nn.LSTM(input_size, rnn_size,
 43 |                            num_layers=num_layers,
 44 |                            dropout=dropout,
 45 |                            bidirectional=bidirectional)
 46 |         self.pos_lut = nn.Embedding(1000, input_size)
 47 | 
 48 |     def load_pretrained_vectors(self, opt):
 49 |         # Pass in needed options only when modify function definition.
 50 |         pass
 51 | 
 52 |     def forward(self, input, lengths=None):
 53 |         "See :obj:`onmt.modules.EncoderBase.forward()`"
 54 | 
 55 |         batch_size = input.size(0)
 56 |         # (batch_size, 64, imgH, imgW)
 57 |         # layer 1
 58 |         input = F.relu(self.layer1(input[:, :, :, :]-0.5), True)
 59 | 
 60 |         # (batch_size, 64, imgH/2, imgW/2)
 61 |         input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2))
 62 | 
 63 |         # (batch_size, 128, imgH/2, imgW/2)
 64 |         # layer 2
 65 |         input = F.relu(self.layer2(input), True)
 66 | 
 67 |         # (batch_size, 128, imgH/2/2, imgW/2/2)
 68 |         input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2))
 69 | 
 70 |         #  (batch_size, 256, imgH/2/2, imgW/2/2)
 71 |         # layer 3
 72 |         # batch norm 1
 73 |         input = F.relu(self.batch_norm1(self.layer3(input)), True)
 74 | 
 75 |         # (batch_size, 256, imgH/2/2, imgW/2/2)
 76 |         # layer4
 77 |         input = F.relu(self.layer4(input), True)
 78 | 
 79 |         # (batch_size, 256, imgH/2/2/2, imgW/2/2)
 80 |         input = F.max_pool2d(input, kernel_size=(1, 2), stride=(1, 2))
 81 | 
 82 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2)
 83 |         # layer 5
 84 |         # batch norm 2
 85 |         input = F.relu(self.batch_norm2(self.layer5(input)), True)
 86 | 
 87 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2/2)
 88 |         input = F.max_pool2d(input, kernel_size=(2, 1), stride=(2, 1))
 89 | 
 90 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2/2)
 91 |         input = F.relu(self.batch_norm3(self.layer6(input)), True)
 92 | 
 93 |         # # (batch_size, 512, H, W)
 94 |         all_outputs = []
 95 |         for row in range(input.size(2)):
 96 |             inp = input[:, :, row, :].transpose(0, 2)\
 97 |                                      .transpose(1, 2)
 98 |             row_vec = torch.Tensor(batch_size).type_as(inp.data)\
 99 |                                               .long().fill_(row)
100 |             pos_emb = self.pos_lut(Variable(row_vec))
101 |             with_pos = torch.cat(
102 |                 (pos_emb.view(1, pos_emb.size(0), pos_emb.size(1)), inp), 0)
103 |             outputs, hidden_t = self.rnn(with_pos)
104 |             all_outputs.append(outputs)
105 |         out = torch.cat(all_outputs, 0)
106 | 
107 |         return hidden_t, out
108 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/modules/ImageEncoder.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | import torch
  4 | from torch.autograd import Variable
  5 | 
  6 | 
  7 | class ImageEncoder(nn.Module):
  8 |     """
  9 |     A simple encoder convolutional -> recurrent neural network for
 10 |     image input.
 11 | 
 12 |     Args:
 13 |         num_layers (int): number of encoder layers.
 14 |         bidirectional (bool): bidirectional encoder.
 15 |         rnn_size (int): size of hidden states of the rnn.
 16 |         dropout (float): dropout probablity.
 17 |     """
 18 |     def __init__(self, num_layers, bidirectional, rnn_size, dropout):
 19 |         super(ImageEncoder, self).__init__()
 20 |         self.num_layers = num_layers
 21 |         self.num_directions = 2 if bidirectional else 1
 22 |         self.hidden_size = rnn_size
 23 | 
 24 |         self.layer1 = nn.Conv2d(3,   64, kernel_size=(3, 3),
 25 |                                 padding=(1, 1), stride=(1, 1))
 26 |         self.layer2 = nn.Conv2d(64,  128, kernel_size=(3, 3),
 27 |                                 padding=(1, 1), stride=(1, 1))
 28 |         self.layer3 = nn.Conv2d(128, 256, kernel_size=(3, 3),
 29 |                                 padding=(1, 1), stride=(1, 1))
 30 |         self.layer4 = nn.Conv2d(256, 256, kernel_size=(3, 3),
 31 |                                 padding=(1, 1), stride=(1, 1))
 32 |         self.layer5 = nn.Conv2d(256, 512, kernel_size=(3, 3),
 33 |                                 padding=(1, 1), stride=(1, 1))
 34 |         self.layer6 = nn.Conv2d(512, 512, kernel_size=(3, 3),
 35 |                                 padding=(1, 1), stride=(1, 1))
 36 | 
 37 |         self.batch_norm1 = nn.BatchNorm2d(256)
 38 |         self.batch_norm2 = nn.BatchNorm2d(512)
 39 |         self.batch_norm3 = nn.BatchNorm2d(512)
 40 | 
 41 |         input_size = 512
 42 |         self.rnn = nn.LSTM(input_size, rnn_size,
 43 |                            num_layers=num_layers,
 44 |                            dropout=dropout,
 45 |                            bidirectional=bidirectional)
 46 |         self.pos_lut = nn.Embedding(1000, input_size)
 47 | 
 48 |     def load_pretrained_vectors(self, opt):
 49 |         # Pass in needed options only when modify function definition.
 50 |         pass
 51 | 
 52 |     def forward(self, input, lengths=None):
 53 |         "See :obj:`onmt.modules.EncoderBase.forward()`"
 54 | 
 55 |         batch_size = input.size(0)
 56 |         # (batch_size, 64, imgH, imgW)
 57 |         # layer 1
 58 |         input = F.relu(self.layer1(input[:, :, :, :]-0.5), True)
 59 | 
 60 |         # (batch_size, 64, imgH/2, imgW/2)
 61 |         input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2))
 62 | 
 63 |         # (batch_size, 128, imgH/2, imgW/2)
 64 |         # layer 2
 65 |         input = F.relu(self.layer2(input), True)
 66 | 
 67 |         # (batch_size, 128, imgH/2/2, imgW/2/2)
 68 |         input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2))
 69 | 
 70 |         #  (batch_size, 256, imgH/2/2, imgW/2/2)
 71 |         # layer 3
 72 |         # batch norm 1
 73 |         input = F.relu(self.batch_norm1(self.layer3(input)), True)
 74 | 
 75 |         # (batch_size, 256, imgH/2/2, imgW/2/2)
 76 |         # layer4
 77 |         input = F.relu(self.layer4(input), True)
 78 | 
 79 |         # (batch_size, 256, imgH/2/2/2, imgW/2/2)
 80 |         input = F.max_pool2d(input, kernel_size=(1, 2), stride=(1, 2))
 81 | 
 82 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2)
 83 |         # layer 5
 84 |         # batch norm 2
 85 |         input = F.relu(self.batch_norm2(self.layer5(input)), True)
 86 | 
 87 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2/2)
 88 |         input = F.max_pool2d(input, kernel_size=(2, 1), stride=(2, 1))
 89 | 
 90 |         # (batch_size, 512, imgH/2/2/2, imgW/2/2/2)
 91 |         input = F.relu(self.batch_norm3(self.layer6(input)), True)
 92 | 
 93 |         # # (batch_size, 512, H, W)
 94 |         all_outputs = []
 95 |         for row in range(input.size(2)):
 96 |             inp = input[:, :, row, :].transpose(0, 2)\
 97 |                                      .transpose(1, 2)
 98 |             row_vec = torch.Tensor(batch_size).type_as(inp.data)\
 99 |                                               .long().fill_(row)
100 |             pos_emb = self.pos_lut(Variable(row_vec))
101 |             with_pos = torch.cat(
102 |                 (pos_emb.view(1, pos_emb.size(0), pos_emb.size(1)), inp), 0)
103 |             outputs, hidden_t = self.rnn(with_pos)
104 |             all_outputs.append(outputs)
105 |         out = torch.cat(all_outputs, 0)
106 | 
107 |         return hidden_t, out
108 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/Optim.py:
--------------------------------------------------------------------------------
  1 | import torch.optim as optim
  2 | from torch.nn.utils import clip_grad_norm
  3 | 
  4 | 
  5 | class Optim(object):
  6 |     """
  7 |     Controller class for optimization. Mostly a thin
  8 |     wrapper for `optim`, but also useful for implementing
  9 |     rate scheduling beyond what is currently available.
 10 |     Also implements necessary methods for training RNNs such
 11 |     as grad manipulations.
 12 | 
 13 |     Args:
 14 |       method (:obj:`str`): one of [sgd, adagrad, adadelta, adam]
 15 |       lr (float): learning rate
 16 |       lr_decay (float, optional): learning rate decay multiplier
 17 |       start_decay_at (int, optional): epoch to start learning rate decay
 18 |       beta1, beta2 (float, optional): parameters for adam
 19 |       adagrad_accum (float, optional): initialization parameter for adagrad
 20 |       decay_method (str, option): custom decay options
 21 |       warmup_steps (int, option): parameter for `noam` decay
 22 |       model_size (int, option): parameter for `noam` decay
 23 |     """
 24 |     # We use the default parameters for Adam that are suggested by
 25 |     # the original paper https://arxiv.org/pdf/1412.6980.pdf
 26 |     # These values are also used by other established implementations,
 27 |     # e.g. https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
 28 |     # https://keras.io/optimizers/
 29 |     # Recently there are slightly different values used in the paper
 30 |     # "Attention is all you need"
 31 |     # https://arxiv.org/pdf/1706.03762.pdf, particularly the value beta2=0.98
 32 |     # was used there however, beta2=0.999 is still arguably the more
 33 |     # established value, so we use that here as well
 34 |     def __init__(self, method, lr, max_grad_norm,
 35 |                  lr_decay=1, start_decay_at=None,
 36 |                  beta1=0.9, beta2=0.999,
 37 |                  adagrad_accum=0.0,
 38 |                  decay_method=None,
 39 |                  warmup_steps=4000,
 40 |                  model_size=None):
 41 |         self.last_ppl = None
 42 |         self.lr = lr
 43 |         self.original_lr = lr
 44 |         self.max_grad_norm = max_grad_norm
 45 |         self.method = method
 46 |         self.lr_decay = lr_decay
 47 |         self.start_decay_at = start_decay_at
 48 |         self.start_decay = False
 49 |         self._step = 0
 50 |         self.betas = [beta1, beta2]
 51 |         self.adagrad_accum = adagrad_accum
 52 |         self.decay_method = decay_method
 53 |         self.warmup_steps = warmup_steps
 54 |         self.model_size = model_size
 55 | 
 56 |     def set_parameters(self, params):
 57 |         self.params = [p for p in params if p.requires_grad]
 58 |         if self.method == 'sgd':
 59 |             self.optimizer = optim.SGD(self.params, lr=self.lr)
 60 |         elif self.method == 'adagrad':
 61 |             self.optimizer = optim.Adagrad(self.params, lr=self.lr)
 62 |             for group in self.optimizer.param_groups:
 63 |                 for p in group['params']:
 64 |                     self.optimizer.state[p]['sum'] = self.optimizer\
 65 |                         .state[p]['sum'].fill_(self.adagrad_accum)
 66 |         elif self.method == 'adadelta':
 67 |             self.optimizer = optim.Adadelta(self.params, lr=self.lr)
 68 |         elif self.method == 'adam':
 69 |             self.optimizer = optim.Adam(self.params, lr=self.lr,
 70 |                                         betas=self.betas, eps=1e-9)
 71 |         else:
 72 |             raise RuntimeError("Invalid optim method: " + self.method)
 73 | 
 74 |     def _set_rate(self, lr):
 75 |         self.lr = lr
 76 |         self.optimizer.param_groups[0]['lr'] = self.lr
 77 | 
 78 |     def step(self):
 79 |         """Update the model parameters based on current gradients.
 80 | 
 81 |         Optionally, will employ gradient modification or update learning
 82 |         rate.
 83 |         """
 84 |         self._step += 1
 85 | 
 86 |         # Decay method used in tensor2tensor.
 87 |         if self.decay_method == "noam":
 88 |             self._set_rate(
 89 |                 self.original_lr *
 90 |                 (self.model_size ** (-0.5) *
 91 |                  min(self._step ** (-0.5),
 92 |                      self._step * self.warmup_steps**(-1.5))))
 93 | 
 94 |         if self.max_grad_norm:
 95 |             clip_grad_norm(self.params, self.max_grad_norm)
 96 |         self.optimizer.step()
 97 | 
 98 |     def update_learning_rate(self, ppl, epoch):
 99 |         """
100 |         Decay learning rate if val perf does not improve
101 |         or we hit the start_decay_at limit.
102 |         """
103 | 
104 |         if self.start_decay_at is not None and epoch >= self.start_decay_at:
105 |             self.start_decay = True
106 |         if self.last_ppl is not None and ppl > self.last_ppl:
107 |             self.start_decay = True
108 | 
109 |         if self.start_decay:
110 |             self.lr = self.lr * self.lr_decay
111 |             print("Decaying learning rate to %g" % self.lr)
112 | 
113 |         self.last_ppl = ppl
114 |         self.optimizer.param_groups[0]['lr'] = self.lr
115 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/Optim.py:
--------------------------------------------------------------------------------
  1 | import torch.optim as optim
  2 | from torch.nn.utils import clip_grad_norm
  3 | 
  4 | 
  5 | class Optim(object):
  6 |     """
  7 |     Controller class for optimization. Mostly a thin
  8 |     wrapper for `optim`, but also useful for implementing
  9 |     rate scheduling beyond what is currently available.
 10 |     Also implements necessary methods for training RNNs such
 11 |     as grad manipulations.
 12 | 
 13 |     Args:
 14 |       method (:obj:`str`): one of [sgd, adagrad, adadelta, adam]
 15 |       lr (float): learning rate
 16 |       lr_decay (float, optional): learning rate decay multiplier
 17 |       start_decay_at (int, optional): epoch to start learning rate decay
 18 |       beta1, beta2 (float, optional): parameters for adam
 19 |       adagrad_accum (float, optional): initialization parameter for adagrad
 20 |       decay_method (str, option): custom decay options
 21 |       warmup_steps (int, option): parameter for `noam` decay
 22 |       model_size (int, option): parameter for `noam` decay
 23 |     """
 24 |     # We use the default parameters for Adam that are suggested by
 25 |     # the original paper https://arxiv.org/pdf/1412.6980.pdf
 26 |     # These values are also used by other established implementations,
 27 |     # e.g. https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
 28 |     # https://keras.io/optimizers/
 29 |     # Recently there are slightly different values used in the paper
 30 |     # "Attention is all you need"
 31 |     # https://arxiv.org/pdf/1706.03762.pdf, particularly the value beta2=0.98
 32 |     # was used there however, beta2=0.999 is still arguably the more
 33 |     # established value, so we use that here as well
 34 |     def __init__(self, method, lr, max_grad_norm,
 35 |                  lr_decay=1, start_decay_at=None,
 36 |                  beta1=0.9, beta2=0.999,
 37 |                  adagrad_accum=0.0,
 38 |                  decay_method=None,
 39 |                  warmup_steps=4000,
 40 |                  model_size=None):
 41 |         self.last_ppl = None
 42 |         self.lr = lr
 43 |         self.original_lr = lr
 44 |         self.max_grad_norm = max_grad_norm
 45 |         self.method = method
 46 |         self.lr_decay = lr_decay
 47 |         self.start_decay_at = start_decay_at
 48 |         self.start_decay = False
 49 |         self._step = 0
 50 |         self.betas = [beta1, beta2]
 51 |         self.adagrad_accum = adagrad_accum
 52 |         self.decay_method = decay_method
 53 |         self.warmup_steps = warmup_steps
 54 |         self.model_size = model_size
 55 | 
 56 |     def set_parameters(self, params):
 57 |         self.params = [p for p in params if p.requires_grad]
 58 |         if self.method == 'sgd':
 59 |             self.optimizer = optim.SGD(self.params, lr=self.lr)
 60 |         elif self.method == 'adagrad':
 61 |             self.optimizer = optim.Adagrad(self.params, lr=self.lr)
 62 |             for group in self.optimizer.param_groups:
 63 |                 for p in group['params']:
 64 |                     self.optimizer.state[p]['sum'] = self.optimizer\
 65 |                         .state[p]['sum'].fill_(self.adagrad_accum)
 66 |         elif self.method == 'adadelta':
 67 |             self.optimizer = optim.Adadelta(self.params, lr=self.lr)
 68 |         elif self.method == 'adam':
 69 |             self.optimizer = optim.Adam(self.params, lr=self.lr,
 70 |                                         betas=self.betas, eps=1e-9)
 71 |         else:
 72 |             raise RuntimeError("Invalid optim method: " + self.method)
 73 | 
 74 |     def _set_rate(self, lr):
 75 |         self.lr = lr
 76 |         self.optimizer.param_groups[0]['lr'] = self.lr
 77 | 
 78 |     def step(self):
 79 |         """Update the model parameters based on current gradients.
 80 | 
 81 |         Optionally, will employ gradient modification or update learning
 82 |         rate.
 83 |         """
 84 |         self._step += 1
 85 | 
 86 |         # Decay method used in tensor2tensor.
 87 |         if self.decay_method == "noam":
 88 |             self._set_rate(
 89 |                 self.original_lr *
 90 |                 (self.model_size ** (-0.5) *
 91 |                  min(self._step ** (-0.5),
 92 |                      self._step * self.warmup_steps**(-1.5))))
 93 | 
 94 |         if self.max_grad_norm:
 95 |             clip_grad_norm(self.params, self.max_grad_norm)
 96 |         self.optimizer.step()
 97 | 
 98 |     def update_learning_rate(self, ppl, epoch):
 99 |         """
100 |         Decay learning rate if val perf does not improve
101 |         or we hit the start_decay_at limit.
102 |         """
103 | 
104 |         if self.start_decay_at is not None and epoch >= self.start_decay_at:
105 |             self.start_decay = True
106 |         if self.last_ppl is not None and ppl > self.last_ppl:
107 |             self.start_decay = True
108 | 
109 |         if self.start_decay:
110 |             self.lr = self.lr * self.lr_decay
111 |             print("Decaying learning rate to %g" % self.lr)
112 | 
113 |         self.last_ppl = ppl
114 |         self.optimizer.param_groups[0]['lr'] = self.lr
115 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/Optim.py:
--------------------------------------------------------------------------------
  1 | import torch.optim as optim
  2 | from torch.nn.utils import clip_grad_norm
  3 | 
  4 | 
  5 | class Optim(object):
  6 |     """
  7 |     Controller class for optimization. Mostly a thin
  8 |     wrapper for `optim`, but also useful for implementing
  9 |     rate scheduling beyond what is currently available.
 10 |     Also implements necessary methods for training RNNs such
 11 |     as grad manipulations.
 12 | 
 13 |     Args:
 14 |       method (:obj:`str`): one of [sgd, adagrad, adadelta, adam]
 15 |       lr (float): learning rate
 16 |       lr_decay (float, optional): learning rate decay multiplier
 17 |       start_decay_at (int, optional): epoch to start learning rate decay
 18 |       beta1, beta2 (float, optional): parameters for adam
 19 |       adagrad_accum (float, optional): initialization parameter for adagrad
 20 |       decay_method (str, option): custom decay options
 21 |       warmup_steps (int, option): parameter for `noam` decay
 22 |       model_size (int, option): parameter for `noam` decay
 23 |     """
 24 |     # We use the default parameters for Adam that are suggested by
 25 |     # the original paper https://arxiv.org/pdf/1412.6980.pdf
 26 |     # These values are also used by other established implementations,
 27 |     # e.g. https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
 28 |     # https://keras.io/optimizers/
 29 |     # Recently there are slightly different values used in the paper
 30 |     # "Attention is all you need"
 31 |     # https://arxiv.org/pdf/1706.03762.pdf, particularly the value beta2=0.98
 32 |     # was used there however, beta2=0.999 is still arguably the more
 33 |     # established value, so we use that here as well
 34 |     def __init__(self, method, lr, max_grad_norm,
 35 |                  lr_decay=1, start_decay_at=None,
 36 |                  beta1=0.9, beta2=0.999,
 37 |                  adagrad_accum=0.0,
 38 |                  decay_method=None,
 39 |                  warmup_steps=4000,
 40 |                  model_size=None):
 41 |         self.last_ppl = None
 42 |         self.lr = lr
 43 |         self.original_lr = lr
 44 |         self.max_grad_norm = max_grad_norm
 45 |         self.method = method
 46 |         self.lr_decay = lr_decay
 47 |         self.start_decay_at = start_decay_at
 48 |         self.start_decay = False
 49 |         self._step = 0
 50 |         self.betas = [beta1, beta2]
 51 |         self.adagrad_accum = adagrad_accum
 52 |         self.decay_method = decay_method
 53 |         self.warmup_steps = warmup_steps
 54 |         self.model_size = model_size
 55 | 
 56 |     def set_parameters(self, params):
 57 |         self.params = [p for p in params if p.requires_grad]
 58 |         if self.method == 'sgd':
 59 |             self.optimizer = optim.SGD(self.params, lr=self.lr)
 60 |         elif self.method == 'adagrad':
 61 |             self.optimizer = optim.Adagrad(self.params, lr=self.lr)
 62 |             for group in self.optimizer.param_groups:
 63 |                 for p in group['params']:
 64 |                     self.optimizer.state[p]['sum'] = self.optimizer\
 65 |                         .state[p]['sum'].fill_(self.adagrad_accum)
 66 |         elif self.method == 'adadelta':
 67 |             self.optimizer = optim.Adadelta(self.params, lr=self.lr)
 68 |         elif self.method == 'adam':
 69 |             self.optimizer = optim.Adam(self.params, lr=self.lr,
 70 |                                         betas=self.betas, eps=1e-9)
 71 |         else:
 72 |             raise RuntimeError("Invalid optim method: " + self.method)
 73 | 
 74 |     def _set_rate(self, lr):
 75 |         self.lr = lr
 76 |         self.optimizer.param_groups[0]['lr'] = self.lr
 77 | 
 78 |     def step(self):
 79 |         """Update the model parameters based on current gradients.
 80 | 
 81 |         Optionally, will employ gradient modification or update learning
 82 |         rate.
 83 |         """
 84 |         self._step += 1
 85 | 
 86 |         # Decay method used in tensor2tensor.
 87 |         if self.decay_method == "noam":
 88 |             self._set_rate(
 89 |                 self.original_lr *
 90 |                 (self.model_size ** (-0.5) *
 91 |                  min(self._step ** (-0.5),
 92 |                      self._step * self.warmup_steps**(-1.5))))
 93 | 
 94 |         if self.max_grad_norm:
 95 |             clip_grad_norm(self.params, self.max_grad_norm)
 96 |         self.optimizer.step()
 97 | 
 98 |     def update_learning_rate(self, ppl, epoch):
 99 |         """
100 |         Decay learning rate if val perf does not improve
101 |         or we hit the start_decay_at limit.
102 |         """
103 | 
104 |         if self.start_decay_at is not None and epoch >= self.start_decay_at:
105 |             self.start_decay = True
106 |         if self.last_ppl is not None and ppl > self.last_ppl:
107 |             self.start_decay = True
108 | 
109 |         if self.start_decay:
110 |             self.lr = self.lr * self.lr_decay
111 |             print("Decaying learning rate to %g" % self.lr)
112 | 
113 |         self.last_ppl = ppl
114 |         self.optimizer.param_groups[0]['lr'] = self.lr
115 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/Optim.py:
--------------------------------------------------------------------------------
  1 | import torch.optim as optim
  2 | from torch.nn.utils import clip_grad_norm
  3 | 
  4 | 
  5 | class Optim(object):
  6 |     """
  7 |     Controller class for optimization. Mostly a thin
  8 |     wrapper for `optim`, but also useful for implementing
  9 |     rate scheduling beyond what is currently available.
 10 |     Also implements necessary methods for training RNNs such
 11 |     as grad manipulations.
 12 | 
 13 |     Args:
 14 |       method (:obj:`str`): one of [sgd, adagrad, adadelta, adam]
 15 |       lr (float): learning rate
 16 |       lr_decay (float, optional): learning rate decay multiplier
 17 |       start_decay_at (int, optional): epoch to start learning rate decay
 18 |       beta1, beta2 (float, optional): parameters for adam
 19 |       adagrad_accum (float, optional): initialization parameter for adagrad
 20 |       decay_method (str, option): custom decay options
 21 |       warmup_steps (int, option): parameter for `noam` decay
 22 |       model_size (int, option): parameter for `noam` decay
 23 |     """
 24 |     # We use the default parameters for Adam that are suggested by
 25 |     # the original paper https://arxiv.org/pdf/1412.6980.pdf
 26 |     # These values are also used by other established implementations,
 27 |     # e.g. https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
 28 |     # https://keras.io/optimizers/
 29 |     # Recently there are slightly different values used in the paper
 30 |     # "Attention is all you need"
 31 |     # https://arxiv.org/pdf/1706.03762.pdf, particularly the value beta2=0.98
 32 |     # was used there however, beta2=0.999 is still arguably the more
 33 |     # established value, so we use that here as well
 34 |     def __init__(self, method, lr, max_grad_norm,
 35 |                  lr_decay=1, start_decay_at=None,
 36 |                  beta1=0.9, beta2=0.999,
 37 |                  adagrad_accum=0.0,
 38 |                  decay_method=None,
 39 |                  warmup_steps=4000,
 40 |                  model_size=None):
 41 |         self.last_ppl = None
 42 |         self.lr = lr
 43 |         self.original_lr = lr
 44 |         self.max_grad_norm = max_grad_norm
 45 |         self.method = method
 46 |         self.lr_decay = lr_decay
 47 |         self.start_decay_at = start_decay_at
 48 |         self.start_decay = False
 49 |         self._step = 0
 50 |         self.betas = [beta1, beta2]
 51 |         self.adagrad_accum = adagrad_accum
 52 |         self.decay_method = decay_method
 53 |         self.warmup_steps = warmup_steps
 54 |         self.model_size = model_size
 55 | 
 56 |     def set_parameters(self, params):
 57 |         self.params = [p for p in params if p.requires_grad]
 58 |         if self.method == 'sgd':
 59 |             self.optimizer = optim.SGD(self.params, lr=self.lr)
 60 |         elif self.method == 'adagrad':
 61 |             self.optimizer = optim.Adagrad(self.params, lr=self.lr)
 62 |             for group in self.optimizer.param_groups:
 63 |                 for p in group['params']:
 64 |                     self.optimizer.state[p]['sum'] = self.optimizer\
 65 |                         .state[p]['sum'].fill_(self.adagrad_accum)
 66 |         elif self.method == 'adadelta':
 67 |             self.optimizer = optim.Adadelta(self.params, lr=self.lr)
 68 |         elif self.method == 'adam':
 69 |             self.optimizer = optim.Adam(self.params, lr=self.lr,
 70 |                                         betas=self.betas, eps=1e-9)
 71 |         else:
 72 |             raise RuntimeError("Invalid optim method: " + self.method)
 73 | 
 74 |     def _set_rate(self, lr):
 75 |         self.lr = lr
 76 |         self.optimizer.param_groups[0]['lr'] = self.lr
 77 | 
 78 |     def step(self):
 79 |         """Update the model parameters based on current gradients.
 80 | 
 81 |         Optionally, will employ gradient modification or update learning
 82 |         rate.
 83 |         """
 84 |         self._step += 1
 85 | 
 86 |         # Decay method used in tensor2tensor.
 87 |         if self.decay_method == "noam":
 88 |             self._set_rate(
 89 |                 self.original_lr *
 90 |                 (self.model_size ** (-0.5) *
 91 |                  min(self._step ** (-0.5),
 92 |                      self._step * self.warmup_steps**(-1.5))))
 93 | 
 94 |         if self.max_grad_norm:
 95 |             clip_grad_norm(self.params, self.max_grad_norm)
 96 |         self.optimizer.step()
 97 | 
 98 |     def update_learning_rate(self, ppl, epoch):
 99 |         """
100 |         Decay learning rate if val perf does not improve
101 |         or we hit the start_decay_at limit.
102 |         """
103 | 
104 |         if self.start_decay_at is not None and epoch >= self.start_decay_at:
105 |             self.start_decay = True
106 |         if self.last_ppl is not None and ppl > self.last_ppl:
107 |             self.start_decay = True
108 | 
109 |         if self.start_decay:
110 |             self.lr = self.lr * self.lr_decay
111 |             print("Decaying learning rate to %g" % self.lr)
112 | 
113 |         self.last_ppl = ppl
114 |         self.optimizer.param_groups[0]['lr'] = self.lr
115 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/io/DatasetBase.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from itertools import chain
  4 | import torchtext
  5 | from onmt.Utils import aeq
  6 | 
  7 | 
  8 | PAD_WORD = '<blank>'
  9 | UNK = 0
 10 | BOS_WORD = '<s>'
 11 | EOS_WORD = '</s>'
 12 | 
 13 | 
 14 | class ONMTDatasetBase(torchtext.data.Dataset):
 15 |     """
 16 |     A dataset basically supports iteration over all the examples
 17 |     it contains. We currently have 3 datasets inheriting this base
 18 |     for 3 types of corpus respectively: "text", "img", "audio".
 19 | 
 20 |     Internally it initializes an `torchtext.data.Dataset` object with
 21 |     the following attributes:
 22 | 
 23 |      `examples`: a sequence of `torchtext.data.Example` objects.
 24 |      `fields`: a dictionary associating str keys with `torchtext.data.Field`
 25 |         objects, and not necessarily having the same keys as the input fields.
 26 |     """
 27 |     def __getstate__(self):
 28 |         return self.__dict__
 29 | 
 30 |     def __setstate__(self, d):
 31 |         self.__dict__.update(d)
 32 | 
 33 |     def __reduce_ex__(self, proto):
 34 |         "This is a hack. Something is broken with torch pickle."
 35 |         return super(ONMTDatasetBase, self).__reduce_ex__()
 36 | 
 37 |     def load_fields(self, vocab_dict):
 38 |         """ Load fields from vocab.pt, and set the `fields` attribute.
 39 | 
 40 |         Args:
 41 |             vocab_dict (dict): a dict of loaded vocab from vocab.pt file.
 42 |         """
 43 |         from onmt.io.IO import load_fields_from_vocab
 44 | 
 45 |         fields = load_fields_from_vocab(vocab_dict.items(), self.data_type)
 46 |         self.fields = dict([(k, f) for (k, f) in fields.items()
 47 |                            if k in self.examples[0].__dict__])
 48 | 
 49 |     @staticmethod
 50 |     def coalesce_datasets(datasets):
 51 |         """Coalesce all dataset instances. """
 52 |         final = datasets[0]
 53 |         for d in datasets[1:]:
 54 |             # `src_vocabs` is a list of `torchtext.vocab.Vocab`.
 55 |             # Each sentence transforms into on Vocab.
 56 |             # Coalesce them into one big list.
 57 |             final.src_vocabs += d.src_vocabs
 58 | 
 59 |             # All datasets have same number of features.
 60 |             aeq(final.n_src_feats, d.n_src_feats)
 61 |             aeq(final.n_tgt_feats, d.n_tgt_feats)
 62 | 
 63 |             # `examples` is a list of `torchtext.data.Example`.
 64 |             # Coalesce them into one big list.
 65 |             final.examples += d.examples
 66 | 
 67 |             # All datasets have same fields, no need to update.
 68 | 
 69 |         return final
 70 | 
 71 |     @staticmethod
 72 |     def extract_text_features(tokens):
 73 |         """
 74 |         Args:
 75 |             tokens: A list of tokens, where each token consists of a word,
 76 |                 optionally followed by u"￨"-delimited features.
 77 |         Returns:
 78 |             A sequence of words, a sequence of features, and num of features.
 79 |         """
 80 |         if not tokens:
 81 |             return [], [], -1
 82 | 
 83 |         split_tokens = [token.split(u"￨") for token in tokens]
 84 |         split_tokens = [token for token in split_tokens if token[0]]
 85 |         token_size = len(split_tokens[0])
 86 | 
 87 |         assert all(len(token) == token_size for token in split_tokens), \
 88 |             "all words must have the same number of features"
 89 |         words_and_features = list(zip(*split_tokens))
 90 |         words = words_and_features[0]
 91 |         features = words_and_features[1:]
 92 | 
 93 |         return words, features, token_size - 1
 94 | 
 95 |     # Below are helper functions for intra-class use only.
 96 | 
 97 |     def _join_dicts(self, *args):
 98 |         """
 99 |         Args:
100 |             dictionaries with disjoint keys.
101 | 
102 |         Returns:
103 |             a single dictionary that has the union of these keys.
104 |         """
105 |         return dict(chain(*[d.items() for d in args]))
106 | 
107 |     def _peek(self, seq):
108 |         """
109 |         Args:
110 |             seq: an iterator.
111 | 
112 |         Returns:
113 |             the first thing returned by calling next() on the iterator
114 |             and an iterator created by re-chaining that value to the beginning
115 |             of the iterator.
116 |         """
117 |         first = next(seq)
118 |         return first, chain([first], seq)
119 | 
120 |     def _construct_example_fromlist(self, data, fields):
121 |         """
122 |         Args:
123 |             data: the data to be set as the value of the attributes of
124 |                 the to-be-created `Example`, associating with respective
125 |                 `Field` objects with same key.
126 |             fields: a dict of `torchtext.data.Field` objects. The keys
127 |                 are attributes of the to-be-created `Example`.
128 | 
129 |         Returns:
130 |             the created `Example` object.
131 |         """
132 |         ex = torchtext.data.Example()
133 |         for (name, field), val in zip(fields, data):
134 |             if field is not None:
135 |                 setattr(ex, name, field.preprocess(val))
136 |             else:
137 |                 setattr(ex, name, val)
138 |         return ex
139 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/io/DatasetBase.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from itertools import chain
  4 | import torchtext
  5 | from onmt.Utils import aeq
  6 | 
  7 | 
  8 | PAD_WORD = '<blank>'
  9 | UNK = 0
 10 | BOS_WORD = '<s>'
 11 | EOS_WORD = '</s>'
 12 | 
 13 | 
 14 | class ONMTDatasetBase(torchtext.data.Dataset):
 15 |     """
 16 |     A dataset basically supports iteration over all the examples
 17 |     it contains. We currently have 3 datasets inheriting this base
 18 |     for 3 types of corpus respectively: "text", "img", "audio".
 19 | 
 20 |     Internally it initializes an `torchtext.data.Dataset` object with
 21 |     the following attributes:
 22 | 
 23 |      `examples`: a sequence of `torchtext.data.Example` objects.
 24 |      `fields`: a dictionary associating str keys with `torchtext.data.Field`
 25 |         objects, and not necessarily having the same keys as the input fields.
 26 |     """
 27 |     def __getstate__(self):
 28 |         return self.__dict__
 29 | 
 30 |     def __setstate__(self, d):
 31 |         self.__dict__.update(d)
 32 | 
 33 |     def __reduce_ex__(self, proto):
 34 |         "This is a hack. Something is broken with torch pickle."
 35 |         return super(ONMTDatasetBase, self).__reduce_ex__()
 36 | 
 37 |     def load_fields(self, vocab_dict):
 38 |         """ Load fields from vocab.pt, and set the `fields` attribute.
 39 | 
 40 |         Args:
 41 |             vocab_dict (dict): a dict of loaded vocab from vocab.pt file.
 42 |         """
 43 |         from onmt.io.IO import load_fields_from_vocab
 44 | 
 45 |         fields = load_fields_from_vocab(vocab_dict.items(), self.data_type)
 46 |         self.fields = dict([(k, f) for (k, f) in fields.items()
 47 |                            if k in self.examples[0].__dict__])
 48 | 
 49 |     @staticmethod
 50 |     def coalesce_datasets(datasets):
 51 |         """Coalesce all dataset instances. """
 52 |         final = datasets[0]
 53 |         for d in datasets[1:]:
 54 |             # `src_vocabs` is a list of `torchtext.vocab.Vocab`.
 55 |             # Each sentence transforms into on Vocab.
 56 |             # Coalesce them into one big list.
 57 |             final.src_vocabs += d.src_vocabs
 58 | 
 59 |             # All datasets have same number of features.
 60 |             aeq(final.n_src_feats, d.n_src_feats)
 61 |             aeq(final.n_tgt_feats, d.n_tgt_feats)
 62 | 
 63 |             # `examples` is a list of `torchtext.data.Example`.
 64 |             # Coalesce them into one big list.
 65 |             final.examples += d.examples
 66 | 
 67 |             # All datasets have same fields, no need to update.
 68 | 
 69 |         return final
 70 | 
 71 |     @staticmethod
 72 |     def extract_text_features(tokens):
 73 |         """
 74 |         Args:
 75 |             tokens: A list of tokens, where each token consists of a word,
 76 |                 optionally followed by u"￨"-delimited features.
 77 |         Returns:
 78 |             A sequence of words, a sequence of features, and num of features.
 79 |         """
 80 |         if not tokens:
 81 |             return [], [], -1
 82 | 
 83 |         split_tokens = [token.split(u"￨") for token in tokens]
 84 |         split_tokens = [token for token in split_tokens if token[0]]
 85 |         token_size = len(split_tokens[0])
 86 | 
 87 |         assert all(len(token) == token_size for token in split_tokens), \
 88 |             "all words must have the same number of features"
 89 |         words_and_features = list(zip(*split_tokens))
 90 |         words = words_and_features[0]
 91 |         features = words_and_features[1:]
 92 | 
 93 |         return words, features, token_size - 1
 94 | 
 95 |     # Below are helper functions for intra-class use only.
 96 | 
 97 |     def _join_dicts(self, *args):
 98 |         """
 99 |         Args:
100 |             dictionaries with disjoint keys.
101 | 
102 |         Returns:
103 |             a single dictionary that has the union of these keys.
104 |         """
105 |         return dict(chain(*[d.items() for d in args]))
106 | 
107 |     def _peek(self, seq):
108 |         """
109 |         Args:
110 |             seq: an iterator.
111 | 
112 |         Returns:
113 |             the first thing returned by calling next() on the iterator
114 |             and an iterator created by re-chaining that value to the beginning
115 |             of the iterator.
116 |         """
117 |         first = next(seq)
118 |         return first, chain([first], seq)
119 | 
120 |     def _construct_example_fromlist(self, data, fields):
121 |         """
122 |         Args:
123 |             data: the data to be set as the value of the attributes of
124 |                 the to-be-created `Example`, associating with respective
125 |                 `Field` objects with same key.
126 |             fields: a dict of `torchtext.data.Field` objects. The keys
127 |                 are attributes of the to-be-created `Example`.
128 | 
129 |         Returns:
130 |             the created `Example` object.
131 |         """
132 |         ex = torchtext.data.Example()
133 |         for (name, field), val in zip(fields, data):
134 |             if field is not None:
135 |                 setattr(ex, name, field.preprocess(val))
136 |             else:
137 |                 setattr(ex, name, val)
138 |         return ex
139 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/io/DatasetBase.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from itertools import chain
  4 | import torchtext
  5 | from onmt.Utils import aeq
  6 | 
  7 | 
  8 | PAD_WORD = '<blank>'
  9 | UNK = 0
 10 | BOS_WORD = '<s>'
 11 | EOS_WORD = '</s>'
 12 | 
 13 | 
 14 | class ONMTDatasetBase(torchtext.data.Dataset):
 15 |     """
 16 |     A dataset basically supports iteration over all the examples
 17 |     it contains. We currently have 3 datasets inheriting this base
 18 |     for 3 types of corpus respectively: "text", "img", "audio".
 19 | 
 20 |     Internally it initializes an `torchtext.data.Dataset` object with
 21 |     the following attributes:
 22 | 
 23 |      `examples`: a sequence of `torchtext.data.Example` objects.
 24 |      `fields`: a dictionary associating str keys with `torchtext.data.Field`
 25 |         objects, and not necessarily having the same keys as the input fields.
 26 |     """
 27 |     def __getstate__(self):
 28 |         return self.__dict__
 29 | 
 30 |     def __setstate__(self, d):
 31 |         self.__dict__.update(d)
 32 | 
 33 |     def __reduce_ex__(self, proto):
 34 |         "This is a hack. Something is broken with torch pickle."
 35 |         return super(ONMTDatasetBase, self).__reduce_ex__()
 36 | 
 37 |     def load_fields(self, vocab_dict):
 38 |         """ Load fields from vocab.pt, and set the `fields` attribute.
 39 | 
 40 |         Args:
 41 |             vocab_dict (dict): a dict of loaded vocab from vocab.pt file.
 42 |         """
 43 |         from onmt.io.IO import load_fields_from_vocab
 44 | 
 45 |         fields = load_fields_from_vocab(vocab_dict.items(), self.data_type)
 46 |         self.fields = dict([(k, f) for (k, f) in fields.items()
 47 |                            if k in self.examples[0].__dict__])
 48 | 
 49 |     @staticmethod
 50 |     def coalesce_datasets(datasets):
 51 |         """Coalesce all dataset instances. """
 52 |         final = datasets[0]
 53 |         for d in datasets[1:]:
 54 |             # `src_vocabs` is a list of `torchtext.vocab.Vocab`.
 55 |             # Each sentence transforms into on Vocab.
 56 |             # Coalesce them into one big list.
 57 |             final.src_vocabs += d.src_vocabs
 58 | 
 59 |             # All datasets have same number of features.
 60 |             aeq(final.n_src_feats, d.n_src_feats)
 61 |             aeq(final.n_tgt_feats, d.n_tgt_feats)
 62 | 
 63 |             # `examples` is a list of `torchtext.data.Example`.
 64 |             # Coalesce them into one big list.
 65 |             final.examples += d.examples
 66 | 
 67 |             # All datasets have same fields, no need to update.
 68 | 
 69 |         return final
 70 | 
 71 |     @staticmethod
 72 |     def extract_text_features(tokens):
 73 |         """
 74 |         Args:
 75 |             tokens: A list of tokens, where each token consists of a word,
 76 |                 optionally followed by u"￨"-delimited features.
 77 |         Returns:
 78 |             A sequence of words, a sequence of features, and num of features.
 79 |         """
 80 |         if not tokens:
 81 |             return [], [], -1
 82 | 
 83 |         split_tokens = [token.split(u"￨") for token in tokens]
 84 |         split_tokens = [token for token in split_tokens if token[0]]
 85 |         token_size = len(split_tokens[0])
 86 | 
 87 |         assert all(len(token) == token_size for token in split_tokens), \
 88 |             "all words must have the same number of features"
 89 |         words_and_features = list(zip(*split_tokens))
 90 |         words = words_and_features[0]
 91 |         features = words_and_features[1:]
 92 | 
 93 |         return words, features, token_size - 1
 94 | 
 95 |     # Below are helper functions for intra-class use only.
 96 | 
 97 |     def _join_dicts(self, *args):
 98 |         """
 99 |         Args:
100 |             dictionaries with disjoint keys.
101 | 
102 |         Returns:
103 |             a single dictionary that has the union of these keys.
104 |         """
105 |         return dict(chain(*[d.items() for d in args]))
106 | 
107 |     def _peek(self, seq):
108 |         """
109 |         Args:
110 |             seq: an iterator.
111 | 
112 |         Returns:
113 |             the first thing returned by calling next() on the iterator
114 |             and an iterator created by re-chaining that value to the beginning
115 |             of the iterator.
116 |         """
117 |         first = next(seq)
118 |         return first, chain([first], seq)
119 | 
120 |     def _construct_example_fromlist(self, data, fields):
121 |         """
122 |         Args:
123 |             data: the data to be set as the value of the attributes of
124 |                 the to-be-created `Example`, associating with respective
125 |                 `Field` objects with same key.
126 |             fields: a dict of `torchtext.data.Field` objects. The keys
127 |                 are attributes of the to-be-created `Example`.
128 | 
129 |         Returns:
130 |             the created `Example` object.
131 |         """
132 |         ex = torchtext.data.Example()
133 |         for (name, field), val in zip(fields, data):
134 |             if field is not None:
135 |                 setattr(ex, name, field.preprocess(val))
136 |             else:
137 |                 setattr(ex, name, val)
138 |         return ex
139 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/io/DatasetBase.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from itertools import chain
  4 | import torchtext
  5 | from onmt.Utils import aeq
  6 | 
  7 | 
  8 | PAD_WORD = '<blank>'
  9 | UNK = 0
 10 | BOS_WORD = '<s>'
 11 | EOS_WORD = '</s>'
 12 | 
 13 | 
 14 | class ONMTDatasetBase(torchtext.data.Dataset):
 15 |     """
 16 |     A dataset basically supports iteration over all the examples
 17 |     it contains. We currently have 3 datasets inheriting this base
 18 |     for 3 types of corpus respectively: "text", "img", "audio".
 19 | 
 20 |     Internally it initializes an `torchtext.data.Dataset` object with
 21 |     the following attributes:
 22 | 
 23 |      `examples`: a sequence of `torchtext.data.Example` objects.
 24 |      `fields`: a dictionary associating str keys with `torchtext.data.Field`
 25 |         objects, and not necessarily having the same keys as the input fields.
 26 |     """
 27 |     def __getstate__(self):
 28 |         return self.__dict__
 29 | 
 30 |     def __setstate__(self, d):
 31 |         self.__dict__.update(d)
 32 | 
 33 |     def __reduce_ex__(self, proto):
 34 |         "This is a hack. Something is broken with torch pickle."
 35 |         return super(ONMTDatasetBase, self).__reduce_ex__()
 36 | 
 37 |     def load_fields(self, vocab_dict):
 38 |         """ Load fields from vocab.pt, and set the `fields` attribute.
 39 | 
 40 |         Args:
 41 |             vocab_dict (dict): a dict of loaded vocab from vocab.pt file.
 42 |         """
 43 |         from onmt.io.IO import load_fields_from_vocab
 44 | 
 45 |         fields = load_fields_from_vocab(vocab_dict.items(), self.data_type)
 46 |         self.fields = dict([(k, f) for (k, f) in fields.items()
 47 |                            if k in self.examples[0].__dict__])
 48 | 
 49 |     @staticmethod
 50 |     def coalesce_datasets(datasets):
 51 |         """Coalesce all dataset instances. """
 52 |         final = datasets[0]
 53 |         for d in datasets[1:]:
 54 |             # `src_vocabs` is a list of `torchtext.vocab.Vocab`.
 55 |             # Each sentence transforms into on Vocab.
 56 |             # Coalesce them into one big list.
 57 |             final.src_vocabs += d.src_vocabs
 58 | 
 59 |             # All datasets have same number of features.
 60 |             aeq(final.n_src_feats, d.n_src_feats)
 61 |             aeq(final.n_tgt_feats, d.n_tgt_feats)
 62 | 
 63 |             # `examples` is a list of `torchtext.data.Example`.
 64 |             # Coalesce them into one big list.
 65 |             final.examples += d.examples
 66 | 
 67 |             # All datasets have same fields, no need to update.
 68 | 
 69 |         return final
 70 | 
 71 |     @staticmethod
 72 |     def extract_text_features(tokens):
 73 |         """
 74 |         Args:
 75 |             tokens: A list of tokens, where each token consists of a word,
 76 |                 optionally followed by u"￨"-delimited features.
 77 |         Returns:
 78 |             A sequence of words, a sequence of features, and num of features.
 79 |         """
 80 |         if not tokens:
 81 |             return [], [], -1
 82 | 
 83 |         split_tokens = [token.split(u"￨") for token in tokens]
 84 |         split_tokens = [token for token in split_tokens if token[0]]
 85 |         token_size = len(split_tokens[0])
 86 | 
 87 |         assert all(len(token) == token_size for token in split_tokens), \
 88 |             "all words must have the same number of features"
 89 |         words_and_features = list(zip(*split_tokens))
 90 |         words = words_and_features[0]
 91 |         features = words_and_features[1:]
 92 | 
 93 |         return words, features, token_size - 1
 94 | 
 95 |     # Below are helper functions for intra-class use only.
 96 | 
 97 |     def _join_dicts(self, *args):
 98 |         """
 99 |         Args:
100 |             dictionaries with disjoint keys.
101 | 
102 |         Returns:
103 |             a single dictionary that has the union of these keys.
104 |         """
105 |         return dict(chain(*[d.items() for d in args]))
106 | 
107 |     def _peek(self, seq):
108 |         """
109 |         Args:
110 |             seq: an iterator.
111 | 
112 |         Returns:
113 |             the first thing returned by calling next() on the iterator
114 |             and an iterator created by re-chaining that value to the beginning
115 |             of the iterator.
116 |         """
117 |         first = next(seq)
118 |         return first, chain([first], seq)
119 | 
120 |     def _construct_example_fromlist(self, data, fields):
121 |         """
122 |         Args:
123 |             data: the data to be set as the value of the attributes of
124 |                 the to-be-created `Example`, associating with respective
125 |                 `Field` objects with same key.
126 |             fields: a dict of `torchtext.data.Field` objects. The keys
127 |                 are attributes of the to-be-created `Example`.
128 | 
129 |         Returns:
130 |             the created `Example` object.
131 |         """
132 |         ex = torchtext.data.Example()
133 |         for (name, field), val in zip(fields, data):
134 |             if field is not None:
135 |                 setattr(ex, name, field.preprocess(val))
136 |             else:
137 |                 setattr(ex, name, val)
138 |         return ex
139 | 


--------------------------------------------------------------------------------
/reinforcement_train/predict.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from __future__ import division, unicode_literals
  4 | import os
  5 | import argparse
  6 | import math
  7 | import codecs
  8 | import torch
  9 | 
 10 | from itertools import count
 11 | 
 12 | import onmt.io
 13 | import onmt.translate
 14 | import onmt
 15 | import onmt.ModelConstructor
 16 | import onmt.modules
 17 | import opts
 18 | 
 19 | parser = argparse.ArgumentParser(
 20 |     description='translate.py',
 21 |     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 22 | opts.add_md_help_argument(parser)
 23 | opts.translate_opts(parser)
 24 | 
 25 | opt = parser.parse_args()
 26 | 
 27 | 
 28 | def _report_score(name, score_total, words_total):
 29 |     print("%s AVG SCORE: %.4f, %s PPL: %.4f" % (
 30 |         name, score_total / words_total,
 31 |         name, math.exp(-score_total / words_total)))
 32 | 
 33 | 
 34 | def _report_bleu():
 35 |     import subprocess
 36 |     print()
 37 |     res = subprocess.check_output(
 38 |         "perl tools/multi-bleu.perl %s < %s" % (opt.tgt, opt.output),
 39 |         shell=True).decode("utf-8")
 40 |     print(">> " + res.strip())
 41 | 
 42 | 
 43 | def _report_rouge():
 44 |     import subprocess
 45 |     res = subprocess.check_output(
 46 |         "python tools/test_rouge.py -r %s -c %s" % (opt.tgt, opt.output),
 47 |         shell=True).decode("utf-8")
 48 |     print(res.strip())
 49 | 
 50 | 
 51 | def main():
 52 |     dummy_parser = argparse.ArgumentParser(description='train.py')
 53 |     opts.model_opts(dummy_parser)
 54 |     dummy_opt = dummy_parser.parse_known_args([])[0]
 55 | 
 56 |     opt.cuda = opt.gpu > -1
 57 |     if opt.cuda:
 58 |         torch.cuda.set_device(opt.gpu)
 59 | 
 60 |     # Load the model.
 61 |     fields, model, model_opt = \
 62 |         onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__)
 63 | 
 64 |     # File to write sentences to.
 65 |     out_file = codecs.open(opt.output, 'w', 'utf-8')
 66 | 
 67 |     # Test data
 68 |     data = onmt.io.build_dataset(fields, opt.data_type,
 69 |                                  opt.src, opt.tgt, opt.per, opt.nli,
 70 |                                  src_dir=opt.src_dir,
 71 |                                  sample_rate=opt.sample_rate,
 72 |                                  window_size=opt.window_size,
 73 |                                  window_stride=opt.window_stride,
 74 |                                  window=opt.window,
 75 |                                  use_filter_pred=False)
 76 | 
 77 |     # Sort batch by decreasing lengths of sentence required by pytorch.
 78 |     # sort=False means "Use dataset's sortkey instead of iterator's".
 79 |     data_iter = onmt.io.OrderedIterator(
 80 |         dataset=data, device=opt.gpu,
 81 |         batch_size=opt.batch_size, train=False, sort=False,
 82 |         sort_within_batch=True, shuffle=False)
 83 | 
 84 |     # Translator
 85 |     scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta)
 86 |     translator = onmt.translate.Translator(model, fields,
 87 |                                            beam_size=opt.beam_size,
 88 |                                            n_best=opt.n_best,
 89 |                                            global_scorer=scorer,
 90 |                                            max_length=opt.max_length,
 91 |                                            copy_attn=model_opt.copy_attn,
 92 |                                            cuda=opt.cuda,
 93 |                                            beam_trace=opt.dump_beam != "",
 94 |                                            min_length=opt.min_length)
 95 |     builder = onmt.translate.TranslationBuilder(
 96 |         data, translator.fields,
 97 |         opt.n_best, opt.replace_unk, opt.tgt)
 98 | 
 99 |     # Statistics
100 |     counter = count(1)
101 |     pred_score_total, pred_words_total = 0, 0
102 |     gold_score_total, gold_words_total = 0, 0
103 | 
104 |     for batch in data_iter:
105 |         batch_data = translator.translate_batch(batch, data)
106 |         translations = builder.from_batch(batch_data)
107 | 
108 |         for trans in translations:
109 |             pred_score_total += trans.pred_scores[0]
110 |             pred_words_total += len(trans.pred_sents[0])
111 |             if opt.tgt:
112 |                 gold_score_total += trans.gold_score
113 |                 gold_words_total += len(trans.gold_sent)
114 | 
115 |             n_best_preds = [" ".join(pred)
116 |                             for pred in trans.pred_sents[:opt.n_best]]
117 |             out_file.write('\n'.join(n_best_preds))
118 |             out_file.write('\n')
119 |             out_file.flush()
120 | 
121 |             if opt.verbose:
122 |                 sent_number = next(counter)
123 |                 output = trans.log(sent_number)
124 |                 os.write(1, output.encode('utf-8'))
125 | 
126 |     _report_score('PRED', pred_score_total, pred_words_total)
127 |     if opt.tgt:
128 |         _report_score('GOLD', gold_score_total, gold_words_total)
129 |         if opt.report_bleu:
130 |             _report_bleu()
131 |         if opt.report_rouge:
132 |             _report_rouge()
133 | 
134 |     if opt.dump_beam:
135 |         import json
136 |         json.dump(translator.beam_accum,
137 |                   codecs.open(opt.dump_beam, 'w', 'utf-8'))
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     main()
142 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/modules/MultiHeadedAttn.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | from torch.autograd import Variable
  5 | 
  6 | from onmt.Utils import aeq
  7 | from onmt.modules.UtilClass import BottleLinear, BottleSoftmax
  8 | 
  9 | 
 10 | class MultiHeadedAttention(nn.Module):
 11 |     """
 12 |     Multi-Head Attention module from
 13 |     "Attention is All You Need"
 14 |     :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
 15 | 
 16 |     Similar to standard `dot` attention but uses
 17 |     multiple attention distributions simulataneously
 18 |     to select relevant items.
 19 | 
 20 |     .. mermaid::
 21 | 
 22 |        graph BT
 23 |           A[key]
 24 |           B[value]
 25 |           C[query]
 26 |           O[output]
 27 |           subgraph Attn
 28 |             D[Attn 1]
 29 |             E[Attn 2]
 30 |             F[Attn N]
 31 |           end
 32 |           A --> D
 33 |           C --> D
 34 |           A --> E
 35 |           C --> E
 36 |           A --> F
 37 |           C --> F
 38 |           D --> O
 39 |           E --> O
 40 |           F --> O
 41 |           B --> O
 42 | 
 43 |     Also includes several additional tricks.
 44 | 
 45 |     Args:
 46 |        head_count (int): number of parallel heads
 47 |        model_dim (int): the dimension of keys/values/queries,
 48 |            must be divisible by head_count
 49 |        dropout (float): dropout parameter
 50 |     """
 51 |     def __init__(self, head_count, model_dim, dropout=0.1):
 52 |         assert model_dim % head_count == 0
 53 |         self.dim_per_head = model_dim // head_count
 54 |         self.model_dim = model_dim
 55 | 
 56 |         super(MultiHeadedAttention, self).__init__()
 57 |         self.head_count = head_count
 58 | 
 59 |         self.linear_keys = BottleLinear(model_dim,
 60 |                                         head_count * self.dim_per_head,
 61 |                                         bias=False)
 62 |         self.linear_values = BottleLinear(model_dim,
 63 |                                           head_count * self.dim_per_head,
 64 |                                           bias=False)
 65 |         self.linear_query = BottleLinear(model_dim,
 66 |                                          head_count * self.dim_per_head,
 67 |                                          bias=False)
 68 |         self.sm = BottleSoftmax()
 69 |         self.activation = nn.ReLU()
 70 |         self.dropout = nn.Dropout(dropout)
 71 |         self.res_dropout = nn.Dropout(dropout)
 72 | 
 73 |     def forward(self, key, value, query, mask=None):
 74 |         """
 75 |         Compute the context vector and the attention vectors.
 76 | 
 77 |         Args:
 78 |            key (`FloatTensor`): set of `key_len`
 79 |                 key vectors `[batch, key_len, dim]`
 80 |            value (`FloatTensor`): set of `key_len`
 81 |                 value vectors `[batch, key_len, dim]`
 82 |            query (`FloatTensor`): set of `query_len`
 83 |                  query vectors  `[batch, query_len, dim]`
 84 |            mask: binary mask indicating which keys have
 85 |                  non-zero attention `[batch, query_len, key_len]`
 86 |         Returns:
 87 |            (`FloatTensor`, `FloatTensor`) :
 88 | 
 89 |            * output context vectors `[batch, query_len, dim]`
 90 |            * one of the attention vectors `[batch, query_len, key_len]`
 91 |         """
 92 | 
 93 |         # CHECKS
 94 |         batch, k_len, d = key.size()
 95 |         batch_, k_len_, d_ = value.size()
 96 |         aeq(batch, batch_)
 97 |         aeq(k_len, k_len_)
 98 |         aeq(d, d_)
 99 |         batch_, q_len, d_ = query.size()
100 |         aeq(batch, batch_)
101 |         aeq(d, d_)
102 |         aeq(self.model_dim % 8, 0)
103 |         if mask is not None:
104 |             batch_, q_len_, k_len_ = mask.size()
105 |             aeq(batch_, batch)
106 |             aeq(k_len_, k_len)
107 |             aeq(q_len_ == q_len)
108 |         # END CHECKS
109 | 
110 |         def shape_projection(x):
111 |             b, l, d = x.size()
112 |             return x.view(b, l, self.head_count, self.dim_per_head) \
113 |                 .transpose(1, 2).contiguous() \
114 |                 .view(b * self.head_count, l, self.dim_per_head)
115 | 
116 |         def unshape_projection(x, q):
117 |             b, l, d = q.size()
118 |             return x.view(b, self.head_count, l, self.dim_per_head) \
119 |                     .transpose(1, 2).contiguous() \
120 |                     .view(b, l, self.head_count * self.dim_per_head)
121 | 
122 |         residual = query
123 |         key_up = shape_projection(self.linear_keys(key))
124 |         value_up = shape_projection(self.linear_values(value))
125 |         query_up = shape_projection(self.linear_query(query))
126 | 
127 |         scaled = torch.bmm(query_up, key_up.transpose(1, 2))
128 |         scaled = scaled / math.sqrt(self.dim_per_head)
129 |         bh, l, dim_per_head = scaled.size()
130 |         b = bh // self.head_count
131 |         if mask is not None:
132 | 
133 |             scaled = scaled.view(b, self.head_count, l, dim_per_head)
134 |             mask = mask.unsqueeze(1).expand_as(scaled)
135 |             scaled = scaled.masked_fill(Variable(mask), -1e18) \
136 |                            .view(bh, l, dim_per_head)
137 |         attn = self.sm(scaled)
138 |         # Return one attn
139 |         top_attn = attn \
140 |             .view(b, self.head_count, l, dim_per_head)[:, 0, :, :] \
141 |             .contiguous()
142 | 
143 |         drop_attn = self.dropout(self.sm(scaled))
144 | 
145 |         # values : (batch * 8) x qlen x dim
146 |         out = unshape_projection(torch.bmm(drop_attn, value_up), residual)
147 | 
148 |         # Residual and layer norm
149 |         ret = self.res_dropout(out)
150 | 
151 |         # CHECK
152 |         batch_, q_len_, d_ = ret.size()
153 |         aeq(q_len, q_len_)
154 |         aeq(batch, batch_)
155 |         aeq(d, d_)
156 |         # END CHECK
157 |         return ret, top_attn
158 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/modules/MultiHeadedAttn.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | from torch.autograd import Variable
  5 | 
  6 | from onmt.Utils import aeq
  7 | from onmt.modules.UtilClass import BottleLinear, BottleSoftmax
  8 | 
  9 | 
 10 | class MultiHeadedAttention(nn.Module):
 11 |     """
 12 |     Multi-Head Attention module from
 13 |     "Attention is All You Need"
 14 |     :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
 15 | 
 16 |     Similar to standard `dot` attention but uses
 17 |     multiple attention distributions simulataneously
 18 |     to select relevant items.
 19 | 
 20 |     .. mermaid::
 21 | 
 22 |        graph BT
 23 |           A[key]
 24 |           B[value]
 25 |           C[query]
 26 |           O[output]
 27 |           subgraph Attn
 28 |             D[Attn 1]
 29 |             E[Attn 2]
 30 |             F[Attn N]
 31 |           end
 32 |           A --> D
 33 |           C --> D
 34 |           A --> E
 35 |           C --> E
 36 |           A --> F
 37 |           C --> F
 38 |           D --> O
 39 |           E --> O
 40 |           F --> O
 41 |           B --> O
 42 | 
 43 |     Also includes several additional tricks.
 44 | 
 45 |     Args:
 46 |        head_count (int): number of parallel heads
 47 |        model_dim (int): the dimension of keys/values/queries,
 48 |            must be divisible by head_count
 49 |        dropout (float): dropout parameter
 50 |     """
 51 |     def __init__(self, head_count, model_dim, dropout=0.1):
 52 |         assert model_dim % head_count == 0
 53 |         self.dim_per_head = model_dim // head_count
 54 |         self.model_dim = model_dim
 55 | 
 56 |         super(MultiHeadedAttention, self).__init__()
 57 |         self.head_count = head_count
 58 | 
 59 |         self.linear_keys = BottleLinear(model_dim,
 60 |                                         head_count * self.dim_per_head,
 61 |                                         bias=False)
 62 |         self.linear_values = BottleLinear(model_dim,
 63 |                                           head_count * self.dim_per_head,
 64 |                                           bias=False)
 65 |         self.linear_query = BottleLinear(model_dim,
 66 |                                          head_count * self.dim_per_head,
 67 |                                          bias=False)
 68 |         self.sm = BottleSoftmax()
 69 |         self.activation = nn.ReLU()
 70 |         self.dropout = nn.Dropout(dropout)
 71 |         self.res_dropout = nn.Dropout(dropout)
 72 | 
 73 |     def forward(self, key, value, query, mask=None):
 74 |         """
 75 |         Compute the context vector and the attention vectors.
 76 | 
 77 |         Args:
 78 |            key (`FloatTensor`): set of `key_len`
 79 |                 key vectors `[batch, key_len, dim]`
 80 |            value (`FloatTensor`): set of `key_len`
 81 |                 value vectors `[batch, key_len, dim]`
 82 |            query (`FloatTensor`): set of `query_len`
 83 |                  query vectors  `[batch, query_len, dim]`
 84 |            mask: binary mask indicating which keys have
 85 |                  non-zero attention `[batch, query_len, key_len]`
 86 |         Returns:
 87 |            (`FloatTensor`, `FloatTensor`) :
 88 | 
 89 |            * output context vectors `[batch, query_len, dim]`
 90 |            * one of the attention vectors `[batch, query_len, key_len]`
 91 |         """
 92 | 
 93 |         # CHECKS
 94 |         batch, k_len, d = key.size()
 95 |         batch_, k_len_, d_ = value.size()
 96 |         aeq(batch, batch_)
 97 |         aeq(k_len, k_len_)
 98 |         aeq(d, d_)
 99 |         batch_, q_len, d_ = query.size()
100 |         aeq(batch, batch_)
101 |         aeq(d, d_)
102 |         aeq(self.model_dim % 8, 0)
103 |         if mask is not None:
104 |             batch_, q_len_, k_len_ = mask.size()
105 |             aeq(batch_, batch)
106 |             aeq(k_len_, k_len)
107 |             aeq(q_len_ == q_len)
108 |         # END CHECKS
109 | 
110 |         def shape_projection(x):
111 |             b, l, d = x.size()
112 |             return x.view(b, l, self.head_count, self.dim_per_head) \
113 |                 .transpose(1, 2).contiguous() \
114 |                 .view(b * self.head_count, l, self.dim_per_head)
115 | 
116 |         def unshape_projection(x, q):
117 |             b, l, d = q.size()
118 |             return x.view(b, self.head_count, l, self.dim_per_head) \
119 |                     .transpose(1, 2).contiguous() \
120 |                     .view(b, l, self.head_count * self.dim_per_head)
121 | 
122 |         residual = query
123 |         key_up = shape_projection(self.linear_keys(key))
124 |         value_up = shape_projection(self.linear_values(value))
125 |         query_up = shape_projection(self.linear_query(query))
126 | 
127 |         scaled = torch.bmm(query_up, key_up.transpose(1, 2))
128 |         scaled = scaled / math.sqrt(self.dim_per_head)
129 |         bh, l, dim_per_head = scaled.size()
130 |         b = bh // self.head_count
131 |         if mask is not None:
132 | 
133 |             scaled = scaled.view(b, self.head_count, l, dim_per_head)
134 |             mask = mask.unsqueeze(1).expand_as(scaled)
135 |             scaled = scaled.masked_fill(Variable(mask), -1e18) \
136 |                            .view(bh, l, dim_per_head)
137 |         attn = self.sm(scaled)
138 |         # Return one attn
139 |         top_attn = attn \
140 |             .view(b, self.head_count, l, dim_per_head)[:, 0, :, :] \
141 |             .contiguous()
142 | 
143 |         drop_attn = self.dropout(self.sm(scaled))
144 | 
145 |         # values : (batch * 8) x qlen x dim
146 |         out = unshape_projection(torch.bmm(drop_attn, value_up), residual)
147 | 
148 |         # Residual and layer norm
149 |         ret = self.res_dropout(out)
150 | 
151 |         # CHECK
152 |         batch_, q_len_, d_ = ret.size()
153 |         aeq(q_len, q_len_)
154 |         aeq(batch, batch_)
155 |         aeq(d, d_)
156 |         # END CHECK
157 |         return ret, top_attn
158 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/modules/MultiHeadedAttn.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | from torch.autograd import Variable
  5 | 
  6 | from onmt.Utils import aeq
  7 | from onmt.modules.UtilClass import BottleLinear, BottleSoftmax
  8 | 
  9 | 
 10 | class MultiHeadedAttention(nn.Module):
 11 |     """
 12 |     Multi-Head Attention module from
 13 |     "Attention is All You Need"
 14 |     :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
 15 | 
 16 |     Similar to standard `dot` attention but uses
 17 |     multiple attention distributions simulataneously
 18 |     to select relevant items.
 19 | 
 20 |     .. mermaid::
 21 | 
 22 |        graph BT
 23 |           A[key]
 24 |           B[value]
 25 |           C[query]
 26 |           O[output]
 27 |           subgraph Attn
 28 |             D[Attn 1]
 29 |             E[Attn 2]
 30 |             F[Attn N]
 31 |           end
 32 |           A --> D
 33 |           C --> D
 34 |           A --> E
 35 |           C --> E
 36 |           A --> F
 37 |           C --> F
 38 |           D --> O
 39 |           E --> O
 40 |           F --> O
 41 |           B --> O
 42 | 
 43 |     Also includes several additional tricks.
 44 | 
 45 |     Args:
 46 |        head_count (int): number of parallel heads
 47 |        model_dim (int): the dimension of keys/values/queries,
 48 |            must be divisible by head_count
 49 |        dropout (float): dropout parameter
 50 |     """
 51 |     def __init__(self, head_count, model_dim, dropout=0.1):
 52 |         assert model_dim % head_count == 0
 53 |         self.dim_per_head = model_dim // head_count
 54 |         self.model_dim = model_dim
 55 | 
 56 |         super(MultiHeadedAttention, self).__init__()
 57 |         self.head_count = head_count
 58 | 
 59 |         self.linear_keys = BottleLinear(model_dim,
 60 |                                         head_count * self.dim_per_head,
 61 |                                         bias=False)
 62 |         self.linear_values = BottleLinear(model_dim,
 63 |                                           head_count * self.dim_per_head,
 64 |                                           bias=False)
 65 |         self.linear_query = BottleLinear(model_dim,
 66 |                                          head_count * self.dim_per_head,
 67 |                                          bias=False)
 68 |         self.sm = BottleSoftmax()
 69 |         self.activation = nn.ReLU()
 70 |         self.dropout = nn.Dropout(dropout)
 71 |         self.res_dropout = nn.Dropout(dropout)
 72 | 
 73 |     def forward(self, key, value, query, mask=None):
 74 |         """
 75 |         Compute the context vector and the attention vectors.
 76 | 
 77 |         Args:
 78 |            key (`FloatTensor`): set of `key_len`
 79 |                 key vectors `[batch, key_len, dim]`
 80 |            value (`FloatTensor`): set of `key_len`
 81 |                 value vectors `[batch, key_len, dim]`
 82 |            query (`FloatTensor`): set of `query_len`
 83 |                  query vectors  `[batch, query_len, dim]`
 84 |            mask: binary mask indicating which keys have
 85 |                  non-zero attention `[batch, query_len, key_len]`
 86 |         Returns:
 87 |            (`FloatTensor`, `FloatTensor`) :
 88 | 
 89 |            * output context vectors `[batch, query_len, dim]`
 90 |            * one of the attention vectors `[batch, query_len, key_len]`
 91 |         """
 92 | 
 93 |         # CHECKS
 94 |         batch, k_len, d = key.size()
 95 |         batch_, k_len_, d_ = value.size()
 96 |         aeq(batch, batch_)
 97 |         aeq(k_len, k_len_)
 98 |         aeq(d, d_)
 99 |         batch_, q_len, d_ = query.size()
100 |         aeq(batch, batch_)
101 |         aeq(d, d_)
102 |         aeq(self.model_dim % 8, 0)
103 |         if mask is not None:
104 |             batch_, q_len_, k_len_ = mask.size()
105 |             aeq(batch_, batch)
106 |             aeq(k_len_, k_len)
107 |             aeq(q_len_ == q_len)
108 |         # END CHECKS
109 | 
110 |         def shape_projection(x):
111 |             b, l, d = x.size()
112 |             return x.view(b, l, self.head_count, self.dim_per_head) \
113 |                 .transpose(1, 2).contiguous() \
114 |                 .view(b * self.head_count, l, self.dim_per_head)
115 | 
116 |         def unshape_projection(x, q):
117 |             b, l, d = q.size()
118 |             return x.view(b, self.head_count, l, self.dim_per_head) \
119 |                     .transpose(1, 2).contiguous() \
120 |                     .view(b, l, self.head_count * self.dim_per_head)
121 | 
122 |         residual = query
123 |         key_up = shape_projection(self.linear_keys(key))
124 |         value_up = shape_projection(self.linear_values(value))
125 |         query_up = shape_projection(self.linear_query(query))
126 | 
127 |         scaled = torch.bmm(query_up, key_up.transpose(1, 2))
128 |         scaled = scaled / math.sqrt(self.dim_per_head)
129 |         bh, l, dim_per_head = scaled.size()
130 |         b = bh // self.head_count
131 |         if mask is not None:
132 | 
133 |             scaled = scaled.view(b, self.head_count, l, dim_per_head)
134 |             mask = mask.unsqueeze(1).expand_as(scaled)
135 |             scaled = scaled.masked_fill(Variable(mask), -1e18) \
136 |                            .view(bh, l, dim_per_head)
137 |         attn = self.sm(scaled)
138 |         # Return one attn
139 |         top_attn = attn \
140 |             .view(b, self.head_count, l, dim_per_head)[:, 0, :, :] \
141 |             .contiguous()
142 | 
143 |         drop_attn = self.dropout(self.sm(scaled))
144 | 
145 |         # values : (batch * 8) x qlen x dim
146 |         out = unshape_projection(torch.bmm(drop_attn, value_up), residual)
147 | 
148 |         # Residual and layer norm
149 |         ret = self.res_dropout(out)
150 | 
151 |         # CHECK
152 |         batch_, q_len_, d_ = ret.size()
153 |         aeq(q_len, q_len_)
154 |         aeq(batch, batch_)
155 |         aeq(d, d_)
156 |         # END CHECK
157 |         return ret, top_attn
158 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/modules/MultiHeadedAttn.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | from torch.autograd import Variable
  5 | 
  6 | from onmt.Utils import aeq
  7 | from onmt.modules.UtilClass import BottleLinear, BottleSoftmax
  8 | 
  9 | 
 10 | class MultiHeadedAttention(nn.Module):
 11 |     """
 12 |     Multi-Head Attention module from
 13 |     "Attention is All You Need"
 14 |     :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.
 15 | 
 16 |     Similar to standard `dot` attention but uses
 17 |     multiple attention distributions simulataneously
 18 |     to select relevant items.
 19 | 
 20 |     .. mermaid::
 21 | 
 22 |        graph BT
 23 |           A[key]
 24 |           B[value]
 25 |           C[query]
 26 |           O[output]
 27 |           subgraph Attn
 28 |             D[Attn 1]
 29 |             E[Attn 2]
 30 |             F[Attn N]
 31 |           end
 32 |           A --> D
 33 |           C --> D
 34 |           A --> E
 35 |           C --> E
 36 |           A --> F
 37 |           C --> F
 38 |           D --> O
 39 |           E --> O
 40 |           F --> O
 41 |           B --> O
 42 | 
 43 |     Also includes several additional tricks.
 44 | 
 45 |     Args:
 46 |        head_count (int): number of parallel heads
 47 |        model_dim (int): the dimension of keys/values/queries,
 48 |            must be divisible by head_count
 49 |        dropout (float): dropout parameter
 50 |     """
 51 |     def __init__(self, head_count, model_dim, dropout=0.1):
 52 |         assert model_dim % head_count == 0
 53 |         self.dim_per_head = model_dim // head_count
 54 |         self.model_dim = model_dim
 55 | 
 56 |         super(MultiHeadedAttention, self).__init__()
 57 |         self.head_count = head_count
 58 | 
 59 |         self.linear_keys = BottleLinear(model_dim,
 60 |                                         head_count * self.dim_per_head,
 61 |                                         bias=False)
 62 |         self.linear_values = BottleLinear(model_dim,
 63 |                                           head_count * self.dim_per_head,
 64 |                                           bias=False)
 65 |         self.linear_query = BottleLinear(model_dim,
 66 |                                          head_count * self.dim_per_head,
 67 |                                          bias=False)
 68 |         self.sm = BottleSoftmax()
 69 |         self.activation = nn.ReLU()
 70 |         self.dropout = nn.Dropout(dropout)
 71 |         self.res_dropout = nn.Dropout(dropout)
 72 | 
 73 |     def forward(self, key, value, query, mask=None):
 74 |         """
 75 |         Compute the context vector and the attention vectors.
 76 | 
 77 |         Args:
 78 |            key (`FloatTensor`): set of `key_len`
 79 |                 key vectors `[batch, key_len, dim]`
 80 |            value (`FloatTensor`): set of `key_len`
 81 |                 value vectors `[batch, key_len, dim]`
 82 |            query (`FloatTensor`): set of `query_len`
 83 |                  query vectors  `[batch, query_len, dim]`
 84 |            mask: binary mask indicating which keys have
 85 |                  non-zero attention `[batch, query_len, key_len]`
 86 |         Returns:
 87 |            (`FloatTensor`, `FloatTensor`) :
 88 | 
 89 |            * output context vectors `[batch, query_len, dim]`
 90 |            * one of the attention vectors `[batch, query_len, key_len]`
 91 |         """
 92 | 
 93 |         # CHECKS
 94 |         batch, k_len, d = key.size()
 95 |         batch_, k_len_, d_ = value.size()
 96 |         aeq(batch, batch_)
 97 |         aeq(k_len, k_len_)
 98 |         aeq(d, d_)
 99 |         batch_, q_len, d_ = query.size()
100 |         aeq(batch, batch_)
101 |         aeq(d, d_)
102 |         aeq(self.model_dim % 8, 0)
103 |         if mask is not None:
104 |             batch_, q_len_, k_len_ = mask.size()
105 |             aeq(batch_, batch)
106 |             aeq(k_len_, k_len)
107 |             aeq(q_len_ == q_len)
108 |         # END CHECKS
109 | 
110 |         def shape_projection(x):
111 |             b, l, d = x.size()
112 |             return x.view(b, l, self.head_count, self.dim_per_head) \
113 |                 .transpose(1, 2).contiguous() \
114 |                 .view(b * self.head_count, l, self.dim_per_head)
115 | 
116 |         def unshape_projection(x, q):
117 |             b, l, d = q.size()
118 |             return x.view(b, self.head_count, l, self.dim_per_head) \
119 |                     .transpose(1, 2).contiguous() \
120 |                     .view(b, l, self.head_count * self.dim_per_head)
121 | 
122 |         residual = query
123 |         key_up = shape_projection(self.linear_keys(key))
124 |         value_up = shape_projection(self.linear_values(value))
125 |         query_up = shape_projection(self.linear_query(query))
126 | 
127 |         scaled = torch.bmm(query_up, key_up.transpose(1, 2))
128 |         scaled = scaled / math.sqrt(self.dim_per_head)
129 |         bh, l, dim_per_head = scaled.size()
130 |         b = bh // self.head_count
131 |         if mask is not None:
132 | 
133 |             scaled = scaled.view(b, self.head_count, l, dim_per_head)
134 |             mask = mask.unsqueeze(1).expand_as(scaled)
135 |             scaled = scaled.masked_fill(Variable(mask), -1e18) \
136 |                            .view(bh, l, dim_per_head)
137 |         attn = self.sm(scaled)
138 |         # Return one attn
139 |         top_attn = attn \
140 |             .view(b, self.head_count, l, dim_per_head)[:, 0, :, :] \
141 |             .contiguous()
142 | 
143 |         drop_attn = self.dropout(self.sm(scaled))
144 | 
145 |         # values : (batch * 8) x qlen x dim
146 |         out = unshape_projection(torch.bmm(drop_attn, value_up), residual)
147 | 
148 |         # Residual and layer norm
149 |         ret = self.res_dropout(out)
150 | 
151 |         # CHECK
152 |         batch_, q_len_, d_ = ret.size()
153 |         aeq(q_len, q_len_)
154 |         aeq(batch, batch_)
155 |         aeq(d, d_)
156 |         # END CHECK
157 |         return ret, top_attn
158 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/translate/Translation.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, unicode_literals
  2 | 
  3 | import torch
  4 | import onmt.io
  5 | 
  6 | 
  7 | class TranslationBuilder(object):
  8 |     """
  9 |     Build a word-based translation from the batch output
 10 |     of translator and the underlying dictionaries.
 11 | 
 12 |     Replacement based on "Addressing the Rare Word
 13 |     Problem in Neural Machine Translation" :cite:`Luong2015b`
 14 | 
 15 |     Args:
 16 |        data (DataSet):
 17 |        fields (dict of Fields): data fields
 18 |        n_best (int): number of translations produced
 19 |        replace_unk (bool): replace unknown words using attention
 20 |        has_tgt (bool): will the batch have gold targets
 21 |     """
 22 |     def __init__(self, data, fields, n_best=1, replace_unk=False,
 23 |                  has_tgt=False):
 24 |         self.data = data
 25 |         self.fields = fields
 26 |         self.n_best = n_best
 27 |         self.replace_unk = replace_unk
 28 |         self.has_tgt = has_tgt
 29 | 
 30 |     def _build_target_tokens(self, src, src_vocab, src_raw, pred, attn):
 31 |         vocab = self.fields["tgt"].vocab
 32 |         tokens = []
 33 |         for tok in pred:
 34 |             if tok < len(vocab):
 35 |                 tokens.append(vocab.itos[tok])
 36 |             else:
 37 |                 tokens.append(src_vocab.itos[tok - len(vocab)])
 38 |             if tokens[-1] == onmt.io.EOS_WORD:
 39 |                 tokens = tokens[:-1]
 40 |                 break
 41 |         if self.replace_unk and (attn is not None) and (src is not None):
 42 |             for i in range(len(tokens)):
 43 |                 if tokens[i] == vocab.itos[onmt.io.UNK]:
 44 |                     _, maxIndex = attn[i].max(0)
 45 |                     tokens[i] = src_raw[maxIndex[0]]
 46 |         return tokens
 47 | 
 48 |     def from_batch(self, translation_batch):
 49 |         batch = translation_batch["batch"]
 50 |         assert(len(translation_batch["gold_score"]) ==
 51 |                len(translation_batch["predictions"]))
 52 |         batch_size = batch.batch_size
 53 | 
 54 |         preds, pred_score, attn, gold_score, indices = list(zip(
 55 |             *sorted(zip(translation_batch["predictions"],
 56 |                         translation_batch["scores"],
 57 |                         translation_batch["attention"],
 58 |                         translation_batch["gold_score"],
 59 |                         batch.indices.data),
 60 |                     key=lambda x: x[-1])))
 61 | 
 62 |         # Sorting
 63 |         inds, perm = torch.sort(batch.indices.data)
 64 |         data_type = self.data.data_type
 65 |         if data_type == 'text':
 66 |             src = batch.src[0].data.index_select(1, perm)
 67 |         else:
 68 |             src = None
 69 | 
 70 |         if self.has_tgt:
 71 |             tgt = batch.tgt.data.index_select(1, perm)
 72 |         else:
 73 |             tgt = None
 74 | 
 75 |         translations = []
 76 |         for b in range(batch_size):
 77 |             if data_type == 'text':
 78 |                 src_vocab = self.data.src_vocabs[inds[b]] \
 79 |                   if self.data.src_vocabs else None
 80 |                 src_raw = self.data.examples[inds[b]].src
 81 |             else:
 82 |                 src_vocab = None
 83 |                 src_raw = None
 84 |             pred_sents = [self._build_target_tokens(
 85 |                 src[:, b] if src is not None else None,
 86 |                 src_vocab, src_raw,
 87 |                 preds[b][n], attn[b][n])
 88 |                           for n in range(self.n_best)]
 89 |             gold_sent = None
 90 |             if tgt is not None:
 91 |                 gold_sent = self._build_target_tokens(
 92 |                     src[:, b] if src is not None else None,
 93 |                     src_vocab, src_raw,
 94 |                     tgt[1:, b] if tgt is not None else None, None)
 95 | 
 96 |             translation = Translation(src[:, b] if src is not None else None,
 97 |                                       src_raw, pred_sents,
 98 |                                       attn[b], pred_score[b], gold_sent,
 99 |                                       gold_score[b])
100 |             translations.append(translation)
101 | 
102 |         return translations
103 | 
104 | 
105 | class Translation(object):
106 |     """
107 |     Container for a translated sentence.
108 | 
109 |     Attributes:
110 |         src (`LongTensor`): src word ids
111 |         src_raw ([str]): raw src words
112 | 
113 |         pred_sents ([[str]]): words from the n-best translations
114 |         pred_scores ([[float]]): log-probs of n-best translations
115 |         attns ([`FloatTensor`]) : attention dist for each translation
116 |         gold_sent ([str]): words from gold translation
117 |         gold_score ([float]): log-prob of gold translation
118 | 
119 |     """
120 |     def __init__(self, src, src_raw, pred_sents,
121 |                  attn, pred_scores, tgt_sent, gold_score):
122 |         self.src = src
123 |         self.src_raw = src_raw
124 |         self.pred_sents = pred_sents
125 |         self.attns = attn
126 |         self.pred_scores = pred_scores
127 |         self.gold_sent = tgt_sent
128 |         self.gold_score = gold_score
129 | 
130 |     def log(self, sent_number):
131 |         """
132 |         Log translation to stdout.
133 |         """
134 |         output = '\nSENT {}: {}\n'.format(sent_number, self.src_raw)
135 | 
136 |         best_pred = self.pred_sents[0]
137 |         best_score = self.pred_scores[0]
138 |         pred_sent = ' '.join(best_pred)
139 |         output += 'PRED {}: {}\n'.format(sent_number, pred_sent)
140 |         print("PRED SCORE: {:.4f}".format(best_score))
141 | 
142 |         if self.gold_sent is not None:
143 |             tgt_sent = ' '.join(self.gold_sent)
144 |             output += 'GOLD {}: {}\n'.format(sent_number, tgt_sent)
145 |             output += ("GOLD SCORE: {:.4f}".format(self.gold_score))
146 | 
147 |         if len(self.pred_sents) > 1:
148 |             print('\nBEST HYP:')
149 |             for score, sent in zip(self.pred_scores, self.pred_sents):
150 |                 output += "[{:.4f}] {}\n".format(score, sent)
151 | 
152 |         return output
153 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/translate/Translation.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, unicode_literals
  2 | 
  3 | import torch
  4 | import onmt.io
  5 | 
  6 | 
  7 | class TranslationBuilder(object):
  8 |     """
  9 |     Build a word-based translation from the batch output
 10 |     of translator and the underlying dictionaries.
 11 | 
 12 |     Replacement based on "Addressing the Rare Word
 13 |     Problem in Neural Machine Translation" :cite:`Luong2015b`
 14 | 
 15 |     Args:
 16 |        data (DataSet):
 17 |        fields (dict of Fields): data fields
 18 |        n_best (int): number of translations produced
 19 |        replace_unk (bool): replace unknown words using attention
 20 |        has_tgt (bool): will the batch have gold targets
 21 |     """
 22 |     def __init__(self, data, fields, n_best=1, replace_unk=False,
 23 |                  has_tgt=False):
 24 |         self.data = data
 25 |         self.fields = fields
 26 |         self.n_best = n_best
 27 |         self.replace_unk = replace_unk
 28 |         self.has_tgt = has_tgt
 29 | 
 30 |     def _build_target_tokens(self, src, src_vocab, src_raw, pred, attn):
 31 |         vocab = self.fields["tgt"].vocab
 32 |         tokens = []
 33 |         for tok in pred:
 34 |             if tok < len(vocab):
 35 |                 tokens.append(vocab.itos[tok])
 36 |             else:
 37 |                 tokens.append(src_vocab.itos[tok - len(vocab)])
 38 |             if tokens[-1] == onmt.io.EOS_WORD:
 39 |                 tokens = tokens[:-1]
 40 |                 break
 41 |         if self.replace_unk and (attn is not None) and (src is not None):
 42 |             for i in range(len(tokens)):
 43 |                 if tokens[i] == vocab.itos[onmt.io.UNK]:
 44 |                     _, maxIndex = attn[i].max(0)
 45 |                     tokens[i] = src_raw[maxIndex[0]]
 46 |         return tokens
 47 | 
 48 |     def from_batch(self, translation_batch):
 49 |         batch = translation_batch["batch"]
 50 |         assert(len(translation_batch["gold_score"]) ==
 51 |                len(translation_batch["predictions"]))
 52 |         batch_size = batch.batch_size
 53 | 
 54 |         preds, pred_score, attn, gold_score, indices = list(zip(
 55 |             *sorted(zip(translation_batch["predictions"],
 56 |                         translation_batch["scores"],
 57 |                         translation_batch["attention"],
 58 |                         translation_batch["gold_score"],
 59 |                         batch.indices.data),
 60 |                     key=lambda x: x[-1])))
 61 | 
 62 |         # Sorting
 63 |         inds, perm = torch.sort(batch.indices.data)
 64 |         data_type = self.data.data_type
 65 |         if data_type == 'text':
 66 |             src = batch.src[0].data.index_select(1, perm)
 67 |         else:
 68 |             src = None
 69 | 
 70 |         if self.has_tgt:
 71 |             tgt = batch.tgt.data.index_select(1, perm)
 72 |         else:
 73 |             tgt = None
 74 | 
 75 |         translations = []
 76 |         for b in range(batch_size):
 77 |             if data_type == 'text':
 78 |                 src_vocab = self.data.src_vocabs[inds[b]] \
 79 |                   if self.data.src_vocabs else None
 80 |                 src_raw = self.data.examples[inds[b]].src
 81 |             else:
 82 |                 src_vocab = None
 83 |                 src_raw = None
 84 |             pred_sents = [self._build_target_tokens(
 85 |                 src[:, b] if src is not None else None,
 86 |                 src_vocab, src_raw,
 87 |                 preds[b][n], attn[b][n])
 88 |                           for n in range(self.n_best)]
 89 |             gold_sent = None
 90 |             if tgt is not None:
 91 |                 gold_sent = self._build_target_tokens(
 92 |                     src[:, b] if src is not None else None,
 93 |                     src_vocab, src_raw,
 94 |                     tgt[1:, b] if tgt is not None else None, None)
 95 | 
 96 |             translation = Translation(src[:, b] if src is not None else None,
 97 |                                       src_raw, pred_sents,
 98 |                                       attn[b], pred_score[b], gold_sent,
 99 |                                       gold_score[b])
100 |             translations.append(translation)
101 | 
102 |         return translations
103 | 
104 | 
105 | class Translation(object):
106 |     """
107 |     Container for a translated sentence.
108 | 
109 |     Attributes:
110 |         src (`LongTensor`): src word ids
111 |         src_raw ([str]): raw src words
112 | 
113 |         pred_sents ([[str]]): words from the n-best translations
114 |         pred_scores ([[float]]): log-probs of n-best translations
115 |         attns ([`FloatTensor`]) : attention dist for each translation
116 |         gold_sent ([str]): words from gold translation
117 |         gold_score ([float]): log-prob of gold translation
118 | 
119 |     """
120 |     def __init__(self, src, src_raw, pred_sents,
121 |                  attn, pred_scores, tgt_sent, gold_score):
122 |         self.src = src
123 |         self.src_raw = src_raw
124 |         self.pred_sents = pred_sents
125 |         self.attns = attn
126 |         self.pred_scores = pred_scores
127 |         self.gold_sent = tgt_sent
128 |         self.gold_score = gold_score
129 | 
130 |     def log(self, sent_number):
131 |         """
132 |         Log translation to stdout.
133 |         """
134 |         output = '\nSENT {}: {}\n'.format(sent_number, self.src_raw)
135 | 
136 |         best_pred = self.pred_sents[0]
137 |         best_score = self.pred_scores[0]
138 |         pred_sent = ' '.join(best_pred)
139 |         output += 'PRED {}: {}\n'.format(sent_number, pred_sent)
140 |         print("PRED SCORE: {:.4f}".format(best_score))
141 | 
142 |         if self.gold_sent is not None:
143 |             tgt_sent = ' '.join(self.gold_sent)
144 |             output += 'GOLD {}: {}\n'.format(sent_number, tgt_sent)
145 |             output += ("GOLD SCORE: {:.4f}".format(self.gold_score))
146 | 
147 |         if len(self.pred_sents) > 1:
148 |             print('\nBEST HYP:')
149 |             for score, sent in zip(self.pred_scores, self.pred_sents):
150 |                 output += "[{:.4f}] {}\n".format(score, sent)
151 | 
152 |         return output
153 | 


--------------------------------------------------------------------------------
/NLI_pretrain/onmt/translate/Translation.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, unicode_literals
  2 | 
  3 | import torch
  4 | import onmt.io
  5 | 
  6 | 
  7 | class TranslationBuilder(object):
  8 |     """
  9 |     Build a word-based translation from the batch output
 10 |     of translator and the underlying dictionaries.
 11 | 
 12 |     Replacement based on "Addressing the Rare Word
 13 |     Problem in Neural Machine Translation" :cite:`Luong2015b`
 14 | 
 15 |     Args:
 16 |        data (DataSet):
 17 |        fields (dict of Fields): data fields
 18 |        n_best (int): number of translations produced
 19 |        replace_unk (bool): replace unknown words using attention
 20 |        has_tgt (bool): will the batch have gold targets
 21 |     """
 22 |     def __init__(self, data, fields, n_best=1, replace_unk=False,
 23 |                  has_tgt=False):
 24 |         self.data = data
 25 |         self.fields = fields
 26 |         self.n_best = n_best
 27 |         self.replace_unk = replace_unk
 28 |         self.has_tgt = has_tgt
 29 | 
 30 |     def _build_target_tokens(self, src, src_vocab, src_raw, pred, attn):
 31 |         vocab = self.fields["tgt"].vocab
 32 |         tokens = []
 33 |         for tok in pred:
 34 |             if tok < len(vocab):
 35 |                 tokens.append(vocab.itos[tok])
 36 |             else:
 37 |                 tokens.append(src_vocab.itos[tok - len(vocab)])
 38 |             if tokens[-1] == onmt.io.EOS_WORD:
 39 |                 tokens = tokens[:-1]
 40 |                 break
 41 |         if self.replace_unk and (attn is not None) and (src is not None):
 42 |             for i in range(len(tokens)):
 43 |                 if tokens[i] == vocab.itos[onmt.io.UNK]:
 44 |                     _, maxIndex = attn[i].max(0)
 45 |                     tokens[i] = src_raw[maxIndex[0]]
 46 |         return tokens
 47 | 
 48 |     def from_batch(self, translation_batch):
 49 |         batch = translation_batch["batch"]
 50 |         assert(len(translation_batch["gold_score"]) ==
 51 |                len(translation_batch["predictions"]))
 52 |         batch_size = batch.batch_size
 53 | 
 54 |         preds, pred_score, attn, gold_score, indices = list(zip(
 55 |             *sorted(zip(translation_batch["predictions"],
 56 |                         translation_batch["scores"],
 57 |                         translation_batch["attention"],
 58 |                         translation_batch["gold_score"],
 59 |                         batch.indices.data),
 60 |                     key=lambda x: x[-1])))
 61 | 
 62 |         # Sorting
 63 |         inds, perm = torch.sort(batch.indices.data)
 64 |         data_type = self.data.data_type
 65 |         if data_type == 'text':
 66 |             src = batch.src[0].data.index_select(1, perm)
 67 |         else:
 68 |             src = None
 69 | 
 70 |         if self.has_tgt:
 71 |             tgt = batch.tgt.data.index_select(1, perm)
 72 |         else:
 73 |             tgt = None
 74 | 
 75 |         translations = []
 76 |         for b in range(batch_size):
 77 |             if data_type == 'text':
 78 |                 src_vocab = self.data.src_vocabs[inds[b]] \
 79 |                   if self.data.src_vocabs else None
 80 |                 src_raw = self.data.examples[inds[b]].src
 81 |             else:
 82 |                 src_vocab = None
 83 |                 src_raw = None
 84 |             pred_sents = [self._build_target_tokens(
 85 |                 src[:, b] if src is not None else None,
 86 |                 src_vocab, src_raw,
 87 |                 preds[b][n], attn[b][n])
 88 |                           for n in range(self.n_best)]
 89 |             gold_sent = None
 90 |             if tgt is not None:
 91 |                 gold_sent = self._build_target_tokens(
 92 |                     src[:, b] if src is not None else None,
 93 |                     src_vocab, src_raw,
 94 |                     tgt[1:, b] if tgt is not None else None, None)
 95 | 
 96 |             translation = Translation(src[:, b] if src is not None else None,
 97 |                                       src_raw, pred_sents,
 98 |                                       attn[b], pred_score[b], gold_sent,
 99 |                                       gold_score[b])
100 |             translations.append(translation)
101 | 
102 |         return translations
103 | 
104 | 
105 | class Translation(object):
106 |     """
107 |     Container for a translated sentence.
108 | 
109 |     Attributes:
110 |         src (`LongTensor`): src word ids
111 |         src_raw ([str]): raw src words
112 | 
113 |         pred_sents ([[str]]): words from the n-best translations
114 |         pred_scores ([[float]]): log-probs of n-best translations
115 |         attns ([`FloatTensor`]) : attention dist for each translation
116 |         gold_sent ([str]): words from gold translation
117 |         gold_score ([float]): log-prob of gold translation
118 | 
119 |     """
120 |     def __init__(self, src, src_raw, pred_sents,
121 |                  attn, pred_scores, tgt_sent, gold_score):
122 |         self.src = src
123 |         self.src_raw = src_raw
124 |         self.pred_sents = pred_sents
125 |         self.attns = attn
126 |         self.pred_scores = pred_scores
127 |         self.gold_sent = tgt_sent
128 |         self.gold_score = gold_score
129 | 
130 |     def log(self, sent_number):
131 |         """
132 |         Log translation to stdout.
133 |         """
134 |         output = '\nSENT {}: {}\n'.format(sent_number, self.src_raw)
135 | 
136 |         best_pred = self.pred_sents[0]
137 |         best_score = self.pred_scores[0]
138 |         pred_sent = ' '.join(best_pred)
139 |         output += 'PRED {}: {}\n'.format(sent_number, pred_sent)
140 |         print("PRED SCORE: {:.4f}".format(best_score))
141 | 
142 |         if self.gold_sent is not None:
143 |             tgt_sent = ' '.join(self.gold_sent)
144 |             output += 'GOLD {}: {}\n'.format(sent_number, tgt_sent)
145 |             output += ("GOLD SCORE: {:.4f}".format(self.gold_score))
146 | 
147 |         if len(self.pred_sents) > 1:
148 |             print('\nBEST HYP:')
149 |             for score, sent in zip(self.pred_scores, self.pred_sents):
150 |                 output += "[{:.4f}] {}\n".format(score, sent)
151 | 
152 |         return output
153 | 


--------------------------------------------------------------------------------
/reinforcement_train/onmt/translate/Translation.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, unicode_literals
  2 | 
  3 | import torch
  4 | import onmt.io
  5 | 
  6 | 
  7 | class TranslationBuilder(object):
  8 |     """
  9 |     Build a word-based translation from the batch output
 10 |     of translator and the underlying dictionaries.
 11 | 
 12 |     Replacement based on "Addressing the Rare Word
 13 |     Problem in Neural Machine Translation" :cite:`Luong2015b`
 14 | 
 15 |     Args:
 16 |        data (DataSet):
 17 |        fields (dict of Fields): data fields
 18 |        n_best (int): number of translations produced
 19 |        replace_unk (bool): replace unknown words using attention
 20 |        has_tgt (bool): will the batch have gold targets
 21 |     """
 22 |     def __init__(self, data, fields, n_best=1, replace_unk=False,
 23 |                  has_tgt=False):
 24 |         self.data = data
 25 |         self.fields = fields
 26 |         self.n_best = n_best
 27 |         self.replace_unk = replace_unk
 28 |         self.has_tgt = has_tgt
 29 | 
 30 |     def _build_target_tokens(self, src, src_vocab, src_raw, pred, attn):
 31 |         vocab = self.fields["tgt"].vocab
 32 |         tokens = []
 33 |         for tok in pred:
 34 |             if tok < len(vocab):
 35 |                 tokens.append(vocab.itos[tok])
 36 |             else:
 37 |                 tokens.append(src_vocab.itos[tok - len(vocab)])
 38 |             if tokens[-1] == onmt.io.EOS_WORD:
 39 |                 tokens = tokens[:-1]
 40 |                 break
 41 |         if self.replace_unk and (attn is not None) and (src is not None):
 42 |             for i in range(len(tokens)):
 43 |                 if tokens[i] == vocab.itos[onmt.io.UNK]:
 44 |                     _, maxIndex = attn[i].max(0)
 45 |                     tokens[i] = src_raw[maxIndex[0]]
 46 |         return tokens
 47 | 
 48 |     def from_batch(self, translation_batch):
 49 |         batch = translation_batch["batch"]
 50 |         assert(len(translation_batch["gold_score"]) ==
 51 |                len(translation_batch["predictions"]))
 52 |         batch_size = batch.batch_size
 53 | 
 54 |         preds, pred_score, attn, gold_score, indices = list(zip(
 55 |             *sorted(zip(translation_batch["predictions"],
 56 |                         translation_batch["scores"],
 57 |                         translation_batch["attention"],
 58 |                         translation_batch["gold_score"],
 59 |                         batch.indices.data),
 60 |                     key=lambda x: x[-1])))
 61 | 
 62 |         # Sorting
 63 |         inds, perm = torch.sort(batch.indices.data)
 64 |         data_type = self.data.data_type
 65 |         if data_type == 'text':
 66 |             src = batch.src[0].data.index_select(1, perm)
 67 |         else:
 68 |             src = None
 69 | 
 70 |         if self.has_tgt:
 71 |             tgt = batch.tgt.data.index_select(1, perm)
 72 |         else:
 73 |             tgt = None
 74 | 
 75 |         translations = []
 76 |         for b in range(batch_size):
 77 |             if data_type == 'text':
 78 |                 src_vocab = self.data.src_vocabs[inds[b]] \
 79 |                   if self.data.src_vocabs else None
 80 |                 src_raw = self.data.examples[inds[b]].src
 81 |             else:
 82 |                 src_vocab = None
 83 |                 src_raw = None
 84 |             pred_sents = [self._build_target_tokens(
 85 |                 src[:, b] if src is not None else None,
 86 |                 src_vocab, src_raw,
 87 |                 preds[b][n], attn[b][n])
 88 |                           for n in range(self.n_best)]
 89 |             gold_sent = None
 90 |             if tgt is not None:
 91 |                 gold_sent = self._build_target_tokens(
 92 |                     src[:, b] if src is not None else None,
 93 |                     src_vocab, src_raw,
 94 |                     tgt[1:, b] if tgt is not None else None, None)
 95 | 
 96 |             translation = Translation(src[:, b] if src is not None else None,
 97 |                                       src_raw, pred_sents,
 98 |                                       attn[b], pred_score[b], gold_sent,
 99 |                                       gold_score[b])
100 |             translations.append(translation)
101 | 
102 |         return translations
103 | 
104 | 
105 | class Translation(object):
106 |     """
107 |     Container for a translated sentence.
108 | 
109 |     Attributes:
110 |         src (`LongTensor`): src word ids
111 |         src_raw ([str]): raw src words
112 | 
113 |         pred_sents ([[str]]): words from the n-best translations
114 |         pred_scores ([[float]]): log-probs of n-best translations
115 |         attns ([`FloatTensor`]) : attention dist for each translation
116 |         gold_sent ([str]): words from gold translation
117 |         gold_score ([float]): log-prob of gold translation
118 | 
119 |     """
120 |     def __init__(self, src, src_raw, pred_sents,
121 |                  attn, pred_scores, tgt_sent, gold_score):
122 |         self.src = src
123 |         self.src_raw = src_raw
124 |         self.pred_sents = pred_sents
125 |         self.attns = attn
126 |         self.pred_scores = pred_scores
127 |         self.gold_sent = tgt_sent
128 |         self.gold_score = gold_score
129 | 
130 |     def log(self, sent_number):
131 |         """
132 |         Log translation to stdout.
133 |         """
134 |         output = '\nSENT {}: {}\n'.format(sent_number, self.src_raw)
135 | 
136 |         best_pred = self.pred_sents[0]
137 |         best_score = self.pred_scores[0]
138 |         pred_sent = ' '.join(best_pred)
139 |         output += 'PRED {}: {}\n'.format(sent_number, pred_sent)
140 |         print("PRED SCORE: {:.4f}".format(best_score))
141 | 
142 |         if self.gold_sent is not None:
143 |             tgt_sent = ' '.join(self.gold_sent)
144 |             output += 'GOLD {}: {}\n'.format(sent_number, tgt_sent)
145 |             output += ("GOLD SCORE: {:.4f}".format(self.gold_score))
146 | 
147 |         if len(self.pred_sents) > 1:
148 |             print('\nBEST HYP:')
149 |             for score, sent in zip(self.pred_scores, self.pred_sents):
150 |                 output += "[{:.4f}] {}\n".format(score, sent)
151 | 
152 |         return output
153 | 


--------------------------------------------------------------------------------
/D_pretrain/onmt/translate/Beam.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import torch
  3 | 
  4 | 
  5 | class Beam(object):
  6 |     """
  7 |     Class for managing the internals of the beam search process.
  8 | 
  9 |     Takes care of beams, back pointers, and scores.
 10 | 
 11 |     Args:
 12 |        size (int): beam size
 13 |        pad, bos, eos (int): indices of padding, beginning, and ending.
 14 |        n_best (int): nbest size to use
 15 |        cuda (bool): use gpu
 16 |        global_scorer (:obj:`GlobalScorer`)
 17 |     """
 18 |     def __init__(self, size, pad, bos, eos,
 19 |                  n_best=1, cuda=False,
 20 |                  global_scorer=None,
 21 |                  min_length=0):
 22 | 
 23 |         self.size = size
 24 |         self.tt = torch.cuda if cuda else torch
 25 | 
 26 |         # The score for each translation on the beam.
 27 |         self.scores = self.tt.FloatTensor(size).zero_()
 28 |         self.all_scores = []
 29 | 
 30 |         # The backpointers at each time-step.
 31 |         self.prev_ks = []
 32 | 
 33 |         # The outputs at each time-step.
 34 |         self.next_ys = [self.tt.LongTensor(size)
 35 |                         .fill_(pad)]
 36 |         self.next_ys[0][0] = bos
 37 | 
 38 |         # Has EOS topped the beam yet.
 39 |         self._eos = eos
 40 |         self.eos_top = False
 41 | 
 42 |         # The attentions (matrix) for each time.
 43 |         self.attn = []
 44 | 
 45 |         # Time and k pair for finished.
 46 |         self.finished = []
 47 |         self.n_best = n_best
 48 | 
 49 |         # Information for global scoring.
 50 |         self.global_scorer = global_scorer
 51 |         self.global_state = {}
 52 | 
 53 |         # Minimum prediction length
 54 |         self.min_length = min_length
 55 | 
 56 |     def get_current_state(self):
 57 |         "Get the outputs for the current timestep."
 58 |         return self.next_ys[-1]
 59 | 
 60 |     def get_current_origin(self):
 61 |         "Get the backpointers for the current timestep."
 62 |         return self.prev_ks[-1]
 63 | 
 64 |     def advance(self, word_probs, attn_out):
 65 |         """
 66 |         Given prob over words for every last beam `wordLk` and attention
 67 |         `attn_out`: Compute and update the beam search.
 68 | 
 69 |         Parameters:
 70 | 
 71 |         * `word_probs`- probs of advancing from the last step (K x words)
 72 |         * `attn_out`- attention at the last step
 73 | 
 74 |         Returns: True if beam search is complete.
 75 |         """
 76 |         num_words = word_probs.size(1)
 77 | 
 78 |         # force the output to be longer than self.min_length
 79 |         cur_len = len(self.next_ys)
 80 |         if cur_len < self.min_length:
 81 |             for k in range(len(word_probs)):
 82 |                 word_probs[k][self._eos] = -1e20
 83 | 
 84 |         # Sum the previous scores.
 85 |         if len(self.prev_ks) > 0:
 86 |             beam_scores = word_probs + \
 87 |                 self.scores.unsqueeze(1).expand_as(word_probs)
 88 | 
 89 |             # Don't let EOS have children.
 90 |             for i in range(self.next_ys[-1].size(0)):
 91 |                 if self.next_ys[-1][i] == self._eos:
 92 |                     beam_scores[i] = -1e20
 93 |         else:
 94 |             beam_scores = word_probs[0]
 95 |         flat_beam_scores = beam_scores.view(-1)
 96 |         best_scores, best_scores_id = flat_beam_scores.topk(self.size, 0,
 97 |                                                             True, True)
 98 | 
 99 |         self.all_scores.append(self.scores)
100 |         self.scores = best_scores
101 | 
102 |         # best_scores_id is flattened beam x word array, so calculate which
103 |         # word and beam each score came from
104 |         prev_k = best_scores_id / num_words
105 |         self.prev_ks.append(prev_k)
106 |         self.next_ys.append((best_scores_id - prev_k * num_words))
107 |         self.attn.append(attn_out.index_select(0, prev_k))
108 | 
109 |         if self.global_scorer is not None:
110 |             self.global_scorer.update_global_state(self)
111 | 
112 |         for i in range(self.next_ys[-1].size(0)):
113 |             if self.next_ys[-1][i] == self._eos:
114 |                 s = self.scores[i]
115 |                 if self.global_scorer is not None:
116 |                     global_scores = self.global_scorer.score(self, self.scores)
117 |                     s = global_scores[i]
118 |                 self.finished.append((s, len(self.next_ys) - 1, i))
119 | 
120 |         # End condition is when top-of-beam is EOS and no global score.
121 |         if self.next_ys[-1][0] == self._eos:
122 |             # self.all_scores.append(self.scores)
123 |             self.eos_top = True
124 | 
125 |     def done(self):
126 |         return self.eos_top and len(self.finished) >= self.n_best
127 | 
128 |     def sort_finished(self, minimum=None):
129 |         if minimum is not None:
130 |             i = 0
131 |             # Add from beam until we have minimum outputs.
132 |             while len(self.finished) < minimum:
133 |                 s = self.scores[i]
134 |                 if self.global_scorer is not None:
135 |                     global_scores = self.global_scorer.score(self, self.scores)
136 |                     s = global_scores[i]
137 |                 self.finished.append((s, len(self.next_ys) - 1, i))
138 | 
139 |         self.finished.sort(key=lambda a: -a[0])
140 |         scores = [sc for sc, _, _ in self.finished]
141 |         ks = [(t, k) for _, t, k in self.finished]
142 |         return scores, ks
143 | 
144 |     def get_hyp(self, timestep, k):
145 |         """
146 |         Walk back to construct the full hypothesis.
147 |         """
148 |         hyp, attn = [], []
149 |         for j in range(len(self.prev_ks[:timestep]) - 1, -1, -1):
150 |             hyp.append(self.next_ys[j+1][k])
151 |             attn.append(self.attn[j][k])
152 |             k = self.prev_ks[j][k]
153 |         return hyp[::-1], torch.stack(attn[::-1])
154 | 
155 | 
156 | class GNMTGlobalScorer(object):
157 |     """
158 |     NMT re-ranking score from
159 |     "Google's Neural Machine Translation System" :cite:`wu2016google`
160 | 
161 |     Args:
162 |        alpha (float): length parameter
163 |        beta (float):  coverage parameter
164 |     """
165 |     def __init__(self, alpha, beta):
166 |         self.alpha = alpha
167 |         self.beta = beta
168 | 
169 |     def score(self, beam, logprobs):
170 |         "Additional term add to log probability"
171 |         cov = beam.global_state["coverage"]
172 |         pen = self.beta * torch.min(cov, cov.clone().fill_(1.0)).log().sum(1)
173 |         l_term = (((5 + len(beam.next_ys)) ** self.alpha) /
174 |                   ((5 + 1) ** self.alpha))
175 |         return (logprobs / l_term) + pen
176 | 
177 |     def update_global_state(self, beam):
178 |         "Keeps the coverage vector as sum of attens"
179 |         if len(beam.prev_ks) == 1:
180 |             beam.global_state["coverage"] = beam.attn[-1]
181 |         else:
182 |             beam.global_state["coverage"] = beam.global_state["coverage"] \
183 |                 .index_select(0, beam.prev_ks[-1]).add(beam.attn[-1])
184 | 


--------------------------------------------------------------------------------
/G_pretrain/onmt/translate/Beam.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import torch
  3 | 
  4 | 
  5 | class Beam(object):
  6 |     """
  7 |     Class for managing the internals of the beam search process.
  8 | 
  9 |     Takes care of beams, back pointers, and scores.
 10 | 
 11 |     Args:
 12 |        size (int): beam size
 13 |        pad, bos, eos (int): indices of padding, beginning, and ending.
 14 |        n_best (int): nbest size to use
 15 |        cuda (bool): use gpu
 16 |        global_scorer (:obj:`GlobalScorer`)
 17 |     """
 18 |     def __init__(self, size, pad, bos, eos,
 19 |                  n_best=1, cuda=False,
 20 |                  global_scorer=None,
 21 |                  min_length=0):
 22 | 
 23 |         self.size = size
 24 |         self.tt = torch.cuda if cuda else torch
 25 | 
 26 |         # The score for each translation on the beam.
 27 |         self.scores = self.tt.FloatTensor(size).zero_()
 28 |         self.all_scores = []
 29 | 
 30 |         # The backpointers at each time-step.
 31 |         self.prev_ks = []
 32 | 
 33 |         # The outputs at each time-step.
 34 |         self.next_ys = [self.tt.LongTensor(size)
 35 |                         .fill_(pad)]
 36 |         self.next_ys[0][0] = bos
 37 | 
 38 |         # Has EOS topped the beam yet.
 39 |         self._eos = eos
 40 |         self.eos_top = False
 41 | 
 42 |         # The attentions (matrix) for each time.
 43 |         self.attn = []
 44 | 
 45 |         # Time and k pair for finished.
 46 |         self.finished = []
 47 |         self.n_best = n_best
 48 | 
 49 |         # Information for global scoring.
 50 |         self.global_scorer = global_scorer
 51 |         self.global_state = {}
 52 | 
 53 |         # Minimum prediction length
 54 |         self.min_length = min_length
 55 | 
 56 |     def get_current_state(self):
 57 |         "Get the outputs for the current timestep."
 58 |         return self.next_ys[-1]
 59 | 
 60 |     def get_current_origin(self):
 61 |         "Get the backpointers for the current timestep."
 62 |         return self.prev_ks[-1]
 63 | 
 64 |     def advance(self, word_probs, attn_out):
 65 |         """
 66 |         Given prob over words for every last beam `wordLk` and attention
 67 |         `attn_out`: Compute and update the beam search.
 68 | 
 69 |         Parameters:
 70 | 
 71 |         * `word_probs`- probs of advancing from the last step (K x words)
 72 |         * `attn_out`- attention at the last step
 73 | 
 74 |         Returns: True if beam search is complete.
 75 |         """
 76 |         num_words = word_probs.size(1)
 77 | 
 78 |         # force the output to be longer than self.min_length
 79 |         cur_len = len(self.next_ys)
 80 |         if cur_len < self.min_length:
 81 |             for k in range(len(word_probs)):
 82 |                 word_probs[k][self._eos] = -1e20
 83 | 
 84 |         # Sum the previous scores.
 85 |         if len(self.prev_ks) > 0:
 86 |             beam_scores = word_probs + \
 87 |                 self.scores.unsqueeze(1).expand_as(word_probs)
 88 | 
 89 |             # Don't let EOS have children.
 90 |             for i in range(self.next_ys[-1].size(0)):
 91 |                 if self.next_ys[-1][i] == self._eos:
 92 |                     beam_scores[i] = -1e20
 93 |         else:
 94 |             beam_scores = word_probs[0]
 95 |         flat_beam_scores = beam_scores.view(-1)
 96 |         best_scores, best_scores_id = flat_beam_scores.topk(self.size, 0,
 97 |                                                             True, True)
 98 | 
 99 |         self.all_scores.append(self.scores)
100 |         self.scores = best_scores
101 | 
102 |         # best_scores_id is flattened beam x word array, so calculate which
103 |         # word and beam each score came from
104 |         prev_k = best_scores_id / num_words
105 |         self.prev_ks.append(prev_k)
106 |         self.next_ys.append((best_scores_id - prev_k * num_words))
107 |         self.attn.append(attn_out.index_select(0, prev_k))
108 | 
109 |         if self.global_scorer is not None:
110 |             self.global_scorer.update_global_state(self)
111 | 
112 |         for i in range(self.next_ys[-1].size(0)):
113 |             if self.next_ys[-1][i] == self._eos:
114 |                 s = self.scores[i]
115 |                 if self.global_scorer is not None:
116 |                     global_scores = self.global_scorer.score(self, self.scores)
117 |                     s = global_scores[i]
118 |                 self.finished.append((s, len(self.next_ys) - 1, i))
119 | 
120 |         # End condition is when top-of-beam is EOS and no global score.
121 |         if self.next_ys[-1][0] == self._eos:
122 |             # self.all_scores.append(self.scores)
123 |             self.eos_top = True
124 | 
125 |     def done(self):
126 |         return self.eos_top and len(self.finished) >= self.n_best
127 | 
128 |     def sort_finished(self, minimum=None):
129 |         if minimum is not None:
130 |             i = 0
131 |             # Add from beam until we have minimum outputs.
132 |             while len(self.finished) < minimum:
133 |                 s = self.scores[i]
134 |                 if self.global_scorer is not None:
135 |                     global_scores = self.global_scorer.score(self, self.scores)
136 |                     s = global_scores[i]
137 |                 self.finished.append((s, len(self.next_ys) - 1, i))
138 | 
139 |         self.finished.sort(key=lambda a: -a[0])
140 |         scores = [sc for sc, _, _ in self.finished]
141 |         ks = [(t, k) for _, t, k in self.finished]
142 |         return scores, ks
143 | 
144 |     def get_hyp(self, timestep, k):
145 |         """
146 |         Walk back to construct the full hypothesis.
147 |         """
148 |         hyp, attn = [], []
149 |         for j in range(len(self.prev_ks[:timestep]) - 1, -1, -1):
150 |             hyp.append(self.next_ys[j+1][k])
151 |             attn.append(self.attn[j][k])
152 |             k = self.prev_ks[j][k]
153 |         return hyp[::-1], torch.stack(attn[::-1])
154 | 
155 | 
156 | class GNMTGlobalScorer(object):
157 |     """
158 |     NMT re-ranking score from
159 |     "Google's Neural Machine Translation System" :cite:`wu2016google`
160 | 
161 |     Args:
162 |        alpha (float): length parameter
163 |        beta (float):  coverage parameter
164 |     """
165 |     def __init__(self, alpha, beta):
166 |         self.alpha = alpha
167 |         self.beta = beta
168 | 
169 |     def score(self, beam, logprobs):
170 |         "Additional term add to log probability"
171 |         cov = beam.global_state["coverage"]
172 |         pen = self.beta * torch.min(cov, cov.clone().fill_(1.0)).log().sum(1)
173 |         l_term = (((5 + len(beam.next_ys)) ** self.alpha) /
174 |                   ((5 + 1) ** self.alpha))
175 |         return (logprobs / l_term) + pen
176 | 
177 |     def update_global_state(self, beam):
178 |         "Keeps the coverage vector as sum of attens"
179 |         if len(beam.prev_ks) == 1:
180 |             beam.global_state["coverage"] = beam.attn[-1]
181 |         else:
182 |             beam.global_state["coverage"] = beam.global_state["coverage"] \
183 |                 .index_select(0, beam.prev_ks[-1]).add(beam.attn[-1])
184 | 


--------------------------------------------------------------------------------