├── D_pretrain └── onmt │ ├── translate │ ├── __init__.py │ ├── Translation.py │ └── Beam.py │ ├── __init__.py │ ├── io │ ├── __init__.py │ └── DatasetBase.py │ ├── modules │ ├── StructuredAttention.py │ ├── StackedRNN.py │ ├── __init__.py │ ├── AudioEncoder.py │ ├── ConvMultiStepAttention.py │ ├── UtilClass.py │ ├── Gate.py │ ├── ImageEncoder.py │ └── MultiHeadedAttn.py │ ├── Utils.py │ └── Optim.py ├── G_pretrain └── onmt │ ├── translate │ ├── __init__.py │ ├── Translation.py │ └── Beam.py │ ├── __init__.py │ ├── io │ ├── __init__.py │ └── DatasetBase.py │ ├── modules │ ├── StructuredAttention.py │ ├── StackedRNN.py │ ├── __init__.py │ ├── AudioEncoder.py │ ├── ConvMultiStepAttention.py │ ├── UtilClass.py │ ├── Gate.py │ ├── ImageEncoder.py │ └── MultiHeadedAttn.py │ ├── Utils.py │ └── Optim.py ├── NLI_pretrain └── onmt │ ├── translate │ ├── __init__.py │ └── Translation.py │ ├── __init__.py │ ├── io │ ├── __init__.py │ └── DatasetBase.py │ ├── modules │ ├── StructuredAttention.py │ ├── StackedRNN.py │ ├── __init__.py │ ├── AudioEncoder.py │ ├── ConvMultiStepAttention.py │ ├── UtilClass.py │ ├── Gate.py │ ├── ImageEncoder.py │ └── MultiHeadedAttn.py │ ├── Utils.py │ └── Optim.py ├── reinforcement_train ├── onmt │ ├── translate │ │ ├── __init__.py │ │ └── Translation.py │ ├── __init__.py │ ├── io │ │ ├── __init__.py │ │ └── DatasetBase.py │ ├── modules │ │ ├── StructuredAttention.py │ │ ├── StackedRNN.py │ │ ├── __init__.py │ │ ├── AudioEncoder.py │ │ ├── ConvMultiStepAttention.py │ │ ├── UtilClass.py │ │ ├── Gate.py │ │ ├── ImageEncoder.py │ │ └── MultiHeadedAttn.py │ ├── Utils.py │ └── Optim.py └── predict.py ├── LICENSE.md └── .gitignore /D_pretrain/onmt/translate/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.translate.Translator import Translator 2 | from onmt.translate.Translation import Translation, TranslationBuilder 3 | from onmt.translate.Beam import Beam, GNMTGlobalScorer 4 | 5 | __all__ = [Translator, Translation, Beam, GNMTGlobalScorer, TranslationBuilder] 6 | -------------------------------------------------------------------------------- /G_pretrain/onmt/translate/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.translate.Translator import Translator 2 | from onmt.translate.Translation import Translation, TranslationBuilder 3 | from onmt.translate.Beam import Beam, GNMTGlobalScorer 4 | 5 | __all__ = [Translator, Translation, Beam, GNMTGlobalScorer, TranslationBuilder] 6 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/translate/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.translate.Translator import Translator 2 | from onmt.translate.Translation import Translation, TranslationBuilder 3 | from onmt.translate.Beam import Beam, GNMTGlobalScorer 4 | 5 | __all__ = [Translator, Translation, Beam, GNMTGlobalScorer, TranslationBuilder] 6 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/translate/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.translate.Translator import Translator 2 | from onmt.translate.Translation import Translation, TranslationBuilder 3 | from onmt.translate.Beam import Beam, GNMTGlobalScorer 4 | 5 | __all__ = [Translator, Translation, Beam, GNMTGlobalScorer, TranslationBuilder] 6 | -------------------------------------------------------------------------------- /D_pretrain/onmt/__init__.py: -------------------------------------------------------------------------------- 1 | import onmt.io 2 | import onmt.translate 3 | import onmt.Models 4 | import onmt.Loss 5 | from onmt.Trainer import Trainer, Statistics 6 | from onmt.Optim import Optim 7 | 8 | # For flake8 compatibility 9 | __all__ = [onmt.Loss, onmt.Models, 10 | Trainer, Optim, Statistics, onmt.io, onmt.translate] 11 | -------------------------------------------------------------------------------- /G_pretrain/onmt/__init__.py: -------------------------------------------------------------------------------- 1 | import onmt.io 2 | import onmt.translate 3 | import onmt.Models 4 | import onmt.Loss 5 | from onmt.Trainer import Trainer, Statistics 6 | from onmt.Optim import Optim 7 | 8 | # For flake8 compatibility 9 | __all__ = [onmt.Loss, onmt.Models, 10 | Trainer, Optim, Statistics, onmt.io, onmt.translate] 11 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/__init__.py: -------------------------------------------------------------------------------- 1 | import onmt.io 2 | import onmt.translate 3 | import onmt.Models 4 | import onmt.Loss 5 | from onmt.Trainer import Trainer, Statistics 6 | from onmt.Optim import Optim 7 | 8 | # For flake8 compatibility 9 | __all__ = [onmt.Loss, onmt.Models, 10 | Trainer, Optim, Statistics, onmt.io, onmt.translate] 11 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/__init__.py: -------------------------------------------------------------------------------- 1 | import onmt.io 2 | import onmt.translate 3 | import onmt.Models 4 | import onmt.Loss 5 | from onmt.Trainer import Trainer, Statistics 6 | from onmt.Optim import Optim 7 | 8 | # For flake8 compatibility 9 | __all__ = [onmt.Loss, onmt.Models, 10 | Trainer, Optim, Statistics, onmt.io, onmt.translate] 11 | -------------------------------------------------------------------------------- /D_pretrain/onmt/io/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.io.IO import collect_feature_vocabs, make_features, \ 2 | collect_features, get_num_features, \ 3 | load_fields_from_vocab, get_fields, \ 4 | save_fields_to_vocab, build_dataset, \ 5 | build_vocab, merge_vocabs, OrderedIterator 6 | from onmt.io.DatasetBase import ONMTDatasetBase, PAD_WORD, BOS_WORD, \ 7 | EOS_WORD, UNK 8 | from onmt.io.TextDataset import TextDataset, ShardedTextCorpusIterator 9 | from onmt.io.ImageDataset import ImageDataset 10 | from onmt.io.AudioDataset import AudioDataset 11 | 12 | 13 | __all__ = [PAD_WORD, BOS_WORD, EOS_WORD, UNK, ONMTDatasetBase, 14 | collect_feature_vocabs, make_features, 15 | collect_features, get_num_features, 16 | load_fields_from_vocab, get_fields, 17 | save_fields_to_vocab, build_dataset, 18 | build_vocab, merge_vocabs, OrderedIterator, 19 | TextDataset, ImageDataset, AudioDataset, 20 | ShardedTextCorpusIterator] 21 | -------------------------------------------------------------------------------- /G_pretrain/onmt/io/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.io.IO import collect_feature_vocabs, make_features, \ 2 | collect_features, get_num_features, \ 3 | load_fields_from_vocab, get_fields, \ 4 | save_fields_to_vocab, build_dataset, \ 5 | build_vocab, merge_vocabs, OrderedIterator 6 | from onmt.io.DatasetBase import ONMTDatasetBase, PAD_WORD, BOS_WORD, \ 7 | EOS_WORD, UNK 8 | from onmt.io.TextDataset import TextDataset, ShardedTextCorpusIterator 9 | from onmt.io.ImageDataset import ImageDataset 10 | from onmt.io.AudioDataset import AudioDataset 11 | 12 | 13 | __all__ = [PAD_WORD, BOS_WORD, EOS_WORD, UNK, ONMTDatasetBase, 14 | collect_feature_vocabs, make_features, 15 | collect_features, get_num_features, 16 | load_fields_from_vocab, get_fields, 17 | save_fields_to_vocab, build_dataset, 18 | build_vocab, merge_vocabs, OrderedIterator, 19 | TextDataset, ImageDataset, AudioDataset, 20 | ShardedTextCorpusIterator] 21 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/io/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.io.IO import collect_feature_vocabs, make_features, \ 2 | collect_features, get_num_features, \ 3 | load_fields_from_vocab, get_fields, \ 4 | save_fields_to_vocab, build_dataset, \ 5 | build_vocab, merge_vocabs, OrderedIterator 6 | from onmt.io.DatasetBase import ONMTDatasetBase, PAD_WORD, BOS_WORD, \ 7 | EOS_WORD, UNK 8 | from onmt.io.TextDataset import TextDataset, ShardedTextCorpusIterator 9 | from onmt.io.ImageDataset import ImageDataset 10 | from onmt.io.AudioDataset import AudioDataset 11 | 12 | 13 | __all__ = [PAD_WORD, BOS_WORD, EOS_WORD, UNK, ONMTDatasetBase, 14 | collect_feature_vocabs, make_features, 15 | collect_features, get_num_features, 16 | load_fields_from_vocab, get_fields, 17 | save_fields_to_vocab, build_dataset, 18 | build_vocab, merge_vocabs, OrderedIterator, 19 | TextDataset, ImageDataset, AudioDataset, 20 | ShardedTextCorpusIterator] 21 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/io/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.io.IO import collect_feature_vocabs, make_features, \ 2 | collect_features, get_num_features, \ 3 | load_fields_from_vocab, get_fields, \ 4 | save_fields_to_vocab, build_dataset, \ 5 | build_vocab, merge_vocabs, OrderedIterator 6 | from onmt.io.DatasetBase import ONMTDatasetBase, PAD_WORD, BOS_WORD, \ 7 | EOS_WORD, UNK 8 | from onmt.io.TextDataset import TextDataset, ShardedTextCorpusIterator 9 | from onmt.io.ImageDataset import ImageDataset 10 | from onmt.io.AudioDataset import AudioDataset 11 | 12 | 13 | __all__ = [PAD_WORD, BOS_WORD, EOS_WORD, UNK, ONMTDatasetBase, 14 | collect_feature_vocabs, make_features, 15 | collect_features, get_num_features, 16 | load_fields_from_vocab, get_fields, 17 | save_fields_to_vocab, build_dataset, 18 | build_vocab, merge_vocabs, OrderedIterator, 19 | TextDataset, ImageDataset, AudioDataset, 20 | ShardedTextCorpusIterator] 21 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 OpenNMT 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /D_pretrain/onmt/modules/StructuredAttention.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import torch.cuda 4 | from torch.autograd import Variable 5 | 6 | 7 | class MatrixTree(nn.Module): 8 | """Implementation of the matrix-tree theorem for computing marginals 9 | of non-projective dependency parsing. This attention layer is used 10 | in the paper "Learning Structured Text Representations." 11 | 12 | 13 | :cite:`DBLP:journals/corr/LiuL17d` 14 | """ 15 | def __init__(self, eps=1e-5): 16 | self.eps = eps 17 | super(MatrixTree, self).__init__() 18 | 19 | def forward(self, input): 20 | laplacian = input.exp() + self.eps 21 | output = input.clone() 22 | for b in range(input.size(0)): 23 | lap = laplacian[b].masked_fill( 24 | Variable(torch.eye(input.size(1)).cuda().ne(0)), 0) 25 | lap = -lap + torch.diag(lap.sum(0)) 26 | # store roots on diagonal 27 | lap[0] = input[b].diag().exp() 28 | inv_laplacian = lap.inverse() 29 | 30 | factor = inv_laplacian.diag().unsqueeze(1)\ 31 | .expand_as(input[b]).transpose(0, 1) 32 | term1 = input[b].exp().mul(factor).clone() 33 | term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone() 34 | term1[:, 0] = 0 35 | term2[0] = 0 36 | output[b] = term1 - term2 37 | roots_output = input[b].diag().exp().mul( 38 | inv_laplacian.transpose(0, 1)[0]) 39 | output[b] = output[b] + torch.diag(roots_output) 40 | return output 41 | 42 | 43 | if __name__ == "__main__": 44 | dtree = MatrixTree() 45 | q = torch.rand(1, 5, 5).cuda() 46 | marg = dtree.forward(Variable(q)) 47 | print(marg.sum(1)) 48 | -------------------------------------------------------------------------------- /G_pretrain/onmt/modules/StructuredAttention.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import torch.cuda 4 | from torch.autograd import Variable 5 | 6 | 7 | class MatrixTree(nn.Module): 8 | """Implementation of the matrix-tree theorem for computing marginals 9 | of non-projective dependency parsing. This attention layer is used 10 | in the paper "Learning Structured Text Representations." 11 | 12 | 13 | :cite:`DBLP:journals/corr/LiuL17d` 14 | """ 15 | def __init__(self, eps=1e-5): 16 | self.eps = eps 17 | super(MatrixTree, self).__init__() 18 | 19 | def forward(self, input): 20 | laplacian = input.exp() + self.eps 21 | output = input.clone() 22 | for b in range(input.size(0)): 23 | lap = laplacian[b].masked_fill( 24 | Variable(torch.eye(input.size(1)).cuda().ne(0)), 0) 25 | lap = -lap + torch.diag(lap.sum(0)) 26 | # store roots on diagonal 27 | lap[0] = input[b].diag().exp() 28 | inv_laplacian = lap.inverse() 29 | 30 | factor = inv_laplacian.diag().unsqueeze(1)\ 31 | .expand_as(input[b]).transpose(0, 1) 32 | term1 = input[b].exp().mul(factor).clone() 33 | term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone() 34 | term1[:, 0] = 0 35 | term2[0] = 0 36 | output[b] = term1 - term2 37 | roots_output = input[b].diag().exp().mul( 38 | inv_laplacian.transpose(0, 1)[0]) 39 | output[b] = output[b] + torch.diag(roots_output) 40 | return output 41 | 42 | 43 | if __name__ == "__main__": 44 | dtree = MatrixTree() 45 | q = torch.rand(1, 5, 5).cuda() 46 | marg = dtree.forward(Variable(q)) 47 | print(marg.sum(1)) 48 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/modules/StructuredAttention.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import torch.cuda 4 | from torch.autograd import Variable 5 | 6 | 7 | class MatrixTree(nn.Module): 8 | """Implementation of the matrix-tree theorem for computing marginals 9 | of non-projective dependency parsing. This attention layer is used 10 | in the paper "Learning Structured Text Representations." 11 | 12 | 13 | :cite:`DBLP:journals/corr/LiuL17d` 14 | """ 15 | def __init__(self, eps=1e-5): 16 | self.eps = eps 17 | super(MatrixTree, self).__init__() 18 | 19 | def forward(self, input): 20 | laplacian = input.exp() + self.eps 21 | output = input.clone() 22 | for b in range(input.size(0)): 23 | lap = laplacian[b].masked_fill( 24 | Variable(torch.eye(input.size(1)).cuda().ne(0)), 0) 25 | lap = -lap + torch.diag(lap.sum(0)) 26 | # store roots on diagonal 27 | lap[0] = input[b].diag().exp() 28 | inv_laplacian = lap.inverse() 29 | 30 | factor = inv_laplacian.diag().unsqueeze(1)\ 31 | .expand_as(input[b]).transpose(0, 1) 32 | term1 = input[b].exp().mul(factor).clone() 33 | term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone() 34 | term1[:, 0] = 0 35 | term2[0] = 0 36 | output[b] = term1 - term2 37 | roots_output = input[b].diag().exp().mul( 38 | inv_laplacian.transpose(0, 1)[0]) 39 | output[b] = output[b] + torch.diag(roots_output) 40 | return output 41 | 42 | 43 | if __name__ == "__main__": 44 | dtree = MatrixTree() 45 | q = torch.rand(1, 5, 5).cuda() 46 | marg = dtree.forward(Variable(q)) 47 | print(marg.sum(1)) 48 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/modules/StructuredAttention.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import torch.cuda 4 | from torch.autograd import Variable 5 | 6 | 7 | class MatrixTree(nn.Module): 8 | """Implementation of the matrix-tree theorem for computing marginals 9 | of non-projective dependency parsing. This attention layer is used 10 | in the paper "Learning Structured Text Representations." 11 | 12 | 13 | :cite:`DBLP:journals/corr/LiuL17d` 14 | """ 15 | def __init__(self, eps=1e-5): 16 | self.eps = eps 17 | super(MatrixTree, self).__init__() 18 | 19 | def forward(self, input): 20 | laplacian = input.exp() + self.eps 21 | output = input.clone() 22 | for b in range(input.size(0)): 23 | lap = laplacian[b].masked_fill( 24 | Variable(torch.eye(input.size(1)).cuda().ne(0)), 0) 25 | lap = -lap + torch.diag(lap.sum(0)) 26 | # store roots on diagonal 27 | lap[0] = input[b].diag().exp() 28 | inv_laplacian = lap.inverse() 29 | 30 | factor = inv_laplacian.diag().unsqueeze(1)\ 31 | .expand_as(input[b]).transpose(0, 1) 32 | term1 = input[b].exp().mul(factor).clone() 33 | term2 = input[b].exp().mul(inv_laplacian.transpose(0, 1)).clone() 34 | term1[:, 0] = 0 35 | term2[0] = 0 36 | output[b] = term1 - term2 37 | roots_output = input[b].diag().exp().mul( 38 | inv_laplacian.transpose(0, 1)[0]) 39 | output[b] = output[b] + torch.diag(roots_output) 40 | return output 41 | 42 | 43 | if __name__ == "__main__": 44 | dtree = MatrixTree() 45 | q = torch.rand(1, 5, 5).cuda() 46 | marg = dtree.forward(Variable(q)) 47 | print(marg.sum(1)) 48 | -------------------------------------------------------------------------------- /D_pretrain/onmt/modules/StackedRNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class StackedLSTM(nn.Module): 6 | """ 7 | Our own implementation of stacked LSTM. 8 | Needed for the decoder, because we do input feeding. 9 | """ 10 | def __init__(self, num_layers, input_size, rnn_size, dropout): 11 | super(StackedLSTM, self).__init__() 12 | self.dropout = nn.Dropout(dropout) 13 | self.num_layers = num_layers 14 | self.layers = nn.ModuleList() 15 | 16 | for i in range(num_layers): 17 | self.layers.append(nn.LSTMCell(input_size, rnn_size)) 18 | input_size = rnn_size 19 | 20 | def forward(self, input, hidden): 21 | h_0, c_0 = hidden 22 | h_1, c_1 = [], [] 23 | for i, layer in enumerate(self.layers): 24 | h_1_i, c_1_i = layer(input, (h_0[i], c_0[i])) 25 | input = h_1_i 26 | if i + 1 != self.num_layers: 27 | input = self.dropout(input) 28 | h_1 += [h_1_i] 29 | c_1 += [c_1_i] 30 | 31 | h_1 = torch.stack(h_1) 32 | c_1 = torch.stack(c_1) 33 | 34 | return input, (h_1, c_1) 35 | 36 | 37 | class StackedGRU(nn.Module): 38 | 39 | def __init__(self, num_layers, input_size, rnn_size, dropout): 40 | super(StackedGRU, self).__init__() 41 | self.dropout = nn.Dropout(dropout) 42 | self.num_layers = num_layers 43 | self.layers = nn.ModuleList() 44 | 45 | for i in range(num_layers): 46 | self.layers.append(nn.GRUCell(input_size, rnn_size)) 47 | input_size = rnn_size 48 | 49 | def forward(self, input, hidden): 50 | h_1 = [] 51 | for i, layer in enumerate(self.layers): 52 | h_1_i = layer(input, hidden[0][i]) 53 | input = h_1_i 54 | if i + 1 != self.num_layers: 55 | input = self.dropout(input) 56 | h_1 += [h_1_i] 57 | 58 | h_1 = torch.stack(h_1) 59 | return input, (h_1,) 60 | -------------------------------------------------------------------------------- /G_pretrain/onmt/modules/StackedRNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class StackedLSTM(nn.Module): 6 | """ 7 | Our own implementation of stacked LSTM. 8 | Needed for the decoder, because we do input feeding. 9 | """ 10 | def __init__(self, num_layers, input_size, rnn_size, dropout): 11 | super(StackedLSTM, self).__init__() 12 | self.dropout = nn.Dropout(dropout) 13 | self.num_layers = num_layers 14 | self.layers = nn.ModuleList() 15 | 16 | for i in range(num_layers): 17 | self.layers.append(nn.LSTMCell(input_size, rnn_size)) 18 | input_size = rnn_size 19 | 20 | def forward(self, input, hidden): 21 | h_0, c_0 = hidden 22 | h_1, c_1 = [], [] 23 | for i, layer in enumerate(self.layers): 24 | h_1_i, c_1_i = layer(input, (h_0[i], c_0[i])) 25 | input = h_1_i 26 | if i + 1 != self.num_layers: 27 | input = self.dropout(input) 28 | h_1 += [h_1_i] 29 | c_1 += [c_1_i] 30 | 31 | h_1 = torch.stack(h_1) 32 | c_1 = torch.stack(c_1) 33 | 34 | return input, (h_1, c_1) 35 | 36 | 37 | class StackedGRU(nn.Module): 38 | 39 | def __init__(self, num_layers, input_size, rnn_size, dropout): 40 | super(StackedGRU, self).__init__() 41 | self.dropout = nn.Dropout(dropout) 42 | self.num_layers = num_layers 43 | self.layers = nn.ModuleList() 44 | 45 | for i in range(num_layers): 46 | self.layers.append(nn.GRUCell(input_size, rnn_size)) 47 | input_size = rnn_size 48 | 49 | def forward(self, input, hidden): 50 | h_1 = [] 51 | for i, layer in enumerate(self.layers): 52 | h_1_i = layer(input, hidden[0][i]) 53 | input = h_1_i 54 | if i + 1 != self.num_layers: 55 | input = self.dropout(input) 56 | h_1 += [h_1_i] 57 | 58 | h_1 = torch.stack(h_1) 59 | return input, (h_1,) 60 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/modules/StackedRNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class StackedLSTM(nn.Module): 6 | """ 7 | Our own implementation of stacked LSTM. 8 | Needed for the decoder, because we do input feeding. 9 | """ 10 | def __init__(self, num_layers, input_size, rnn_size, dropout): 11 | super(StackedLSTM, self).__init__() 12 | self.dropout = nn.Dropout(dropout) 13 | self.num_layers = num_layers 14 | self.layers = nn.ModuleList() 15 | 16 | for i in range(num_layers): 17 | self.layers.append(nn.LSTMCell(input_size, rnn_size)) 18 | input_size = rnn_size 19 | 20 | def forward(self, input, hidden): 21 | h_0, c_0 = hidden 22 | h_1, c_1 = [], [] 23 | for i, layer in enumerate(self.layers): 24 | h_1_i, c_1_i = layer(input, (h_0[i], c_0[i])) 25 | input = h_1_i 26 | if i + 1 != self.num_layers: 27 | input = self.dropout(input) 28 | h_1 += [h_1_i] 29 | c_1 += [c_1_i] 30 | 31 | h_1 = torch.stack(h_1) 32 | c_1 = torch.stack(c_1) 33 | 34 | return input, (h_1, c_1) 35 | 36 | 37 | class StackedGRU(nn.Module): 38 | 39 | def __init__(self, num_layers, input_size, rnn_size, dropout): 40 | super(StackedGRU, self).__init__() 41 | self.dropout = nn.Dropout(dropout) 42 | self.num_layers = num_layers 43 | self.layers = nn.ModuleList() 44 | 45 | for i in range(num_layers): 46 | self.layers.append(nn.GRUCell(input_size, rnn_size)) 47 | input_size = rnn_size 48 | 49 | def forward(self, input, hidden): 50 | h_1 = [] 51 | for i, layer in enumerate(self.layers): 52 | h_1_i = layer(input, hidden[0][i]) 53 | input = h_1_i 54 | if i + 1 != self.num_layers: 55 | input = self.dropout(input) 56 | h_1 += [h_1_i] 57 | 58 | h_1 = torch.stack(h_1) 59 | return input, (h_1,) 60 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/modules/StackedRNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class StackedLSTM(nn.Module): 6 | """ 7 | Our own implementation of stacked LSTM. 8 | Needed for the decoder, because we do input feeding. 9 | """ 10 | def __init__(self, num_layers, input_size, rnn_size, dropout): 11 | super(StackedLSTM, self).__init__() 12 | self.dropout = nn.Dropout(dropout) 13 | self.num_layers = num_layers 14 | self.layers = nn.ModuleList() 15 | 16 | for i in range(num_layers): 17 | self.layers.append(nn.LSTMCell(input_size, rnn_size)) 18 | input_size = rnn_size 19 | 20 | def forward(self, input, hidden): 21 | h_0, c_0 = hidden 22 | h_1, c_1 = [], [] 23 | for i, layer in enumerate(self.layers): 24 | h_1_i, c_1_i = layer(input, (h_0[i], c_0[i])) 25 | input = h_1_i 26 | if i + 1 != self.num_layers: 27 | input = self.dropout(input) 28 | h_1 += [h_1_i] 29 | c_1 += [c_1_i] 30 | 31 | h_1 = torch.stack(h_1) 32 | c_1 = torch.stack(c_1) 33 | 34 | return input, (h_1, c_1) 35 | 36 | 37 | class StackedGRU(nn.Module): 38 | 39 | def __init__(self, num_layers, input_size, rnn_size, dropout): 40 | super(StackedGRU, self).__init__() 41 | self.dropout = nn.Dropout(dropout) 42 | self.num_layers = num_layers 43 | self.layers = nn.ModuleList() 44 | 45 | for i in range(num_layers): 46 | self.layers.append(nn.GRUCell(input_size, rnn_size)) 47 | input_size = rnn_size 48 | 49 | def forward(self, input, hidden): 50 | h_1 = [] 51 | for i, layer in enumerate(self.layers): 52 | h_1_i = layer(input, hidden[0][i]) 53 | input = h_1_i 54 | if i + 1 != self.num_layers: 55 | input = self.dropout(input) 56 | h_1 += [h_1_i] 57 | 58 | h_1 = torch.stack(h_1) 59 | return input, (h_1,) 60 | -------------------------------------------------------------------------------- /D_pretrain/onmt/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.modules.UtilClass import LayerNorm, Bottle, BottleLinear, \ 2 | BottleLayerNorm, BottleSoftmax, Elementwise 3 | from onmt.modules.Gate import context_gate_factory, ContextGate 4 | from onmt.modules.GlobalAttention import GlobalAttention 5 | from onmt.modules.ConvMultiStepAttention import ConvMultiStepAttention 6 | from onmt.modules.ImageEncoder import ImageEncoder 7 | from onmt.modules.AudioEncoder import AudioEncoder 8 | from onmt.modules.CopyGenerator import CopyGenerator, CopyGeneratorLossCompute 9 | from onmt.modules.StructuredAttention import MatrixTree 10 | from onmt.modules.Transformer import \ 11 | TransformerEncoder, TransformerDecoder, PositionwiseFeedForward 12 | from onmt.modules.Conv2Conv import CNNEncoder, CNNDecoder 13 | from onmt.modules.MultiHeadedAttn import MultiHeadedAttention 14 | from onmt.modules.StackedRNN import StackedLSTM, StackedGRU 15 | from onmt.modules.Embeddings import Embeddings, PositionalEncoding 16 | from onmt.modules.WeightNorm import WeightNormConv2d 17 | from onmt.modules.Distriminitor import Disc, NLI 18 | 19 | from onmt.Models import EncoderBase, MeanEncoder, StdRNNDecoder, \ 20 | RNNDecoderBase, InputFeedRNNDecoder, RNNEncoder, NMTModel 21 | 22 | from onmt.modules.SRU import check_sru_requirement 23 | can_use_sru = check_sru_requirement() 24 | if can_use_sru: 25 | from onmt.modules.SRU import SRU 26 | 27 | 28 | # For flake8 compatibility. 29 | __all__ = [EncoderBase, MeanEncoder, RNNDecoderBase, InputFeedRNNDecoder, 30 | RNNEncoder, NMTModel, 31 | StdRNNDecoder, ContextGate, GlobalAttention, ImageEncoder, 32 | PositionwiseFeedForward, PositionalEncoding, 33 | CopyGenerator, MultiHeadedAttention, 34 | LayerNorm, Bottle, BottleLinear, BottleLayerNorm, BottleSoftmax, 35 | TransformerEncoder, TransformerDecoder, Embeddings, Elementwise, 36 | MatrixTree, WeightNormConv2d, ConvMultiStepAttention, 37 | CNNEncoder, CNNDecoder, StackedLSTM, StackedGRU, 38 | context_gate_factory, CopyGeneratorLossCompute, AudioEncoder, 39 | Disc, NLI] 40 | 41 | if can_use_sru: 42 | __all__.extend([SRU, check_sru_requirement]) 43 | -------------------------------------------------------------------------------- /G_pretrain/onmt/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.modules.UtilClass import LayerNorm, Bottle, BottleLinear, \ 2 | BottleLayerNorm, BottleSoftmax, Elementwise 3 | from onmt.modules.Gate import context_gate_factory, ContextGate 4 | from onmt.modules.GlobalAttention import GlobalAttention 5 | from onmt.modules.ConvMultiStepAttention import ConvMultiStepAttention 6 | from onmt.modules.ImageEncoder import ImageEncoder 7 | from onmt.modules.AudioEncoder import AudioEncoder 8 | from onmt.modules.CopyGenerator import CopyGenerator, CopyGeneratorLossCompute 9 | from onmt.modules.StructuredAttention import MatrixTree 10 | from onmt.modules.Transformer import \ 11 | TransformerEncoder, TransformerDecoder, PositionwiseFeedForward 12 | from onmt.modules.Conv2Conv import CNNEncoder, CNNDecoder 13 | from onmt.modules.MultiHeadedAttn import MultiHeadedAttention 14 | from onmt.modules.StackedRNN import StackedLSTM, StackedGRU 15 | from onmt.modules.Embeddings import Embeddings, PositionalEncoding 16 | from onmt.modules.WeightNorm import WeightNormConv2d 17 | from onmt.modules.Distriminitor import Disc, NLI 18 | 19 | from onmt.Models import EncoderBase, MeanEncoder, StdRNNDecoder, \ 20 | RNNDecoderBase, InputFeedRNNDecoder, RNNEncoder, NMTModel 21 | 22 | from onmt.modules.SRU import check_sru_requirement 23 | can_use_sru = check_sru_requirement() 24 | if can_use_sru: 25 | from onmt.modules.SRU import SRU 26 | 27 | 28 | # For flake8 compatibility. 29 | __all__ = [EncoderBase, MeanEncoder, RNNDecoderBase, InputFeedRNNDecoder, 30 | RNNEncoder, NMTModel, 31 | StdRNNDecoder, ContextGate, GlobalAttention, ImageEncoder, 32 | PositionwiseFeedForward, PositionalEncoding, 33 | CopyGenerator, MultiHeadedAttention, 34 | LayerNorm, Bottle, BottleLinear, BottleLayerNorm, BottleSoftmax, 35 | TransformerEncoder, TransformerDecoder, Embeddings, Elementwise, 36 | MatrixTree, WeightNormConv2d, ConvMultiStepAttention, 37 | CNNEncoder, CNNDecoder, StackedLSTM, StackedGRU, 38 | context_gate_factory, CopyGeneratorLossCompute, AudioEncoder, 39 | Disc, NLI] 40 | 41 | if can_use_sru: 42 | __all__.extend([SRU, check_sru_requirement]) 43 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.modules.UtilClass import LayerNorm, Bottle, BottleLinear, \ 2 | BottleLayerNorm, BottleSoftmax, Elementwise 3 | from onmt.modules.Gate import context_gate_factory, ContextGate 4 | from onmt.modules.GlobalAttention import GlobalAttention 5 | from onmt.modules.ConvMultiStepAttention import ConvMultiStepAttention 6 | from onmt.modules.ImageEncoder import ImageEncoder 7 | from onmt.modules.AudioEncoder import AudioEncoder 8 | from onmt.modules.CopyGenerator import CopyGenerator, CopyGeneratorLossCompute 9 | from onmt.modules.StructuredAttention import MatrixTree 10 | from onmt.modules.Transformer import \ 11 | TransformerEncoder, TransformerDecoder, PositionwiseFeedForward 12 | from onmt.modules.Conv2Conv import CNNEncoder, CNNDecoder 13 | from onmt.modules.MultiHeadedAttn import MultiHeadedAttention 14 | from onmt.modules.StackedRNN import StackedLSTM, StackedGRU 15 | from onmt.modules.Embeddings import Embeddings, PositionalEncoding 16 | from onmt.modules.WeightNorm import WeightNormConv2d 17 | from onmt.modules.Distriminitor import Disc, NLI 18 | 19 | from onmt.Models import EncoderBase, MeanEncoder, StdRNNDecoder, \ 20 | RNNDecoderBase, InputFeedRNNDecoder, RNNEncoder, NMTModel 21 | 22 | from onmt.modules.SRU import check_sru_requirement 23 | can_use_sru = check_sru_requirement() 24 | if can_use_sru: 25 | from onmt.modules.SRU import SRU 26 | 27 | 28 | # For flake8 compatibility. 29 | __all__ = [EncoderBase, MeanEncoder, RNNDecoderBase, InputFeedRNNDecoder, 30 | RNNEncoder, NMTModel, 31 | StdRNNDecoder, ContextGate, GlobalAttention, ImageEncoder, 32 | PositionwiseFeedForward, PositionalEncoding, 33 | CopyGenerator, MultiHeadedAttention, 34 | LayerNorm, Bottle, BottleLinear, BottleLayerNorm, BottleSoftmax, 35 | TransformerEncoder, TransformerDecoder, Embeddings, Elementwise, 36 | MatrixTree, WeightNormConv2d, ConvMultiStepAttention, 37 | CNNEncoder, CNNDecoder, StackedLSTM, StackedGRU, 38 | context_gate_factory, CopyGeneratorLossCompute, AudioEncoder, 39 | Disc, NLI] 40 | 41 | if can_use_sru: 42 | __all__.extend([SRU, check_sru_requirement]) 43 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from onmt.modules.UtilClass import LayerNorm, Bottle, BottleLinear, \ 2 | BottleLayerNorm, BottleSoftmax, Elementwise 3 | from onmt.modules.Gate import context_gate_factory, ContextGate 4 | from onmt.modules.GlobalAttention import GlobalAttention 5 | from onmt.modules.ConvMultiStepAttention import ConvMultiStepAttention 6 | from onmt.modules.ImageEncoder import ImageEncoder 7 | from onmt.modules.AudioEncoder import AudioEncoder 8 | from onmt.modules.CopyGenerator import CopyGenerator, CopyGeneratorLossCompute 9 | from onmt.modules.StructuredAttention import MatrixTree 10 | from onmt.modules.Transformer import \ 11 | TransformerEncoder, TransformerDecoder, PositionwiseFeedForward 12 | from onmt.modules.Conv2Conv import CNNEncoder, CNNDecoder 13 | from onmt.modules.MultiHeadedAttn import MultiHeadedAttention 14 | from onmt.modules.StackedRNN import StackedLSTM, StackedGRU 15 | from onmt.modules.Embeddings import Embeddings, PositionalEncoding 16 | from onmt.modules.WeightNorm import WeightNormConv2d 17 | from onmt.modules.Distriminitor import Disc, NLI 18 | 19 | from onmt.Models import EncoderBase, MeanEncoder, StdRNNDecoder, \ 20 | RNNDecoderBase, InputFeedRNNDecoder, RNNEncoder, NMTModel 21 | 22 | from onmt.modules.SRU import check_sru_requirement 23 | can_use_sru = check_sru_requirement() 24 | if can_use_sru: 25 | from onmt.modules.SRU import SRU 26 | 27 | 28 | # For flake8 compatibility. 29 | __all__ = [EncoderBase, MeanEncoder, RNNDecoderBase, InputFeedRNNDecoder, 30 | RNNEncoder, NMTModel, 31 | StdRNNDecoder, ContextGate, GlobalAttention, ImageEncoder, 32 | PositionwiseFeedForward, PositionalEncoding, 33 | CopyGenerator, MultiHeadedAttention, 34 | LayerNorm, Bottle, BottleLinear, BottleLayerNorm, BottleSoftmax, 35 | TransformerEncoder, TransformerDecoder, Embeddings, Elementwise, 36 | MatrixTree, WeightNormConv2d, ConvMultiStepAttention, 37 | CNNEncoder, CNNDecoder, StackedLSTM, StackedGRU, 38 | context_gate_factory, CopyGeneratorLossCompute, AudioEncoder, 39 | Disc, NLI] 40 | 41 | if can_use_sru: 42 | __all__.extend([SRU, check_sru_requirement]) 43 | -------------------------------------------------------------------------------- /D_pretrain/onmt/Utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | 4 | 5 | def aeq(*args): 6 | """ 7 | Assert all arguments have the same value 8 | """ 9 | arguments = (arg for arg in args) 10 | first = next(arguments) 11 | assert all(arg == first for arg in arguments), \ 12 | "Not all arguments have the same value: " + str(args) 13 | 14 | 15 | def sequence_mask(lengths, max_len=None): 16 | """ 17 | Creates a boolean mask from sequence lengths. 18 | """ 19 | batch_size = lengths.numel() 20 | max_len = max_len or lengths.max() 21 | return (torch.arange(0, max_len) 22 | .type_as(lengths) 23 | .repeat(batch_size, 1) 24 | .lt(lengths.unsqueeze(1))) 25 | 26 | 27 | def use_gpu(opt): 28 | return (hasattr(opt, 'gpuid') and len(opt.gpuid) > 0) or \ 29 | (hasattr(opt, 'gpu') and opt.gpu > -1) 30 | 31 | 32 | def formalize(batch, batch_length, batch_first=False): 33 | """formalize a batch to sort the batch according to its length 34 | 35 | Args: 36 | batch: batch 37 | batch_length: batch length list 38 | Returns: 39 | formalized batch 40 | """ 41 | sorted_lengths, _ = torch.sort(batch_length, descending=True) 42 | batch_length = batch_length.view(-1).tolist() 43 | index_length = [(i, l) for i, l in enumerate(batch_length)] 44 | ordered_index = sorted(index_length, key=lambda e: e[1], reverse=True) 45 | 46 | origin_new = dict([(v[0], k) for k, v in enumerate(ordered_index)]) 47 | 48 | sorted_batch = Variable(batch.data.new(batch.size())) 49 | for k, v in origin_new.items(): 50 | if batch_first: 51 | sorted_batch[v] = batch[k] 52 | else: 53 | sorted_batch[:, v] = batch[:, k] 54 | return sorted_batch, sorted_lengths, origin_new 55 | 56 | 57 | def deformalize(batch, origin_new): 58 | """reform batch in the origin order, batch is the second dimension. 59 | 60 | Args: 61 | batch: encoded batch, length*batch_size*dim 62 | origin_new: origin->new index dict 63 | Returns: 64 | reformed batch 65 | """ 66 | desorted_batch = Variable(batch.data.new(batch.size())) 67 | for k, v in origin_new.items(): 68 | desorted_batch[:, k] = batch[:, v] 69 | return desorted_batch -------------------------------------------------------------------------------- /G_pretrain/onmt/Utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | 4 | 5 | def aeq(*args): 6 | """ 7 | Assert all arguments have the same value 8 | """ 9 | arguments = (arg for arg in args) 10 | first = next(arguments) 11 | assert all(arg == first for arg in arguments), \ 12 | "Not all arguments have the same value: " + str(args) 13 | 14 | 15 | def sequence_mask(lengths, max_len=None): 16 | """ 17 | Creates a boolean mask from sequence lengths. 18 | """ 19 | batch_size = lengths.numel() 20 | max_len = max_len or lengths.max() 21 | return (torch.arange(0, max_len) 22 | .type_as(lengths) 23 | .repeat(batch_size, 1) 24 | .lt(lengths.unsqueeze(1))) 25 | 26 | 27 | def use_gpu(opt): 28 | return (hasattr(opt, 'gpuid') and len(opt.gpuid) > 0) or \ 29 | (hasattr(opt, 'gpu') and opt.gpu > -1) 30 | 31 | 32 | def formalize(batch, batch_length, batch_first=False): 33 | """formalize a batch to sort the batch according to its length 34 | 35 | Args: 36 | batch: batch 37 | batch_length: batch length list 38 | Returns: 39 | formalized batch 40 | """ 41 | sorted_lengths, _ = torch.sort(batch_length, descending=True) 42 | batch_length = batch_length.view(-1).tolist() 43 | index_length = [(i, l) for i, l in enumerate(batch_length)] 44 | ordered_index = sorted(index_length, key=lambda e: e[1], reverse=True) 45 | 46 | origin_new = dict([(v[0], k) for k, v in enumerate(ordered_index)]) 47 | 48 | sorted_batch = Variable(batch.data.new(batch.size())) 49 | for k, v in origin_new.items(): 50 | if batch_first: 51 | sorted_batch[v] = batch[k] 52 | else: 53 | sorted_batch[:, v] = batch[:, k] 54 | return sorted_batch, sorted_lengths, origin_new 55 | 56 | 57 | def deformalize(batch, origin_new): 58 | """reform batch in the origin order, batch is the second dimension. 59 | 60 | Args: 61 | batch: encoded batch, length*batch_size*dim 62 | origin_new: origin->new index dict 63 | Returns: 64 | reformed batch 65 | """ 66 | desorted_batch = Variable(batch.data.new(batch.size())) 67 | for k, v in origin_new.items(): 68 | desorted_batch[:, k] = batch[:, v] 69 | return desorted_batch -------------------------------------------------------------------------------- /NLI_pretrain/onmt/Utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | 4 | 5 | def aeq(*args): 6 | """ 7 | Assert all arguments have the same value 8 | """ 9 | arguments = (arg for arg in args) 10 | first = next(arguments) 11 | assert all(arg == first for arg in arguments), \ 12 | "Not all arguments have the same value: " + str(args) 13 | 14 | 15 | def sequence_mask(lengths, max_len=None): 16 | """ 17 | Creates a boolean mask from sequence lengths. 18 | """ 19 | batch_size = lengths.numel() 20 | max_len = max_len or lengths.max() 21 | return (torch.arange(0, max_len) 22 | .type_as(lengths) 23 | .repeat(batch_size, 1) 24 | .lt(lengths.unsqueeze(1))) 25 | 26 | 27 | def use_gpu(opt): 28 | return (hasattr(opt, 'gpuid') and len(opt.gpuid) > 0) or \ 29 | (hasattr(opt, 'gpu') and opt.gpu > -1) 30 | 31 | 32 | def formalize(batch, batch_length, batch_first=False): 33 | """formalize a batch to sort the batch according to its length 34 | 35 | Args: 36 | batch: batch 37 | batch_length: batch length list 38 | Returns: 39 | formalized batch 40 | """ 41 | sorted_lengths, _ = torch.sort(batch_length, descending=True) 42 | batch_length = batch_length.view(-1).tolist() 43 | index_length = [(i, l) for i, l in enumerate(batch_length)] 44 | ordered_index = sorted(index_length, key=lambda e: e[1], reverse=True) 45 | 46 | origin_new = dict([(v[0], k) for k, v in enumerate(ordered_index)]) 47 | 48 | sorted_batch = Variable(batch.data.new(batch.size())) 49 | for k, v in origin_new.items(): 50 | if batch_first: 51 | sorted_batch[v] = batch[k] 52 | else: 53 | sorted_batch[:, v] = batch[:, k] 54 | return sorted_batch, sorted_lengths, origin_new 55 | 56 | 57 | def deformalize(batch, origin_new): 58 | """reform batch in the origin order, batch is the second dimension. 59 | 60 | Args: 61 | batch: encoded batch, length*batch_size*dim 62 | origin_new: origin->new index dict 63 | Returns: 64 | reformed batch 65 | """ 66 | desorted_batch = Variable(batch.data.new(batch.size())) 67 | for k, v in origin_new.items(): 68 | desorted_batch[:, k] = batch[:, v] 69 | return desorted_batch -------------------------------------------------------------------------------- /reinforcement_train/onmt/Utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | 4 | 5 | def aeq(*args): 6 | """ 7 | Assert all arguments have the same value 8 | """ 9 | arguments = (arg for arg in args) 10 | first = next(arguments) 11 | assert all(arg == first for arg in arguments), \ 12 | "Not all arguments have the same value: " + str(args) 13 | 14 | 15 | def sequence_mask(lengths, max_len=None): 16 | """ 17 | Creates a boolean mask from sequence lengths. 18 | """ 19 | batch_size = lengths.numel() 20 | max_len = max_len or lengths.max() 21 | return (torch.arange(0, max_len) 22 | .type_as(lengths) 23 | .repeat(batch_size, 1) 24 | .lt(lengths.unsqueeze(1))) 25 | 26 | 27 | def use_gpu(opt): 28 | return (hasattr(opt, 'gpuid') and len(opt.gpuid) > 0) or \ 29 | (hasattr(opt, 'gpu') and opt.gpu > -1) 30 | 31 | 32 | def formalize(batch, batch_length, batch_first=False): 33 | """formalize a batch to sort the batch according to its length 34 | 35 | Args: 36 | batch: batch 37 | batch_length: batch length list 38 | Returns: 39 | formalized batch 40 | """ 41 | sorted_lengths, _ = torch.sort(batch_length, descending=True) 42 | batch_length = batch_length.view(-1).tolist() 43 | index_length = [(i, l) for i, l in enumerate(batch_length)] 44 | ordered_index = sorted(index_length, key=lambda e: e[1], reverse=True) 45 | 46 | origin_new = dict([(v[0], k) for k, v in enumerate(ordered_index)]) 47 | 48 | sorted_batch = Variable(batch.data.new(batch.size())) 49 | for k, v in origin_new.items(): 50 | if batch_first: 51 | sorted_batch[v] = batch[k] 52 | else: 53 | sorted_batch[:, v] = batch[:, k] 54 | return sorted_batch, sorted_lengths, origin_new 55 | 56 | 57 | def deformalize(batch, origin_new): 58 | """reform batch in the origin order, batch is the second dimension. 59 | 60 | Args: 61 | batch: encoded batch, length*batch_size*dim 62 | origin_new: origin->new index dict 63 | Returns: 64 | reformed batch 65 | """ 66 | desorted_batch = Variable(batch.data.new(batch.size())) 67 | for k, v in origin_new.items(): 68 | desorted_batch[:, k] = batch[:, v] 69 | return desorted_batch -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /D_pretrain/onmt/modules/AudioEncoder.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class AudioEncoder(nn.Module): 7 | """ 8 | A simple encoder convolutional -> recurrent neural network for 9 | audio input. 10 | 11 | Args: 12 | num_layers (int): number of encoder layers. 13 | bidirectional (bool): bidirectional encoder. 14 | rnn_size (int): size of hidden states of the rnn. 15 | dropout (float): dropout probablity. 16 | sample_rate (float): input spec 17 | window_size (int): input spec 18 | 19 | """ 20 | def __init__(self, num_layers, bidirectional, rnn_size, dropout, 21 | sample_rate, window_size): 22 | super(AudioEncoder, self).__init__() 23 | self.num_layers = num_layers 24 | self.num_directions = 2 if bidirectional else 1 25 | self.hidden_size = rnn_size 26 | 27 | self.layer1 = nn.Conv2d(1, 32, kernel_size=(41, 11), 28 | padding=(0, 10), stride=(2, 2)) 29 | self.batch_norm1 = nn.BatchNorm2d(32) 30 | self.layer2 = nn.Conv2d(32, 32, kernel_size=(21, 11), 31 | padding=(0, 0), stride=(2, 1)) 32 | self.batch_norm2 = nn.BatchNorm2d(32) 33 | 34 | input_size = int(math.floor((sample_rate * window_size) / 2) + 1) 35 | input_size = int(math.floor(input_size - 41) / 2 + 1) 36 | input_size = int(math.floor(input_size - 21) / 2 + 1) 37 | input_size *= 32 38 | self.rnn = nn.LSTM(input_size, rnn_size, 39 | num_layers=num_layers, 40 | dropout=dropout, 41 | bidirectional=bidirectional) 42 | 43 | def load_pretrained_vectors(self, opt): 44 | # Pass in needed options only when modify function definition. 45 | pass 46 | 47 | def forward(self, input, lengths=None): 48 | "See :obj:`onmt.modules.EncoderBase.forward()`" 49 | # (batch_size, 1, nfft, t) 50 | # layer 1 51 | input = self.batch_norm1(self.layer1(input[:, :, :, :])) 52 | 53 | # (batch_size, 32, nfft/2, t/2) 54 | input = F.hardtanh(input, 0, 20, inplace=True) 55 | 56 | # (batch_size, 32, nfft/2/2, t/2) 57 | # layer 2 58 | input = self.batch_norm2(self.layer2(input)) 59 | 60 | # (batch_size, 32, nfft/2/2, t/2) 61 | input = F.hardtanh(input, 0, 20, inplace=True) 62 | 63 | batch_size = input.size(0) 64 | length = input.size(3) 65 | input = input.view(batch_size, -1, length) 66 | input = input.transpose(0, 2).transpose(1, 2) 67 | 68 | output, hidden = self.rnn(input) 69 | 70 | return hidden, output 71 | -------------------------------------------------------------------------------- /G_pretrain/onmt/modules/AudioEncoder.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class AudioEncoder(nn.Module): 7 | """ 8 | A simple encoder convolutional -> recurrent neural network for 9 | audio input. 10 | 11 | Args: 12 | num_layers (int): number of encoder layers. 13 | bidirectional (bool): bidirectional encoder. 14 | rnn_size (int): size of hidden states of the rnn. 15 | dropout (float): dropout probablity. 16 | sample_rate (float): input spec 17 | window_size (int): input spec 18 | 19 | """ 20 | def __init__(self, num_layers, bidirectional, rnn_size, dropout, 21 | sample_rate, window_size): 22 | super(AudioEncoder, self).__init__() 23 | self.num_layers = num_layers 24 | self.num_directions = 2 if bidirectional else 1 25 | self.hidden_size = rnn_size 26 | 27 | self.layer1 = nn.Conv2d(1, 32, kernel_size=(41, 11), 28 | padding=(0, 10), stride=(2, 2)) 29 | self.batch_norm1 = nn.BatchNorm2d(32) 30 | self.layer2 = nn.Conv2d(32, 32, kernel_size=(21, 11), 31 | padding=(0, 0), stride=(2, 1)) 32 | self.batch_norm2 = nn.BatchNorm2d(32) 33 | 34 | input_size = int(math.floor((sample_rate * window_size) / 2) + 1) 35 | input_size = int(math.floor(input_size - 41) / 2 + 1) 36 | input_size = int(math.floor(input_size - 21) / 2 + 1) 37 | input_size *= 32 38 | self.rnn = nn.LSTM(input_size, rnn_size, 39 | num_layers=num_layers, 40 | dropout=dropout, 41 | bidirectional=bidirectional) 42 | 43 | def load_pretrained_vectors(self, opt): 44 | # Pass in needed options only when modify function definition. 45 | pass 46 | 47 | def forward(self, input, lengths=None): 48 | "See :obj:`onmt.modules.EncoderBase.forward()`" 49 | # (batch_size, 1, nfft, t) 50 | # layer 1 51 | input = self.batch_norm1(self.layer1(input[:, :, :, :])) 52 | 53 | # (batch_size, 32, nfft/2, t/2) 54 | input = F.hardtanh(input, 0, 20, inplace=True) 55 | 56 | # (batch_size, 32, nfft/2/2, t/2) 57 | # layer 2 58 | input = self.batch_norm2(self.layer2(input)) 59 | 60 | # (batch_size, 32, nfft/2/2, t/2) 61 | input = F.hardtanh(input, 0, 20, inplace=True) 62 | 63 | batch_size = input.size(0) 64 | length = input.size(3) 65 | input = input.view(batch_size, -1, length) 66 | input = input.transpose(0, 2).transpose(1, 2) 67 | 68 | output, hidden = self.rnn(input) 69 | 70 | return hidden, output 71 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/modules/AudioEncoder.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class AudioEncoder(nn.Module): 7 | """ 8 | A simple encoder convolutional -> recurrent neural network for 9 | audio input. 10 | 11 | Args: 12 | num_layers (int): number of encoder layers. 13 | bidirectional (bool): bidirectional encoder. 14 | rnn_size (int): size of hidden states of the rnn. 15 | dropout (float): dropout probablity. 16 | sample_rate (float): input spec 17 | window_size (int): input spec 18 | 19 | """ 20 | def __init__(self, num_layers, bidirectional, rnn_size, dropout, 21 | sample_rate, window_size): 22 | super(AudioEncoder, self).__init__() 23 | self.num_layers = num_layers 24 | self.num_directions = 2 if bidirectional else 1 25 | self.hidden_size = rnn_size 26 | 27 | self.layer1 = nn.Conv2d(1, 32, kernel_size=(41, 11), 28 | padding=(0, 10), stride=(2, 2)) 29 | self.batch_norm1 = nn.BatchNorm2d(32) 30 | self.layer2 = nn.Conv2d(32, 32, kernel_size=(21, 11), 31 | padding=(0, 0), stride=(2, 1)) 32 | self.batch_norm2 = nn.BatchNorm2d(32) 33 | 34 | input_size = int(math.floor((sample_rate * window_size) / 2) + 1) 35 | input_size = int(math.floor(input_size - 41) / 2 + 1) 36 | input_size = int(math.floor(input_size - 21) / 2 + 1) 37 | input_size *= 32 38 | self.rnn = nn.LSTM(input_size, rnn_size, 39 | num_layers=num_layers, 40 | dropout=dropout, 41 | bidirectional=bidirectional) 42 | 43 | def load_pretrained_vectors(self, opt): 44 | # Pass in needed options only when modify function definition. 45 | pass 46 | 47 | def forward(self, input, lengths=None): 48 | "See :obj:`onmt.modules.EncoderBase.forward()`" 49 | # (batch_size, 1, nfft, t) 50 | # layer 1 51 | input = self.batch_norm1(self.layer1(input[:, :, :, :])) 52 | 53 | # (batch_size, 32, nfft/2, t/2) 54 | input = F.hardtanh(input, 0, 20, inplace=True) 55 | 56 | # (batch_size, 32, nfft/2/2, t/2) 57 | # layer 2 58 | input = self.batch_norm2(self.layer2(input)) 59 | 60 | # (batch_size, 32, nfft/2/2, t/2) 61 | input = F.hardtanh(input, 0, 20, inplace=True) 62 | 63 | batch_size = input.size(0) 64 | length = input.size(3) 65 | input = input.view(batch_size, -1, length) 66 | input = input.transpose(0, 2).transpose(1, 2) 67 | 68 | output, hidden = self.rnn(input) 69 | 70 | return hidden, output 71 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/modules/AudioEncoder.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class AudioEncoder(nn.Module): 7 | """ 8 | A simple encoder convolutional -> recurrent neural network for 9 | audio input. 10 | 11 | Args: 12 | num_layers (int): number of encoder layers. 13 | bidirectional (bool): bidirectional encoder. 14 | rnn_size (int): size of hidden states of the rnn. 15 | dropout (float): dropout probablity. 16 | sample_rate (float): input spec 17 | window_size (int): input spec 18 | 19 | """ 20 | def __init__(self, num_layers, bidirectional, rnn_size, dropout, 21 | sample_rate, window_size): 22 | super(AudioEncoder, self).__init__() 23 | self.num_layers = num_layers 24 | self.num_directions = 2 if bidirectional else 1 25 | self.hidden_size = rnn_size 26 | 27 | self.layer1 = nn.Conv2d(1, 32, kernel_size=(41, 11), 28 | padding=(0, 10), stride=(2, 2)) 29 | self.batch_norm1 = nn.BatchNorm2d(32) 30 | self.layer2 = nn.Conv2d(32, 32, kernel_size=(21, 11), 31 | padding=(0, 0), stride=(2, 1)) 32 | self.batch_norm2 = nn.BatchNorm2d(32) 33 | 34 | input_size = int(math.floor((sample_rate * window_size) / 2) + 1) 35 | input_size = int(math.floor(input_size - 41) / 2 + 1) 36 | input_size = int(math.floor(input_size - 21) / 2 + 1) 37 | input_size *= 32 38 | self.rnn = nn.LSTM(input_size, rnn_size, 39 | num_layers=num_layers, 40 | dropout=dropout, 41 | bidirectional=bidirectional) 42 | 43 | def load_pretrained_vectors(self, opt): 44 | # Pass in needed options only when modify function definition. 45 | pass 46 | 47 | def forward(self, input, lengths=None): 48 | "See :obj:`onmt.modules.EncoderBase.forward()`" 49 | # (batch_size, 1, nfft, t) 50 | # layer 1 51 | input = self.batch_norm1(self.layer1(input[:, :, :, :])) 52 | 53 | # (batch_size, 32, nfft/2, t/2) 54 | input = F.hardtanh(input, 0, 20, inplace=True) 55 | 56 | # (batch_size, 32, nfft/2/2, t/2) 57 | # layer 2 58 | input = self.batch_norm2(self.layer2(input)) 59 | 60 | # (batch_size, 32, nfft/2/2, t/2) 61 | input = F.hardtanh(input, 0, 20, inplace=True) 62 | 63 | batch_size = input.size(0) 64 | length = input.size(3) 65 | input = input.view(batch_size, -1, length) 66 | input = input.transpose(0, 2).transpose(1, 2) 67 | 68 | output, hidden = self.rnn(input) 69 | 70 | return hidden, output 71 | -------------------------------------------------------------------------------- /D_pretrain/onmt/modules/ConvMultiStepAttention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from onmt.Utils import aeq 5 | 6 | 7 | SCALE_WEIGHT = 0.5 ** 0.5 8 | 9 | 10 | def seq_linear(linear, x): 11 | # linear transform for 3-d tensor 12 | batch, hidden_size, length, _ = x.size() 13 | h = linear(torch.transpose(x, 1, 2).contiguous().view( 14 | batch * length, hidden_size)) 15 | return torch.transpose(h.view(batch, length, hidden_size, 1), 1, 2) 16 | 17 | 18 | class ConvMultiStepAttention(nn.Module): 19 | """ 20 | 21 | Conv attention takes a key matrix, a value matrix and a query vector. 22 | Attention weight is calculated by key matrix with the query vector 23 | and sum on the value matrix. And the same operation is applied 24 | in each decode conv layer. 25 | 26 | """ 27 | 28 | def __init__(self, input_size): 29 | super(ConvMultiStepAttention, self).__init__() 30 | self.linear_in = nn.Linear(input_size, input_size) 31 | self.mask = None 32 | 33 | def apply_mask(self, mask): 34 | self.mask = mask 35 | 36 | def forward(self, base_target_emb, input, encoder_out_top, 37 | encoder_out_combine): 38 | """ 39 | Args: 40 | base_target_emb: target emb tensor 41 | input: output of decode conv 42 | encoder_out_t: the key matrix for calculation of attetion weight, 43 | which is the top output of encode conv 44 | encoder_out_combine: 45 | the value matrix for the attention-weighted sum, 46 | which is the combination of base emb and top output of encode 47 | 48 | """ 49 | # checks 50 | batch, channel, height, width = base_target_emb.size() 51 | batch_, channel_, height_, width_ = input.size() 52 | aeq(batch, batch_) 53 | aeq(height, height_) 54 | 55 | enc_batch, enc_channel, enc_height = encoder_out_top.size() 56 | enc_batch_, enc_channel_, enc_height_ = encoder_out_combine.size() 57 | 58 | aeq(enc_batch, enc_batch_) 59 | aeq(enc_height, enc_height_) 60 | 61 | preatt = seq_linear(self.linear_in, input) 62 | target = (base_target_emb + preatt) * SCALE_WEIGHT 63 | target = torch.squeeze(target, 3) 64 | target = torch.transpose(target, 1, 2) 65 | pre_attn = torch.bmm(target, encoder_out_top) 66 | 67 | if self.mask is not None: 68 | pre_attn.data.masked_fill_(self.mask, -float('inf')) 69 | 70 | pre_attn = pre_attn.transpose(0, 2) 71 | attn = F.softmax(pre_attn) 72 | attn = attn.transpose(0, 2).contiguous() 73 | context_output = torch.bmm( 74 | attn, torch.transpose(encoder_out_combine, 1, 2)) 75 | context_output = torch.transpose( 76 | torch.unsqueeze(context_output, 3), 1, 2) 77 | return context_output, attn 78 | -------------------------------------------------------------------------------- /G_pretrain/onmt/modules/ConvMultiStepAttention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from onmt.Utils import aeq 5 | 6 | 7 | SCALE_WEIGHT = 0.5 ** 0.5 8 | 9 | 10 | def seq_linear(linear, x): 11 | # linear transform for 3-d tensor 12 | batch, hidden_size, length, _ = x.size() 13 | h = linear(torch.transpose(x, 1, 2).contiguous().view( 14 | batch * length, hidden_size)) 15 | return torch.transpose(h.view(batch, length, hidden_size, 1), 1, 2) 16 | 17 | 18 | class ConvMultiStepAttention(nn.Module): 19 | """ 20 | 21 | Conv attention takes a key matrix, a value matrix and a query vector. 22 | Attention weight is calculated by key matrix with the query vector 23 | and sum on the value matrix. And the same operation is applied 24 | in each decode conv layer. 25 | 26 | """ 27 | 28 | def __init__(self, input_size): 29 | super(ConvMultiStepAttention, self).__init__() 30 | self.linear_in = nn.Linear(input_size, input_size) 31 | self.mask = None 32 | 33 | def apply_mask(self, mask): 34 | self.mask = mask 35 | 36 | def forward(self, base_target_emb, input, encoder_out_top, 37 | encoder_out_combine): 38 | """ 39 | Args: 40 | base_target_emb: target emb tensor 41 | input: output of decode conv 42 | encoder_out_t: the key matrix for calculation of attetion weight, 43 | which is the top output of encode conv 44 | encoder_out_combine: 45 | the value matrix for the attention-weighted sum, 46 | which is the combination of base emb and top output of encode 47 | 48 | """ 49 | # checks 50 | batch, channel, height, width = base_target_emb.size() 51 | batch_, channel_, height_, width_ = input.size() 52 | aeq(batch, batch_) 53 | aeq(height, height_) 54 | 55 | enc_batch, enc_channel, enc_height = encoder_out_top.size() 56 | enc_batch_, enc_channel_, enc_height_ = encoder_out_combine.size() 57 | 58 | aeq(enc_batch, enc_batch_) 59 | aeq(enc_height, enc_height_) 60 | 61 | preatt = seq_linear(self.linear_in, input) 62 | target = (base_target_emb + preatt) * SCALE_WEIGHT 63 | target = torch.squeeze(target, 3) 64 | target = torch.transpose(target, 1, 2) 65 | pre_attn = torch.bmm(target, encoder_out_top) 66 | 67 | if self.mask is not None: 68 | pre_attn.data.masked_fill_(self.mask, -float('inf')) 69 | 70 | pre_attn = pre_attn.transpose(0, 2) 71 | attn = F.softmax(pre_attn) 72 | attn = attn.transpose(0, 2).contiguous() 73 | context_output = torch.bmm( 74 | attn, torch.transpose(encoder_out_combine, 1, 2)) 75 | context_output = torch.transpose( 76 | torch.unsqueeze(context_output, 3), 1, 2) 77 | return context_output, attn 78 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/modules/ConvMultiStepAttention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from onmt.Utils import aeq 5 | 6 | 7 | SCALE_WEIGHT = 0.5 ** 0.5 8 | 9 | 10 | def seq_linear(linear, x): 11 | # linear transform for 3-d tensor 12 | batch, hidden_size, length, _ = x.size() 13 | h = linear(torch.transpose(x, 1, 2).contiguous().view( 14 | batch * length, hidden_size)) 15 | return torch.transpose(h.view(batch, length, hidden_size, 1), 1, 2) 16 | 17 | 18 | class ConvMultiStepAttention(nn.Module): 19 | """ 20 | 21 | Conv attention takes a key matrix, a value matrix and a query vector. 22 | Attention weight is calculated by key matrix with the query vector 23 | and sum on the value matrix. And the same operation is applied 24 | in each decode conv layer. 25 | 26 | """ 27 | 28 | def __init__(self, input_size): 29 | super(ConvMultiStepAttention, self).__init__() 30 | self.linear_in = nn.Linear(input_size, input_size) 31 | self.mask = None 32 | 33 | def apply_mask(self, mask): 34 | self.mask = mask 35 | 36 | def forward(self, base_target_emb, input, encoder_out_top, 37 | encoder_out_combine): 38 | """ 39 | Args: 40 | base_target_emb: target emb tensor 41 | input: output of decode conv 42 | encoder_out_t: the key matrix for calculation of attetion weight, 43 | which is the top output of encode conv 44 | encoder_out_combine: 45 | the value matrix for the attention-weighted sum, 46 | which is the combination of base emb and top output of encode 47 | 48 | """ 49 | # checks 50 | batch, channel, height, width = base_target_emb.size() 51 | batch_, channel_, height_, width_ = input.size() 52 | aeq(batch, batch_) 53 | aeq(height, height_) 54 | 55 | enc_batch, enc_channel, enc_height = encoder_out_top.size() 56 | enc_batch_, enc_channel_, enc_height_ = encoder_out_combine.size() 57 | 58 | aeq(enc_batch, enc_batch_) 59 | aeq(enc_height, enc_height_) 60 | 61 | preatt = seq_linear(self.linear_in, input) 62 | target = (base_target_emb + preatt) * SCALE_WEIGHT 63 | target = torch.squeeze(target, 3) 64 | target = torch.transpose(target, 1, 2) 65 | pre_attn = torch.bmm(target, encoder_out_top) 66 | 67 | if self.mask is not None: 68 | pre_attn.data.masked_fill_(self.mask, -float('inf')) 69 | 70 | pre_attn = pre_attn.transpose(0, 2) 71 | attn = F.softmax(pre_attn) 72 | attn = attn.transpose(0, 2).contiguous() 73 | context_output = torch.bmm( 74 | attn, torch.transpose(encoder_out_combine, 1, 2)) 75 | context_output = torch.transpose( 76 | torch.unsqueeze(context_output, 3), 1, 2) 77 | return context_output, attn 78 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/modules/ConvMultiStepAttention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from onmt.Utils import aeq 5 | 6 | 7 | SCALE_WEIGHT = 0.5 ** 0.5 8 | 9 | 10 | def seq_linear(linear, x): 11 | # linear transform for 3-d tensor 12 | batch, hidden_size, length, _ = x.size() 13 | h = linear(torch.transpose(x, 1, 2).contiguous().view( 14 | batch * length, hidden_size)) 15 | return torch.transpose(h.view(batch, length, hidden_size, 1), 1, 2) 16 | 17 | 18 | class ConvMultiStepAttention(nn.Module): 19 | """ 20 | 21 | Conv attention takes a key matrix, a value matrix and a query vector. 22 | Attention weight is calculated by key matrix with the query vector 23 | and sum on the value matrix. And the same operation is applied 24 | in each decode conv layer. 25 | 26 | """ 27 | 28 | def __init__(self, input_size): 29 | super(ConvMultiStepAttention, self).__init__() 30 | self.linear_in = nn.Linear(input_size, input_size) 31 | self.mask = None 32 | 33 | def apply_mask(self, mask): 34 | self.mask = mask 35 | 36 | def forward(self, base_target_emb, input, encoder_out_top, 37 | encoder_out_combine): 38 | """ 39 | Args: 40 | base_target_emb: target emb tensor 41 | input: output of decode conv 42 | encoder_out_t: the key matrix for calculation of attetion weight, 43 | which is the top output of encode conv 44 | encoder_out_combine: 45 | the value matrix for the attention-weighted sum, 46 | which is the combination of base emb and top output of encode 47 | 48 | """ 49 | # checks 50 | batch, channel, height, width = base_target_emb.size() 51 | batch_, channel_, height_, width_ = input.size() 52 | aeq(batch, batch_) 53 | aeq(height, height_) 54 | 55 | enc_batch, enc_channel, enc_height = encoder_out_top.size() 56 | enc_batch_, enc_channel_, enc_height_ = encoder_out_combine.size() 57 | 58 | aeq(enc_batch, enc_batch_) 59 | aeq(enc_height, enc_height_) 60 | 61 | preatt = seq_linear(self.linear_in, input) 62 | target = (base_target_emb + preatt) * SCALE_WEIGHT 63 | target = torch.squeeze(target, 3) 64 | target = torch.transpose(target, 1, 2) 65 | pre_attn = torch.bmm(target, encoder_out_top) 66 | 67 | if self.mask is not None: 68 | pre_attn.data.masked_fill_(self.mask, -float('inf')) 69 | 70 | pre_attn = pre_attn.transpose(0, 2) 71 | attn = F.softmax(pre_attn) 72 | attn = attn.transpose(0, 2).contiguous() 73 | context_output = torch.bmm( 74 | attn, torch.transpose(encoder_out_combine, 1, 2)) 75 | context_output = torch.transpose( 76 | torch.unsqueeze(context_output, 3), 1, 2) 77 | return context_output, attn 78 | -------------------------------------------------------------------------------- /D_pretrain/onmt/modules/UtilClass.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class Bottle(nn.Module): 6 | def forward(self, input): 7 | if len(input.size()) <= 2: 8 | return super(Bottle, self).forward(input) 9 | size = input.size()[:2] 10 | out = super(Bottle, self).forward(input.view(size[0]*size[1], -1)) 11 | return out.contiguous().view(size[0], size[1], -1) 12 | 13 | 14 | class Bottle2(nn.Module): 15 | def forward(self, input): 16 | if len(input.size()) <= 3: 17 | return super(Bottle2, self).forward(input) 18 | size = input.size() 19 | out = super(Bottle2, self).forward(input.view(size[0]*size[1], 20 | size[2], size[3])) 21 | return out.contiguous().view(size[0], size[1], size[2], size[3]) 22 | 23 | 24 | class LayerNorm(nn.Module): 25 | ''' Layer normalization module ''' 26 | 27 | def __init__(self, d_hid, eps=1e-3): 28 | super(LayerNorm, self).__init__() 29 | 30 | self.eps = eps 31 | self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True) 32 | self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True) 33 | 34 | def forward(self, z): 35 | if z.size(1) == 1: 36 | return z 37 | mu = torch.mean(z, dim=1) 38 | sigma = torch.std(z, dim=1) 39 | # HACK. PyTorch is changing behavior 40 | if mu.dim() == 1: 41 | mu = mu.unsqueeze(1) 42 | sigma = sigma.unsqueeze(1) 43 | ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps) 44 | ln_out = ln_out.mul(self.a_2.expand_as(ln_out)) \ 45 | + self.b_2.expand_as(ln_out) 46 | return ln_out 47 | 48 | 49 | class BottleLinear(Bottle, nn.Linear): 50 | pass 51 | 52 | 53 | class BottleLayerNorm(Bottle, LayerNorm): 54 | pass 55 | 56 | 57 | class BottleSoftmax(Bottle, nn.Softmax): 58 | pass 59 | 60 | 61 | class Elementwise(nn.ModuleList): 62 | """ 63 | A simple network container. 64 | Parameters are a list of modules. 65 | Inputs are a 3d Variable whose last dimension is the same length 66 | as the list. 67 | Outputs are the result of applying modules to inputs elementwise. 68 | An optional merge parameter allows the outputs to be reduced to a 69 | single Variable. 70 | """ 71 | 72 | def __init__(self, merge=None, *args): 73 | assert merge in [None, 'first', 'concat', 'sum', 'mlp'] 74 | self.merge = merge 75 | super(Elementwise, self).__init__(*args) 76 | 77 | def forward(self, input): 78 | inputs = [feat.squeeze(2) for feat in input.split(1, dim=2)] 79 | assert len(self) == len(inputs) 80 | outputs = [f(x) for f, x in zip(self, inputs)] 81 | if self.merge == 'first': 82 | return outputs[0] 83 | elif self.merge == 'concat' or self.merge == 'mlp': 84 | return torch.cat(outputs, 2) 85 | elif self.merge == 'sum': 86 | return sum(outputs) 87 | else: 88 | return outputs 89 | -------------------------------------------------------------------------------- /G_pretrain/onmt/modules/UtilClass.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class Bottle(nn.Module): 6 | def forward(self, input): 7 | if len(input.size()) <= 2: 8 | return super(Bottle, self).forward(input) 9 | size = input.size()[:2] 10 | out = super(Bottle, self).forward(input.view(size[0]*size[1], -1)) 11 | return out.contiguous().view(size[0], size[1], -1) 12 | 13 | 14 | class Bottle2(nn.Module): 15 | def forward(self, input): 16 | if len(input.size()) <= 3: 17 | return super(Bottle2, self).forward(input) 18 | size = input.size() 19 | out = super(Bottle2, self).forward(input.view(size[0]*size[1], 20 | size[2], size[3])) 21 | return out.contiguous().view(size[0], size[1], size[2], size[3]) 22 | 23 | 24 | class LayerNorm(nn.Module): 25 | ''' Layer normalization module ''' 26 | 27 | def __init__(self, d_hid, eps=1e-3): 28 | super(LayerNorm, self).__init__() 29 | 30 | self.eps = eps 31 | self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True) 32 | self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True) 33 | 34 | def forward(self, z): 35 | if z.size(1) == 1: 36 | return z 37 | mu = torch.mean(z, dim=1) 38 | sigma = torch.std(z, dim=1) 39 | # HACK. PyTorch is changing behavior 40 | if mu.dim() == 1: 41 | mu = mu.unsqueeze(1) 42 | sigma = sigma.unsqueeze(1) 43 | ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps) 44 | ln_out = ln_out.mul(self.a_2.expand_as(ln_out)) \ 45 | + self.b_2.expand_as(ln_out) 46 | return ln_out 47 | 48 | 49 | class BottleLinear(Bottle, nn.Linear): 50 | pass 51 | 52 | 53 | class BottleLayerNorm(Bottle, LayerNorm): 54 | pass 55 | 56 | 57 | class BottleSoftmax(Bottle, nn.Softmax): 58 | pass 59 | 60 | 61 | class Elementwise(nn.ModuleList): 62 | """ 63 | A simple network container. 64 | Parameters are a list of modules. 65 | Inputs are a 3d Variable whose last dimension is the same length 66 | as the list. 67 | Outputs are the result of applying modules to inputs elementwise. 68 | An optional merge parameter allows the outputs to be reduced to a 69 | single Variable. 70 | """ 71 | 72 | def __init__(self, merge=None, *args): 73 | assert merge in [None, 'first', 'concat', 'sum', 'mlp'] 74 | self.merge = merge 75 | super(Elementwise, self).__init__(*args) 76 | 77 | def forward(self, input): 78 | inputs = [feat.squeeze(2) for feat in input.split(1, dim=2)] 79 | assert len(self) == len(inputs) 80 | outputs = [f(x) for f, x in zip(self, inputs)] 81 | if self.merge == 'first': 82 | return outputs[0] 83 | elif self.merge == 'concat' or self.merge == 'mlp': 84 | return torch.cat(outputs, 2) 85 | elif self.merge == 'sum': 86 | return sum(outputs) 87 | else: 88 | return outputs 89 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/modules/UtilClass.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class Bottle(nn.Module): 6 | def forward(self, input): 7 | if len(input.size()) <= 2: 8 | return super(Bottle, self).forward(input) 9 | size = input.size()[:2] 10 | out = super(Bottle, self).forward(input.view(size[0]*size[1], -1)) 11 | return out.contiguous().view(size[0], size[1], -1) 12 | 13 | 14 | class Bottle2(nn.Module): 15 | def forward(self, input): 16 | if len(input.size()) <= 3: 17 | return super(Bottle2, self).forward(input) 18 | size = input.size() 19 | out = super(Bottle2, self).forward(input.view(size[0]*size[1], 20 | size[2], size[3])) 21 | return out.contiguous().view(size[0], size[1], size[2], size[3]) 22 | 23 | 24 | class LayerNorm(nn.Module): 25 | ''' Layer normalization module ''' 26 | 27 | def __init__(self, d_hid, eps=1e-3): 28 | super(LayerNorm, self).__init__() 29 | 30 | self.eps = eps 31 | self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True) 32 | self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True) 33 | 34 | def forward(self, z): 35 | if z.size(1) == 1: 36 | return z 37 | mu = torch.mean(z, dim=1) 38 | sigma = torch.std(z, dim=1) 39 | # HACK. PyTorch is changing behavior 40 | if mu.dim() == 1: 41 | mu = mu.unsqueeze(1) 42 | sigma = sigma.unsqueeze(1) 43 | ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps) 44 | ln_out = ln_out.mul(self.a_2.expand_as(ln_out)) \ 45 | + self.b_2.expand_as(ln_out) 46 | return ln_out 47 | 48 | 49 | class BottleLinear(Bottle, nn.Linear): 50 | pass 51 | 52 | 53 | class BottleLayerNorm(Bottle, LayerNorm): 54 | pass 55 | 56 | 57 | class BottleSoftmax(Bottle, nn.Softmax): 58 | pass 59 | 60 | 61 | class Elementwise(nn.ModuleList): 62 | """ 63 | A simple network container. 64 | Parameters are a list of modules. 65 | Inputs are a 3d Variable whose last dimension is the same length 66 | as the list. 67 | Outputs are the result of applying modules to inputs elementwise. 68 | An optional merge parameter allows the outputs to be reduced to a 69 | single Variable. 70 | """ 71 | 72 | def __init__(self, merge=None, *args): 73 | assert merge in [None, 'first', 'concat', 'sum', 'mlp'] 74 | self.merge = merge 75 | super(Elementwise, self).__init__(*args) 76 | 77 | def forward(self, input): 78 | inputs = [feat.squeeze(2) for feat in input.split(1, dim=2)] 79 | assert len(self) == len(inputs) 80 | outputs = [f(x) for f, x in zip(self, inputs)] 81 | if self.merge == 'first': 82 | return outputs[0] 83 | elif self.merge == 'concat' or self.merge == 'mlp': 84 | return torch.cat(outputs, 2) 85 | elif self.merge == 'sum': 86 | return sum(outputs) 87 | else: 88 | return outputs 89 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/modules/UtilClass.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class Bottle(nn.Module): 6 | def forward(self, input): 7 | if len(input.size()) <= 2: 8 | return super(Bottle, self).forward(input) 9 | size = input.size()[:2] 10 | out = super(Bottle, self).forward(input.view(size[0]*size[1], -1)) 11 | return out.contiguous().view(size[0], size[1], -1) 12 | 13 | 14 | class Bottle2(nn.Module): 15 | def forward(self, input): 16 | if len(input.size()) <= 3: 17 | return super(Bottle2, self).forward(input) 18 | size = input.size() 19 | out = super(Bottle2, self).forward(input.view(size[0]*size[1], 20 | size[2], size[3])) 21 | return out.contiguous().view(size[0], size[1], size[2], size[3]) 22 | 23 | 24 | class LayerNorm(nn.Module): 25 | ''' Layer normalization module ''' 26 | 27 | def __init__(self, d_hid, eps=1e-3): 28 | super(LayerNorm, self).__init__() 29 | 30 | self.eps = eps 31 | self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True) 32 | self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True) 33 | 34 | def forward(self, z): 35 | if z.size(1) == 1: 36 | return z 37 | mu = torch.mean(z, dim=1) 38 | sigma = torch.std(z, dim=1) 39 | # HACK. PyTorch is changing behavior 40 | if mu.dim() == 1: 41 | mu = mu.unsqueeze(1) 42 | sigma = sigma.unsqueeze(1) 43 | ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps) 44 | ln_out = ln_out.mul(self.a_2.expand_as(ln_out)) \ 45 | + self.b_2.expand_as(ln_out) 46 | return ln_out 47 | 48 | 49 | class BottleLinear(Bottle, nn.Linear): 50 | pass 51 | 52 | 53 | class BottleLayerNorm(Bottle, LayerNorm): 54 | pass 55 | 56 | 57 | class BottleSoftmax(Bottle, nn.Softmax): 58 | pass 59 | 60 | 61 | class Elementwise(nn.ModuleList): 62 | """ 63 | A simple network container. 64 | Parameters are a list of modules. 65 | Inputs are a 3d Variable whose last dimension is the same length 66 | as the list. 67 | Outputs are the result of applying modules to inputs elementwise. 68 | An optional merge parameter allows the outputs to be reduced to a 69 | single Variable. 70 | """ 71 | 72 | def __init__(self, merge=None, *args): 73 | assert merge in [None, 'first', 'concat', 'sum', 'mlp'] 74 | self.merge = merge 75 | super(Elementwise, self).__init__(*args) 76 | 77 | def forward(self, input): 78 | inputs = [feat.squeeze(2) for feat in input.split(1, dim=2)] 79 | assert len(self) == len(inputs) 80 | outputs = [f(x) for f, x in zip(self, inputs)] 81 | if self.merge == 'first': 82 | return outputs[0] 83 | elif self.merge == 'concat' or self.merge == 'mlp': 84 | return torch.cat(outputs, 2) 85 | elif self.merge == 'sum': 86 | return sum(outputs) 87 | else: 88 | return outputs 89 | -------------------------------------------------------------------------------- /D_pretrain/onmt/modules/Gate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | def context_gate_factory(type, embeddings_size, decoder_size, 6 | attention_size, output_size): 7 | """Returns the correct ContextGate class""" 8 | 9 | gate_types = {'source': SourceContextGate, 10 | 'target': TargetContextGate, 11 | 'both': BothContextGate} 12 | 13 | assert type in gate_types, "Not valid ContextGate type: {0}".format(type) 14 | return gate_types[type](embeddings_size, decoder_size, attention_size, 15 | output_size) 16 | 17 | 18 | class ContextGate(nn.Module): 19 | """ 20 | Context gate is a decoder module that takes as input the previous word 21 | embedding, the current decoder state and the attention state, and 22 | produces a gate. 23 | The gate can be used to select the input from the target side context 24 | (decoder state), from the source context (attention state) or both. 25 | """ 26 | def __init__(self, embeddings_size, decoder_size, 27 | attention_size, output_size): 28 | super(ContextGate, self).__init__() 29 | input_size = embeddings_size + decoder_size + attention_size 30 | self.gate = nn.Linear(input_size, output_size, bias=True) 31 | self.sig = nn.Sigmoid() 32 | self.source_proj = nn.Linear(attention_size, output_size) 33 | self.target_proj = nn.Linear(embeddings_size + decoder_size, 34 | output_size) 35 | 36 | def forward(self, prev_emb, dec_state, attn_state): 37 | input_tensor = torch.cat((prev_emb, dec_state, attn_state), dim=1) 38 | z = self.sig(self.gate(input_tensor)) 39 | proj_source = self.source_proj(attn_state) 40 | proj_target = self.target_proj( 41 | torch.cat((prev_emb, dec_state), dim=1)) 42 | return z, proj_source, proj_target 43 | 44 | 45 | class SourceContextGate(nn.Module): 46 | """Apply the context gate only to the source context""" 47 | 48 | def __init__(self, embeddings_size, decoder_size, 49 | attention_size, output_size): 50 | super(SourceContextGate, self).__init__() 51 | self.context_gate = ContextGate(embeddings_size, decoder_size, 52 | attention_size, output_size) 53 | self.tanh = nn.Tanh() 54 | 55 | def forward(self, prev_emb, dec_state, attn_state): 56 | z, source, target = self.context_gate( 57 | prev_emb, dec_state, attn_state) 58 | return self.tanh(target + z * source) 59 | 60 | 61 | class TargetContextGate(nn.Module): 62 | """Apply the context gate only to the target context""" 63 | 64 | def __init__(self, embeddings_size, decoder_size, 65 | attention_size, output_size): 66 | super(TargetContextGate, self).__init__() 67 | self.context_gate = ContextGate(embeddings_size, decoder_size, 68 | attention_size, output_size) 69 | self.tanh = nn.Tanh() 70 | 71 | def forward(self, prev_emb, dec_state, attn_state): 72 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state) 73 | return self.tanh(z * target + source) 74 | 75 | 76 | class BothContextGate(nn.Module): 77 | """Apply the context gate to both contexts""" 78 | 79 | def __init__(self, embeddings_size, decoder_size, 80 | attention_size, output_size): 81 | super(BothContextGate, self).__init__() 82 | self.context_gate = ContextGate(embeddings_size, decoder_size, 83 | attention_size, output_size) 84 | self.tanh = nn.Tanh() 85 | 86 | def forward(self, prev_emb, dec_state, attn_state): 87 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state) 88 | return self.tanh((1. - z) * target + z * source) 89 | -------------------------------------------------------------------------------- /G_pretrain/onmt/modules/Gate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | def context_gate_factory(type, embeddings_size, decoder_size, 6 | attention_size, output_size): 7 | """Returns the correct ContextGate class""" 8 | 9 | gate_types = {'source': SourceContextGate, 10 | 'target': TargetContextGate, 11 | 'both': BothContextGate} 12 | 13 | assert type in gate_types, "Not valid ContextGate type: {0}".format(type) 14 | return gate_types[type](embeddings_size, decoder_size, attention_size, 15 | output_size) 16 | 17 | 18 | class ContextGate(nn.Module): 19 | """ 20 | Context gate is a decoder module that takes as input the previous word 21 | embedding, the current decoder state and the attention state, and 22 | produces a gate. 23 | The gate can be used to select the input from the target side context 24 | (decoder state), from the source context (attention state) or both. 25 | """ 26 | def __init__(self, embeddings_size, decoder_size, 27 | attention_size, output_size): 28 | super(ContextGate, self).__init__() 29 | input_size = embeddings_size + decoder_size + attention_size 30 | self.gate = nn.Linear(input_size, output_size, bias=True) 31 | self.sig = nn.Sigmoid() 32 | self.source_proj = nn.Linear(attention_size, output_size) 33 | self.target_proj = nn.Linear(embeddings_size + decoder_size, 34 | output_size) 35 | 36 | def forward(self, prev_emb, dec_state, attn_state): 37 | input_tensor = torch.cat((prev_emb, dec_state, attn_state), dim=1) 38 | z = self.sig(self.gate(input_tensor)) 39 | proj_source = self.source_proj(attn_state) 40 | proj_target = self.target_proj( 41 | torch.cat((prev_emb, dec_state), dim=1)) 42 | return z, proj_source, proj_target 43 | 44 | 45 | class SourceContextGate(nn.Module): 46 | """Apply the context gate only to the source context""" 47 | 48 | def __init__(self, embeddings_size, decoder_size, 49 | attention_size, output_size): 50 | super(SourceContextGate, self).__init__() 51 | self.context_gate = ContextGate(embeddings_size, decoder_size, 52 | attention_size, output_size) 53 | self.tanh = nn.Tanh() 54 | 55 | def forward(self, prev_emb, dec_state, attn_state): 56 | z, source, target = self.context_gate( 57 | prev_emb, dec_state, attn_state) 58 | return self.tanh(target + z * source) 59 | 60 | 61 | class TargetContextGate(nn.Module): 62 | """Apply the context gate only to the target context""" 63 | 64 | def __init__(self, embeddings_size, decoder_size, 65 | attention_size, output_size): 66 | super(TargetContextGate, self).__init__() 67 | self.context_gate = ContextGate(embeddings_size, decoder_size, 68 | attention_size, output_size) 69 | self.tanh = nn.Tanh() 70 | 71 | def forward(self, prev_emb, dec_state, attn_state): 72 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state) 73 | return self.tanh(z * target + source) 74 | 75 | 76 | class BothContextGate(nn.Module): 77 | """Apply the context gate to both contexts""" 78 | 79 | def __init__(self, embeddings_size, decoder_size, 80 | attention_size, output_size): 81 | super(BothContextGate, self).__init__() 82 | self.context_gate = ContextGate(embeddings_size, decoder_size, 83 | attention_size, output_size) 84 | self.tanh = nn.Tanh() 85 | 86 | def forward(self, prev_emb, dec_state, attn_state): 87 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state) 88 | return self.tanh((1. - z) * target + z * source) 89 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/modules/Gate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | def context_gate_factory(type, embeddings_size, decoder_size, 6 | attention_size, output_size): 7 | """Returns the correct ContextGate class""" 8 | 9 | gate_types = {'source': SourceContextGate, 10 | 'target': TargetContextGate, 11 | 'both': BothContextGate} 12 | 13 | assert type in gate_types, "Not valid ContextGate type: {0}".format(type) 14 | return gate_types[type](embeddings_size, decoder_size, attention_size, 15 | output_size) 16 | 17 | 18 | class ContextGate(nn.Module): 19 | """ 20 | Context gate is a decoder module that takes as input the previous word 21 | embedding, the current decoder state and the attention state, and 22 | produces a gate. 23 | The gate can be used to select the input from the target side context 24 | (decoder state), from the source context (attention state) or both. 25 | """ 26 | def __init__(self, embeddings_size, decoder_size, 27 | attention_size, output_size): 28 | super(ContextGate, self).__init__() 29 | input_size = embeddings_size + decoder_size + attention_size 30 | self.gate = nn.Linear(input_size, output_size, bias=True) 31 | self.sig = nn.Sigmoid() 32 | self.source_proj = nn.Linear(attention_size, output_size) 33 | self.target_proj = nn.Linear(embeddings_size + decoder_size, 34 | output_size) 35 | 36 | def forward(self, prev_emb, dec_state, attn_state): 37 | input_tensor = torch.cat((prev_emb, dec_state, attn_state), dim=1) 38 | z = self.sig(self.gate(input_tensor)) 39 | proj_source = self.source_proj(attn_state) 40 | proj_target = self.target_proj( 41 | torch.cat((prev_emb, dec_state), dim=1)) 42 | return z, proj_source, proj_target 43 | 44 | 45 | class SourceContextGate(nn.Module): 46 | """Apply the context gate only to the source context""" 47 | 48 | def __init__(self, embeddings_size, decoder_size, 49 | attention_size, output_size): 50 | super(SourceContextGate, self).__init__() 51 | self.context_gate = ContextGate(embeddings_size, decoder_size, 52 | attention_size, output_size) 53 | self.tanh = nn.Tanh() 54 | 55 | def forward(self, prev_emb, dec_state, attn_state): 56 | z, source, target = self.context_gate( 57 | prev_emb, dec_state, attn_state) 58 | return self.tanh(target + z * source) 59 | 60 | 61 | class TargetContextGate(nn.Module): 62 | """Apply the context gate only to the target context""" 63 | 64 | def __init__(self, embeddings_size, decoder_size, 65 | attention_size, output_size): 66 | super(TargetContextGate, self).__init__() 67 | self.context_gate = ContextGate(embeddings_size, decoder_size, 68 | attention_size, output_size) 69 | self.tanh = nn.Tanh() 70 | 71 | def forward(self, prev_emb, dec_state, attn_state): 72 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state) 73 | return self.tanh(z * target + source) 74 | 75 | 76 | class BothContextGate(nn.Module): 77 | """Apply the context gate to both contexts""" 78 | 79 | def __init__(self, embeddings_size, decoder_size, 80 | attention_size, output_size): 81 | super(BothContextGate, self).__init__() 82 | self.context_gate = ContextGate(embeddings_size, decoder_size, 83 | attention_size, output_size) 84 | self.tanh = nn.Tanh() 85 | 86 | def forward(self, prev_emb, dec_state, attn_state): 87 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state) 88 | return self.tanh((1. - z) * target + z * source) 89 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/modules/Gate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | def context_gate_factory(type, embeddings_size, decoder_size, 6 | attention_size, output_size): 7 | """Returns the correct ContextGate class""" 8 | 9 | gate_types = {'source': SourceContextGate, 10 | 'target': TargetContextGate, 11 | 'both': BothContextGate} 12 | 13 | assert type in gate_types, "Not valid ContextGate type: {0}".format(type) 14 | return gate_types[type](embeddings_size, decoder_size, attention_size, 15 | output_size) 16 | 17 | 18 | class ContextGate(nn.Module): 19 | """ 20 | Context gate is a decoder module that takes as input the previous word 21 | embedding, the current decoder state and the attention state, and 22 | produces a gate. 23 | The gate can be used to select the input from the target side context 24 | (decoder state), from the source context (attention state) or both. 25 | """ 26 | def __init__(self, embeddings_size, decoder_size, 27 | attention_size, output_size): 28 | super(ContextGate, self).__init__() 29 | input_size = embeddings_size + decoder_size + attention_size 30 | self.gate = nn.Linear(input_size, output_size, bias=True) 31 | self.sig = nn.Sigmoid() 32 | self.source_proj = nn.Linear(attention_size, output_size) 33 | self.target_proj = nn.Linear(embeddings_size + decoder_size, 34 | output_size) 35 | 36 | def forward(self, prev_emb, dec_state, attn_state): 37 | input_tensor = torch.cat((prev_emb, dec_state, attn_state), dim=1) 38 | z = self.sig(self.gate(input_tensor)) 39 | proj_source = self.source_proj(attn_state) 40 | proj_target = self.target_proj( 41 | torch.cat((prev_emb, dec_state), dim=1)) 42 | return z, proj_source, proj_target 43 | 44 | 45 | class SourceContextGate(nn.Module): 46 | """Apply the context gate only to the source context""" 47 | 48 | def __init__(self, embeddings_size, decoder_size, 49 | attention_size, output_size): 50 | super(SourceContextGate, self).__init__() 51 | self.context_gate = ContextGate(embeddings_size, decoder_size, 52 | attention_size, output_size) 53 | self.tanh = nn.Tanh() 54 | 55 | def forward(self, prev_emb, dec_state, attn_state): 56 | z, source, target = self.context_gate( 57 | prev_emb, dec_state, attn_state) 58 | return self.tanh(target + z * source) 59 | 60 | 61 | class TargetContextGate(nn.Module): 62 | """Apply the context gate only to the target context""" 63 | 64 | def __init__(self, embeddings_size, decoder_size, 65 | attention_size, output_size): 66 | super(TargetContextGate, self).__init__() 67 | self.context_gate = ContextGate(embeddings_size, decoder_size, 68 | attention_size, output_size) 69 | self.tanh = nn.Tanh() 70 | 71 | def forward(self, prev_emb, dec_state, attn_state): 72 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state) 73 | return self.tanh(z * target + source) 74 | 75 | 76 | class BothContextGate(nn.Module): 77 | """Apply the context gate to both contexts""" 78 | 79 | def __init__(self, embeddings_size, decoder_size, 80 | attention_size, output_size): 81 | super(BothContextGate, self).__init__() 82 | self.context_gate = ContextGate(embeddings_size, decoder_size, 83 | attention_size, output_size) 84 | self.tanh = nn.Tanh() 85 | 86 | def forward(self, prev_emb, dec_state, attn_state): 87 | z, source, target = self.context_gate(prev_emb, dec_state, attn_state) 88 | return self.tanh((1. - z) * target + z * source) 89 | -------------------------------------------------------------------------------- /D_pretrain/onmt/modules/ImageEncoder.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | from torch.autograd import Variable 5 | 6 | 7 | class ImageEncoder(nn.Module): 8 | """ 9 | A simple encoder convolutional -> recurrent neural network for 10 | image input. 11 | 12 | Args: 13 | num_layers (int): number of encoder layers. 14 | bidirectional (bool): bidirectional encoder. 15 | rnn_size (int): size of hidden states of the rnn. 16 | dropout (float): dropout probablity. 17 | """ 18 | def __init__(self, num_layers, bidirectional, rnn_size, dropout): 19 | super(ImageEncoder, self).__init__() 20 | self.num_layers = num_layers 21 | self.num_directions = 2 if bidirectional else 1 22 | self.hidden_size = rnn_size 23 | 24 | self.layer1 = nn.Conv2d(3, 64, kernel_size=(3, 3), 25 | padding=(1, 1), stride=(1, 1)) 26 | self.layer2 = nn.Conv2d(64, 128, kernel_size=(3, 3), 27 | padding=(1, 1), stride=(1, 1)) 28 | self.layer3 = nn.Conv2d(128, 256, kernel_size=(3, 3), 29 | padding=(1, 1), stride=(1, 1)) 30 | self.layer4 = nn.Conv2d(256, 256, kernel_size=(3, 3), 31 | padding=(1, 1), stride=(1, 1)) 32 | self.layer5 = nn.Conv2d(256, 512, kernel_size=(3, 3), 33 | padding=(1, 1), stride=(1, 1)) 34 | self.layer6 = nn.Conv2d(512, 512, kernel_size=(3, 3), 35 | padding=(1, 1), stride=(1, 1)) 36 | 37 | self.batch_norm1 = nn.BatchNorm2d(256) 38 | self.batch_norm2 = nn.BatchNorm2d(512) 39 | self.batch_norm3 = nn.BatchNorm2d(512) 40 | 41 | input_size = 512 42 | self.rnn = nn.LSTM(input_size, rnn_size, 43 | num_layers=num_layers, 44 | dropout=dropout, 45 | bidirectional=bidirectional) 46 | self.pos_lut = nn.Embedding(1000, input_size) 47 | 48 | def load_pretrained_vectors(self, opt): 49 | # Pass in needed options only when modify function definition. 50 | pass 51 | 52 | def forward(self, input, lengths=None): 53 | "See :obj:`onmt.modules.EncoderBase.forward()`" 54 | 55 | batch_size = input.size(0) 56 | # (batch_size, 64, imgH, imgW) 57 | # layer 1 58 | input = F.relu(self.layer1(input[:, :, :, :]-0.5), True) 59 | 60 | # (batch_size, 64, imgH/2, imgW/2) 61 | input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2)) 62 | 63 | # (batch_size, 128, imgH/2, imgW/2) 64 | # layer 2 65 | input = F.relu(self.layer2(input), True) 66 | 67 | # (batch_size, 128, imgH/2/2, imgW/2/2) 68 | input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2)) 69 | 70 | # (batch_size, 256, imgH/2/2, imgW/2/2) 71 | # layer 3 72 | # batch norm 1 73 | input = F.relu(self.batch_norm1(self.layer3(input)), True) 74 | 75 | # (batch_size, 256, imgH/2/2, imgW/2/2) 76 | # layer4 77 | input = F.relu(self.layer4(input), True) 78 | 79 | # (batch_size, 256, imgH/2/2/2, imgW/2/2) 80 | input = F.max_pool2d(input, kernel_size=(1, 2), stride=(1, 2)) 81 | 82 | # (batch_size, 512, imgH/2/2/2, imgW/2/2) 83 | # layer 5 84 | # batch norm 2 85 | input = F.relu(self.batch_norm2(self.layer5(input)), True) 86 | 87 | # (batch_size, 512, imgH/2/2/2, imgW/2/2/2) 88 | input = F.max_pool2d(input, kernel_size=(2, 1), stride=(2, 1)) 89 | 90 | # (batch_size, 512, imgH/2/2/2, imgW/2/2/2) 91 | input = F.relu(self.batch_norm3(self.layer6(input)), True) 92 | 93 | # # (batch_size, 512, H, W) 94 | all_outputs = [] 95 | for row in range(input.size(2)): 96 | inp = input[:, :, row, :].transpose(0, 2)\ 97 | .transpose(1, 2) 98 | row_vec = torch.Tensor(batch_size).type_as(inp.data)\ 99 | .long().fill_(row) 100 | pos_emb = self.pos_lut(Variable(row_vec)) 101 | with_pos = torch.cat( 102 | (pos_emb.view(1, pos_emb.size(0), pos_emb.size(1)), inp), 0) 103 | outputs, hidden_t = self.rnn(with_pos) 104 | all_outputs.append(outputs) 105 | out = torch.cat(all_outputs, 0) 106 | 107 | return hidden_t, out 108 | -------------------------------------------------------------------------------- /G_pretrain/onmt/modules/ImageEncoder.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | from torch.autograd import Variable 5 | 6 | 7 | class ImageEncoder(nn.Module): 8 | """ 9 | A simple encoder convolutional -> recurrent neural network for 10 | image input. 11 | 12 | Args: 13 | num_layers (int): number of encoder layers. 14 | bidirectional (bool): bidirectional encoder. 15 | rnn_size (int): size of hidden states of the rnn. 16 | dropout (float): dropout probablity. 17 | """ 18 | def __init__(self, num_layers, bidirectional, rnn_size, dropout): 19 | super(ImageEncoder, self).__init__() 20 | self.num_layers = num_layers 21 | self.num_directions = 2 if bidirectional else 1 22 | self.hidden_size = rnn_size 23 | 24 | self.layer1 = nn.Conv2d(3, 64, kernel_size=(3, 3), 25 | padding=(1, 1), stride=(1, 1)) 26 | self.layer2 = nn.Conv2d(64, 128, kernel_size=(3, 3), 27 | padding=(1, 1), stride=(1, 1)) 28 | self.layer3 = nn.Conv2d(128, 256, kernel_size=(3, 3), 29 | padding=(1, 1), stride=(1, 1)) 30 | self.layer4 = nn.Conv2d(256, 256, kernel_size=(3, 3), 31 | padding=(1, 1), stride=(1, 1)) 32 | self.layer5 = nn.Conv2d(256, 512, kernel_size=(3, 3), 33 | padding=(1, 1), stride=(1, 1)) 34 | self.layer6 = nn.Conv2d(512, 512, kernel_size=(3, 3), 35 | padding=(1, 1), stride=(1, 1)) 36 | 37 | self.batch_norm1 = nn.BatchNorm2d(256) 38 | self.batch_norm2 = nn.BatchNorm2d(512) 39 | self.batch_norm3 = nn.BatchNorm2d(512) 40 | 41 | input_size = 512 42 | self.rnn = nn.LSTM(input_size, rnn_size, 43 | num_layers=num_layers, 44 | dropout=dropout, 45 | bidirectional=bidirectional) 46 | self.pos_lut = nn.Embedding(1000, input_size) 47 | 48 | def load_pretrained_vectors(self, opt): 49 | # Pass in needed options only when modify function definition. 50 | pass 51 | 52 | def forward(self, input, lengths=None): 53 | "See :obj:`onmt.modules.EncoderBase.forward()`" 54 | 55 | batch_size = input.size(0) 56 | # (batch_size, 64, imgH, imgW) 57 | # layer 1 58 | input = F.relu(self.layer1(input[:, :, :, :]-0.5), True) 59 | 60 | # (batch_size, 64, imgH/2, imgW/2) 61 | input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2)) 62 | 63 | # (batch_size, 128, imgH/2, imgW/2) 64 | # layer 2 65 | input = F.relu(self.layer2(input), True) 66 | 67 | # (batch_size, 128, imgH/2/2, imgW/2/2) 68 | input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2)) 69 | 70 | # (batch_size, 256, imgH/2/2, imgW/2/2) 71 | # layer 3 72 | # batch norm 1 73 | input = F.relu(self.batch_norm1(self.layer3(input)), True) 74 | 75 | # (batch_size, 256, imgH/2/2, imgW/2/2) 76 | # layer4 77 | input = F.relu(self.layer4(input), True) 78 | 79 | # (batch_size, 256, imgH/2/2/2, imgW/2/2) 80 | input = F.max_pool2d(input, kernel_size=(1, 2), stride=(1, 2)) 81 | 82 | # (batch_size, 512, imgH/2/2/2, imgW/2/2) 83 | # layer 5 84 | # batch norm 2 85 | input = F.relu(self.batch_norm2(self.layer5(input)), True) 86 | 87 | # (batch_size, 512, imgH/2/2/2, imgW/2/2/2) 88 | input = F.max_pool2d(input, kernel_size=(2, 1), stride=(2, 1)) 89 | 90 | # (batch_size, 512, imgH/2/2/2, imgW/2/2/2) 91 | input = F.relu(self.batch_norm3(self.layer6(input)), True) 92 | 93 | # # (batch_size, 512, H, W) 94 | all_outputs = [] 95 | for row in range(input.size(2)): 96 | inp = input[:, :, row, :].transpose(0, 2)\ 97 | .transpose(1, 2) 98 | row_vec = torch.Tensor(batch_size).type_as(inp.data)\ 99 | .long().fill_(row) 100 | pos_emb = self.pos_lut(Variable(row_vec)) 101 | with_pos = torch.cat( 102 | (pos_emb.view(1, pos_emb.size(0), pos_emb.size(1)), inp), 0) 103 | outputs, hidden_t = self.rnn(with_pos) 104 | all_outputs.append(outputs) 105 | out = torch.cat(all_outputs, 0) 106 | 107 | return hidden_t, out 108 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/modules/ImageEncoder.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | from torch.autograd import Variable 5 | 6 | 7 | class ImageEncoder(nn.Module): 8 | """ 9 | A simple encoder convolutional -> recurrent neural network for 10 | image input. 11 | 12 | Args: 13 | num_layers (int): number of encoder layers. 14 | bidirectional (bool): bidirectional encoder. 15 | rnn_size (int): size of hidden states of the rnn. 16 | dropout (float): dropout probablity. 17 | """ 18 | def __init__(self, num_layers, bidirectional, rnn_size, dropout): 19 | super(ImageEncoder, self).__init__() 20 | self.num_layers = num_layers 21 | self.num_directions = 2 if bidirectional else 1 22 | self.hidden_size = rnn_size 23 | 24 | self.layer1 = nn.Conv2d(3, 64, kernel_size=(3, 3), 25 | padding=(1, 1), stride=(1, 1)) 26 | self.layer2 = nn.Conv2d(64, 128, kernel_size=(3, 3), 27 | padding=(1, 1), stride=(1, 1)) 28 | self.layer3 = nn.Conv2d(128, 256, kernel_size=(3, 3), 29 | padding=(1, 1), stride=(1, 1)) 30 | self.layer4 = nn.Conv2d(256, 256, kernel_size=(3, 3), 31 | padding=(1, 1), stride=(1, 1)) 32 | self.layer5 = nn.Conv2d(256, 512, kernel_size=(3, 3), 33 | padding=(1, 1), stride=(1, 1)) 34 | self.layer6 = nn.Conv2d(512, 512, kernel_size=(3, 3), 35 | padding=(1, 1), stride=(1, 1)) 36 | 37 | self.batch_norm1 = nn.BatchNorm2d(256) 38 | self.batch_norm2 = nn.BatchNorm2d(512) 39 | self.batch_norm3 = nn.BatchNorm2d(512) 40 | 41 | input_size = 512 42 | self.rnn = nn.LSTM(input_size, rnn_size, 43 | num_layers=num_layers, 44 | dropout=dropout, 45 | bidirectional=bidirectional) 46 | self.pos_lut = nn.Embedding(1000, input_size) 47 | 48 | def load_pretrained_vectors(self, opt): 49 | # Pass in needed options only when modify function definition. 50 | pass 51 | 52 | def forward(self, input, lengths=None): 53 | "See :obj:`onmt.modules.EncoderBase.forward()`" 54 | 55 | batch_size = input.size(0) 56 | # (batch_size, 64, imgH, imgW) 57 | # layer 1 58 | input = F.relu(self.layer1(input[:, :, :, :]-0.5), True) 59 | 60 | # (batch_size, 64, imgH/2, imgW/2) 61 | input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2)) 62 | 63 | # (batch_size, 128, imgH/2, imgW/2) 64 | # layer 2 65 | input = F.relu(self.layer2(input), True) 66 | 67 | # (batch_size, 128, imgH/2/2, imgW/2/2) 68 | input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2)) 69 | 70 | # (batch_size, 256, imgH/2/2, imgW/2/2) 71 | # layer 3 72 | # batch norm 1 73 | input = F.relu(self.batch_norm1(self.layer3(input)), True) 74 | 75 | # (batch_size, 256, imgH/2/2, imgW/2/2) 76 | # layer4 77 | input = F.relu(self.layer4(input), True) 78 | 79 | # (batch_size, 256, imgH/2/2/2, imgW/2/2) 80 | input = F.max_pool2d(input, kernel_size=(1, 2), stride=(1, 2)) 81 | 82 | # (batch_size, 512, imgH/2/2/2, imgW/2/2) 83 | # layer 5 84 | # batch norm 2 85 | input = F.relu(self.batch_norm2(self.layer5(input)), True) 86 | 87 | # (batch_size, 512, imgH/2/2/2, imgW/2/2/2) 88 | input = F.max_pool2d(input, kernel_size=(2, 1), stride=(2, 1)) 89 | 90 | # (batch_size, 512, imgH/2/2/2, imgW/2/2/2) 91 | input = F.relu(self.batch_norm3(self.layer6(input)), True) 92 | 93 | # # (batch_size, 512, H, W) 94 | all_outputs = [] 95 | for row in range(input.size(2)): 96 | inp = input[:, :, row, :].transpose(0, 2)\ 97 | .transpose(1, 2) 98 | row_vec = torch.Tensor(batch_size).type_as(inp.data)\ 99 | .long().fill_(row) 100 | pos_emb = self.pos_lut(Variable(row_vec)) 101 | with_pos = torch.cat( 102 | (pos_emb.view(1, pos_emb.size(0), pos_emb.size(1)), inp), 0) 103 | outputs, hidden_t = self.rnn(with_pos) 104 | all_outputs.append(outputs) 105 | out = torch.cat(all_outputs, 0) 106 | 107 | return hidden_t, out 108 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/modules/ImageEncoder.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | from torch.autograd import Variable 5 | 6 | 7 | class ImageEncoder(nn.Module): 8 | """ 9 | A simple encoder convolutional -> recurrent neural network for 10 | image input. 11 | 12 | Args: 13 | num_layers (int): number of encoder layers. 14 | bidirectional (bool): bidirectional encoder. 15 | rnn_size (int): size of hidden states of the rnn. 16 | dropout (float): dropout probablity. 17 | """ 18 | def __init__(self, num_layers, bidirectional, rnn_size, dropout): 19 | super(ImageEncoder, self).__init__() 20 | self.num_layers = num_layers 21 | self.num_directions = 2 if bidirectional else 1 22 | self.hidden_size = rnn_size 23 | 24 | self.layer1 = nn.Conv2d(3, 64, kernel_size=(3, 3), 25 | padding=(1, 1), stride=(1, 1)) 26 | self.layer2 = nn.Conv2d(64, 128, kernel_size=(3, 3), 27 | padding=(1, 1), stride=(1, 1)) 28 | self.layer3 = nn.Conv2d(128, 256, kernel_size=(3, 3), 29 | padding=(1, 1), stride=(1, 1)) 30 | self.layer4 = nn.Conv2d(256, 256, kernel_size=(3, 3), 31 | padding=(1, 1), stride=(1, 1)) 32 | self.layer5 = nn.Conv2d(256, 512, kernel_size=(3, 3), 33 | padding=(1, 1), stride=(1, 1)) 34 | self.layer6 = nn.Conv2d(512, 512, kernel_size=(3, 3), 35 | padding=(1, 1), stride=(1, 1)) 36 | 37 | self.batch_norm1 = nn.BatchNorm2d(256) 38 | self.batch_norm2 = nn.BatchNorm2d(512) 39 | self.batch_norm3 = nn.BatchNorm2d(512) 40 | 41 | input_size = 512 42 | self.rnn = nn.LSTM(input_size, rnn_size, 43 | num_layers=num_layers, 44 | dropout=dropout, 45 | bidirectional=bidirectional) 46 | self.pos_lut = nn.Embedding(1000, input_size) 47 | 48 | def load_pretrained_vectors(self, opt): 49 | # Pass in needed options only when modify function definition. 50 | pass 51 | 52 | def forward(self, input, lengths=None): 53 | "See :obj:`onmt.modules.EncoderBase.forward()`" 54 | 55 | batch_size = input.size(0) 56 | # (batch_size, 64, imgH, imgW) 57 | # layer 1 58 | input = F.relu(self.layer1(input[:, :, :, :]-0.5), True) 59 | 60 | # (batch_size, 64, imgH/2, imgW/2) 61 | input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2)) 62 | 63 | # (batch_size, 128, imgH/2, imgW/2) 64 | # layer 2 65 | input = F.relu(self.layer2(input), True) 66 | 67 | # (batch_size, 128, imgH/2/2, imgW/2/2) 68 | input = F.max_pool2d(input, kernel_size=(2, 2), stride=(2, 2)) 69 | 70 | # (batch_size, 256, imgH/2/2, imgW/2/2) 71 | # layer 3 72 | # batch norm 1 73 | input = F.relu(self.batch_norm1(self.layer3(input)), True) 74 | 75 | # (batch_size, 256, imgH/2/2, imgW/2/2) 76 | # layer4 77 | input = F.relu(self.layer4(input), True) 78 | 79 | # (batch_size, 256, imgH/2/2/2, imgW/2/2) 80 | input = F.max_pool2d(input, kernel_size=(1, 2), stride=(1, 2)) 81 | 82 | # (batch_size, 512, imgH/2/2/2, imgW/2/2) 83 | # layer 5 84 | # batch norm 2 85 | input = F.relu(self.batch_norm2(self.layer5(input)), True) 86 | 87 | # (batch_size, 512, imgH/2/2/2, imgW/2/2/2) 88 | input = F.max_pool2d(input, kernel_size=(2, 1), stride=(2, 1)) 89 | 90 | # (batch_size, 512, imgH/2/2/2, imgW/2/2/2) 91 | input = F.relu(self.batch_norm3(self.layer6(input)), True) 92 | 93 | # # (batch_size, 512, H, W) 94 | all_outputs = [] 95 | for row in range(input.size(2)): 96 | inp = input[:, :, row, :].transpose(0, 2)\ 97 | .transpose(1, 2) 98 | row_vec = torch.Tensor(batch_size).type_as(inp.data)\ 99 | .long().fill_(row) 100 | pos_emb = self.pos_lut(Variable(row_vec)) 101 | with_pos = torch.cat( 102 | (pos_emb.view(1, pos_emb.size(0), pos_emb.size(1)), inp), 0) 103 | outputs, hidden_t = self.rnn(with_pos) 104 | all_outputs.append(outputs) 105 | out = torch.cat(all_outputs, 0) 106 | 107 | return hidden_t, out 108 | -------------------------------------------------------------------------------- /D_pretrain/onmt/Optim.py: -------------------------------------------------------------------------------- 1 | import torch.optim as optim 2 | from torch.nn.utils import clip_grad_norm 3 | 4 | 5 | class Optim(object): 6 | """ 7 | Controller class for optimization. Mostly a thin 8 | wrapper for `optim`, but also useful for implementing 9 | rate scheduling beyond what is currently available. 10 | Also implements necessary methods for training RNNs such 11 | as grad manipulations. 12 | 13 | Args: 14 | method (:obj:`str`): one of [sgd, adagrad, adadelta, adam] 15 | lr (float): learning rate 16 | lr_decay (float, optional): learning rate decay multiplier 17 | start_decay_at (int, optional): epoch to start learning rate decay 18 | beta1, beta2 (float, optional): parameters for adam 19 | adagrad_accum (float, optional): initialization parameter for adagrad 20 | decay_method (str, option): custom decay options 21 | warmup_steps (int, option): parameter for `noam` decay 22 | model_size (int, option): parameter for `noam` decay 23 | """ 24 | # We use the default parameters for Adam that are suggested by 25 | # the original paper https://arxiv.org/pdf/1412.6980.pdf 26 | # These values are also used by other established implementations, 27 | # e.g. https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer 28 | # https://keras.io/optimizers/ 29 | # Recently there are slightly different values used in the paper 30 | # "Attention is all you need" 31 | # https://arxiv.org/pdf/1706.03762.pdf, particularly the value beta2=0.98 32 | # was used there however, beta2=0.999 is still arguably the more 33 | # established value, so we use that here as well 34 | def __init__(self, method, lr, max_grad_norm, 35 | lr_decay=1, start_decay_at=None, 36 | beta1=0.9, beta2=0.999, 37 | adagrad_accum=0.0, 38 | decay_method=None, 39 | warmup_steps=4000, 40 | model_size=None): 41 | self.last_ppl = None 42 | self.lr = lr 43 | self.original_lr = lr 44 | self.max_grad_norm = max_grad_norm 45 | self.method = method 46 | self.lr_decay = lr_decay 47 | self.start_decay_at = start_decay_at 48 | self.start_decay = False 49 | self._step = 0 50 | self.betas = [beta1, beta2] 51 | self.adagrad_accum = adagrad_accum 52 | self.decay_method = decay_method 53 | self.warmup_steps = warmup_steps 54 | self.model_size = model_size 55 | 56 | def set_parameters(self, params): 57 | self.params = [p for p in params if p.requires_grad] 58 | if self.method == 'sgd': 59 | self.optimizer = optim.SGD(self.params, lr=self.lr) 60 | elif self.method == 'adagrad': 61 | self.optimizer = optim.Adagrad(self.params, lr=self.lr) 62 | for group in self.optimizer.param_groups: 63 | for p in group['params']: 64 | self.optimizer.state[p]['sum'] = self.optimizer\ 65 | .state[p]['sum'].fill_(self.adagrad_accum) 66 | elif self.method == 'adadelta': 67 | self.optimizer = optim.Adadelta(self.params, lr=self.lr) 68 | elif self.method == 'adam': 69 | self.optimizer = optim.Adam(self.params, lr=self.lr, 70 | betas=self.betas, eps=1e-9) 71 | else: 72 | raise RuntimeError("Invalid optim method: " + self.method) 73 | 74 | def _set_rate(self, lr): 75 | self.lr = lr 76 | self.optimizer.param_groups[0]['lr'] = self.lr 77 | 78 | def step(self): 79 | """Update the model parameters based on current gradients. 80 | 81 | Optionally, will employ gradient modification or update learning 82 | rate. 83 | """ 84 | self._step += 1 85 | 86 | # Decay method used in tensor2tensor. 87 | if self.decay_method == "noam": 88 | self._set_rate( 89 | self.original_lr * 90 | (self.model_size ** (-0.5) * 91 | min(self._step ** (-0.5), 92 | self._step * self.warmup_steps**(-1.5)))) 93 | 94 | if self.max_grad_norm: 95 | clip_grad_norm(self.params, self.max_grad_norm) 96 | self.optimizer.step() 97 | 98 | def update_learning_rate(self, ppl, epoch): 99 | """ 100 | Decay learning rate if val perf does not improve 101 | or we hit the start_decay_at limit. 102 | """ 103 | 104 | if self.start_decay_at is not None and epoch >= self.start_decay_at: 105 | self.start_decay = True 106 | if self.last_ppl is not None and ppl > self.last_ppl: 107 | self.start_decay = True 108 | 109 | if self.start_decay: 110 | self.lr = self.lr * self.lr_decay 111 | print("Decaying learning rate to %g" % self.lr) 112 | 113 | self.last_ppl = ppl 114 | self.optimizer.param_groups[0]['lr'] = self.lr 115 | -------------------------------------------------------------------------------- /G_pretrain/onmt/Optim.py: -------------------------------------------------------------------------------- 1 | import torch.optim as optim 2 | from torch.nn.utils import clip_grad_norm 3 | 4 | 5 | class Optim(object): 6 | """ 7 | Controller class for optimization. Mostly a thin 8 | wrapper for `optim`, but also useful for implementing 9 | rate scheduling beyond what is currently available. 10 | Also implements necessary methods for training RNNs such 11 | as grad manipulations. 12 | 13 | Args: 14 | method (:obj:`str`): one of [sgd, adagrad, adadelta, adam] 15 | lr (float): learning rate 16 | lr_decay (float, optional): learning rate decay multiplier 17 | start_decay_at (int, optional): epoch to start learning rate decay 18 | beta1, beta2 (float, optional): parameters for adam 19 | adagrad_accum (float, optional): initialization parameter for adagrad 20 | decay_method (str, option): custom decay options 21 | warmup_steps (int, option): parameter for `noam` decay 22 | model_size (int, option): parameter for `noam` decay 23 | """ 24 | # We use the default parameters for Adam that are suggested by 25 | # the original paper https://arxiv.org/pdf/1412.6980.pdf 26 | # These values are also used by other established implementations, 27 | # e.g. https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer 28 | # https://keras.io/optimizers/ 29 | # Recently there are slightly different values used in the paper 30 | # "Attention is all you need" 31 | # https://arxiv.org/pdf/1706.03762.pdf, particularly the value beta2=0.98 32 | # was used there however, beta2=0.999 is still arguably the more 33 | # established value, so we use that here as well 34 | def __init__(self, method, lr, max_grad_norm, 35 | lr_decay=1, start_decay_at=None, 36 | beta1=0.9, beta2=0.999, 37 | adagrad_accum=0.0, 38 | decay_method=None, 39 | warmup_steps=4000, 40 | model_size=None): 41 | self.last_ppl = None 42 | self.lr = lr 43 | self.original_lr = lr 44 | self.max_grad_norm = max_grad_norm 45 | self.method = method 46 | self.lr_decay = lr_decay 47 | self.start_decay_at = start_decay_at 48 | self.start_decay = False 49 | self._step = 0 50 | self.betas = [beta1, beta2] 51 | self.adagrad_accum = adagrad_accum 52 | self.decay_method = decay_method 53 | self.warmup_steps = warmup_steps 54 | self.model_size = model_size 55 | 56 | def set_parameters(self, params): 57 | self.params = [p for p in params if p.requires_grad] 58 | if self.method == 'sgd': 59 | self.optimizer = optim.SGD(self.params, lr=self.lr) 60 | elif self.method == 'adagrad': 61 | self.optimizer = optim.Adagrad(self.params, lr=self.lr) 62 | for group in self.optimizer.param_groups: 63 | for p in group['params']: 64 | self.optimizer.state[p]['sum'] = self.optimizer\ 65 | .state[p]['sum'].fill_(self.adagrad_accum) 66 | elif self.method == 'adadelta': 67 | self.optimizer = optim.Adadelta(self.params, lr=self.lr) 68 | elif self.method == 'adam': 69 | self.optimizer = optim.Adam(self.params, lr=self.lr, 70 | betas=self.betas, eps=1e-9) 71 | else: 72 | raise RuntimeError("Invalid optim method: " + self.method) 73 | 74 | def _set_rate(self, lr): 75 | self.lr = lr 76 | self.optimizer.param_groups[0]['lr'] = self.lr 77 | 78 | def step(self): 79 | """Update the model parameters based on current gradients. 80 | 81 | Optionally, will employ gradient modification or update learning 82 | rate. 83 | """ 84 | self._step += 1 85 | 86 | # Decay method used in tensor2tensor. 87 | if self.decay_method == "noam": 88 | self._set_rate( 89 | self.original_lr * 90 | (self.model_size ** (-0.5) * 91 | min(self._step ** (-0.5), 92 | self._step * self.warmup_steps**(-1.5)))) 93 | 94 | if self.max_grad_norm: 95 | clip_grad_norm(self.params, self.max_grad_norm) 96 | self.optimizer.step() 97 | 98 | def update_learning_rate(self, ppl, epoch): 99 | """ 100 | Decay learning rate if val perf does not improve 101 | or we hit the start_decay_at limit. 102 | """ 103 | 104 | if self.start_decay_at is not None and epoch >= self.start_decay_at: 105 | self.start_decay = True 106 | if self.last_ppl is not None and ppl > self.last_ppl: 107 | self.start_decay = True 108 | 109 | if self.start_decay: 110 | self.lr = self.lr * self.lr_decay 111 | print("Decaying learning rate to %g" % self.lr) 112 | 113 | self.last_ppl = ppl 114 | self.optimizer.param_groups[0]['lr'] = self.lr 115 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/Optim.py: -------------------------------------------------------------------------------- 1 | import torch.optim as optim 2 | from torch.nn.utils import clip_grad_norm 3 | 4 | 5 | class Optim(object): 6 | """ 7 | Controller class for optimization. Mostly a thin 8 | wrapper for `optim`, but also useful for implementing 9 | rate scheduling beyond what is currently available. 10 | Also implements necessary methods for training RNNs such 11 | as grad manipulations. 12 | 13 | Args: 14 | method (:obj:`str`): one of [sgd, adagrad, adadelta, adam] 15 | lr (float): learning rate 16 | lr_decay (float, optional): learning rate decay multiplier 17 | start_decay_at (int, optional): epoch to start learning rate decay 18 | beta1, beta2 (float, optional): parameters for adam 19 | adagrad_accum (float, optional): initialization parameter for adagrad 20 | decay_method (str, option): custom decay options 21 | warmup_steps (int, option): parameter for `noam` decay 22 | model_size (int, option): parameter for `noam` decay 23 | """ 24 | # We use the default parameters for Adam that are suggested by 25 | # the original paper https://arxiv.org/pdf/1412.6980.pdf 26 | # These values are also used by other established implementations, 27 | # e.g. https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer 28 | # https://keras.io/optimizers/ 29 | # Recently there are slightly different values used in the paper 30 | # "Attention is all you need" 31 | # https://arxiv.org/pdf/1706.03762.pdf, particularly the value beta2=0.98 32 | # was used there however, beta2=0.999 is still arguably the more 33 | # established value, so we use that here as well 34 | def __init__(self, method, lr, max_grad_norm, 35 | lr_decay=1, start_decay_at=None, 36 | beta1=0.9, beta2=0.999, 37 | adagrad_accum=0.0, 38 | decay_method=None, 39 | warmup_steps=4000, 40 | model_size=None): 41 | self.last_ppl = None 42 | self.lr = lr 43 | self.original_lr = lr 44 | self.max_grad_norm = max_grad_norm 45 | self.method = method 46 | self.lr_decay = lr_decay 47 | self.start_decay_at = start_decay_at 48 | self.start_decay = False 49 | self._step = 0 50 | self.betas = [beta1, beta2] 51 | self.adagrad_accum = adagrad_accum 52 | self.decay_method = decay_method 53 | self.warmup_steps = warmup_steps 54 | self.model_size = model_size 55 | 56 | def set_parameters(self, params): 57 | self.params = [p for p in params if p.requires_grad] 58 | if self.method == 'sgd': 59 | self.optimizer = optim.SGD(self.params, lr=self.lr) 60 | elif self.method == 'adagrad': 61 | self.optimizer = optim.Adagrad(self.params, lr=self.lr) 62 | for group in self.optimizer.param_groups: 63 | for p in group['params']: 64 | self.optimizer.state[p]['sum'] = self.optimizer\ 65 | .state[p]['sum'].fill_(self.adagrad_accum) 66 | elif self.method == 'adadelta': 67 | self.optimizer = optim.Adadelta(self.params, lr=self.lr) 68 | elif self.method == 'adam': 69 | self.optimizer = optim.Adam(self.params, lr=self.lr, 70 | betas=self.betas, eps=1e-9) 71 | else: 72 | raise RuntimeError("Invalid optim method: " + self.method) 73 | 74 | def _set_rate(self, lr): 75 | self.lr = lr 76 | self.optimizer.param_groups[0]['lr'] = self.lr 77 | 78 | def step(self): 79 | """Update the model parameters based on current gradients. 80 | 81 | Optionally, will employ gradient modification or update learning 82 | rate. 83 | """ 84 | self._step += 1 85 | 86 | # Decay method used in tensor2tensor. 87 | if self.decay_method == "noam": 88 | self._set_rate( 89 | self.original_lr * 90 | (self.model_size ** (-0.5) * 91 | min(self._step ** (-0.5), 92 | self._step * self.warmup_steps**(-1.5)))) 93 | 94 | if self.max_grad_norm: 95 | clip_grad_norm(self.params, self.max_grad_norm) 96 | self.optimizer.step() 97 | 98 | def update_learning_rate(self, ppl, epoch): 99 | """ 100 | Decay learning rate if val perf does not improve 101 | or we hit the start_decay_at limit. 102 | """ 103 | 104 | if self.start_decay_at is not None and epoch >= self.start_decay_at: 105 | self.start_decay = True 106 | if self.last_ppl is not None and ppl > self.last_ppl: 107 | self.start_decay = True 108 | 109 | if self.start_decay: 110 | self.lr = self.lr * self.lr_decay 111 | print("Decaying learning rate to %g" % self.lr) 112 | 113 | self.last_ppl = ppl 114 | self.optimizer.param_groups[0]['lr'] = self.lr 115 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/Optim.py: -------------------------------------------------------------------------------- 1 | import torch.optim as optim 2 | from torch.nn.utils import clip_grad_norm 3 | 4 | 5 | class Optim(object): 6 | """ 7 | Controller class for optimization. Mostly a thin 8 | wrapper for `optim`, but also useful for implementing 9 | rate scheduling beyond what is currently available. 10 | Also implements necessary methods for training RNNs such 11 | as grad manipulations. 12 | 13 | Args: 14 | method (:obj:`str`): one of [sgd, adagrad, adadelta, adam] 15 | lr (float): learning rate 16 | lr_decay (float, optional): learning rate decay multiplier 17 | start_decay_at (int, optional): epoch to start learning rate decay 18 | beta1, beta2 (float, optional): parameters for adam 19 | adagrad_accum (float, optional): initialization parameter for adagrad 20 | decay_method (str, option): custom decay options 21 | warmup_steps (int, option): parameter for `noam` decay 22 | model_size (int, option): parameter for `noam` decay 23 | """ 24 | # We use the default parameters for Adam that are suggested by 25 | # the original paper https://arxiv.org/pdf/1412.6980.pdf 26 | # These values are also used by other established implementations, 27 | # e.g. https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer 28 | # https://keras.io/optimizers/ 29 | # Recently there are slightly different values used in the paper 30 | # "Attention is all you need" 31 | # https://arxiv.org/pdf/1706.03762.pdf, particularly the value beta2=0.98 32 | # was used there however, beta2=0.999 is still arguably the more 33 | # established value, so we use that here as well 34 | def __init__(self, method, lr, max_grad_norm, 35 | lr_decay=1, start_decay_at=None, 36 | beta1=0.9, beta2=0.999, 37 | adagrad_accum=0.0, 38 | decay_method=None, 39 | warmup_steps=4000, 40 | model_size=None): 41 | self.last_ppl = None 42 | self.lr = lr 43 | self.original_lr = lr 44 | self.max_grad_norm = max_grad_norm 45 | self.method = method 46 | self.lr_decay = lr_decay 47 | self.start_decay_at = start_decay_at 48 | self.start_decay = False 49 | self._step = 0 50 | self.betas = [beta1, beta2] 51 | self.adagrad_accum = adagrad_accum 52 | self.decay_method = decay_method 53 | self.warmup_steps = warmup_steps 54 | self.model_size = model_size 55 | 56 | def set_parameters(self, params): 57 | self.params = [p for p in params if p.requires_grad] 58 | if self.method == 'sgd': 59 | self.optimizer = optim.SGD(self.params, lr=self.lr) 60 | elif self.method == 'adagrad': 61 | self.optimizer = optim.Adagrad(self.params, lr=self.lr) 62 | for group in self.optimizer.param_groups: 63 | for p in group['params']: 64 | self.optimizer.state[p]['sum'] = self.optimizer\ 65 | .state[p]['sum'].fill_(self.adagrad_accum) 66 | elif self.method == 'adadelta': 67 | self.optimizer = optim.Adadelta(self.params, lr=self.lr) 68 | elif self.method == 'adam': 69 | self.optimizer = optim.Adam(self.params, lr=self.lr, 70 | betas=self.betas, eps=1e-9) 71 | else: 72 | raise RuntimeError("Invalid optim method: " + self.method) 73 | 74 | def _set_rate(self, lr): 75 | self.lr = lr 76 | self.optimizer.param_groups[0]['lr'] = self.lr 77 | 78 | def step(self): 79 | """Update the model parameters based on current gradients. 80 | 81 | Optionally, will employ gradient modification or update learning 82 | rate. 83 | """ 84 | self._step += 1 85 | 86 | # Decay method used in tensor2tensor. 87 | if self.decay_method == "noam": 88 | self._set_rate( 89 | self.original_lr * 90 | (self.model_size ** (-0.5) * 91 | min(self._step ** (-0.5), 92 | self._step * self.warmup_steps**(-1.5)))) 93 | 94 | if self.max_grad_norm: 95 | clip_grad_norm(self.params, self.max_grad_norm) 96 | self.optimizer.step() 97 | 98 | def update_learning_rate(self, ppl, epoch): 99 | """ 100 | Decay learning rate if val perf does not improve 101 | or we hit the start_decay_at limit. 102 | """ 103 | 104 | if self.start_decay_at is not None and epoch >= self.start_decay_at: 105 | self.start_decay = True 106 | if self.last_ppl is not None and ppl > self.last_ppl: 107 | self.start_decay = True 108 | 109 | if self.start_decay: 110 | self.lr = self.lr * self.lr_decay 111 | print("Decaying learning rate to %g" % self.lr) 112 | 113 | self.last_ppl = ppl 114 | self.optimizer.param_groups[0]['lr'] = self.lr 115 | -------------------------------------------------------------------------------- /D_pretrain/onmt/io/DatasetBase.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from itertools import chain 4 | import torchtext 5 | from onmt.Utils import aeq 6 | 7 | 8 | PAD_WORD = '' 9 | UNK = 0 10 | BOS_WORD = '' 11 | EOS_WORD = '' 12 | 13 | 14 | class ONMTDatasetBase(torchtext.data.Dataset): 15 | """ 16 | A dataset basically supports iteration over all the examples 17 | it contains. We currently have 3 datasets inheriting this base 18 | for 3 types of corpus respectively: "text", "img", "audio". 19 | 20 | Internally it initializes an `torchtext.data.Dataset` object with 21 | the following attributes: 22 | 23 | `examples`: a sequence of `torchtext.data.Example` objects. 24 | `fields`: a dictionary associating str keys with `torchtext.data.Field` 25 | objects, and not necessarily having the same keys as the input fields. 26 | """ 27 | def __getstate__(self): 28 | return self.__dict__ 29 | 30 | def __setstate__(self, d): 31 | self.__dict__.update(d) 32 | 33 | def __reduce_ex__(self, proto): 34 | "This is a hack. Something is broken with torch pickle." 35 | return super(ONMTDatasetBase, self).__reduce_ex__() 36 | 37 | def load_fields(self, vocab_dict): 38 | """ Load fields from vocab.pt, and set the `fields` attribute. 39 | 40 | Args: 41 | vocab_dict (dict): a dict of loaded vocab from vocab.pt file. 42 | """ 43 | from onmt.io.IO import load_fields_from_vocab 44 | 45 | fields = load_fields_from_vocab(vocab_dict.items(), self.data_type) 46 | self.fields = dict([(k, f) for (k, f) in fields.items() 47 | if k in self.examples[0].__dict__]) 48 | 49 | @staticmethod 50 | def coalesce_datasets(datasets): 51 | """Coalesce all dataset instances. """ 52 | final = datasets[0] 53 | for d in datasets[1:]: 54 | # `src_vocabs` is a list of `torchtext.vocab.Vocab`. 55 | # Each sentence transforms into on Vocab. 56 | # Coalesce them into one big list. 57 | final.src_vocabs += d.src_vocabs 58 | 59 | # All datasets have same number of features. 60 | aeq(final.n_src_feats, d.n_src_feats) 61 | aeq(final.n_tgt_feats, d.n_tgt_feats) 62 | 63 | # `examples` is a list of `torchtext.data.Example`. 64 | # Coalesce them into one big list. 65 | final.examples += d.examples 66 | 67 | # All datasets have same fields, no need to update. 68 | 69 | return final 70 | 71 | @staticmethod 72 | def extract_text_features(tokens): 73 | """ 74 | Args: 75 | tokens: A list of tokens, where each token consists of a word, 76 | optionally followed by u"│"-delimited features. 77 | Returns: 78 | A sequence of words, a sequence of features, and num of features. 79 | """ 80 | if not tokens: 81 | return [], [], -1 82 | 83 | split_tokens = [token.split(u"│") for token in tokens] 84 | split_tokens = [token for token in split_tokens if token[0]] 85 | token_size = len(split_tokens[0]) 86 | 87 | assert all(len(token) == token_size for token in split_tokens), \ 88 | "all words must have the same number of features" 89 | words_and_features = list(zip(*split_tokens)) 90 | words = words_and_features[0] 91 | features = words_and_features[1:] 92 | 93 | return words, features, token_size - 1 94 | 95 | # Below are helper functions for intra-class use only. 96 | 97 | def _join_dicts(self, *args): 98 | """ 99 | Args: 100 | dictionaries with disjoint keys. 101 | 102 | Returns: 103 | a single dictionary that has the union of these keys. 104 | """ 105 | return dict(chain(*[d.items() for d in args])) 106 | 107 | def _peek(self, seq): 108 | """ 109 | Args: 110 | seq: an iterator. 111 | 112 | Returns: 113 | the first thing returned by calling next() on the iterator 114 | and an iterator created by re-chaining that value to the beginning 115 | of the iterator. 116 | """ 117 | first = next(seq) 118 | return first, chain([first], seq) 119 | 120 | def _construct_example_fromlist(self, data, fields): 121 | """ 122 | Args: 123 | data: the data to be set as the value of the attributes of 124 | the to-be-created `Example`, associating with respective 125 | `Field` objects with same key. 126 | fields: a dict of `torchtext.data.Field` objects. The keys 127 | are attributes of the to-be-created `Example`. 128 | 129 | Returns: 130 | the created `Example` object. 131 | """ 132 | ex = torchtext.data.Example() 133 | for (name, field), val in zip(fields, data): 134 | if field is not None: 135 | setattr(ex, name, field.preprocess(val)) 136 | else: 137 | setattr(ex, name, val) 138 | return ex 139 | -------------------------------------------------------------------------------- /G_pretrain/onmt/io/DatasetBase.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from itertools import chain 4 | import torchtext 5 | from onmt.Utils import aeq 6 | 7 | 8 | PAD_WORD = '' 9 | UNK = 0 10 | BOS_WORD = '' 11 | EOS_WORD = '' 12 | 13 | 14 | class ONMTDatasetBase(torchtext.data.Dataset): 15 | """ 16 | A dataset basically supports iteration over all the examples 17 | it contains. We currently have 3 datasets inheriting this base 18 | for 3 types of corpus respectively: "text", "img", "audio". 19 | 20 | Internally it initializes an `torchtext.data.Dataset` object with 21 | the following attributes: 22 | 23 | `examples`: a sequence of `torchtext.data.Example` objects. 24 | `fields`: a dictionary associating str keys with `torchtext.data.Field` 25 | objects, and not necessarily having the same keys as the input fields. 26 | """ 27 | def __getstate__(self): 28 | return self.__dict__ 29 | 30 | def __setstate__(self, d): 31 | self.__dict__.update(d) 32 | 33 | def __reduce_ex__(self, proto): 34 | "This is a hack. Something is broken with torch pickle." 35 | return super(ONMTDatasetBase, self).__reduce_ex__() 36 | 37 | def load_fields(self, vocab_dict): 38 | """ Load fields from vocab.pt, and set the `fields` attribute. 39 | 40 | Args: 41 | vocab_dict (dict): a dict of loaded vocab from vocab.pt file. 42 | """ 43 | from onmt.io.IO import load_fields_from_vocab 44 | 45 | fields = load_fields_from_vocab(vocab_dict.items(), self.data_type) 46 | self.fields = dict([(k, f) for (k, f) in fields.items() 47 | if k in self.examples[0].__dict__]) 48 | 49 | @staticmethod 50 | def coalesce_datasets(datasets): 51 | """Coalesce all dataset instances. """ 52 | final = datasets[0] 53 | for d in datasets[1:]: 54 | # `src_vocabs` is a list of `torchtext.vocab.Vocab`. 55 | # Each sentence transforms into on Vocab. 56 | # Coalesce them into one big list. 57 | final.src_vocabs += d.src_vocabs 58 | 59 | # All datasets have same number of features. 60 | aeq(final.n_src_feats, d.n_src_feats) 61 | aeq(final.n_tgt_feats, d.n_tgt_feats) 62 | 63 | # `examples` is a list of `torchtext.data.Example`. 64 | # Coalesce them into one big list. 65 | final.examples += d.examples 66 | 67 | # All datasets have same fields, no need to update. 68 | 69 | return final 70 | 71 | @staticmethod 72 | def extract_text_features(tokens): 73 | """ 74 | Args: 75 | tokens: A list of tokens, where each token consists of a word, 76 | optionally followed by u"│"-delimited features. 77 | Returns: 78 | A sequence of words, a sequence of features, and num of features. 79 | """ 80 | if not tokens: 81 | return [], [], -1 82 | 83 | split_tokens = [token.split(u"│") for token in tokens] 84 | split_tokens = [token for token in split_tokens if token[0]] 85 | token_size = len(split_tokens[0]) 86 | 87 | assert all(len(token) == token_size for token in split_tokens), \ 88 | "all words must have the same number of features" 89 | words_and_features = list(zip(*split_tokens)) 90 | words = words_and_features[0] 91 | features = words_and_features[1:] 92 | 93 | return words, features, token_size - 1 94 | 95 | # Below are helper functions for intra-class use only. 96 | 97 | def _join_dicts(self, *args): 98 | """ 99 | Args: 100 | dictionaries with disjoint keys. 101 | 102 | Returns: 103 | a single dictionary that has the union of these keys. 104 | """ 105 | return dict(chain(*[d.items() for d in args])) 106 | 107 | def _peek(self, seq): 108 | """ 109 | Args: 110 | seq: an iterator. 111 | 112 | Returns: 113 | the first thing returned by calling next() on the iterator 114 | and an iterator created by re-chaining that value to the beginning 115 | of the iterator. 116 | """ 117 | first = next(seq) 118 | return first, chain([first], seq) 119 | 120 | def _construct_example_fromlist(self, data, fields): 121 | """ 122 | Args: 123 | data: the data to be set as the value of the attributes of 124 | the to-be-created `Example`, associating with respective 125 | `Field` objects with same key. 126 | fields: a dict of `torchtext.data.Field` objects. The keys 127 | are attributes of the to-be-created `Example`. 128 | 129 | Returns: 130 | the created `Example` object. 131 | """ 132 | ex = torchtext.data.Example() 133 | for (name, field), val in zip(fields, data): 134 | if field is not None: 135 | setattr(ex, name, field.preprocess(val)) 136 | else: 137 | setattr(ex, name, val) 138 | return ex 139 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/io/DatasetBase.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from itertools import chain 4 | import torchtext 5 | from onmt.Utils import aeq 6 | 7 | 8 | PAD_WORD = '' 9 | UNK = 0 10 | BOS_WORD = '' 11 | EOS_WORD = '' 12 | 13 | 14 | class ONMTDatasetBase(torchtext.data.Dataset): 15 | """ 16 | A dataset basically supports iteration over all the examples 17 | it contains. We currently have 3 datasets inheriting this base 18 | for 3 types of corpus respectively: "text", "img", "audio". 19 | 20 | Internally it initializes an `torchtext.data.Dataset` object with 21 | the following attributes: 22 | 23 | `examples`: a sequence of `torchtext.data.Example` objects. 24 | `fields`: a dictionary associating str keys with `torchtext.data.Field` 25 | objects, and not necessarily having the same keys as the input fields. 26 | """ 27 | def __getstate__(self): 28 | return self.__dict__ 29 | 30 | def __setstate__(self, d): 31 | self.__dict__.update(d) 32 | 33 | def __reduce_ex__(self, proto): 34 | "This is a hack. Something is broken with torch pickle." 35 | return super(ONMTDatasetBase, self).__reduce_ex__() 36 | 37 | def load_fields(self, vocab_dict): 38 | """ Load fields from vocab.pt, and set the `fields` attribute. 39 | 40 | Args: 41 | vocab_dict (dict): a dict of loaded vocab from vocab.pt file. 42 | """ 43 | from onmt.io.IO import load_fields_from_vocab 44 | 45 | fields = load_fields_from_vocab(vocab_dict.items(), self.data_type) 46 | self.fields = dict([(k, f) for (k, f) in fields.items() 47 | if k in self.examples[0].__dict__]) 48 | 49 | @staticmethod 50 | def coalesce_datasets(datasets): 51 | """Coalesce all dataset instances. """ 52 | final = datasets[0] 53 | for d in datasets[1:]: 54 | # `src_vocabs` is a list of `torchtext.vocab.Vocab`. 55 | # Each sentence transforms into on Vocab. 56 | # Coalesce them into one big list. 57 | final.src_vocabs += d.src_vocabs 58 | 59 | # All datasets have same number of features. 60 | aeq(final.n_src_feats, d.n_src_feats) 61 | aeq(final.n_tgt_feats, d.n_tgt_feats) 62 | 63 | # `examples` is a list of `torchtext.data.Example`. 64 | # Coalesce them into one big list. 65 | final.examples += d.examples 66 | 67 | # All datasets have same fields, no need to update. 68 | 69 | return final 70 | 71 | @staticmethod 72 | def extract_text_features(tokens): 73 | """ 74 | Args: 75 | tokens: A list of tokens, where each token consists of a word, 76 | optionally followed by u"│"-delimited features. 77 | Returns: 78 | A sequence of words, a sequence of features, and num of features. 79 | """ 80 | if not tokens: 81 | return [], [], -1 82 | 83 | split_tokens = [token.split(u"│") for token in tokens] 84 | split_tokens = [token for token in split_tokens if token[0]] 85 | token_size = len(split_tokens[0]) 86 | 87 | assert all(len(token) == token_size for token in split_tokens), \ 88 | "all words must have the same number of features" 89 | words_and_features = list(zip(*split_tokens)) 90 | words = words_and_features[0] 91 | features = words_and_features[1:] 92 | 93 | return words, features, token_size - 1 94 | 95 | # Below are helper functions for intra-class use only. 96 | 97 | def _join_dicts(self, *args): 98 | """ 99 | Args: 100 | dictionaries with disjoint keys. 101 | 102 | Returns: 103 | a single dictionary that has the union of these keys. 104 | """ 105 | return dict(chain(*[d.items() for d in args])) 106 | 107 | def _peek(self, seq): 108 | """ 109 | Args: 110 | seq: an iterator. 111 | 112 | Returns: 113 | the first thing returned by calling next() on the iterator 114 | and an iterator created by re-chaining that value to the beginning 115 | of the iterator. 116 | """ 117 | first = next(seq) 118 | return first, chain([first], seq) 119 | 120 | def _construct_example_fromlist(self, data, fields): 121 | """ 122 | Args: 123 | data: the data to be set as the value of the attributes of 124 | the to-be-created `Example`, associating with respective 125 | `Field` objects with same key. 126 | fields: a dict of `torchtext.data.Field` objects. The keys 127 | are attributes of the to-be-created `Example`. 128 | 129 | Returns: 130 | the created `Example` object. 131 | """ 132 | ex = torchtext.data.Example() 133 | for (name, field), val in zip(fields, data): 134 | if field is not None: 135 | setattr(ex, name, field.preprocess(val)) 136 | else: 137 | setattr(ex, name, val) 138 | return ex 139 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/io/DatasetBase.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from itertools import chain 4 | import torchtext 5 | from onmt.Utils import aeq 6 | 7 | 8 | PAD_WORD = '' 9 | UNK = 0 10 | BOS_WORD = '' 11 | EOS_WORD = '' 12 | 13 | 14 | class ONMTDatasetBase(torchtext.data.Dataset): 15 | """ 16 | A dataset basically supports iteration over all the examples 17 | it contains. We currently have 3 datasets inheriting this base 18 | for 3 types of corpus respectively: "text", "img", "audio". 19 | 20 | Internally it initializes an `torchtext.data.Dataset` object with 21 | the following attributes: 22 | 23 | `examples`: a sequence of `torchtext.data.Example` objects. 24 | `fields`: a dictionary associating str keys with `torchtext.data.Field` 25 | objects, and not necessarily having the same keys as the input fields. 26 | """ 27 | def __getstate__(self): 28 | return self.__dict__ 29 | 30 | def __setstate__(self, d): 31 | self.__dict__.update(d) 32 | 33 | def __reduce_ex__(self, proto): 34 | "This is a hack. Something is broken with torch pickle." 35 | return super(ONMTDatasetBase, self).__reduce_ex__() 36 | 37 | def load_fields(self, vocab_dict): 38 | """ Load fields from vocab.pt, and set the `fields` attribute. 39 | 40 | Args: 41 | vocab_dict (dict): a dict of loaded vocab from vocab.pt file. 42 | """ 43 | from onmt.io.IO import load_fields_from_vocab 44 | 45 | fields = load_fields_from_vocab(vocab_dict.items(), self.data_type) 46 | self.fields = dict([(k, f) for (k, f) in fields.items() 47 | if k in self.examples[0].__dict__]) 48 | 49 | @staticmethod 50 | def coalesce_datasets(datasets): 51 | """Coalesce all dataset instances. """ 52 | final = datasets[0] 53 | for d in datasets[1:]: 54 | # `src_vocabs` is a list of `torchtext.vocab.Vocab`. 55 | # Each sentence transforms into on Vocab. 56 | # Coalesce them into one big list. 57 | final.src_vocabs += d.src_vocabs 58 | 59 | # All datasets have same number of features. 60 | aeq(final.n_src_feats, d.n_src_feats) 61 | aeq(final.n_tgt_feats, d.n_tgt_feats) 62 | 63 | # `examples` is a list of `torchtext.data.Example`. 64 | # Coalesce them into one big list. 65 | final.examples += d.examples 66 | 67 | # All datasets have same fields, no need to update. 68 | 69 | return final 70 | 71 | @staticmethod 72 | def extract_text_features(tokens): 73 | """ 74 | Args: 75 | tokens: A list of tokens, where each token consists of a word, 76 | optionally followed by u"│"-delimited features. 77 | Returns: 78 | A sequence of words, a sequence of features, and num of features. 79 | """ 80 | if not tokens: 81 | return [], [], -1 82 | 83 | split_tokens = [token.split(u"│") for token in tokens] 84 | split_tokens = [token for token in split_tokens if token[0]] 85 | token_size = len(split_tokens[0]) 86 | 87 | assert all(len(token) == token_size for token in split_tokens), \ 88 | "all words must have the same number of features" 89 | words_and_features = list(zip(*split_tokens)) 90 | words = words_and_features[0] 91 | features = words_and_features[1:] 92 | 93 | return words, features, token_size - 1 94 | 95 | # Below are helper functions for intra-class use only. 96 | 97 | def _join_dicts(self, *args): 98 | """ 99 | Args: 100 | dictionaries with disjoint keys. 101 | 102 | Returns: 103 | a single dictionary that has the union of these keys. 104 | """ 105 | return dict(chain(*[d.items() for d in args])) 106 | 107 | def _peek(self, seq): 108 | """ 109 | Args: 110 | seq: an iterator. 111 | 112 | Returns: 113 | the first thing returned by calling next() on the iterator 114 | and an iterator created by re-chaining that value to the beginning 115 | of the iterator. 116 | """ 117 | first = next(seq) 118 | return first, chain([first], seq) 119 | 120 | def _construct_example_fromlist(self, data, fields): 121 | """ 122 | Args: 123 | data: the data to be set as the value of the attributes of 124 | the to-be-created `Example`, associating with respective 125 | `Field` objects with same key. 126 | fields: a dict of `torchtext.data.Field` objects. The keys 127 | are attributes of the to-be-created `Example`. 128 | 129 | Returns: 130 | the created `Example` object. 131 | """ 132 | ex = torchtext.data.Example() 133 | for (name, field), val in zip(fields, data): 134 | if field is not None: 135 | setattr(ex, name, field.preprocess(val)) 136 | else: 137 | setattr(ex, name, val) 138 | return ex 139 | -------------------------------------------------------------------------------- /reinforcement_train/predict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import division, unicode_literals 4 | import os 5 | import argparse 6 | import math 7 | import codecs 8 | import torch 9 | 10 | from itertools import count 11 | 12 | import onmt.io 13 | import onmt.translate 14 | import onmt 15 | import onmt.ModelConstructor 16 | import onmt.modules 17 | import opts 18 | 19 | parser = argparse.ArgumentParser( 20 | description='translate.py', 21 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 22 | opts.add_md_help_argument(parser) 23 | opts.translate_opts(parser) 24 | 25 | opt = parser.parse_args() 26 | 27 | 28 | def _report_score(name, score_total, words_total): 29 | print("%s AVG SCORE: %.4f, %s PPL: %.4f" % ( 30 | name, score_total / words_total, 31 | name, math.exp(-score_total / words_total))) 32 | 33 | 34 | def _report_bleu(): 35 | import subprocess 36 | print() 37 | res = subprocess.check_output( 38 | "perl tools/multi-bleu.perl %s < %s" % (opt.tgt, opt.output), 39 | shell=True).decode("utf-8") 40 | print(">> " + res.strip()) 41 | 42 | 43 | def _report_rouge(): 44 | import subprocess 45 | res = subprocess.check_output( 46 | "python tools/test_rouge.py -r %s -c %s" % (opt.tgt, opt.output), 47 | shell=True).decode("utf-8") 48 | print(res.strip()) 49 | 50 | 51 | def main(): 52 | dummy_parser = argparse.ArgumentParser(description='train.py') 53 | opts.model_opts(dummy_parser) 54 | dummy_opt = dummy_parser.parse_known_args([])[0] 55 | 56 | opt.cuda = opt.gpu > -1 57 | if opt.cuda: 58 | torch.cuda.set_device(opt.gpu) 59 | 60 | # Load the model. 61 | fields, model, model_opt = \ 62 | onmt.ModelConstructor.load_test_model(opt, dummy_opt.__dict__) 63 | 64 | # File to write sentences to. 65 | out_file = codecs.open(opt.output, 'w', 'utf-8') 66 | 67 | # Test data 68 | data = onmt.io.build_dataset(fields, opt.data_type, 69 | opt.src, opt.tgt, opt.per, opt.nli, 70 | src_dir=opt.src_dir, 71 | sample_rate=opt.sample_rate, 72 | window_size=opt.window_size, 73 | window_stride=opt.window_stride, 74 | window=opt.window, 75 | use_filter_pred=False) 76 | 77 | # Sort batch by decreasing lengths of sentence required by pytorch. 78 | # sort=False means "Use dataset's sortkey instead of iterator's". 79 | data_iter = onmt.io.OrderedIterator( 80 | dataset=data, device=opt.gpu, 81 | batch_size=opt.batch_size, train=False, sort=False, 82 | sort_within_batch=True, shuffle=False) 83 | 84 | # Translator 85 | scorer = onmt.translate.GNMTGlobalScorer(opt.alpha, opt.beta) 86 | translator = onmt.translate.Translator(model, fields, 87 | beam_size=opt.beam_size, 88 | n_best=opt.n_best, 89 | global_scorer=scorer, 90 | max_length=opt.max_length, 91 | copy_attn=model_opt.copy_attn, 92 | cuda=opt.cuda, 93 | beam_trace=opt.dump_beam != "", 94 | min_length=opt.min_length) 95 | builder = onmt.translate.TranslationBuilder( 96 | data, translator.fields, 97 | opt.n_best, opt.replace_unk, opt.tgt) 98 | 99 | # Statistics 100 | counter = count(1) 101 | pred_score_total, pred_words_total = 0, 0 102 | gold_score_total, gold_words_total = 0, 0 103 | 104 | for batch in data_iter: 105 | batch_data = translator.translate_batch(batch, data) 106 | translations = builder.from_batch(batch_data) 107 | 108 | for trans in translations: 109 | pred_score_total += trans.pred_scores[0] 110 | pred_words_total += len(trans.pred_sents[0]) 111 | if opt.tgt: 112 | gold_score_total += trans.gold_score 113 | gold_words_total += len(trans.gold_sent) 114 | 115 | n_best_preds = [" ".join(pred) 116 | for pred in trans.pred_sents[:opt.n_best]] 117 | out_file.write('\n'.join(n_best_preds)) 118 | out_file.write('\n') 119 | out_file.flush() 120 | 121 | if opt.verbose: 122 | sent_number = next(counter) 123 | output = trans.log(sent_number) 124 | os.write(1, output.encode('utf-8')) 125 | 126 | _report_score('PRED', pred_score_total, pred_words_total) 127 | if opt.tgt: 128 | _report_score('GOLD', gold_score_total, gold_words_total) 129 | if opt.report_bleu: 130 | _report_bleu() 131 | if opt.report_rouge: 132 | _report_rouge() 133 | 134 | if opt.dump_beam: 135 | import json 136 | json.dump(translator.beam_accum, 137 | codecs.open(opt.dump_beam, 'w', 'utf-8')) 138 | 139 | 140 | if __name__ == "__main__": 141 | main() 142 | -------------------------------------------------------------------------------- /D_pretrain/onmt/modules/MultiHeadedAttn.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | 6 | from onmt.Utils import aeq 7 | from onmt.modules.UtilClass import BottleLinear, BottleSoftmax 8 | 9 | 10 | class MultiHeadedAttention(nn.Module): 11 | """ 12 | Multi-Head Attention module from 13 | "Attention is All You Need" 14 | :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`. 15 | 16 | Similar to standard `dot` attention but uses 17 | multiple attention distributions simulataneously 18 | to select relevant items. 19 | 20 | .. mermaid:: 21 | 22 | graph BT 23 | A[key] 24 | B[value] 25 | C[query] 26 | O[output] 27 | subgraph Attn 28 | D[Attn 1] 29 | E[Attn 2] 30 | F[Attn N] 31 | end 32 | A --> D 33 | C --> D 34 | A --> E 35 | C --> E 36 | A --> F 37 | C --> F 38 | D --> O 39 | E --> O 40 | F --> O 41 | B --> O 42 | 43 | Also includes several additional tricks. 44 | 45 | Args: 46 | head_count (int): number of parallel heads 47 | model_dim (int): the dimension of keys/values/queries, 48 | must be divisible by head_count 49 | dropout (float): dropout parameter 50 | """ 51 | def __init__(self, head_count, model_dim, dropout=0.1): 52 | assert model_dim % head_count == 0 53 | self.dim_per_head = model_dim // head_count 54 | self.model_dim = model_dim 55 | 56 | super(MultiHeadedAttention, self).__init__() 57 | self.head_count = head_count 58 | 59 | self.linear_keys = BottleLinear(model_dim, 60 | head_count * self.dim_per_head, 61 | bias=False) 62 | self.linear_values = BottleLinear(model_dim, 63 | head_count * self.dim_per_head, 64 | bias=False) 65 | self.linear_query = BottleLinear(model_dim, 66 | head_count * self.dim_per_head, 67 | bias=False) 68 | self.sm = BottleSoftmax() 69 | self.activation = nn.ReLU() 70 | self.dropout = nn.Dropout(dropout) 71 | self.res_dropout = nn.Dropout(dropout) 72 | 73 | def forward(self, key, value, query, mask=None): 74 | """ 75 | Compute the context vector and the attention vectors. 76 | 77 | Args: 78 | key (`FloatTensor`): set of `key_len` 79 | key vectors `[batch, key_len, dim]` 80 | value (`FloatTensor`): set of `key_len` 81 | value vectors `[batch, key_len, dim]` 82 | query (`FloatTensor`): set of `query_len` 83 | query vectors `[batch, query_len, dim]` 84 | mask: binary mask indicating which keys have 85 | non-zero attention `[batch, query_len, key_len]` 86 | Returns: 87 | (`FloatTensor`, `FloatTensor`) : 88 | 89 | * output context vectors `[batch, query_len, dim]` 90 | * one of the attention vectors `[batch, query_len, key_len]` 91 | """ 92 | 93 | # CHECKS 94 | batch, k_len, d = key.size() 95 | batch_, k_len_, d_ = value.size() 96 | aeq(batch, batch_) 97 | aeq(k_len, k_len_) 98 | aeq(d, d_) 99 | batch_, q_len, d_ = query.size() 100 | aeq(batch, batch_) 101 | aeq(d, d_) 102 | aeq(self.model_dim % 8, 0) 103 | if mask is not None: 104 | batch_, q_len_, k_len_ = mask.size() 105 | aeq(batch_, batch) 106 | aeq(k_len_, k_len) 107 | aeq(q_len_ == q_len) 108 | # END CHECKS 109 | 110 | def shape_projection(x): 111 | b, l, d = x.size() 112 | return x.view(b, l, self.head_count, self.dim_per_head) \ 113 | .transpose(1, 2).contiguous() \ 114 | .view(b * self.head_count, l, self.dim_per_head) 115 | 116 | def unshape_projection(x, q): 117 | b, l, d = q.size() 118 | return x.view(b, self.head_count, l, self.dim_per_head) \ 119 | .transpose(1, 2).contiguous() \ 120 | .view(b, l, self.head_count * self.dim_per_head) 121 | 122 | residual = query 123 | key_up = shape_projection(self.linear_keys(key)) 124 | value_up = shape_projection(self.linear_values(value)) 125 | query_up = shape_projection(self.linear_query(query)) 126 | 127 | scaled = torch.bmm(query_up, key_up.transpose(1, 2)) 128 | scaled = scaled / math.sqrt(self.dim_per_head) 129 | bh, l, dim_per_head = scaled.size() 130 | b = bh // self.head_count 131 | if mask is not None: 132 | 133 | scaled = scaled.view(b, self.head_count, l, dim_per_head) 134 | mask = mask.unsqueeze(1).expand_as(scaled) 135 | scaled = scaled.masked_fill(Variable(mask), -1e18) \ 136 | .view(bh, l, dim_per_head) 137 | attn = self.sm(scaled) 138 | # Return one attn 139 | top_attn = attn \ 140 | .view(b, self.head_count, l, dim_per_head)[:, 0, :, :] \ 141 | .contiguous() 142 | 143 | drop_attn = self.dropout(self.sm(scaled)) 144 | 145 | # values : (batch * 8) x qlen x dim 146 | out = unshape_projection(torch.bmm(drop_attn, value_up), residual) 147 | 148 | # Residual and layer norm 149 | ret = self.res_dropout(out) 150 | 151 | # CHECK 152 | batch_, q_len_, d_ = ret.size() 153 | aeq(q_len, q_len_) 154 | aeq(batch, batch_) 155 | aeq(d, d_) 156 | # END CHECK 157 | return ret, top_attn 158 | -------------------------------------------------------------------------------- /G_pretrain/onmt/modules/MultiHeadedAttn.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | 6 | from onmt.Utils import aeq 7 | from onmt.modules.UtilClass import BottleLinear, BottleSoftmax 8 | 9 | 10 | class MultiHeadedAttention(nn.Module): 11 | """ 12 | Multi-Head Attention module from 13 | "Attention is All You Need" 14 | :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`. 15 | 16 | Similar to standard `dot` attention but uses 17 | multiple attention distributions simulataneously 18 | to select relevant items. 19 | 20 | .. mermaid:: 21 | 22 | graph BT 23 | A[key] 24 | B[value] 25 | C[query] 26 | O[output] 27 | subgraph Attn 28 | D[Attn 1] 29 | E[Attn 2] 30 | F[Attn N] 31 | end 32 | A --> D 33 | C --> D 34 | A --> E 35 | C --> E 36 | A --> F 37 | C --> F 38 | D --> O 39 | E --> O 40 | F --> O 41 | B --> O 42 | 43 | Also includes several additional tricks. 44 | 45 | Args: 46 | head_count (int): number of parallel heads 47 | model_dim (int): the dimension of keys/values/queries, 48 | must be divisible by head_count 49 | dropout (float): dropout parameter 50 | """ 51 | def __init__(self, head_count, model_dim, dropout=0.1): 52 | assert model_dim % head_count == 0 53 | self.dim_per_head = model_dim // head_count 54 | self.model_dim = model_dim 55 | 56 | super(MultiHeadedAttention, self).__init__() 57 | self.head_count = head_count 58 | 59 | self.linear_keys = BottleLinear(model_dim, 60 | head_count * self.dim_per_head, 61 | bias=False) 62 | self.linear_values = BottleLinear(model_dim, 63 | head_count * self.dim_per_head, 64 | bias=False) 65 | self.linear_query = BottleLinear(model_dim, 66 | head_count * self.dim_per_head, 67 | bias=False) 68 | self.sm = BottleSoftmax() 69 | self.activation = nn.ReLU() 70 | self.dropout = nn.Dropout(dropout) 71 | self.res_dropout = nn.Dropout(dropout) 72 | 73 | def forward(self, key, value, query, mask=None): 74 | """ 75 | Compute the context vector and the attention vectors. 76 | 77 | Args: 78 | key (`FloatTensor`): set of `key_len` 79 | key vectors `[batch, key_len, dim]` 80 | value (`FloatTensor`): set of `key_len` 81 | value vectors `[batch, key_len, dim]` 82 | query (`FloatTensor`): set of `query_len` 83 | query vectors `[batch, query_len, dim]` 84 | mask: binary mask indicating which keys have 85 | non-zero attention `[batch, query_len, key_len]` 86 | Returns: 87 | (`FloatTensor`, `FloatTensor`) : 88 | 89 | * output context vectors `[batch, query_len, dim]` 90 | * one of the attention vectors `[batch, query_len, key_len]` 91 | """ 92 | 93 | # CHECKS 94 | batch, k_len, d = key.size() 95 | batch_, k_len_, d_ = value.size() 96 | aeq(batch, batch_) 97 | aeq(k_len, k_len_) 98 | aeq(d, d_) 99 | batch_, q_len, d_ = query.size() 100 | aeq(batch, batch_) 101 | aeq(d, d_) 102 | aeq(self.model_dim % 8, 0) 103 | if mask is not None: 104 | batch_, q_len_, k_len_ = mask.size() 105 | aeq(batch_, batch) 106 | aeq(k_len_, k_len) 107 | aeq(q_len_ == q_len) 108 | # END CHECKS 109 | 110 | def shape_projection(x): 111 | b, l, d = x.size() 112 | return x.view(b, l, self.head_count, self.dim_per_head) \ 113 | .transpose(1, 2).contiguous() \ 114 | .view(b * self.head_count, l, self.dim_per_head) 115 | 116 | def unshape_projection(x, q): 117 | b, l, d = q.size() 118 | return x.view(b, self.head_count, l, self.dim_per_head) \ 119 | .transpose(1, 2).contiguous() \ 120 | .view(b, l, self.head_count * self.dim_per_head) 121 | 122 | residual = query 123 | key_up = shape_projection(self.linear_keys(key)) 124 | value_up = shape_projection(self.linear_values(value)) 125 | query_up = shape_projection(self.linear_query(query)) 126 | 127 | scaled = torch.bmm(query_up, key_up.transpose(1, 2)) 128 | scaled = scaled / math.sqrt(self.dim_per_head) 129 | bh, l, dim_per_head = scaled.size() 130 | b = bh // self.head_count 131 | if mask is not None: 132 | 133 | scaled = scaled.view(b, self.head_count, l, dim_per_head) 134 | mask = mask.unsqueeze(1).expand_as(scaled) 135 | scaled = scaled.masked_fill(Variable(mask), -1e18) \ 136 | .view(bh, l, dim_per_head) 137 | attn = self.sm(scaled) 138 | # Return one attn 139 | top_attn = attn \ 140 | .view(b, self.head_count, l, dim_per_head)[:, 0, :, :] \ 141 | .contiguous() 142 | 143 | drop_attn = self.dropout(self.sm(scaled)) 144 | 145 | # values : (batch * 8) x qlen x dim 146 | out = unshape_projection(torch.bmm(drop_attn, value_up), residual) 147 | 148 | # Residual and layer norm 149 | ret = self.res_dropout(out) 150 | 151 | # CHECK 152 | batch_, q_len_, d_ = ret.size() 153 | aeq(q_len, q_len_) 154 | aeq(batch, batch_) 155 | aeq(d, d_) 156 | # END CHECK 157 | return ret, top_attn 158 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/modules/MultiHeadedAttn.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | 6 | from onmt.Utils import aeq 7 | from onmt.modules.UtilClass import BottleLinear, BottleSoftmax 8 | 9 | 10 | class MultiHeadedAttention(nn.Module): 11 | """ 12 | Multi-Head Attention module from 13 | "Attention is All You Need" 14 | :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`. 15 | 16 | Similar to standard `dot` attention but uses 17 | multiple attention distributions simulataneously 18 | to select relevant items. 19 | 20 | .. mermaid:: 21 | 22 | graph BT 23 | A[key] 24 | B[value] 25 | C[query] 26 | O[output] 27 | subgraph Attn 28 | D[Attn 1] 29 | E[Attn 2] 30 | F[Attn N] 31 | end 32 | A --> D 33 | C --> D 34 | A --> E 35 | C --> E 36 | A --> F 37 | C --> F 38 | D --> O 39 | E --> O 40 | F --> O 41 | B --> O 42 | 43 | Also includes several additional tricks. 44 | 45 | Args: 46 | head_count (int): number of parallel heads 47 | model_dim (int): the dimension of keys/values/queries, 48 | must be divisible by head_count 49 | dropout (float): dropout parameter 50 | """ 51 | def __init__(self, head_count, model_dim, dropout=0.1): 52 | assert model_dim % head_count == 0 53 | self.dim_per_head = model_dim // head_count 54 | self.model_dim = model_dim 55 | 56 | super(MultiHeadedAttention, self).__init__() 57 | self.head_count = head_count 58 | 59 | self.linear_keys = BottleLinear(model_dim, 60 | head_count * self.dim_per_head, 61 | bias=False) 62 | self.linear_values = BottleLinear(model_dim, 63 | head_count * self.dim_per_head, 64 | bias=False) 65 | self.linear_query = BottleLinear(model_dim, 66 | head_count * self.dim_per_head, 67 | bias=False) 68 | self.sm = BottleSoftmax() 69 | self.activation = nn.ReLU() 70 | self.dropout = nn.Dropout(dropout) 71 | self.res_dropout = nn.Dropout(dropout) 72 | 73 | def forward(self, key, value, query, mask=None): 74 | """ 75 | Compute the context vector and the attention vectors. 76 | 77 | Args: 78 | key (`FloatTensor`): set of `key_len` 79 | key vectors `[batch, key_len, dim]` 80 | value (`FloatTensor`): set of `key_len` 81 | value vectors `[batch, key_len, dim]` 82 | query (`FloatTensor`): set of `query_len` 83 | query vectors `[batch, query_len, dim]` 84 | mask: binary mask indicating which keys have 85 | non-zero attention `[batch, query_len, key_len]` 86 | Returns: 87 | (`FloatTensor`, `FloatTensor`) : 88 | 89 | * output context vectors `[batch, query_len, dim]` 90 | * one of the attention vectors `[batch, query_len, key_len]` 91 | """ 92 | 93 | # CHECKS 94 | batch, k_len, d = key.size() 95 | batch_, k_len_, d_ = value.size() 96 | aeq(batch, batch_) 97 | aeq(k_len, k_len_) 98 | aeq(d, d_) 99 | batch_, q_len, d_ = query.size() 100 | aeq(batch, batch_) 101 | aeq(d, d_) 102 | aeq(self.model_dim % 8, 0) 103 | if mask is not None: 104 | batch_, q_len_, k_len_ = mask.size() 105 | aeq(batch_, batch) 106 | aeq(k_len_, k_len) 107 | aeq(q_len_ == q_len) 108 | # END CHECKS 109 | 110 | def shape_projection(x): 111 | b, l, d = x.size() 112 | return x.view(b, l, self.head_count, self.dim_per_head) \ 113 | .transpose(1, 2).contiguous() \ 114 | .view(b * self.head_count, l, self.dim_per_head) 115 | 116 | def unshape_projection(x, q): 117 | b, l, d = q.size() 118 | return x.view(b, self.head_count, l, self.dim_per_head) \ 119 | .transpose(1, 2).contiguous() \ 120 | .view(b, l, self.head_count * self.dim_per_head) 121 | 122 | residual = query 123 | key_up = shape_projection(self.linear_keys(key)) 124 | value_up = shape_projection(self.linear_values(value)) 125 | query_up = shape_projection(self.linear_query(query)) 126 | 127 | scaled = torch.bmm(query_up, key_up.transpose(1, 2)) 128 | scaled = scaled / math.sqrt(self.dim_per_head) 129 | bh, l, dim_per_head = scaled.size() 130 | b = bh // self.head_count 131 | if mask is not None: 132 | 133 | scaled = scaled.view(b, self.head_count, l, dim_per_head) 134 | mask = mask.unsqueeze(1).expand_as(scaled) 135 | scaled = scaled.masked_fill(Variable(mask), -1e18) \ 136 | .view(bh, l, dim_per_head) 137 | attn = self.sm(scaled) 138 | # Return one attn 139 | top_attn = attn \ 140 | .view(b, self.head_count, l, dim_per_head)[:, 0, :, :] \ 141 | .contiguous() 142 | 143 | drop_attn = self.dropout(self.sm(scaled)) 144 | 145 | # values : (batch * 8) x qlen x dim 146 | out = unshape_projection(torch.bmm(drop_attn, value_up), residual) 147 | 148 | # Residual and layer norm 149 | ret = self.res_dropout(out) 150 | 151 | # CHECK 152 | batch_, q_len_, d_ = ret.size() 153 | aeq(q_len, q_len_) 154 | aeq(batch, batch_) 155 | aeq(d, d_) 156 | # END CHECK 157 | return ret, top_attn 158 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/modules/MultiHeadedAttn.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | 6 | from onmt.Utils import aeq 7 | from onmt.modules.UtilClass import BottleLinear, BottleSoftmax 8 | 9 | 10 | class MultiHeadedAttention(nn.Module): 11 | """ 12 | Multi-Head Attention module from 13 | "Attention is All You Need" 14 | :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`. 15 | 16 | Similar to standard `dot` attention but uses 17 | multiple attention distributions simulataneously 18 | to select relevant items. 19 | 20 | .. mermaid:: 21 | 22 | graph BT 23 | A[key] 24 | B[value] 25 | C[query] 26 | O[output] 27 | subgraph Attn 28 | D[Attn 1] 29 | E[Attn 2] 30 | F[Attn N] 31 | end 32 | A --> D 33 | C --> D 34 | A --> E 35 | C --> E 36 | A --> F 37 | C --> F 38 | D --> O 39 | E --> O 40 | F --> O 41 | B --> O 42 | 43 | Also includes several additional tricks. 44 | 45 | Args: 46 | head_count (int): number of parallel heads 47 | model_dim (int): the dimension of keys/values/queries, 48 | must be divisible by head_count 49 | dropout (float): dropout parameter 50 | """ 51 | def __init__(self, head_count, model_dim, dropout=0.1): 52 | assert model_dim % head_count == 0 53 | self.dim_per_head = model_dim // head_count 54 | self.model_dim = model_dim 55 | 56 | super(MultiHeadedAttention, self).__init__() 57 | self.head_count = head_count 58 | 59 | self.linear_keys = BottleLinear(model_dim, 60 | head_count * self.dim_per_head, 61 | bias=False) 62 | self.linear_values = BottleLinear(model_dim, 63 | head_count * self.dim_per_head, 64 | bias=False) 65 | self.linear_query = BottleLinear(model_dim, 66 | head_count * self.dim_per_head, 67 | bias=False) 68 | self.sm = BottleSoftmax() 69 | self.activation = nn.ReLU() 70 | self.dropout = nn.Dropout(dropout) 71 | self.res_dropout = nn.Dropout(dropout) 72 | 73 | def forward(self, key, value, query, mask=None): 74 | """ 75 | Compute the context vector and the attention vectors. 76 | 77 | Args: 78 | key (`FloatTensor`): set of `key_len` 79 | key vectors `[batch, key_len, dim]` 80 | value (`FloatTensor`): set of `key_len` 81 | value vectors `[batch, key_len, dim]` 82 | query (`FloatTensor`): set of `query_len` 83 | query vectors `[batch, query_len, dim]` 84 | mask: binary mask indicating which keys have 85 | non-zero attention `[batch, query_len, key_len]` 86 | Returns: 87 | (`FloatTensor`, `FloatTensor`) : 88 | 89 | * output context vectors `[batch, query_len, dim]` 90 | * one of the attention vectors `[batch, query_len, key_len]` 91 | """ 92 | 93 | # CHECKS 94 | batch, k_len, d = key.size() 95 | batch_, k_len_, d_ = value.size() 96 | aeq(batch, batch_) 97 | aeq(k_len, k_len_) 98 | aeq(d, d_) 99 | batch_, q_len, d_ = query.size() 100 | aeq(batch, batch_) 101 | aeq(d, d_) 102 | aeq(self.model_dim % 8, 0) 103 | if mask is not None: 104 | batch_, q_len_, k_len_ = mask.size() 105 | aeq(batch_, batch) 106 | aeq(k_len_, k_len) 107 | aeq(q_len_ == q_len) 108 | # END CHECKS 109 | 110 | def shape_projection(x): 111 | b, l, d = x.size() 112 | return x.view(b, l, self.head_count, self.dim_per_head) \ 113 | .transpose(1, 2).contiguous() \ 114 | .view(b * self.head_count, l, self.dim_per_head) 115 | 116 | def unshape_projection(x, q): 117 | b, l, d = q.size() 118 | return x.view(b, self.head_count, l, self.dim_per_head) \ 119 | .transpose(1, 2).contiguous() \ 120 | .view(b, l, self.head_count * self.dim_per_head) 121 | 122 | residual = query 123 | key_up = shape_projection(self.linear_keys(key)) 124 | value_up = shape_projection(self.linear_values(value)) 125 | query_up = shape_projection(self.linear_query(query)) 126 | 127 | scaled = torch.bmm(query_up, key_up.transpose(1, 2)) 128 | scaled = scaled / math.sqrt(self.dim_per_head) 129 | bh, l, dim_per_head = scaled.size() 130 | b = bh // self.head_count 131 | if mask is not None: 132 | 133 | scaled = scaled.view(b, self.head_count, l, dim_per_head) 134 | mask = mask.unsqueeze(1).expand_as(scaled) 135 | scaled = scaled.masked_fill(Variable(mask), -1e18) \ 136 | .view(bh, l, dim_per_head) 137 | attn = self.sm(scaled) 138 | # Return one attn 139 | top_attn = attn \ 140 | .view(b, self.head_count, l, dim_per_head)[:, 0, :, :] \ 141 | .contiguous() 142 | 143 | drop_attn = self.dropout(self.sm(scaled)) 144 | 145 | # values : (batch * 8) x qlen x dim 146 | out = unshape_projection(torch.bmm(drop_attn, value_up), residual) 147 | 148 | # Residual and layer norm 149 | ret = self.res_dropout(out) 150 | 151 | # CHECK 152 | batch_, q_len_, d_ = ret.size() 153 | aeq(q_len, q_len_) 154 | aeq(batch, batch_) 155 | aeq(d, d_) 156 | # END CHECK 157 | return ret, top_attn 158 | -------------------------------------------------------------------------------- /D_pretrain/onmt/translate/Translation.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, unicode_literals 2 | 3 | import torch 4 | import onmt.io 5 | 6 | 7 | class TranslationBuilder(object): 8 | """ 9 | Build a word-based translation from the batch output 10 | of translator and the underlying dictionaries. 11 | 12 | Replacement based on "Addressing the Rare Word 13 | Problem in Neural Machine Translation" :cite:`Luong2015b` 14 | 15 | Args: 16 | data (DataSet): 17 | fields (dict of Fields): data fields 18 | n_best (int): number of translations produced 19 | replace_unk (bool): replace unknown words using attention 20 | has_tgt (bool): will the batch have gold targets 21 | """ 22 | def __init__(self, data, fields, n_best=1, replace_unk=False, 23 | has_tgt=False): 24 | self.data = data 25 | self.fields = fields 26 | self.n_best = n_best 27 | self.replace_unk = replace_unk 28 | self.has_tgt = has_tgt 29 | 30 | def _build_target_tokens(self, src, src_vocab, src_raw, pred, attn): 31 | vocab = self.fields["tgt"].vocab 32 | tokens = [] 33 | for tok in pred: 34 | if tok < len(vocab): 35 | tokens.append(vocab.itos[tok]) 36 | else: 37 | tokens.append(src_vocab.itos[tok - len(vocab)]) 38 | if tokens[-1] == onmt.io.EOS_WORD: 39 | tokens = tokens[:-1] 40 | break 41 | if self.replace_unk and (attn is not None) and (src is not None): 42 | for i in range(len(tokens)): 43 | if tokens[i] == vocab.itos[onmt.io.UNK]: 44 | _, maxIndex = attn[i].max(0) 45 | tokens[i] = src_raw[maxIndex[0]] 46 | return tokens 47 | 48 | def from_batch(self, translation_batch): 49 | batch = translation_batch["batch"] 50 | assert(len(translation_batch["gold_score"]) == 51 | len(translation_batch["predictions"])) 52 | batch_size = batch.batch_size 53 | 54 | preds, pred_score, attn, gold_score, indices = list(zip( 55 | *sorted(zip(translation_batch["predictions"], 56 | translation_batch["scores"], 57 | translation_batch["attention"], 58 | translation_batch["gold_score"], 59 | batch.indices.data), 60 | key=lambda x: x[-1]))) 61 | 62 | # Sorting 63 | inds, perm = torch.sort(batch.indices.data) 64 | data_type = self.data.data_type 65 | if data_type == 'text': 66 | src = batch.src[0].data.index_select(1, perm) 67 | else: 68 | src = None 69 | 70 | if self.has_tgt: 71 | tgt = batch.tgt.data.index_select(1, perm) 72 | else: 73 | tgt = None 74 | 75 | translations = [] 76 | for b in range(batch_size): 77 | if data_type == 'text': 78 | src_vocab = self.data.src_vocabs[inds[b]] \ 79 | if self.data.src_vocabs else None 80 | src_raw = self.data.examples[inds[b]].src 81 | else: 82 | src_vocab = None 83 | src_raw = None 84 | pred_sents = [self._build_target_tokens( 85 | src[:, b] if src is not None else None, 86 | src_vocab, src_raw, 87 | preds[b][n], attn[b][n]) 88 | for n in range(self.n_best)] 89 | gold_sent = None 90 | if tgt is not None: 91 | gold_sent = self._build_target_tokens( 92 | src[:, b] if src is not None else None, 93 | src_vocab, src_raw, 94 | tgt[1:, b] if tgt is not None else None, None) 95 | 96 | translation = Translation(src[:, b] if src is not None else None, 97 | src_raw, pred_sents, 98 | attn[b], pred_score[b], gold_sent, 99 | gold_score[b]) 100 | translations.append(translation) 101 | 102 | return translations 103 | 104 | 105 | class Translation(object): 106 | """ 107 | Container for a translated sentence. 108 | 109 | Attributes: 110 | src (`LongTensor`): src word ids 111 | src_raw ([str]): raw src words 112 | 113 | pred_sents ([[str]]): words from the n-best translations 114 | pred_scores ([[float]]): log-probs of n-best translations 115 | attns ([`FloatTensor`]) : attention dist for each translation 116 | gold_sent ([str]): words from gold translation 117 | gold_score ([float]): log-prob of gold translation 118 | 119 | """ 120 | def __init__(self, src, src_raw, pred_sents, 121 | attn, pred_scores, tgt_sent, gold_score): 122 | self.src = src 123 | self.src_raw = src_raw 124 | self.pred_sents = pred_sents 125 | self.attns = attn 126 | self.pred_scores = pred_scores 127 | self.gold_sent = tgt_sent 128 | self.gold_score = gold_score 129 | 130 | def log(self, sent_number): 131 | """ 132 | Log translation to stdout. 133 | """ 134 | output = '\nSENT {}: {}\n'.format(sent_number, self.src_raw) 135 | 136 | best_pred = self.pred_sents[0] 137 | best_score = self.pred_scores[0] 138 | pred_sent = ' '.join(best_pred) 139 | output += 'PRED {}: {}\n'.format(sent_number, pred_sent) 140 | print("PRED SCORE: {:.4f}".format(best_score)) 141 | 142 | if self.gold_sent is not None: 143 | tgt_sent = ' '.join(self.gold_sent) 144 | output += 'GOLD {}: {}\n'.format(sent_number, tgt_sent) 145 | output += ("GOLD SCORE: {:.4f}".format(self.gold_score)) 146 | 147 | if len(self.pred_sents) > 1: 148 | print('\nBEST HYP:') 149 | for score, sent in zip(self.pred_scores, self.pred_sents): 150 | output += "[{:.4f}] {}\n".format(score, sent) 151 | 152 | return output 153 | -------------------------------------------------------------------------------- /G_pretrain/onmt/translate/Translation.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, unicode_literals 2 | 3 | import torch 4 | import onmt.io 5 | 6 | 7 | class TranslationBuilder(object): 8 | """ 9 | Build a word-based translation from the batch output 10 | of translator and the underlying dictionaries. 11 | 12 | Replacement based on "Addressing the Rare Word 13 | Problem in Neural Machine Translation" :cite:`Luong2015b` 14 | 15 | Args: 16 | data (DataSet): 17 | fields (dict of Fields): data fields 18 | n_best (int): number of translations produced 19 | replace_unk (bool): replace unknown words using attention 20 | has_tgt (bool): will the batch have gold targets 21 | """ 22 | def __init__(self, data, fields, n_best=1, replace_unk=False, 23 | has_tgt=False): 24 | self.data = data 25 | self.fields = fields 26 | self.n_best = n_best 27 | self.replace_unk = replace_unk 28 | self.has_tgt = has_tgt 29 | 30 | def _build_target_tokens(self, src, src_vocab, src_raw, pred, attn): 31 | vocab = self.fields["tgt"].vocab 32 | tokens = [] 33 | for tok in pred: 34 | if tok < len(vocab): 35 | tokens.append(vocab.itos[tok]) 36 | else: 37 | tokens.append(src_vocab.itos[tok - len(vocab)]) 38 | if tokens[-1] == onmt.io.EOS_WORD: 39 | tokens = tokens[:-1] 40 | break 41 | if self.replace_unk and (attn is not None) and (src is not None): 42 | for i in range(len(tokens)): 43 | if tokens[i] == vocab.itos[onmt.io.UNK]: 44 | _, maxIndex = attn[i].max(0) 45 | tokens[i] = src_raw[maxIndex[0]] 46 | return tokens 47 | 48 | def from_batch(self, translation_batch): 49 | batch = translation_batch["batch"] 50 | assert(len(translation_batch["gold_score"]) == 51 | len(translation_batch["predictions"])) 52 | batch_size = batch.batch_size 53 | 54 | preds, pred_score, attn, gold_score, indices = list(zip( 55 | *sorted(zip(translation_batch["predictions"], 56 | translation_batch["scores"], 57 | translation_batch["attention"], 58 | translation_batch["gold_score"], 59 | batch.indices.data), 60 | key=lambda x: x[-1]))) 61 | 62 | # Sorting 63 | inds, perm = torch.sort(batch.indices.data) 64 | data_type = self.data.data_type 65 | if data_type == 'text': 66 | src = batch.src[0].data.index_select(1, perm) 67 | else: 68 | src = None 69 | 70 | if self.has_tgt: 71 | tgt = batch.tgt.data.index_select(1, perm) 72 | else: 73 | tgt = None 74 | 75 | translations = [] 76 | for b in range(batch_size): 77 | if data_type == 'text': 78 | src_vocab = self.data.src_vocabs[inds[b]] \ 79 | if self.data.src_vocabs else None 80 | src_raw = self.data.examples[inds[b]].src 81 | else: 82 | src_vocab = None 83 | src_raw = None 84 | pred_sents = [self._build_target_tokens( 85 | src[:, b] if src is not None else None, 86 | src_vocab, src_raw, 87 | preds[b][n], attn[b][n]) 88 | for n in range(self.n_best)] 89 | gold_sent = None 90 | if tgt is not None: 91 | gold_sent = self._build_target_tokens( 92 | src[:, b] if src is not None else None, 93 | src_vocab, src_raw, 94 | tgt[1:, b] if tgt is not None else None, None) 95 | 96 | translation = Translation(src[:, b] if src is not None else None, 97 | src_raw, pred_sents, 98 | attn[b], pred_score[b], gold_sent, 99 | gold_score[b]) 100 | translations.append(translation) 101 | 102 | return translations 103 | 104 | 105 | class Translation(object): 106 | """ 107 | Container for a translated sentence. 108 | 109 | Attributes: 110 | src (`LongTensor`): src word ids 111 | src_raw ([str]): raw src words 112 | 113 | pred_sents ([[str]]): words from the n-best translations 114 | pred_scores ([[float]]): log-probs of n-best translations 115 | attns ([`FloatTensor`]) : attention dist for each translation 116 | gold_sent ([str]): words from gold translation 117 | gold_score ([float]): log-prob of gold translation 118 | 119 | """ 120 | def __init__(self, src, src_raw, pred_sents, 121 | attn, pred_scores, tgt_sent, gold_score): 122 | self.src = src 123 | self.src_raw = src_raw 124 | self.pred_sents = pred_sents 125 | self.attns = attn 126 | self.pred_scores = pred_scores 127 | self.gold_sent = tgt_sent 128 | self.gold_score = gold_score 129 | 130 | def log(self, sent_number): 131 | """ 132 | Log translation to stdout. 133 | """ 134 | output = '\nSENT {}: {}\n'.format(sent_number, self.src_raw) 135 | 136 | best_pred = self.pred_sents[0] 137 | best_score = self.pred_scores[0] 138 | pred_sent = ' '.join(best_pred) 139 | output += 'PRED {}: {}\n'.format(sent_number, pred_sent) 140 | print("PRED SCORE: {:.4f}".format(best_score)) 141 | 142 | if self.gold_sent is not None: 143 | tgt_sent = ' '.join(self.gold_sent) 144 | output += 'GOLD {}: {}\n'.format(sent_number, tgt_sent) 145 | output += ("GOLD SCORE: {:.4f}".format(self.gold_score)) 146 | 147 | if len(self.pred_sents) > 1: 148 | print('\nBEST HYP:') 149 | for score, sent in zip(self.pred_scores, self.pred_sents): 150 | output += "[{:.4f}] {}\n".format(score, sent) 151 | 152 | return output 153 | -------------------------------------------------------------------------------- /NLI_pretrain/onmt/translate/Translation.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, unicode_literals 2 | 3 | import torch 4 | import onmt.io 5 | 6 | 7 | class TranslationBuilder(object): 8 | """ 9 | Build a word-based translation from the batch output 10 | of translator and the underlying dictionaries. 11 | 12 | Replacement based on "Addressing the Rare Word 13 | Problem in Neural Machine Translation" :cite:`Luong2015b` 14 | 15 | Args: 16 | data (DataSet): 17 | fields (dict of Fields): data fields 18 | n_best (int): number of translations produced 19 | replace_unk (bool): replace unknown words using attention 20 | has_tgt (bool): will the batch have gold targets 21 | """ 22 | def __init__(self, data, fields, n_best=1, replace_unk=False, 23 | has_tgt=False): 24 | self.data = data 25 | self.fields = fields 26 | self.n_best = n_best 27 | self.replace_unk = replace_unk 28 | self.has_tgt = has_tgt 29 | 30 | def _build_target_tokens(self, src, src_vocab, src_raw, pred, attn): 31 | vocab = self.fields["tgt"].vocab 32 | tokens = [] 33 | for tok in pred: 34 | if tok < len(vocab): 35 | tokens.append(vocab.itos[tok]) 36 | else: 37 | tokens.append(src_vocab.itos[tok - len(vocab)]) 38 | if tokens[-1] == onmt.io.EOS_WORD: 39 | tokens = tokens[:-1] 40 | break 41 | if self.replace_unk and (attn is not None) and (src is not None): 42 | for i in range(len(tokens)): 43 | if tokens[i] == vocab.itos[onmt.io.UNK]: 44 | _, maxIndex = attn[i].max(0) 45 | tokens[i] = src_raw[maxIndex[0]] 46 | return tokens 47 | 48 | def from_batch(self, translation_batch): 49 | batch = translation_batch["batch"] 50 | assert(len(translation_batch["gold_score"]) == 51 | len(translation_batch["predictions"])) 52 | batch_size = batch.batch_size 53 | 54 | preds, pred_score, attn, gold_score, indices = list(zip( 55 | *sorted(zip(translation_batch["predictions"], 56 | translation_batch["scores"], 57 | translation_batch["attention"], 58 | translation_batch["gold_score"], 59 | batch.indices.data), 60 | key=lambda x: x[-1]))) 61 | 62 | # Sorting 63 | inds, perm = torch.sort(batch.indices.data) 64 | data_type = self.data.data_type 65 | if data_type == 'text': 66 | src = batch.src[0].data.index_select(1, perm) 67 | else: 68 | src = None 69 | 70 | if self.has_tgt: 71 | tgt = batch.tgt.data.index_select(1, perm) 72 | else: 73 | tgt = None 74 | 75 | translations = [] 76 | for b in range(batch_size): 77 | if data_type == 'text': 78 | src_vocab = self.data.src_vocabs[inds[b]] \ 79 | if self.data.src_vocabs else None 80 | src_raw = self.data.examples[inds[b]].src 81 | else: 82 | src_vocab = None 83 | src_raw = None 84 | pred_sents = [self._build_target_tokens( 85 | src[:, b] if src is not None else None, 86 | src_vocab, src_raw, 87 | preds[b][n], attn[b][n]) 88 | for n in range(self.n_best)] 89 | gold_sent = None 90 | if tgt is not None: 91 | gold_sent = self._build_target_tokens( 92 | src[:, b] if src is not None else None, 93 | src_vocab, src_raw, 94 | tgt[1:, b] if tgt is not None else None, None) 95 | 96 | translation = Translation(src[:, b] if src is not None else None, 97 | src_raw, pred_sents, 98 | attn[b], pred_score[b], gold_sent, 99 | gold_score[b]) 100 | translations.append(translation) 101 | 102 | return translations 103 | 104 | 105 | class Translation(object): 106 | """ 107 | Container for a translated sentence. 108 | 109 | Attributes: 110 | src (`LongTensor`): src word ids 111 | src_raw ([str]): raw src words 112 | 113 | pred_sents ([[str]]): words from the n-best translations 114 | pred_scores ([[float]]): log-probs of n-best translations 115 | attns ([`FloatTensor`]) : attention dist for each translation 116 | gold_sent ([str]): words from gold translation 117 | gold_score ([float]): log-prob of gold translation 118 | 119 | """ 120 | def __init__(self, src, src_raw, pred_sents, 121 | attn, pred_scores, tgt_sent, gold_score): 122 | self.src = src 123 | self.src_raw = src_raw 124 | self.pred_sents = pred_sents 125 | self.attns = attn 126 | self.pred_scores = pred_scores 127 | self.gold_sent = tgt_sent 128 | self.gold_score = gold_score 129 | 130 | def log(self, sent_number): 131 | """ 132 | Log translation to stdout. 133 | """ 134 | output = '\nSENT {}: {}\n'.format(sent_number, self.src_raw) 135 | 136 | best_pred = self.pred_sents[0] 137 | best_score = self.pred_scores[0] 138 | pred_sent = ' '.join(best_pred) 139 | output += 'PRED {}: {}\n'.format(sent_number, pred_sent) 140 | print("PRED SCORE: {:.4f}".format(best_score)) 141 | 142 | if self.gold_sent is not None: 143 | tgt_sent = ' '.join(self.gold_sent) 144 | output += 'GOLD {}: {}\n'.format(sent_number, tgt_sent) 145 | output += ("GOLD SCORE: {:.4f}".format(self.gold_score)) 146 | 147 | if len(self.pred_sents) > 1: 148 | print('\nBEST HYP:') 149 | for score, sent in zip(self.pred_scores, self.pred_sents): 150 | output += "[{:.4f}] {}\n".format(score, sent) 151 | 152 | return output 153 | -------------------------------------------------------------------------------- /reinforcement_train/onmt/translate/Translation.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, unicode_literals 2 | 3 | import torch 4 | import onmt.io 5 | 6 | 7 | class TranslationBuilder(object): 8 | """ 9 | Build a word-based translation from the batch output 10 | of translator and the underlying dictionaries. 11 | 12 | Replacement based on "Addressing the Rare Word 13 | Problem in Neural Machine Translation" :cite:`Luong2015b` 14 | 15 | Args: 16 | data (DataSet): 17 | fields (dict of Fields): data fields 18 | n_best (int): number of translations produced 19 | replace_unk (bool): replace unknown words using attention 20 | has_tgt (bool): will the batch have gold targets 21 | """ 22 | def __init__(self, data, fields, n_best=1, replace_unk=False, 23 | has_tgt=False): 24 | self.data = data 25 | self.fields = fields 26 | self.n_best = n_best 27 | self.replace_unk = replace_unk 28 | self.has_tgt = has_tgt 29 | 30 | def _build_target_tokens(self, src, src_vocab, src_raw, pred, attn): 31 | vocab = self.fields["tgt"].vocab 32 | tokens = [] 33 | for tok in pred: 34 | if tok < len(vocab): 35 | tokens.append(vocab.itos[tok]) 36 | else: 37 | tokens.append(src_vocab.itos[tok - len(vocab)]) 38 | if tokens[-1] == onmt.io.EOS_WORD: 39 | tokens = tokens[:-1] 40 | break 41 | if self.replace_unk and (attn is not None) and (src is not None): 42 | for i in range(len(tokens)): 43 | if tokens[i] == vocab.itos[onmt.io.UNK]: 44 | _, maxIndex = attn[i].max(0) 45 | tokens[i] = src_raw[maxIndex[0]] 46 | return tokens 47 | 48 | def from_batch(self, translation_batch): 49 | batch = translation_batch["batch"] 50 | assert(len(translation_batch["gold_score"]) == 51 | len(translation_batch["predictions"])) 52 | batch_size = batch.batch_size 53 | 54 | preds, pred_score, attn, gold_score, indices = list(zip( 55 | *sorted(zip(translation_batch["predictions"], 56 | translation_batch["scores"], 57 | translation_batch["attention"], 58 | translation_batch["gold_score"], 59 | batch.indices.data), 60 | key=lambda x: x[-1]))) 61 | 62 | # Sorting 63 | inds, perm = torch.sort(batch.indices.data) 64 | data_type = self.data.data_type 65 | if data_type == 'text': 66 | src = batch.src[0].data.index_select(1, perm) 67 | else: 68 | src = None 69 | 70 | if self.has_tgt: 71 | tgt = batch.tgt.data.index_select(1, perm) 72 | else: 73 | tgt = None 74 | 75 | translations = [] 76 | for b in range(batch_size): 77 | if data_type == 'text': 78 | src_vocab = self.data.src_vocabs[inds[b]] \ 79 | if self.data.src_vocabs else None 80 | src_raw = self.data.examples[inds[b]].src 81 | else: 82 | src_vocab = None 83 | src_raw = None 84 | pred_sents = [self._build_target_tokens( 85 | src[:, b] if src is not None else None, 86 | src_vocab, src_raw, 87 | preds[b][n], attn[b][n]) 88 | for n in range(self.n_best)] 89 | gold_sent = None 90 | if tgt is not None: 91 | gold_sent = self._build_target_tokens( 92 | src[:, b] if src is not None else None, 93 | src_vocab, src_raw, 94 | tgt[1:, b] if tgt is not None else None, None) 95 | 96 | translation = Translation(src[:, b] if src is not None else None, 97 | src_raw, pred_sents, 98 | attn[b], pred_score[b], gold_sent, 99 | gold_score[b]) 100 | translations.append(translation) 101 | 102 | return translations 103 | 104 | 105 | class Translation(object): 106 | """ 107 | Container for a translated sentence. 108 | 109 | Attributes: 110 | src (`LongTensor`): src word ids 111 | src_raw ([str]): raw src words 112 | 113 | pred_sents ([[str]]): words from the n-best translations 114 | pred_scores ([[float]]): log-probs of n-best translations 115 | attns ([`FloatTensor`]) : attention dist for each translation 116 | gold_sent ([str]): words from gold translation 117 | gold_score ([float]): log-prob of gold translation 118 | 119 | """ 120 | def __init__(self, src, src_raw, pred_sents, 121 | attn, pred_scores, tgt_sent, gold_score): 122 | self.src = src 123 | self.src_raw = src_raw 124 | self.pred_sents = pred_sents 125 | self.attns = attn 126 | self.pred_scores = pred_scores 127 | self.gold_sent = tgt_sent 128 | self.gold_score = gold_score 129 | 130 | def log(self, sent_number): 131 | """ 132 | Log translation to stdout. 133 | """ 134 | output = '\nSENT {}: {}\n'.format(sent_number, self.src_raw) 135 | 136 | best_pred = self.pred_sents[0] 137 | best_score = self.pred_scores[0] 138 | pred_sent = ' '.join(best_pred) 139 | output += 'PRED {}: {}\n'.format(sent_number, pred_sent) 140 | print("PRED SCORE: {:.4f}".format(best_score)) 141 | 142 | if self.gold_sent is not None: 143 | tgt_sent = ' '.join(self.gold_sent) 144 | output += 'GOLD {}: {}\n'.format(sent_number, tgt_sent) 145 | output += ("GOLD SCORE: {:.4f}".format(self.gold_score)) 146 | 147 | if len(self.pred_sents) > 1: 148 | print('\nBEST HYP:') 149 | for score, sent in zip(self.pred_scores, self.pred_sents): 150 | output += "[{:.4f}] {}\n".format(score, sent) 151 | 152 | return output 153 | -------------------------------------------------------------------------------- /D_pretrain/onmt/translate/Beam.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import torch 3 | 4 | 5 | class Beam(object): 6 | """ 7 | Class for managing the internals of the beam search process. 8 | 9 | Takes care of beams, back pointers, and scores. 10 | 11 | Args: 12 | size (int): beam size 13 | pad, bos, eos (int): indices of padding, beginning, and ending. 14 | n_best (int): nbest size to use 15 | cuda (bool): use gpu 16 | global_scorer (:obj:`GlobalScorer`) 17 | """ 18 | def __init__(self, size, pad, bos, eos, 19 | n_best=1, cuda=False, 20 | global_scorer=None, 21 | min_length=0): 22 | 23 | self.size = size 24 | self.tt = torch.cuda if cuda else torch 25 | 26 | # The score for each translation on the beam. 27 | self.scores = self.tt.FloatTensor(size).zero_() 28 | self.all_scores = [] 29 | 30 | # The backpointers at each time-step. 31 | self.prev_ks = [] 32 | 33 | # The outputs at each time-step. 34 | self.next_ys = [self.tt.LongTensor(size) 35 | .fill_(pad)] 36 | self.next_ys[0][0] = bos 37 | 38 | # Has EOS topped the beam yet. 39 | self._eos = eos 40 | self.eos_top = False 41 | 42 | # The attentions (matrix) for each time. 43 | self.attn = [] 44 | 45 | # Time and k pair for finished. 46 | self.finished = [] 47 | self.n_best = n_best 48 | 49 | # Information for global scoring. 50 | self.global_scorer = global_scorer 51 | self.global_state = {} 52 | 53 | # Minimum prediction length 54 | self.min_length = min_length 55 | 56 | def get_current_state(self): 57 | "Get the outputs for the current timestep." 58 | return self.next_ys[-1] 59 | 60 | def get_current_origin(self): 61 | "Get the backpointers for the current timestep." 62 | return self.prev_ks[-1] 63 | 64 | def advance(self, word_probs, attn_out): 65 | """ 66 | Given prob over words for every last beam `wordLk` and attention 67 | `attn_out`: Compute and update the beam search. 68 | 69 | Parameters: 70 | 71 | * `word_probs`- probs of advancing from the last step (K x words) 72 | * `attn_out`- attention at the last step 73 | 74 | Returns: True if beam search is complete. 75 | """ 76 | num_words = word_probs.size(1) 77 | 78 | # force the output to be longer than self.min_length 79 | cur_len = len(self.next_ys) 80 | if cur_len < self.min_length: 81 | for k in range(len(word_probs)): 82 | word_probs[k][self._eos] = -1e20 83 | 84 | # Sum the previous scores. 85 | if len(self.prev_ks) > 0: 86 | beam_scores = word_probs + \ 87 | self.scores.unsqueeze(1).expand_as(word_probs) 88 | 89 | # Don't let EOS have children. 90 | for i in range(self.next_ys[-1].size(0)): 91 | if self.next_ys[-1][i] == self._eos: 92 | beam_scores[i] = -1e20 93 | else: 94 | beam_scores = word_probs[0] 95 | flat_beam_scores = beam_scores.view(-1) 96 | best_scores, best_scores_id = flat_beam_scores.topk(self.size, 0, 97 | True, True) 98 | 99 | self.all_scores.append(self.scores) 100 | self.scores = best_scores 101 | 102 | # best_scores_id is flattened beam x word array, so calculate which 103 | # word and beam each score came from 104 | prev_k = best_scores_id / num_words 105 | self.prev_ks.append(prev_k) 106 | self.next_ys.append((best_scores_id - prev_k * num_words)) 107 | self.attn.append(attn_out.index_select(0, prev_k)) 108 | 109 | if self.global_scorer is not None: 110 | self.global_scorer.update_global_state(self) 111 | 112 | for i in range(self.next_ys[-1].size(0)): 113 | if self.next_ys[-1][i] == self._eos: 114 | s = self.scores[i] 115 | if self.global_scorer is not None: 116 | global_scores = self.global_scorer.score(self, self.scores) 117 | s = global_scores[i] 118 | self.finished.append((s, len(self.next_ys) - 1, i)) 119 | 120 | # End condition is when top-of-beam is EOS and no global score. 121 | if self.next_ys[-1][0] == self._eos: 122 | # self.all_scores.append(self.scores) 123 | self.eos_top = True 124 | 125 | def done(self): 126 | return self.eos_top and len(self.finished) >= self.n_best 127 | 128 | def sort_finished(self, minimum=None): 129 | if minimum is not None: 130 | i = 0 131 | # Add from beam until we have minimum outputs. 132 | while len(self.finished) < minimum: 133 | s = self.scores[i] 134 | if self.global_scorer is not None: 135 | global_scores = self.global_scorer.score(self, self.scores) 136 | s = global_scores[i] 137 | self.finished.append((s, len(self.next_ys) - 1, i)) 138 | 139 | self.finished.sort(key=lambda a: -a[0]) 140 | scores = [sc for sc, _, _ in self.finished] 141 | ks = [(t, k) for _, t, k in self.finished] 142 | return scores, ks 143 | 144 | def get_hyp(self, timestep, k): 145 | """ 146 | Walk back to construct the full hypothesis. 147 | """ 148 | hyp, attn = [], [] 149 | for j in range(len(self.prev_ks[:timestep]) - 1, -1, -1): 150 | hyp.append(self.next_ys[j+1][k]) 151 | attn.append(self.attn[j][k]) 152 | k = self.prev_ks[j][k] 153 | return hyp[::-1], torch.stack(attn[::-1]) 154 | 155 | 156 | class GNMTGlobalScorer(object): 157 | """ 158 | NMT re-ranking score from 159 | "Google's Neural Machine Translation System" :cite:`wu2016google` 160 | 161 | Args: 162 | alpha (float): length parameter 163 | beta (float): coverage parameter 164 | """ 165 | def __init__(self, alpha, beta): 166 | self.alpha = alpha 167 | self.beta = beta 168 | 169 | def score(self, beam, logprobs): 170 | "Additional term add to log probability" 171 | cov = beam.global_state["coverage"] 172 | pen = self.beta * torch.min(cov, cov.clone().fill_(1.0)).log().sum(1) 173 | l_term = (((5 + len(beam.next_ys)) ** self.alpha) / 174 | ((5 + 1) ** self.alpha)) 175 | return (logprobs / l_term) + pen 176 | 177 | def update_global_state(self, beam): 178 | "Keeps the coverage vector as sum of attens" 179 | if len(beam.prev_ks) == 1: 180 | beam.global_state["coverage"] = beam.attn[-1] 181 | else: 182 | beam.global_state["coverage"] = beam.global_state["coverage"] \ 183 | .index_select(0, beam.prev_ks[-1]).add(beam.attn[-1]) 184 | -------------------------------------------------------------------------------- /G_pretrain/onmt/translate/Beam.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import torch 3 | 4 | 5 | class Beam(object): 6 | """ 7 | Class for managing the internals of the beam search process. 8 | 9 | Takes care of beams, back pointers, and scores. 10 | 11 | Args: 12 | size (int): beam size 13 | pad, bos, eos (int): indices of padding, beginning, and ending. 14 | n_best (int): nbest size to use 15 | cuda (bool): use gpu 16 | global_scorer (:obj:`GlobalScorer`) 17 | """ 18 | def __init__(self, size, pad, bos, eos, 19 | n_best=1, cuda=False, 20 | global_scorer=None, 21 | min_length=0): 22 | 23 | self.size = size 24 | self.tt = torch.cuda if cuda else torch 25 | 26 | # The score for each translation on the beam. 27 | self.scores = self.tt.FloatTensor(size).zero_() 28 | self.all_scores = [] 29 | 30 | # The backpointers at each time-step. 31 | self.prev_ks = [] 32 | 33 | # The outputs at each time-step. 34 | self.next_ys = [self.tt.LongTensor(size) 35 | .fill_(pad)] 36 | self.next_ys[0][0] = bos 37 | 38 | # Has EOS topped the beam yet. 39 | self._eos = eos 40 | self.eos_top = False 41 | 42 | # The attentions (matrix) for each time. 43 | self.attn = [] 44 | 45 | # Time and k pair for finished. 46 | self.finished = [] 47 | self.n_best = n_best 48 | 49 | # Information for global scoring. 50 | self.global_scorer = global_scorer 51 | self.global_state = {} 52 | 53 | # Minimum prediction length 54 | self.min_length = min_length 55 | 56 | def get_current_state(self): 57 | "Get the outputs for the current timestep." 58 | return self.next_ys[-1] 59 | 60 | def get_current_origin(self): 61 | "Get the backpointers for the current timestep." 62 | return self.prev_ks[-1] 63 | 64 | def advance(self, word_probs, attn_out): 65 | """ 66 | Given prob over words for every last beam `wordLk` and attention 67 | `attn_out`: Compute and update the beam search. 68 | 69 | Parameters: 70 | 71 | * `word_probs`- probs of advancing from the last step (K x words) 72 | * `attn_out`- attention at the last step 73 | 74 | Returns: True if beam search is complete. 75 | """ 76 | num_words = word_probs.size(1) 77 | 78 | # force the output to be longer than self.min_length 79 | cur_len = len(self.next_ys) 80 | if cur_len < self.min_length: 81 | for k in range(len(word_probs)): 82 | word_probs[k][self._eos] = -1e20 83 | 84 | # Sum the previous scores. 85 | if len(self.prev_ks) > 0: 86 | beam_scores = word_probs + \ 87 | self.scores.unsqueeze(1).expand_as(word_probs) 88 | 89 | # Don't let EOS have children. 90 | for i in range(self.next_ys[-1].size(0)): 91 | if self.next_ys[-1][i] == self._eos: 92 | beam_scores[i] = -1e20 93 | else: 94 | beam_scores = word_probs[0] 95 | flat_beam_scores = beam_scores.view(-1) 96 | best_scores, best_scores_id = flat_beam_scores.topk(self.size, 0, 97 | True, True) 98 | 99 | self.all_scores.append(self.scores) 100 | self.scores = best_scores 101 | 102 | # best_scores_id is flattened beam x word array, so calculate which 103 | # word and beam each score came from 104 | prev_k = best_scores_id / num_words 105 | self.prev_ks.append(prev_k) 106 | self.next_ys.append((best_scores_id - prev_k * num_words)) 107 | self.attn.append(attn_out.index_select(0, prev_k)) 108 | 109 | if self.global_scorer is not None: 110 | self.global_scorer.update_global_state(self) 111 | 112 | for i in range(self.next_ys[-1].size(0)): 113 | if self.next_ys[-1][i] == self._eos: 114 | s = self.scores[i] 115 | if self.global_scorer is not None: 116 | global_scores = self.global_scorer.score(self, self.scores) 117 | s = global_scores[i] 118 | self.finished.append((s, len(self.next_ys) - 1, i)) 119 | 120 | # End condition is when top-of-beam is EOS and no global score. 121 | if self.next_ys[-1][0] == self._eos: 122 | # self.all_scores.append(self.scores) 123 | self.eos_top = True 124 | 125 | def done(self): 126 | return self.eos_top and len(self.finished) >= self.n_best 127 | 128 | def sort_finished(self, minimum=None): 129 | if minimum is not None: 130 | i = 0 131 | # Add from beam until we have minimum outputs. 132 | while len(self.finished) < minimum: 133 | s = self.scores[i] 134 | if self.global_scorer is not None: 135 | global_scores = self.global_scorer.score(self, self.scores) 136 | s = global_scores[i] 137 | self.finished.append((s, len(self.next_ys) - 1, i)) 138 | 139 | self.finished.sort(key=lambda a: -a[0]) 140 | scores = [sc for sc, _, _ in self.finished] 141 | ks = [(t, k) for _, t, k in self.finished] 142 | return scores, ks 143 | 144 | def get_hyp(self, timestep, k): 145 | """ 146 | Walk back to construct the full hypothesis. 147 | """ 148 | hyp, attn = [], [] 149 | for j in range(len(self.prev_ks[:timestep]) - 1, -1, -1): 150 | hyp.append(self.next_ys[j+1][k]) 151 | attn.append(self.attn[j][k]) 152 | k = self.prev_ks[j][k] 153 | return hyp[::-1], torch.stack(attn[::-1]) 154 | 155 | 156 | class GNMTGlobalScorer(object): 157 | """ 158 | NMT re-ranking score from 159 | "Google's Neural Machine Translation System" :cite:`wu2016google` 160 | 161 | Args: 162 | alpha (float): length parameter 163 | beta (float): coverage parameter 164 | """ 165 | def __init__(self, alpha, beta): 166 | self.alpha = alpha 167 | self.beta = beta 168 | 169 | def score(self, beam, logprobs): 170 | "Additional term add to log probability" 171 | cov = beam.global_state["coverage"] 172 | pen = self.beta * torch.min(cov, cov.clone().fill_(1.0)).log().sum(1) 173 | l_term = (((5 + len(beam.next_ys)) ** self.alpha) / 174 | ((5 + 1) ** self.alpha)) 175 | return (logprobs / l_term) + pen 176 | 177 | def update_global_state(self, beam): 178 | "Keeps the coverage vector as sum of attens" 179 | if len(beam.prev_ks) == 1: 180 | beam.global_state["coverage"] = beam.attn[-1] 181 | else: 182 | beam.global_state["coverage"] = beam.global_state["coverage"] \ 183 | .index_select(0, beam.prev_ks[-1]).add(beam.attn[-1]) 184 | --------------------------------------------------------------------------------