├── README.md
├── data
├── model
    ├── Allinone_Model.py
    ├── Attention.py
    ├── Decoder.py
    ├── DecoderRNN.py
    ├── EncoderRNN.py
    ├── Model.py
    ├── S2VTAttModel.py
    ├── S2VTModel.py
    ├── S2VT_EncoderRNN.py
    ├── TransformerDecoderModel.py
    ├── TransformerModel.py
    ├── __init__.py
    └── transformer
    │   ├── Beam.py
    │   ├── Constants.py
    │   ├── Layers.py
    │   ├── Modules.py
    │   ├── Optim.py
    │   ├── SubLayers.py
    │   ├── Transformers.py
    │   ├── Translator.py
    │   ├── __init__.py
    │   ├── cap2cms_Translator.py
    │   ├── cap2cms_Translator_transformer.py
    │   ├── cap2cms_Translator_transformerDecoder.py
    │   └── cap_cms_Translator.py
├── opts.py
├── others
    ├── generation.py
    ├── test_RNN.py
    ├── test_attention_Video2text.py
    ├── test_transformer.py
    ├── train_RNN.py
    └── train_transformer.py
├── pictures
    ├── arch.png
    └── v2c.png
├── test.py
├── train.py
└── utils
    ├── allinone_dataloader.py
    ├── cocoeval.py
    ├── dataloader.py
    ├── gt_cap_dataloader.py
    ├── gt_caps_dataloader.py
    ├── gt_human_cap_dataloader.py
    ├── prepro_feats.py
    ├── prepro_ngrams.py
    ├── prepro_vocab.py
    ├── rouge.py
    └── utils.py


/data:
--------------------------------------------------------------------------------
1 | /media/drive1/Data/MSR-VTT


--------------------------------------------------------------------------------
/model/Allinone_Model.py:
--------------------------------------------------------------------------------
 1 | ''' Define the Transformer model '''
 2 | from utils.utils import *
 3 | from model.Decoder import Decoder
 4 | from model.EncoderRNN import EncoderRNN
 5 | 
 6 | __author__ = 'Jacob Zhiyuan Fang'
 7 | 
 8 | 
 9 | class Model(nn.Module):
10 |     ''' A sequence to sequence model with attention mechanism. '''
11 | 
12 |     def __init__(
13 |             self,
14 |             n_cap_vocab, cap_max_seq, vis_emb=2048,
15 |             d_word_vec=512, d_model=512, d_inner=2048,
16 |             n_layers=6, n_head=8, d_k=64, d_v=64, dropout=0.1,
17 |             tgt_emb_prj_weight_sharing=True):
18 | 
19 |         super().__init__()
20 | 
21 |         self.encoder = EncoderRNN(vis_emb, d_model, bidirectional=0)
22 |         # self.encoder = nn.Linear(vis_emb, d_model)
23 | 
24 |         self.decoder = Decoder(
25 |             n_tgt_vocab=n_cap_vocab, len_max_seq=cap_max_seq,
26 |             d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
27 |             n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
28 |             dropout=dropout)
29 | 
30 |         self.cap_word_prj = nn.Linear(d_model, n_cap_vocab, bias=False)
31 |         nn.init.xavier_normal_(self.cap_word_prj.weight)
32 | 
33 |         assert d_model == d_word_vec, \
34 |             'To facilitate the residual connections, ' \
35 |             'the dimensions of all module outputs shall be the same.'
36 | 
37 |         if tgt_emb_prj_weight_sharing:
38 |             # Share the weight matrix between target word embedding & the final logit dense layer
39 |             self.cap_word_prj.weight = self.decoder.tgt_word_emb.weight
40 |             self.x_logit_scale = (d_model ** -0.5)
41 |         else:
42 |             self.x_logit_scale = 1.
43 | 
44 |     def forward(self, vis_feat, tgt_seq, tgt_pos):
45 | 
46 |         tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1]
47 | 
48 |         enc_output, *_ = self.encoder(vis_feat)
49 |         dec_output, *_ = self.decoder(tgt_seq, tgt_pos, vis_feat, enc_output)
50 |         seq_logit = self.cap_word_prj(dec_output) * self.x_logit_scale
51 | 
52 |         return seq_logit.view(-1, seq_logit.size(2))
53 | 
54 | 


--------------------------------------------------------------------------------
/model/Attention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class Attention(nn.Module):
 7 |     """
 8 |     Applies an attention mechanism on the output features from the decoder.
 9 |     """
10 | 
11 |     def __init__(self, dim):
12 |         super(Attention, self).__init__()
13 |         self.dim = dim
14 |         self.linear1 = nn.Linear(dim * 2, dim)
15 |         self.linear2 = nn.Linear(dim, 1, bias=False)
16 |         #self._init_hidden()
17 | 
18 |     def _init_hidden(self):
19 |         nn.init.xavier_normal_(self.linear1.weight)
20 |         nn.init.xavier_normal_(self.linear2.weight)
21 | 
22 |     def forward(self, hidden_state, encoder_outputs):
23 |         """
24 |         Arguments:
25 |             hidden_state {Variable} -- batch_size x dim
26 |             encoder_outputs {Variable} -- batch_size x seq_len x dim
27 | 
28 |         Returns:
29 |             Variable -- context vector of size batch_size x dim
30 |         """
31 |         batch_size, seq_len, _ = encoder_outputs.size()
32 |         hidden_state = hidden_state.unsqueeze(1).repeat(1, seq_len, 1)
33 |         inputs = torch.cat((encoder_outputs, hidden_state),
34 |                            2).view(-1, self.dim * 2)
35 |         o = self.linear2(F.tanh(self.linear1(inputs)))
36 |         e = o.view(batch_size, seq_len)
37 |         alpha = F.softmax(e, dim=1)
38 |         context = torch.bmm(alpha.unsqueeze(1), encoder_outputs).squeeze(1)
39 |         return context
40 | 


--------------------------------------------------------------------------------
/model/Decoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import numpy as np
  4 | from .transformer import Constants as Constants
  5 | from .transformer.Layers import DecoderLayer
  6 | 
  7 | 
  8 | def get_non_pad_mask(seq):
  9 |     assert seq.dim() == 2
 10 |     return seq.ne(Constants.PAD).type(torch.float).unsqueeze(-1).cuda()
 11 | 
 12 | 
 13 | def get_attn_key_pad_mask(seq_k, seq_q):
 14 |     ''' For masking out the padding part of key sequence. '''
 15 | 
 16 |     # Expand to fit the shape of key query attention matrix.
 17 |     len_q = seq_q.size(1)
 18 |     padding_mask = seq_k.eq(Constants.PAD)
 19 |     padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1)  # b x lq x lk
 20 | 
 21 |     return padding_mask.cuda()
 22 | 
 23 | 
 24 | def get_subsequent_mask(seq):
 25 |     ''' For masking out the subsequent info. '''
 26 | 
 27 |     sz_b, len_s = seq.size()
 28 |     subsequent_mask = torch.triu(
 29 |         torch.ones((len_s, len_s), device=seq.device, dtype=torch.uint8), diagonal=1)
 30 |     subsequent_mask = subsequent_mask.unsqueeze(0).expand(sz_b, -1, -1)  # b x ls x ls
 31 | 
 32 |     return subsequent_mask.cuda()
 33 | 
 34 | 
 35 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
 36 |     ''' Sinusoid position encoding table '''
 37 | 
 38 |     def cal_angle(position, hid_idx):
 39 |         return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
 40 | 
 41 |     def get_posi_angle_vec(position):
 42 |         return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
 43 | 
 44 |     sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
 45 | 
 46 |     sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
 47 |     sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
 48 | 
 49 |     if padding_idx is not None:
 50 |         # zero vector for padding dimension
 51 |         sinusoid_table[padding_idx] = 0.
 52 | 
 53 |     return torch.FloatTensor(sinusoid_table)
 54 | 
 55 | 
 56 | class Decoder(nn.Module):
 57 |     ''' A decoder model with self attention mechanism. '''
 58 | 
 59 |     def __init__(
 60 |             self, n_tgt_vocab, len_max_seq, d_word_vec,
 61 |             n_layers, n_head, d_k, d_v,
 62 |             d_model, d_inner, dropout=0.1):
 63 | 
 64 |         super().__init__()
 65 |         n_position = len_max_seq + 1
 66 | 
 67 |         self.tgt_word_emb = nn.Embedding(
 68 |             n_tgt_vocab, d_word_vec, padding_idx=Constants.PAD)
 69 | 
 70 |         self.position_enc = nn.Embedding.from_pretrained(
 71 |             get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0),
 72 |             freeze=True)
 73 | 
 74 |         self.layer_stack = nn.ModuleList([
 75 |             DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
 76 |             for _ in range(n_layers)])
 77 | 
 78 |     def forward(self, tgt_seq, tgt_pos, src_seq, enc_output, return_attns=False):
 79 | 
 80 |         dec_slf_attn_list, dec_enc_attn_list = [], []
 81 | 
 82 |         # -- Prepare masks
 83 |         non_pad_mask = get_non_pad_mask(tgt_seq)
 84 | 
 85 |         slf_attn_mask_subseq = get_subsequent_mask(tgt_seq)
 86 |         slf_attn_mask_keypad = get_attn_key_pad_mask(seq_k=tgt_seq, seq_q=tgt_seq)
 87 |         slf_attn_mask = (slf_attn_mask_keypad + slf_attn_mask_subseq).gt(0)
 88 | 
 89 |         src_tmp = torch.ones(src_seq.shape[0], src_seq.shape[1]).cuda()
 90 |         dec_enc_attn_mask = get_attn_key_pad_mask(seq_k=src_tmp, seq_q=tgt_seq)
 91 | 
 92 |         # -- Forward
 93 |         dec_output = self.tgt_word_emb(tgt_seq) + self.position_enc(tgt_pos)
 94 | 
 95 |         for dec_layer in self.layer_stack:
 96 |             dec_output, dec_slf_attn, dec_enc_attn = dec_layer(
 97 |                 dec_output, enc_output,
 98 |                 non_pad_mask=non_pad_mask,
 99 |                 slf_attn_mask=slf_attn_mask,
100 |                 dec_enc_attn_mask=dec_enc_attn_mask)
101 | 
102 |             if return_attns:
103 |                 dec_slf_attn_list += [dec_slf_attn]
104 |                 dec_enc_attn_list += [dec_enc_attn]
105 | 
106 |         if return_attns:
107 |             return dec_output, dec_slf_attn_list, dec_enc_attn_list
108 |         return dec_output,
109 | 


--------------------------------------------------------------------------------
/model/DecoderRNN.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from .Attention import Attention
  8 | 
  9 | 
 10 | class DecoderRNN(nn.Module):
 11 |     """
 12 |     Provides functionality for decoding in a seq2seq framework, with an option for attention.
 13 |     Args:
 14 |         vocab_size (int): size of the vocabulary
 15 |         max_len (int): a maximum allowed length for the sequence to be processed
 16 |         dim_hidden (int): the number of features in the hidden state `h`
 17 |         n_layers (int, optional): number of recurrent layers (default: 1)
 18 |         rnn_cell (str, optional): type of RNN cell (default: gru)
 19 |         bidirectional (bool, optional): if the encoder is bidirectional (default False)
 20 |         input_dropout_p (float, optional): dropout probability for the input sequence (default: 0)
 21 |         rnn_dropout_p (float, optional): dropout probability for the output sequence (default: 0)
 22 | 
 23 |     """
 24 | 
 25 |     def __init__(self,
 26 |                  vocab_size,
 27 |                  max_len,
 28 |                  dim_hidden,
 29 |                  dim_word,
 30 |                  n_layers=1,
 31 |                  rnn_cell='gru',
 32 |                  bidirectional=False,
 33 |                  input_dropout_p=0.1,
 34 |                  rnn_dropout_p=0.1):
 35 |         super(DecoderRNN, self).__init__()
 36 | 
 37 |         self.bidirectional_encoder = bidirectional
 38 | 
 39 |         self.dim_output = vocab_size
 40 |         self.dim_hidden = dim_hidden * 2 if bidirectional else dim_hidden
 41 |         self.dim_word = dim_word
 42 |         self.max_length = max_len
 43 |         self.sos_id = 1
 44 |         self.eos_id = 0
 45 |         self.input_dropout = nn.Dropout(input_dropout_p)
 46 |         self.embedding = nn.Embedding(self.dim_output, dim_word)
 47 |         self.attention = Attention(self.dim_hidden)
 48 |         if rnn_cell.lower() == 'lstm':
 49 |             self.rnn_cell = nn.LSTM
 50 |         elif rnn_cell.lower() == 'gru':
 51 |             self.rnn_cell = nn.GRU
 52 |         self.rnn = self.rnn_cell(
 53 |             self.dim_hidden + dim_word,
 54 |             self.dim_hidden,
 55 |             n_layers,
 56 |             batch_first=True,
 57 |             dropout=rnn_dropout_p)
 58 | 
 59 |         self.out = nn.Linear(self.dim_hidden, self.dim_output)
 60 | 
 61 |         self._init_weights()
 62 | 
 63 |     def forward(self,
 64 |                 encoder_outputs,
 65 |                 encoder_hidden,
 66 |                 targets=None,
 67 |                 mode='train',
 68 |                 opt={}):
 69 |         """
 70 | 
 71 |         Inputs: inputs, encoder_hidden, encoder_outputs, function, teacher_forcing_ratio
 72 |         - **encoder_hidden** (num_layers * num_directions, batch_size, dim_hidden): tensor containing the features in the
 73 |           hidden state `h` of encoder. Used as the initial hidden state of the decoder. (default `None`)
 74 |         - **encoder_outputs** (batch, seq_len, dim_hidden * num_directions): (default is `None`).
 75 |         - **targets** (batch, max_length): targets labels of the ground truth sentences
 76 | 
 77 |         Outputs: seq_probs,
 78 |         - **seq_logprobs** (batch_size, max_length, vocab_size): tensors containing the outputs of the decoding function.
 79 |         - **seq_preds** (batch_size, max_length): predicted symbols
 80 |         """
 81 |         sample_max = 1
 82 |         beam_size = 1
 83 |         temperature = 1.0
 84 | 
 85 |         batch_size, _, _ = encoder_outputs.size()
 86 |         decoder_hidden = self._init_rnn_state(encoder_hidden)
 87 | 
 88 |         seq_logprobs = []
 89 |         seq_preds = []
 90 |         final_encoding = []
 91 |         self.rnn.flatten_parameters()
 92 |         if mode == 'train':
 93 |             # use targets as rnn inputs
 94 |             targets_emb = self.embedding(targets)
 95 |             for i in range(self.max_length - 1):
 96 |                 current_words = targets_emb[:, i, :]
 97 |                 context = self.attention(decoder_hidden.squeeze(0), encoder_outputs)
 98 |                 decoder_input = torch.cat([current_words, context], dim=1)
 99 |                 decoder_input = self.input_dropout(decoder_input).unsqueeze(1)
100 |                 decoder_output, decoder_hidden = self.rnn(
101 |                     decoder_input, decoder_hidden)
102 |                 final_encoding.append(decoder_output)
103 |                 logprobs = F.log_softmax(
104 |                     self.out(decoder_output.squeeze(1)), dim=1)
105 |                 seq_logprobs.append(logprobs.unsqueeze(1))
106 |             final_encoding = torch.cat(final_encoding, 1)
107 |             seq_logprobs = torch.cat(seq_logprobs, 1)
108 | 
109 |         elif mode == 'inference':
110 |             if beam_size > 1:
111 |                 return self.sample_beam(encoder_outputs, decoder_hidden, opt)
112 | 
113 |             for t in range(self.max_length - 1):
114 |                 context = self.attention(
115 |                     decoder_hidden.squeeze(0), encoder_outputs)
116 | 
117 |                 if t == 0:  # input <bos>
118 |                     it = torch.LongTensor([self.sos_id] * batch_size).cuda()
119 |                 elif sample_max:
120 |                     sampleLogprobs, it = torch.max(logprobs, 1)
121 |                     seq_logprobs.append(sampleLogprobs.view(-1, 1))
122 |                     it = it.view(-1).long()
123 | 
124 |                 else:
125 |                     # sample according to distribuition
126 |                     if temperature == 1.0:
127 |                         prob_prev = torch.exp(logprobs)
128 |                     else:
129 |                         # scale logprobs by temperature
130 |                         prob_prev = torch.exp(torch.div(logprobs, temperature))
131 |                     it = torch.multinomial(prob_prev, 1).cuda()
132 |                     sampleLogprobs = logprobs.gather(1, it)
133 |                     seq_logprobs.append(sampleLogprobs.view(-1, 1))
134 |                     it = it.view(-1).long()
135 | 
136 |                 seq_preds.append(it.view(-1, 1))
137 | 
138 |                 xt = self.embedding(it)
139 |                 decoder_input = torch.cat([xt, context], dim=1)
140 |                 decoder_input = self.input_dropout(decoder_input).unsqueeze(1)
141 |                 decoder_output, decoder_hidden = self.rnn(
142 |                     decoder_input, decoder_hidden)
143 |                 final_encoding.append(decoder_output)
144 |                 logprobs = F.log_softmax(
145 |                     self.out(decoder_output.squeeze(1)), dim=1)
146 | 
147 |             seq_logprobs = torch.cat(seq_logprobs, 1)
148 |             seq_preds = torch.cat(seq_preds[1:], 1)
149 |             final_encoding = torch.cat(final_encoding, 1)
150 |         return seq_logprobs, seq_preds, final_encoding, decoder_hidden
151 | 
152 |     def _init_weights(self):
153 |         """ init the weight of some layers
154 |         """
155 |         nn.init.xavier_normal_(self.out.weight)
156 | 
157 |     def _init_rnn_state(self, encoder_hidden):
158 |         """ Initialize the encoder hidden state. """
159 |         if encoder_hidden is None:
160 |             return None
161 |         if isinstance(encoder_hidden, tuple):
162 |             encoder_hidden = tuple(
163 |                 [self._cat_directions(h) for h in encoder_hidden])
164 |         else:
165 |             encoder_hidden = self._cat_directions(encoder_hidden)
166 |         return encoder_hidden
167 | 
168 |     def _cat_directions(self, h):
169 |         """ If the encoder is bidirectional, do the following transformation.
170 |             (#directions * #layers, #batch, dim_hidden) -> (#layers, #batch, #directions * dim_hidden)
171 |         """
172 |         if self.bidirectional_encoder:
173 |             h = torch.cat([h[0:h.size(0):2], h[1:h.size(0):2]], 2)
174 |         return h
175 | 


--------------------------------------------------------------------------------
/model/EncoderRNN.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | class EncoderRNN(nn.Module):
 5 |     def __init__(self, dim_vid, dim_hidden, input_dropout_p=0.2, rnn_dropout_p=0.5,
 6 |                  n_layers=1, bidirectional=False, rnn_cell='gru'):
 7 |         """
 8 | 
 9 |         Args:
10 |             hidden_dim (int): dim of hidden state of rnn
11 |             input_dropout_p (int): dropout probability for the input sequence
12 |             dropout_p (float): dropout probability for the output sequence
13 |             n_layers (int): number of rnn layers
14 |             rnn_cell (str): type of RNN cell ('LSTM'/'GRU')
15 |         """
16 |         super(EncoderRNN, self).__init__()
17 |         self.dim_vid = dim_vid
18 |         self.dim_hidden = dim_hidden
19 |         self.input_dropout_p = input_dropout_p
20 |         self.rnn_dropout_p = rnn_dropout_p
21 |         self.n_layers = n_layers
22 |         self.bidirectional = bidirectional
23 |         self.rnn_cell = rnn_cell
24 | 
25 |         self.vid2hid = nn.Linear(dim_vid, dim_hidden)
26 |         self.input_dropout = nn.Dropout(input_dropout_p)
27 | 
28 |         if rnn_cell.lower() == 'lstm':
29 |             self.rnn_cell = nn.LSTM
30 |         elif rnn_cell.lower() == 'gru':
31 |             self.rnn_cell = nn.GRU
32 | 
33 |         self.rnn = self.rnn_cell(dim_hidden, dim_hidden, n_layers, batch_first=True,
34 |                                  dropout=self.rnn_dropout_p)
35 | 
36 |         self._init_hidden()
37 | 
38 |     def _init_hidden(self):
39 |         nn.init.xavier_normal_(self.vid2hid.weight)
40 | 
41 |     def forward(self, vid_feats):
42 |         """
43 |         Applies a multi-layer RNN to an input sequence.
44 |         Args:
45 |             input_var (batch, seq_len): tensor containing the features of the input sequence.
46 |             input_lengths (list of int, optional): A list that contains the lengths of sequences
47 |               in the mini-batch
48 |         Returns: output, hidden
49 |             - **output** (batch, seq_len, hidden_size): variable containing the encoded features of the input sequence
50 |             - **hidden** (num_layers * num_directions, batch, hidden_size): variable containing the features in the hidden state h
51 |         """
52 |         batch_size, seq_len, dim_vid = vid_feats.size()
53 |         vid_feats = self.vid2hid(vid_feats.view(-1, dim_vid))
54 |         vid_feats = self.input_dropout(vid_feats)
55 |         vid_feats = vid_feats.view(batch_size, seq_len, self.dim_hidden)
56 |         self.rnn.flatten_parameters()
57 |         state1 = None
58 |         output, hidden = self.rnn(vid_feats, state1)
59 |         return output, hidden
60 | 
61 | 


--------------------------------------------------------------------------------
/model/Model.py:
--------------------------------------------------------------------------------
 1 | ''' Define the Transformer model '''
 2 | from utils.utils import *
 3 | from model.Decoder import Decoder
 4 | from model.EncoderRNN import EncoderRNN
 5 | 
 6 | __author__ = 'Jacob Zhiyuan Fang'
 7 | 
 8 | 
 9 | class Model(nn.Module):
10 |     ''' A sequence to sequence model with attention mechanism. '''
11 | 
12 |     def __init__(
13 |             self,
14 |             n_cap_vocab, n_cms_vocab, cap_max_seq, cms_max_seq, vis_emb=2048,
15 |             d_word_vec=512, d_model=512, d_inner=2048, n_layers=6, rnn_layers=1,
16 |             n_head=8, d_k=64, d_v=64, dropout=0.1, tgt_emb_prj_weight_sharing=True):
17 | 
18 |         super().__init__()
19 | 
20 |         # set RNN layers at 1 or 2 yield better performance.
21 |         self.encoder = EncoderRNN(vis_emb, d_model, n_layers=rnn_layers,
22 |                                   bidirectional=0)
23 | 
24 |         self.decoder = Decoder(
25 |             n_tgt_vocab=n_cap_vocab, len_max_seq=cap_max_seq,
26 |             d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
27 |             n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
28 |             dropout=dropout)
29 | 
30 |         self.cms_decoder = Decoder(
31 |             n_tgt_vocab=n_cms_vocab, len_max_seq=cms_max_seq,
32 |             d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
33 |             n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
34 |             dropout=dropout)
35 | 
36 |         self.cap_word_prj = nn.Linear(d_model, n_cap_vocab, bias=False)
37 |         self.cms_word_prj = nn.Linear(d_model, n_cms_vocab, bias=False)
38 | 
39 |         nn.init.xavier_normal_(self.cap_word_prj.weight)
40 |         nn.init.xavier_normal_(self.cms_word_prj.weight)
41 | 
42 |         assert d_model == d_word_vec, \
43 |             'To facilitate the residual connections, ' \
44 |             'the dimensions of all module outputs shall be the same.'
45 | 
46 |         if tgt_emb_prj_weight_sharing:
47 |             # Share the weight matrix between target word embedding & the final logit dense layer
48 |             self.cap_word_prj.weight = self.decoder.tgt_word_emb.weight
49 |             self.cms_word_prj.weight = self.cms_decoder.tgt_word_emb.weight
50 |             self.x_logit_scale = (d_model ** -0.5)
51 |         else:
52 |             self.x_logit_scale = 1.
53 | 
54 |     def forward(self, vis_feat, tgt_seq, tgt_pos, cms_seq, cms_pos):
55 | 
56 |         tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1]
57 |         cms_seq, cms_pos = cms_seq[:, :-1], cms_pos[:, :-1]
58 | 
59 |         enc_output, *_ = self.encoder(vis_feat)
60 |         dec_output, *_ = self.decoder(tgt_seq, tgt_pos, vis_feat, enc_output)
61 |         seq_logit = self.cap_word_prj(dec_output) * self.x_logit_scale
62 | 
63 |         # Concatenate visual and caption encoding
64 |         cat_output = torch.cat((enc_output, dec_output), 1)
65 | 
66 |         cms_dec_output, *_ = self.cms_decoder(cms_seq, cms_pos, cat_output, cat_output)
67 |         cms_logit = self.cms_word_prj(cms_dec_output) * self.x_logit_scale
68 | 
69 |         return seq_logit.view(-1, seq_logit.size(2)), cms_logit.view(-1, cms_logit.size(2))
70 | 
71 | 


--------------------------------------------------------------------------------
/model/S2VTAttModel.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | 
 4 | class S2VTAttModel(nn.Module):
 5 |     def __init__(self, encoder, decoder, cms_decoder):
 6 |         """
 7 | 
 8 |         Args:
 9 |             encoder (nn.Module): Encoder rnn
10 |             decoder (nn.Module): Decoder rnn
11 |         """
12 |         super(S2VTAttModel, self).__init__()
13 |         self.encoder = encoder
14 |         self.decoder = decoder
15 |         self.cms_decoder = cms_decoder
16 | 
17 |     def forward(self, vid_feats, cap_labels=None, cms_labels=None, mode='train', opt={}):
18 |         """
19 | 
20 |         Args:
21 |             vid_feats (Variable): video feats of shape [batch_size, seq_len, dim_vid]
22 |             target_variable (None, optional): ground truth labels
23 | 
24 |         Returns:
25 |             seq_prob: Variable of shape [batch_size, max_len-1, vocab_size]
26 |             seq_preds: [] or Variable of shape [batch_size, max_len-1]
27 |         """
28 |         encoder_outputs, encoder_hidden = self.encoder(vid_feats)
29 |         # seq_prob, _, cap_encoding, cap_hidden = self.decoder(encoder_outputs,encoder_hidden, cap_labels, 'train', opt)
30 |         _, seq_prob, cap_encoding, cap_hidden = self.decoder(encoder_outputs, encoder_hidden,
31 |                                                           None, 'inference', opt)
32 | 
33 |         cat_encoding = torch.cat((encoder_outputs, cap_encoding), 1)
34 |         if mode == 'test':
35 |             _, cms_seq_prob, _, _ = self.cms_decoder(cat_encoding, cap_hidden, targets=None, mode='inference', opt=opt)
36 |         else:
37 |             cms_seq_prob, _, _, _ = self.cms_decoder(cat_encoding, cap_hidden, cms_labels, mode='train', opt=opt)
38 |         return seq_prob, cms_seq_prob
39 | 


--------------------------------------------------------------------------------
/model/S2VTModel.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | 
  6 | 
  7 | class S2VTModel(nn.Module):
  8 |     def __init__(self, vocab_size, cms_vocab_size, max_len, cms_max_len, dim_hidden, dim_word, dim_vid=2048, sos_id=2, eos_id=3,
  9 |                  n_layers=1, rnn_cell='gru', rnn_dropout_p=0.2):
 10 |         super(S2VTModel, self).__init__()
 11 | 
 12 |         if rnn_cell.lower() == 'lstm':
 13 |             self.rnn_cell = nn.LSTM
 14 |         elif rnn_cell.lower() == 'gru':
 15 |             self.rnn_cell = nn.GRU
 16 | 
 17 |         self.rnn1 = self.rnn_cell(dim_vid, dim_hidden, n_layers,
 18 |                                   batch_first=True, dropout=rnn_dropout_p)
 19 | 
 20 |         self.rnn2 = self.rnn_cell(dim_hidden + dim_word, dim_hidden, n_layers,
 21 |                                   batch_first=True, dropout=rnn_dropout_p)
 22 | 
 23 |         self.rnn3 = self.rnn_cell(dim_hidden + dim_word, dim_hidden, n_layers,
 24 |                                   batch_first=True, dropout=rnn_dropout_p)
 25 | 
 26 |         self.dim_vid = dim_vid
 27 |         self.dim_output = vocab_size
 28 |         self.cms_dim_output = cms_vocab_size
 29 |         self.dim_hidden = dim_hidden
 30 |         self.dim_word = dim_word
 31 |         self.max_length = max_len
 32 |         self.cms_max_length = cms_max_len
 33 |         self.sos_id = sos_id
 34 |         self.eos_id = eos_id
 35 |         self.embedding = nn.Embedding(self.dim_output, self.dim_word)
 36 | 
 37 |         self.out = nn.Linear(self.dim_hidden, self.dim_output)
 38 |         self.cms_out = nn.Linear(self.dim_hidden, self.cms_dim_output)
 39 | 
 40 |     def forward(self, vid_feats, target_variable=None, cms_target_variable=None, mode='train', opt={}):
 41 |         batch_size, n_frames, _ = vid_feats.shape
 42 | 
 43 |         padding_words = torch.zeros((batch_size, n_frames, self.dim_word)).cuda()
 44 |         padding_frames = torch.zeros((batch_size, 1, self.dim_vid)).cuda()
 45 |         state1 = None
 46 |         state2 = None
 47 | 
 48 |         output1, state1 = self.rnn1(vid_feats, state1)
 49 |         input2 = torch.cat((output1, padding_words), dim=2)
 50 |         output2, state2 = self.rnn2(input2, state2)
 51 | 
 52 |         seq_probs = []
 53 |         seq_preds = []
 54 |         cms_seq_probs = []
 55 |         cms_seq_preds = []
 56 |         if mode == 'train':
 57 |             for i in range(self.max_length - 1):
 58 |                 # <eos> doesn't input to the network
 59 |                 current_words = self.embedding(target_variable[:, i])
 60 |                 self.rnn1.flatten_parameters()
 61 |                 self.rnn2.flatten_parameters()
 62 |                 output1, state1 = self.rnn1(padding_frames, state1)
 63 |                 input2 = torch.cat(
 64 |                     (output1, current_words.unsqueeze(1)), dim=2)
 65 |                 output2, state2 = self.rnn2(input2, state2)
 66 |                 logits = self.out(output2.squeeze(1))
 67 |                 logits = F.log_softmax(logits, dim=1)
 68 |                 seq_probs.append(logits.unsqueeze(1))
 69 |             seq_probs = torch.cat(seq_probs, 1)
 70 | 
 71 |             # CMS decoding training
 72 |             state3 = state2
 73 |             for i in range(self.cms_max_length - 1):
 74 |                 # <eos> doesn't input to the network
 75 |                 current_words = self.embedding(cms_target_variable[:, i])
 76 |                 self.rnn3.flatten_parameters()
 77 |                 input3 = torch.cat(
 78 |                     (output2, current_words.unsqueeze(1)), dim=2)
 79 | 
 80 |                 output3, state3 = self.rnn3(input3, state3)
 81 |                 logits = self.cms_out(output3.squeeze(1))
 82 |                 logits = F.log_softmax(logits, dim=1)
 83 |                 cms_seq_probs.append(logits.unsqueeze(1))
 84 |             cms_seq_probs = torch.cat(cms_seq_probs, 1)
 85 | 
 86 |         else:
 87 |             for i in range(self.max_length - 1):
 88 |                 # <eos> doesn't input to the network
 89 |                 current_words = self.embedding(target_variable[:, i])
 90 |                 self.rnn1.flatten_parameters()
 91 |                 self.rnn2.flatten_parameters()
 92 |                 output1, state1 = self.rnn1(padding_frames, state1)
 93 |                 input2 = torch.cat(
 94 |                     (output1, current_words.unsqueeze(1)), dim=2)
 95 |                 output2, state2 = self.rnn2(input2, state2)
 96 |                 logits = self.out(output2.squeeze(1))
 97 |                 logits = F.log_softmax(logits, dim=1)
 98 |                 seq_probs.append(logits.unsqueeze(1))
 99 |             seq_probs = torch.cat(seq_probs, 1)
100 | 
101 |             state3 = state2
102 |             current_words = self.embedding(
103 |                 Variable(torch.LongTensor([self.sos_id] * batch_size)).cuda())
104 |             for i in range(self.cms_max_length - 1):
105 |                 # current_words = self.embedding(cms_target_variable[:, i])
106 |                 self.rnn3.flatten_parameters()
107 |                 input3 = torch.cat((output2, current_words.unsqueeze(1)), dim=2)
108 |                 output3, state3 = self.rnn3(input3, state3)
109 | 
110 |                 logits = self.cms_out(output3.squeeze(1))
111 |                 logits = F.log_softmax(logits, dim=1)
112 |                 cms_seq_probs.append(logits.unsqueeze(1))
113 | 
114 |                 _, preds = torch.max(logits, 1)
115 |                 current_words = self.embedding(preds)
116 |                 cms_seq_preds.append(preds.unsqueeze(1))
117 | 
118 |             cms_seq_probs = torch.cat(cms_seq_probs, 1)
119 |             cms_seq_preds = torch.cat(cms_seq_preds, 1)
120 |         return seq_probs, seq_preds, cms_seq_probs, cms_seq_preds


--------------------------------------------------------------------------------
/model/S2VT_EncoderRNN.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | class EncoderRNN(nn.Module):
 5 |     def __init__(self, dim_vid, dim_hidden, input_dropout_p=0.2, rnn_dropout_p=0.5,
 6 |                  n_layers=1, bidirectional=False, rnn_cell='gru'):
 7 |         """
 8 |         Args:
 9 |             hidden_dim (int): dim of hidden state of rnn
10 |             input_dropout_p (int): dropout probability for the input sequence
11 |             dropout_p (float): dropout probability for the output sequence
12 |             n_layers (int): number of rnn layers
13 |             rnn_cell (str): type of RNN cell ('LSTM'/'GRU')
14 |         """
15 |         super(EncoderRNN, self).__init__()
16 |         self.dim_vid = dim_vid
17 |         self.dim_hidden = dim_hidden
18 |         self.input_dropout_p = input_dropout_p
19 |         self.rnn_dropout_p = rnn_dropout_p
20 |         self.n_layers = n_layers
21 |         self.bidirectional = bidirectional
22 |         self.rnn_cell = rnn_cell
23 | 
24 |         self.vid2hid = nn.Linear(dim_vid, dim_hidden)
25 |         self.input_dropout = nn.Dropout(input_dropout_p)
26 | 
27 |         if rnn_cell.lower() == 'lstm':
28 |             self.rnn_cell = nn.LSTM
29 |         elif rnn_cell.lower() == 'gru':
30 |             self.rnn_cell = nn.GRU
31 | 
32 |         self.rnn = self.rnn_cell(dim_hidden, dim_hidden, n_layers, batch_first=True,
33 |                                  dropout=self.rnn_dropout_p)
34 | 
35 |         self._init_hidden()
36 | 
37 |     def _init_hidden(self):
38 |         nn.init.xavier_normal_(self.vid2hid.weight)
39 | 
40 |     def forward(self, vid_feats):
41 |         """
42 |         Applies a multi-layer RNN to an input sequence.
43 |         Args:
44 |             input_var (batch, seq_len): tensor containing the features of the input sequence.
45 |             input_lengths (list of int, optional): A list that contains the lengths of sequences
46 |               in the mini-batch
47 |         Returns: output, hidden
48 |             - **output** (batch, seq_len, hidden_size): variable containing the encoded features of the input sequence
49 |             - **hidden** (num_layers * num_directions, batch, hidden_size): variable containing the features in the hidden state h
50 |         """
51 |         batch_size, seq_len, dim_vid = vid_feats.size()
52 |         vid_feats = self.vid2hid(vid_feats.view(-1, dim_vid))
53 |         vid_feats = self.input_dropout(vid_feats)
54 |         vid_feats = vid_feats.view(batch_size, seq_len, self.dim_hidden)
55 |         self.rnn.flatten_parameters()
56 |         state1 = None
57 |         output, hidden = self.rnn(vid_feats, state1)
58 |         return output, hidden
59 | 


--------------------------------------------------------------------------------
/model/TransformerDecoderModel.py:
--------------------------------------------------------------------------------
 1 | ''' Define the Transformer model '''
 2 | import numpy as np
 3 | from utils.utils import *
 4 | from model.Decoder import Decoder
 5 | from model.transformer.Layers import EncoderLayer
 6 | 
 7 | __author__ = 'Yu-Hsiang Huang'
 8 | __AugmentedBy__ = 'Jacob Zhiyuan Fang'
 9 | 
10 | 
11 | class Model(nn.Module):
12 |     ''' A sequence to sequence model with attention mechanism. '''
13 | 
14 |     def __init__(
15 |             self,
16 |             n_cap_vocab, n_cms_vocab, cap_max_seq, cms_max_seq, vis_emb=2048,
17 |             d_word_vec=512, d_model=512, d_inner=2048, n_layers=6, rnn_layers=1,
18 |             n_head=8, d_k=64, d_v=64, dropout=0.1, tgt_emb_prj_weight_sharing=True):
19 | 
20 |         super().__init__()
21 | 
22 |         # set RNN layers at 1 or 2 yield better performance.
23 |         self.vis_emb = nn.Linear(vis_emb, d_model)
24 | 
25 |         self.decoder = Decoder(
26 |             n_tgt_vocab=n_cap_vocab, len_max_seq=cap_max_seq,
27 |             d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
28 |             n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
29 |             dropout=dropout)
30 | 
31 |         self.cms_decoder = Decoder(
32 |             n_tgt_vocab=n_cms_vocab, len_max_seq=cms_max_seq,
33 |             d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
34 |             n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
35 |             dropout=dropout)
36 | 
37 |         self.cap_word_prj = nn.Linear(d_model, n_cap_vocab, bias=False)
38 |         self.cms_word_prj = nn.Linear(d_model, n_cms_vocab, bias=False)
39 | 
40 |         nn.init.xavier_normal_(self.cap_word_prj.weight)
41 |         nn.init.xavier_normal_(self.cms_word_prj.weight)
42 | 
43 |         assert d_model == d_word_vec, \
44 |             'To facilitate the residual connections, ' \
45 |             'the dimensions of all module outputs shall be the same.'
46 | 
47 |         if tgt_emb_prj_weight_sharing:
48 |             # Share the weight matrix between target word embedding & the final logit dense layer
49 |             self.cap_word_prj.weight = self.decoder.tgt_word_emb.weight
50 |             self.cms_word_prj.weight = self.cms_decoder.tgt_word_emb.weight
51 |             self.x_logit_scale = (d_model ** -0.5)
52 |         else:
53 |             self.x_logit_scale = 1.
54 | 
55 |     def forward(self, vis_feat, tgt_seq, tgt_pos, cms_seq, cms_pos):
56 |         enc_output = self.vis_emb(vis_feat)
57 |         tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1]
58 |         cms_seq, cms_pos = cms_seq[:, :-1], cms_pos[:, :-1]
59 | 
60 |         dec_output, *_ = self.decoder(tgt_seq, tgt_pos, vis_feat, enc_output)
61 |         seq_logit = self.cap_word_prj(dec_output) * self.x_logit_scale
62 | 
63 |         # Concatenate visual and caption encoding
64 |         cat_output = torch.cat((enc_output, dec_output), 1)
65 | 
66 |         cms_dec_output, *_ = self.cms_decoder(cms_seq, cms_pos, cat_output, cat_output)
67 |         cms_logit = self.cms_word_prj(cms_dec_output) * self.x_logit_scale
68 | 
69 |         return seq_logit.view(-1, seq_logit.size(2)), cms_logit.view(-1, cms_logit.size(2))
70 | 
71 | 


--------------------------------------------------------------------------------
/model/TransformerModel.py:
--------------------------------------------------------------------------------
  1 | ''' Define the Transformer model '''
  2 | import numpy as np
  3 | from utils.utils import *
  4 | from model.Decoder import Decoder
  5 | from model.transformer.Layers import EncoderLayer
  6 | 
  7 | __author__ = 'Yu-Hsiang Huang'
  8 | __AugmentedBy__ = 'Jacob Zhiyuan Fang'
  9 | 
 10 | 
 11 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
 12 |     ''' Sinusoid position encoding table '''
 13 | 
 14 |     def cal_angle(position, hid_idx):
 15 |         return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
 16 | 
 17 |     def get_posi_angle_vec(position):
 18 |         return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
 19 | 
 20 |     sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
 21 | 
 22 |     sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
 23 |     sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
 24 | 
 25 |     if padding_idx is not None:
 26 |         # zero vector for padding dimension
 27 |         sinusoid_table[padding_idx] = 0.
 28 | 
 29 |     return torch.FloatTensor(sinusoid_table)
 30 | 
 31 | 
 32 | def get_non_pad_mask(seq):
 33 |     assert seq.dim() == 2
 34 |     return seq.ne(Constants.PAD).type(torch.float).unsqueeze(-1).cuda()
 35 | 
 36 | 
 37 | def get_attn_key_pad_mask(seq_k, seq_q):
 38 |     ''' For masking out the padding part of key sequence. '''
 39 | 
 40 |     # Expand to fit the shape of key query attention matrix.
 41 |     len_q = seq_q.size(1)
 42 |     padding_mask = seq_k.eq(Constants.PAD)
 43 |     padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1)  # b x lq x lk
 44 | 
 45 |     return padding_mask.cuda()
 46 | 
 47 | class Encoder(nn.Module):
 48 |     ''' A encoder model with self attention mechanism. '''
 49 | 
 50 |     def __init__(
 51 |             self,
 52 |             len_max_seq, d_word_vec,
 53 |             n_layers, n_head, d_k, d_v,
 54 |             d_model, d_inner, dropout=0.1):
 55 | 
 56 |         super().__init__()
 57 | 
 58 |         n_position = len_max_seq + 1
 59 | 
 60 |         self.position_enc = nn.Embedding.from_pretrained(
 61 |             get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0), freeze=True)
 62 | 
 63 |         self.layer_stack = nn.ModuleList([
 64 |             EncoderLayer(d_word_vec, d_inner, n_head, d_k, d_v, dropout=dropout)
 65 |             for _ in range(n_layers)])
 66 | 
 67 |     def forward(self, src_emb, src_pos, return_attns=False):
 68 | 
 69 |         enc_slf_attn_list = []
 70 | 
 71 |         # -- Prepare masks
 72 |         _ = torch.rand(src_emb.shape[0], src_emb.shape[1])
 73 |         slf_attn_mask = get_attn_key_pad_mask(seq_k=_, seq_q=_)
 74 |         non_pad_mask = get_non_pad_mask(_)
 75 | 
 76 |         # -- Forward
 77 |         enc_output = src_emb + self.position_enc(src_pos)
 78 | 
 79 |         for enc_layer in self.layer_stack:
 80 |             enc_output, enc_slf_attn = enc_layer(enc_output, non_pad_mask=non_pad_mask,
 81 |                                                  slf_attn_mask=slf_attn_mask)
 82 |             if return_attns:
 83 |                 enc_slf_attn_list += [enc_slf_attn]
 84 | 
 85 |         if return_attns:
 86 |             return enc_output, enc_slf_attn_list
 87 |         return enc_output,
 88 | 
 89 | class Model(nn.Module):
 90 |     ''' A sequence to sequence model with attention mechanism. '''
 91 | 
 92 |     def __init__(
 93 |             self,
 94 |             n_cap_vocab, n_cms_vocab, cap_max_seq, cms_max_seq, vis_emb=2048,
 95 |             d_word_vec=512, d_model=512, d_inner=2048, n_layers=6, rnn_layers=1,
 96 |             n_head=8, d_k=64, d_v=64, dropout=0.1, tgt_emb_prj_weight_sharing=True):
 97 | 
 98 |         super().__init__()
 99 | 
100 |         # set RNN layers at 1 or 2 yield better performance.
101 |         self.vis_emb = nn.Linear(vis_emb, d_model)
102 |         self.encoder = Encoder(40, d_model, rnn_layers, n_head, d_k, d_v,
103 |                                d_model, d_inner, dropout=0.1)
104 | 
105 |         self.decoder = Decoder(
106 |             n_tgt_vocab=n_cap_vocab, len_max_seq=cap_max_seq,
107 |             d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
108 |             n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
109 |             dropout=dropout)
110 | 
111 |         self.cms_decoder = Decoder(
112 |             n_tgt_vocab=n_cms_vocab, len_max_seq=cms_max_seq,
113 |             d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
114 |             n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
115 |             dropout=dropout)
116 | 
117 |         self.cap_word_prj = nn.Linear(d_model, n_cap_vocab, bias=False)
118 |         self.cms_word_prj = nn.Linear(d_model, n_cms_vocab, bias=False)
119 | 
120 |         nn.init.xavier_normal_(self.cap_word_prj.weight)
121 |         nn.init.xavier_normal_(self.cms_word_prj.weight)
122 | 
123 |         assert d_model == d_word_vec, \
124 |             'To facilitate the residual connections, ' \
125 |             'the dimensions of all module outputs shall be the same.'
126 | 
127 |         if tgt_emb_prj_weight_sharing:
128 |             # Share the weight matrix between target word embedding & the final logit dense layer
129 |             self.cap_word_prj.weight = self.decoder.tgt_word_emb.weight
130 |             self.cms_word_prj.weight = self.cms_decoder.tgt_word_emb.weight
131 |             self.x_logit_scale = (d_model ** -0.5)
132 |         else:
133 |             self.x_logit_scale = 1.
134 | 
135 |     def forward(self, vis_feat, tgt_seq, tgt_pos, cms_seq, cms_pos):
136 |         vis_feat = self.vis_emb(vis_feat)
137 |         tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1]
138 |         cms_seq, cms_pos = cms_seq[:, :-1], cms_pos[:, :-1]
139 | 
140 |         vis_pos = torch.tensor(list(range(0, 40))).cuda().unsqueeze(0).repeat(vis_feat.shape[0], 1)
141 |         enc_output, *_ = self.encoder(vis_feat, vis_pos)
142 |         dec_output, *_ = self.decoder(tgt_seq, tgt_pos, vis_feat, enc_output)
143 |         seq_logit = self.cap_word_prj(dec_output) * self.x_logit_scale
144 | 
145 |         # Concatenate visual and caption encoding
146 |         cat_output = torch.cat((enc_output, dec_output), 1)
147 | 
148 |         cms_dec_output, *_ = self.cms_decoder(cms_seq, cms_pos, cat_output, cat_output)
149 |         cms_logit = self.cms_word_prj(cms_dec_output) * self.x_logit_scale
150 | 
151 |         return seq_logit.view(-1, seq_logit.size(2)), cms_logit.view(-1, cms_logit.size(2))
152 | 
153 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jacobswan1/Video2Commonsense/4dcef76360a29702fd90b7030a39a123da6db19e/model/__init__.py


--------------------------------------------------------------------------------
/model/transformer/Beam.py:
--------------------------------------------------------------------------------
  1 | """ Manage beam search info structure.
  2 | 
  3 |     Heavily borrowed from OpenNMT-py.
  4 |     For code in OpenNMT-py, please check the following link:
  5 |     https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/Beam.py
  6 | """
  7 | 
  8 | import torch
  9 | from ..transformer import Constants as Constants
 10 | # import Constants as Constants
 11 | 
 12 | 
 13 | class Beam():
 14 |     ''' Beam search '''
 15 | 
 16 |     def __init__(self, size, device=False):
 17 | 
 18 |         self.size = size
 19 |         self._done = False
 20 | 
 21 |         # The score for each translation on the beam.
 22 |         self.scores = torch.zeros((size,), dtype=torch.float, device=device)
 23 |         self.all_scores = []
 24 | 
 25 |         # The backpointers at each time-step.
 26 |         self.prev_ks = []
 27 | 
 28 |         # The outputs at each time-step.
 29 |         self.next_ys = [torch.full((size,), Constants.PAD, dtype=torch.long, device=device)]
 30 |         self.next_ys[0][0] = Constants.BOS
 31 | 
 32 |     def get_current_state(self):
 33 |         "Get the outputs for the current timestep."
 34 |         return self.get_tentative_hypothesis()
 35 | 
 36 |     def get_current_origin(self):
 37 |         "Get the backpointers for the current timestep."
 38 |         return self.prev_ks[-1]
 39 | 
 40 |     @property
 41 |     def done(self):
 42 |         return self._done
 43 | 
 44 |     def advance(self, word_prob):
 45 |         "Update beam status and check if finished or not."
 46 |         num_words = word_prob.size(1)
 47 | 
 48 |         # Sum the previous scores.
 49 |         if len(self.prev_ks) > 0:
 50 |             beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob)
 51 |         else:
 52 |             beam_lk = word_prob[0]
 53 | 
 54 |         flat_beam_lk = beam_lk.view(-1)
 55 | 
 56 |         best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) # 1st sort
 57 |         best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) # 2nd sort
 58 | 
 59 |         self.all_scores.append(self.scores)
 60 |         self.scores = best_scores
 61 | 
 62 |         # bestScoresId is flattened as a (beam x word) array,
 63 |         # so we need to calculate which word and beam each score came from
 64 |         prev_k = best_scores_id / num_words
 65 |         self.prev_ks.append(prev_k)
 66 |         self.next_ys.append(best_scores_id - prev_k * num_words)
 67 | 
 68 |         # End condition is when top-of-beam is EOS.
 69 |         if self.next_ys[-1][0].item() == Constants.EOS:
 70 |             self._done = True
 71 |             self.all_scores.append(self.scores)
 72 | 
 73 |         return self._done
 74 | 
 75 |     def sort_scores(self):
 76 |         "Sort the scores."
 77 |         return torch.sort(self.scores, 0, True)
 78 | 
 79 |     def get_the_best_score_and_idx(self):
 80 |         "Get the score of the best in the beam."
 81 |         scores, ids = self.sort_scores()
 82 |         return scores[1], ids[1]
 83 | 
 84 |     def get_tentative_hypothesis(self):
 85 |         "Get the decoded sequence for the current timestep."
 86 | 
 87 |         if len(self.next_ys) == 1:
 88 |             dec_seq = self.next_ys[0].unsqueeze(1)
 89 |         else:
 90 |             _, keys = self.sort_scores()
 91 |             hyps = [self.get_hypothesis(k) for k in keys]
 92 |             hyps = [[Constants.BOS] + h for h in hyps]
 93 |             dec_seq = torch.LongTensor(hyps)
 94 | 
 95 |         return dec_seq
 96 | 
 97 |     def get_hypothesis(self, k):
 98 |         """ Walk back to construct the full hypothesis. """
 99 |         hyp = []
100 |         for j in range(len(self.prev_ks) - 1, -1, -1):
101 |             hyp.append(self.next_ys[j+1][k])
102 |             k = self.prev_ks[j][k]
103 | 
104 |         return list(map(lambda x: x.item(), hyp[::-1]))
105 | 


--------------------------------------------------------------------------------
/model/transformer/Constants.py:
--------------------------------------------------------------------------------
 1 | 
 2 | PAD = 0
 3 | UNK = 1
 4 | BOS = 2
 5 | EOS = 3
 6 | SEP = 4
 7 | 
 8 | CAP_PAD = 0
 9 | CAP_UNK = 1
10 | CAP_BOS = 2
11 | CAP_EOS = 3
12 | 
13 | # PAD_WORD = '<blank>'
14 | # UNK_WORD = '<unk>'
15 | # BOS_WORD = '<s>'
16 | # EOS_WORD = '</s>'
17 | 


--------------------------------------------------------------------------------
/model/transformer/Layers.py:
--------------------------------------------------------------------------------
 1 | ''' Define the Layers '''
 2 | import torch.nn as nn
 3 | from model.transformer.SubLayers import MultiHeadAttention, PositionwiseFeedForward
 4 | 
 5 | __author__ = 'Yu-Hsiang Huang'
 6 | __RevisedBy__ = 'Jacob Zhiyuan Fang'
 7 | 
 8 | 
 9 | class EncoderLayer(nn.Module):
10 |     ''' Compose with two layers '''
11 | 
12 |     def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
13 |         super(EncoderLayer, self).__init__()
14 |         self.slf_attn = MultiHeadAttention(
15 |             n_head, d_model, d_k, d_v, dropout=dropout)
16 |         self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
17 | 
18 |     def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
19 |         enc_output, enc_slf_attn = self.slf_attn(
20 |             enc_input, enc_input, enc_input, mask=slf_attn_mask)
21 |         enc_output *= non_pad_mask
22 | 
23 |         enc_output = self.pos_ffn(enc_output)
24 |         enc_output *= non_pad_mask
25 | 
26 |         return enc_output, enc_slf_attn
27 | 
28 | 
29 | class DecoderLayer(nn.Module):
30 |     ''' Compose with three layers '''
31 | 
32 |     def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
33 |         super(DecoderLayer, self).__init__()
34 |         self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
35 |         self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
36 |         self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
37 | 
38 |     def forward(self, dec_input, enc_output, non_pad_mask=None, slf_attn_mask=None, dec_enc_attn_mask=None):
39 |         dec_output, dec_slf_attn = self.slf_attn(
40 |             dec_input, dec_input, dec_input, mask=slf_attn_mask)
41 |         dec_output *= non_pad_mask
42 | 
43 |         dec_output, dec_enc_attn = self.enc_attn(
44 |             dec_output, enc_output, enc_output, mask=dec_enc_attn_mask)
45 |         dec_output *= non_pad_mask
46 | 
47 |         dec_output = self.pos_ffn(dec_output)
48 |         dec_output *= non_pad_mask
49 | 
50 |         return dec_output, dec_slf_attn, dec_enc_attn
51 | 


--------------------------------------------------------------------------------
/model/transformer/Modules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | __author__ = "Yu-Hsiang Huang"
 6 | 
 7 | class ScaledDotProductAttention(nn.Module):
 8 |     ''' Scaled Dot-Product Attention '''
 9 | 
10 |     def __init__(self, temperature, attn_dropout=0.1):
11 |         super().__init__()
12 |         self.temperature = temperature
13 |         self.dropout = nn.Dropout(attn_dropout)
14 |         self.softmax = nn.Softmax(dim=2)
15 | 
16 |     def forward(self, q, k, v, mask=None):
17 | 
18 |         attn = torch.bmm(q, k.transpose(1, 2))
19 |         attn = attn / self.temperature
20 | 
21 |         if mask is not None:
22 |             attn = attn.masked_fill(mask, -np.inf)
23 | 
24 |         attn = self.softmax(attn)
25 |         attn = self.dropout(attn)
26 |         output = torch.bmm(attn, v)
27 | 
28 |         return output, attn
29 | 


--------------------------------------------------------------------------------
/model/transformer/Optim.py:
--------------------------------------------------------------------------------
 1 | '''A wrapper class for optimizer '''
 2 | import numpy as np
 3 | 
 4 | 
 5 | class ScheduledOptim():
 6 |     '''A simple wrapper class for learning rate scheduling'''
 7 | 
 8 |     def __init__(self, optimizer, d_model, n_warmup_steps):
 9 |         self._optimizer = optimizer
10 |         self.n_warmup_steps = n_warmup_steps
11 |         self.n_current_steps = 0
12 |         self.init_lr = np.power(d_model, -0.5)
13 | 
14 |     def step_and_update_lr(self):
15 |         "Step with the inner optimizer"
16 |         self._update_learning_rate()
17 |         self._optimizer.step()
18 | 
19 |     def zero_grad(self):
20 |         "Zero out the gradients by the inner optimizer"
21 |         self._optimizer.zero_grad()
22 | 
23 |     def _get_lr_scale(self):
24 |         return np.min([
25 |             np.power(self.n_current_steps, -0.8),
26 |             np.power(self.n_warmup_steps, -1.8) * self.n_current_steps])
27 | 
28 |     def _update_learning_rate(self):
29 |         ''' Learning rate scheduling per step '''
30 | 
31 |         self.n_current_steps += 1
32 |         lr = self.init_lr * self._get_lr_scale()
33 | 
34 |         for param_group in self._optimizer.param_groups:
35 |             param_group['lr'] = lr
36 | 
37 | 


--------------------------------------------------------------------------------
/model/transformer/SubLayers.py:
--------------------------------------------------------------------------------
 1 | ''' Define the sublayers in encoder/decoder layer '''
 2 | import numpy as np
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from model.transformer.Modules import ScaledDotProductAttention
 6 | 
 7 | __author__ = "Yu-Hsiang Huang"
 8 | __RevisedBy__ = 'Jacob Zhiyuan Fang'
 9 | 
10 | 
11 | class MultiHeadAttention(nn.Module):
12 |     ''' Multi-Head Attention module '''
13 | 
14 |     def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
15 |         super().__init__()
16 | 
17 |         self.n_head = n_head
18 |         self.d_k = d_k
19 |         self.d_v = d_v
20 | 
21 |         self.w_qs = nn.Linear(d_model, n_head * d_k)
22 |         self.w_ks = nn.Linear(d_model, n_head * d_k)
23 |         self.w_vs = nn.Linear(d_model, n_head * d_v)
24 |         nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
25 |         nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
26 |         nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))
27 | 
28 |         self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5))
29 |         self.layer_norm = nn.LayerNorm(d_model)
30 | 
31 |         self.fc = nn.Linear(n_head * d_v, d_model)
32 |         nn.init.xavier_normal_(self.fc.weight)
33 |         self.dropout = nn.Dropout(dropout)
34 | 
35 |     def forward(self, q, k, v, mask=None):
36 |         d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
37 | 
38 |         sz_b, len_q, _ = q.size()
39 |         sz_b, len_k, _ = k.size()
40 |         sz_b, len_v, _ = v.size()
41 | 
42 |         residual = q
43 | 
44 |         q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
45 |         k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
46 |         v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
47 | 
48 |         q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk
49 |         k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk
50 |         v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv
51 | 
52 |         if mask is not None:
53 |             mask = mask.repeat(n_head, 1, 1) #
54 | 
55 |         output, attn = self.attention(q, k, v, mask=mask)
56 | 
57 |         output = output.view(n_head, sz_b, len_q, d_v)
58 |         output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv)
59 | 
60 |         output = self.dropout(self.fc(output))
61 |         output = self.layer_norm(output + residual)
62 | 
63 |         return output, attn
64 | 
65 | 
66 | class PositionwiseFeedForward(nn.Module):
67 |     ''' A two-feed-forward-layer module '''
68 | 
69 |     def __init__(self, d_in, d_hid, dropout=0.1):
70 |         super().__init__()
71 |         self.w_1 = nn.Conv1d(d_in, d_hid, 1) # position-wise
72 |         self.w_2 = nn.Conv1d(d_hid, d_in, 1) # position-wise
73 |         self.layer_norm = nn.LayerNorm(d_in)
74 |         self.dropout = nn.Dropout(dropout)
75 | 
76 |     def forward(self, x):
77 |         residual = x
78 |         output = x.transpose(1, 2)
79 |         output = self.w_2(F.relu(self.w_1(output)))
80 |         output = output.transpose(1, 2)
81 |         output = self.dropout(output)
82 |         output = self.layer_norm(output + residual)
83 |         return output
84 | 


--------------------------------------------------------------------------------
/model/transformer/Transformers.py:
--------------------------------------------------------------------------------
  1 | ''' Define the Transformer model '''
  2 | import torch
  3 | import numpy as np
  4 | import torch.nn as nn
  5 | import model.transformer.Constants as Constants
  6 | from model.transformer.Layers import EncoderLayer, DecoderLayer
  7 | 
  8 | __author__ = 'Yu-Hsiang Huang'
  9 | __AugmentedBy__ = 'Jacob Zhiyuan Fang'
 10 | 
 11 | 
 12 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
 13 |     ''' Sinusoid position encoding table '''
 14 | 
 15 |     def cal_angle(position, hid_idx):
 16 |         return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
 17 | 
 18 |     def get_posi_angle_vec(position):
 19 |         return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
 20 | 
 21 |     sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
 22 | 
 23 |     sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
 24 |     sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
 25 | 
 26 |     if padding_idx is not None:
 27 |         # zero vector for padding dimension
 28 |         sinusoid_table[padding_idx] = 0.
 29 | 
 30 |     return torch.FloatTensor(sinusoid_table)
 31 | 
 32 | 
 33 | def get_non_pad_mask(seq):
 34 |     assert seq.dim() == 2
 35 |     return seq.ne(Constants.PAD).type(torch.float).unsqueeze(-1).cuda()
 36 | 
 37 | 
 38 | def get_attn_key_pad_mask(seq_k, seq_q):
 39 |     ''' For masking out the padding part of key sequence. '''
 40 | 
 41 |     # Expand to fit the shape of key query attention matrix.
 42 |     len_q = seq_q.size(1)
 43 |     padding_mask = seq_k.eq(Constants.PAD)
 44 |     padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1)  # b x lq x lk
 45 | 
 46 |     return padding_mask.cuda()
 47 | 
 48 | 
 49 | def get_subsequent_mask(seq):
 50 |     ''' For masking out the subsequent info. '''
 51 | 
 52 |     sz_b, len_s = seq.size()
 53 |     subsequent_mask = torch.triu(
 54 |         torch.ones((len_s, len_s), device=seq.device, dtype=torch.uint8), diagonal=1)
 55 |     subsequent_mask = subsequent_mask.unsqueeze(0).expand(sz_b, -1, -1)  # b x ls x ls
 56 | 
 57 |     return subsequent_mask.cuda()
 58 | 
 59 | 
 60 | class Encoder(nn.Module):
 61 |     ''' A encoder model with self attention mechanism. '''
 62 | 
 63 |     def __init__(
 64 |             self,
 65 |             len_max_seq, d_word_vec,
 66 |             n_layers, n_head, d_k, d_v,
 67 |             d_model, d_inner, dropout=0.1):
 68 | 
 69 |         super().__init__()
 70 | 
 71 |         n_position = len_max_seq + 1
 72 | 
 73 |         self.position_enc = nn.Embedding.from_pretrained(
 74 |             get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0), freeze=True)
 75 | 
 76 |         self.layer_stack = nn.ModuleList([
 77 |             EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
 78 |             for _ in range(n_layers)])
 79 | 
 80 |     def forward(self, src_emb, src_pos, return_attns=False):
 81 | 
 82 |         enc_slf_attn_list = []
 83 | 
 84 |         # -- Prepare masks
 85 |         _ = torch.rand(src_emb.shape[0], src_emb.shape[1])
 86 |         slf_attn_mask = get_attn_key_pad_mask(seq_k=_, seq_q=_)
 87 |         non_pad_mask = get_non_pad_mask(_)
 88 | 
 89 |         # -- Forward
 90 |         enc_output = src_emb + self.position_enc(src_pos)
 91 | 
 92 |         for enc_layer in self.layer_stack:
 93 |             enc_output, enc_slf_attn = enc_layer(enc_output, non_pad_mask=non_pad_mask, slf_attn_mask=slf_attn_mask)
 94 |             if return_attns:
 95 |                 enc_slf_attn_list += [enc_slf_attn]
 96 | 
 97 |         if return_attns:
 98 |             return enc_output, enc_slf_attn_list
 99 |         return enc_output,
100 | 
101 | 
102 | class Decoder(nn.Module):
103 |     ''' A decoder model with self attention mechanism. '''
104 | 
105 |     def __init__(
106 |             self, n_tgt_vocab, len_max_seq, d_word_vec,
107 |             n_layers, n_head, d_k, d_v,
108 |             d_model, d_inner, dropout=0.1):
109 | 
110 |         super().__init__()
111 |         n_position = len_max_seq + 1
112 | 
113 |         self.tgt_word_emb = nn.Embedding(
114 |             n_tgt_vocab, d_word_vec, padding_idx=Constants.PAD)
115 | 
116 |         self.position_enc = nn.Embedding.from_pretrained(
117 |             get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0),
118 |             freeze=True)
119 | 
120 |         self.layer_stack = nn.ModuleList([
121 |             DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
122 |             for _ in range(n_layers)])
123 | 
124 |     def forward(self, tgt_seq, tgt_pos, src_seq, enc_output, return_attns=False):
125 | 
126 |         dec_slf_attn_list, dec_enc_attn_list = [], []
127 | 
128 |         # -- Prepare masks
129 |         non_pad_mask = get_non_pad_mask(tgt_seq)
130 | 
131 |         slf_attn_mask_subseq = get_subsequent_mask(tgt_seq)
132 |         slf_attn_mask_keypad = get_attn_key_pad_mask(seq_k=tgt_seq, seq_q=tgt_seq)
133 |         slf_attn_mask = (slf_attn_mask_keypad + slf_attn_mask_subseq).gt(0)
134 | 
135 |         src_tmp = torch.ones(src_seq.shape[0], src_seq.shape[1]).cuda()
136 |         dec_enc_attn_mask = get_attn_key_pad_mask(seq_k=src_tmp, seq_q=tgt_seq)
137 | 
138 |         # -- Forward
139 |         dec_output = self.tgt_word_emb(tgt_seq) + self.position_enc(tgt_pos)
140 | 
141 |         for dec_layer in self.layer_stack:
142 |             dec_output, dec_slf_attn, dec_enc_attn = dec_layer(
143 |                 dec_output, enc_output,
144 |                 non_pad_mask=non_pad_mask,
145 |                 slf_attn_mask=slf_attn_mask,
146 |                 dec_enc_attn_mask=dec_enc_attn_mask)
147 | 
148 |             if return_attns:
149 |                 dec_slf_attn_list += [dec_slf_attn]
150 |                 dec_enc_attn_list += [dec_enc_attn]
151 | 
152 |         if return_attns:
153 |             return dec_output, dec_slf_attn_list, dec_enc_attn_list
154 |         return dec_output,
155 | 
156 | 
157 | class Transformer(nn.Module):
158 |     ''' A sequence to sequence model with attention mechanism. '''
159 | 
160 |     def __init__(
161 |             self,
162 |             n_tgt_vocab, len_max_seq, vis_emb=2048,
163 |             d_word_vec=512, d_model=512, d_inner=2048,
164 |             n_layers=6, n_head=8, d_k=64, d_v=64, dropout=0.1,
165 |             tgt_emb_prj_weight_sharing=True):
166 | 
167 |         super().__init__()
168 | 
169 |         self.vis_emb = nn.Linear(vis_emb, d_model)
170 | 
171 |         self.encoder = Encoder(
172 |             len_max_seq=40,
173 |             d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
174 |             n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
175 |             dropout=dropout)
176 | 
177 |         self.decoder = Decoder(
178 |             n_tgt_vocab=n_tgt_vocab, len_max_seq=len_max_seq,
179 |             d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
180 |             n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
181 |             dropout=dropout)
182 | 
183 |         self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False)
184 |         nn.init.xavier_normal_(self.tgt_word_prj.weight)
185 | 
186 |         assert d_model == d_word_vec, \
187 |             'To facilitate the residual connections, ' \
188 |             'the dimensions of all module outputs shall be the same.'
189 | 
190 |         if tgt_emb_prj_weight_sharing:
191 |             # Share the weight matrix between target word embedding & the final logit dense layer
192 |             self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight
193 |             self.x_logit_scale = (d_model ** -0.5)
194 |         else:
195 |             self.x_logit_scale = 1.
196 | 
197 |     def forward(self, src_emb, src_pos, tgt_seq, tgt_pos):
198 | 
199 |         src_emb = self.vis_emb(src_emb)
200 |         tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1]
201 | 
202 |         enc_output, *_ = self.encoder(src_emb, src_pos)
203 |         dec_output, *_ = self.decoder(tgt_seq, tgt_pos, src_emb, enc_output)
204 |         seq_logit = self.tgt_word_prj(dec_output) * self.x_logit_scale
205 | 
206 |         return seq_logit.view(-1, seq_logit.size(2))
207 | 
208 | 


--------------------------------------------------------------------------------
/model/transformer/Translator.py:
--------------------------------------------------------------------------------
  1 | ''' This module will handle the text generation with beam search. '''
  2 | import numpy as np
  3 | from utils.utils import *
  4 | import torch.nn.functional as F
  5 | from model.transformer.Beam import Beam
  6 | 
  7 | __author__ = 'Jacob Zhiyuan Fang'
  8 | 
  9 | 
 10 | def pos_emb_generation(word_labels):
 11 |     '''
 12 |         Generate the position embedding input for Transformers.
 13 |     '''
 14 | 
 15 |     seq = list(range(1, word_labels.shape[1] + 1))
 16 |     tgt_pos = torch.tensor([seq] * word_labels.shape[0]).cuda()
 17 |     binary_mask = (word_labels != 0).long()
 18 | 
 19 |     return tgt_pos*binary_mask
 20 | 
 21 | 
 22 | def translate_batch(model, src_emb, opt):
 23 |     ''' Translation work in one batch '''
 24 | 
 25 |     def get_inst_idx_to_tensor_position_map(inst_idx_list):
 26 |         ''' Indicate the position of an instance in a tensor. '''
 27 |         return {inst_idx: tensor_position for tensor_position, inst_idx in enumerate(inst_idx_list)}
 28 | 
 29 |     def collect_active_part(beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm):
 30 |         ''' Collect tensor parts associated to active instances. '''
 31 | 
 32 |         _, *d_hs = beamed_tensor.size()
 33 |         n_curr_active_inst = len(curr_active_inst_idx)
 34 |         new_shape = (n_curr_active_inst * n_bm, *d_hs)
 35 | 
 36 |         beamed_tensor = beamed_tensor.view(n_prev_active_inst, -1)
 37 |         beamed_tensor = beamed_tensor.index_select(0, curr_active_inst_idx)
 38 |         beamed_tensor = beamed_tensor.view(*new_shape)
 39 | 
 40 |         return beamed_tensor
 41 | 
 42 |     def collate_active_info(src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list):
 43 |         # Sentences which are still active are collected,
 44 |         # so the decoder will not run on completed sentences.
 45 |         n_prev_active_inst = len(inst_idx_to_position_map)
 46 |         active_inst_idx = [inst_idx_to_position_map[k] for k in active_inst_idx_list]
 47 |         active_inst_idx = torch.LongTensor(active_inst_idx).cuda()
 48 | 
 49 |         active_src_seq = collect_active_part(src_seq, active_inst_idx, n_prev_active_inst, n_bm)
 50 |         active_src_enc = collect_active_part(src_enc, active_inst_idx, n_prev_active_inst, n_bm)
 51 |         active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)
 52 | 
 53 |         return active_src_seq, active_src_enc, active_inst_idx_to_position_map
 54 | 
 55 |     def beam_decode_step(inst_dec_beams, len_dec_seq, src_seq, enc_output, inst_idx_to_position_map, n_bm, mode):
 56 |         ''' Decode and update beam status, and then return active beam idx '''
 57 | 
 58 |         def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq):
 59 |             dec_partial_seq = [b.get_current_state() for b in inst_dec_beams if not b.done]
 60 |             dec_partial_seq = torch.stack(dec_partial_seq).cuda()
 61 |             dec_partial_seq = dec_partial_seq.view(-1, len_dec_seq)
 62 |             return dec_partial_seq
 63 | 
 64 |         def prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm):
 65 |             dec_partial_pos = torch.arange(1, len_dec_seq + 1, dtype=torch.long).cuda()
 66 |             dec_partial_pos = dec_partial_pos.unsqueeze(0).repeat(n_active_inst * n_bm, 1)
 67 |             return dec_partial_pos
 68 | 
 69 |         def predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm):
 70 |             if mode == 'cap':
 71 |                 dec_output, dec_slf_attn_list, dec_enc_attn_list = model.decoder\
 72 |                     (dec_seq, dec_pos, src_seq, enc_output, return_attns=True)
 73 |                 # print(dec_enc_attn_list[-1][0])
 74 |                 dec_output = dec_output[:, -1, :]  # Pick the last step: (bh * bm) * d_h
 75 |                 word_prob = F.log_softmax(model.cap_word_prj(dec_output), dim=1)
 76 |                 word_prob = word_prob.view(n_active_inst, n_bm, -1)
 77 | 
 78 |             elif mode == 'int':
 79 |                 dec_output, dec_slf_attn_list, dec_enc_attn_list = model.cms_decoder\
 80 |                     (dec_seq, dec_pos, src_seq, enc_output, return_attns=True)
 81 |                 # print(dec_enc_attn_list[-1][0])
 82 |                 dec_output = dec_output[:, -1, :]  # Pick the last step: (bh * bm) * d_h
 83 |                 word_prob = F.log_softmax(model.cms_word_prj(dec_output), dim=1)
 84 |                 word_prob = word_prob.view(n_active_inst, n_bm, -1)
 85 |             return word_prob
 86 | 
 87 |         def collect_active_inst_idx_list(inst_beams, word_prob, inst_idx_to_position_map):
 88 |             active_inst_idx_list = []
 89 |             for inst_idx, inst_position in inst_idx_to_position_map.items():
 90 |                 is_inst_complete = inst_beams[inst_idx].advance(word_prob[inst_position])
 91 |                 if not is_inst_complete:
 92 |                     active_inst_idx_list += [inst_idx]
 93 | 
 94 |             return active_inst_idx_list
 95 | 
 96 |         n_active_inst = len(inst_idx_to_position_map)
 97 | 
 98 |         dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq)
 99 |         dec_pos = prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm)
100 |         word_prob = predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm)
101 | 
102 |         # Update the beam with predicted word prob information and collect incomplete instances
103 |         active_inst_idx_list = collect_active_inst_idx_list(inst_dec_beams, word_prob, inst_idx_to_position_map)
104 | 
105 |         return active_inst_idx_list
106 | 
107 |     def collect_hypothesis_and_scores(inst_dec_beams, n_best):
108 |         all_hyp, all_scores = [], []
109 |         for inst_idx in range(len(inst_dec_beams)):
110 |             scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores()
111 |             all_scores += [scores[:n_best]]
112 | 
113 |             hyps = [inst_dec_beams[inst_idx].get_hypothesis(i) for i in tail_idxs[:n_best]]
114 |             all_hyp += [hyps]
115 |         return all_hyp, all_scores
116 | 
117 |     with torch.no_grad():
118 |         # Encode
119 |         src_seq = src_emb.cuda()
120 |         src_enc, *_ = model.encoder(src_seq)
121 |         video_encoding = src_enc
122 | 
123 |         # Repeat data for beam search
124 |         n_bm = 1
125 |         n_inst, len_s, d_h = src_enc.size()
126 |         src_enc = src_enc.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h)
127 |         src_seq = src_seq.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, -1)
128 | 
129 |         # Prepare beams
130 |         inst_dec_beams = [Beam(n_bm, device='cuda') for _ in range(n_inst)]
131 | 
132 |         # Bookkeeping for active or not
133 |         active_inst_idx_list = list(range(n_inst))
134 |         inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)
135 | 
136 |         # <---------------------------------------------Decode CAP ---------------------------------------------------->
137 |         for len_dec_seq in range(1, 28 + 1):
138 | 
139 |             active_inst_idx_list = beam_decode_step(
140 |                 inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm, mode='cap')
141 | 
142 |             if not active_inst_idx_list:
143 |                 break  # all instances have finished their path to <EOS>
144 | 
145 |             src_seq, src_enc, inst_idx_to_position_map = collate_active_info(
146 |                 src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list)
147 | 
148 |         batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams, n_bm)
149 | 
150 |         # <---------------------------------------------Decode CMS ---------------------------------------------------->
151 |         cms_batch_hyp = []
152 |         for cap_idx in range(n_bm):
153 |             [_[0].insert(0, 2) for _ in batch_hyp]  # Start with <BOS> symbol
154 |             dec_seq = np.zeros((opt['batch_size'], opt['cap_max_len']))
155 |             for idx, seq in enumerate(batch_hyp):
156 |                 dec_seq[idx, :len(seq[cap_idx])] = seq[cap_idx]
157 |             dec_seq = torch.as_tensor(dec_seq).cuda().long()
158 |             dec_pos = pos_emb_generation(dec_seq).long()
159 |             dec_output_, *_ = model.decoder(dec_seq[:, :-1], dec_pos[:, :-1], src_emb.cuda(), video_encoding)
160 | 
161 |             # Concatenate visual-captioning encodings
162 |             cat_encoding = torch.cat((video_encoding, dec_output_), 1)
163 | 
164 |             # Repeat data for beam search for CMS
165 |             n_inst, len_s, d_h = cat_encoding.size()
166 |             src_enc = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h)
167 |             src_seq = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, -1)
168 | 
169 |             # Prepare beams
170 |             inst_dec_beams = [Beam(n_bm, device='cuda') for _ in range(n_inst)]
171 | 
172 |             # Bookkeeping for active or not
173 |             active_inst_idx_list = list(range(n_inst))
174 |             inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)
175 | 
176 |             for len_dec_seq in range(1, opt['eff_max_len'] - 1):
177 | 
178 |                 active_inst_idx_list = beam_decode_step(
179 |                     inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm, mode='int')
180 | 
181 |                 if not active_inst_idx_list:
182 |                     break  # all instances have finished their path to <EOS>
183 | 
184 |                 src_seq, src_enc, inst_idx_to_position_map = collate_active_info(
185 |                     src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list)
186 | 
187 |             cms_hyp, cms_batch_scores = collect_hypothesis_and_scores(inst_dec_beams, 3)
188 |             cms_batch_hyp.append(cms_hyp)
189 |     # only return the top-1 cms beam searched result
190 |     return batch_hyp, cms_batch_hyp[0]
191 | 
192 | 
193 | 
194 | 


--------------------------------------------------------------------------------
/model/transformer/__init__.py:
--------------------------------------------------------------------------------
 1 | import model.transformer.Constants
 2 | import model.transformer.Modules
 3 | import model.transformer.Layers
 4 | import model.transformer.SubLayers
 5 | import model.Model
 6 | import model.transformer.Translator
 7 | import model.transformer.Beam
 8 | import model.transformer.Optim
 9 | 
10 | # __all__ = [
11 | #     model.transformer.Constants, model.transformer.Modules, model.transformer.Layers,
12 | #     model.transformer.SubLayers, model.transformer.Models, model.transformer.Optim,
13 | #     model.transformer.Translator, model.transformer.Beam]
14 | 


--------------------------------------------------------------------------------
/model/transformer/cap2cms_Translator.py:
--------------------------------------------------------------------------------
  1 | ''' This module will handle the text generation with beam search. '''
  2 | import numpy as np
  3 | from utils.utils import *
  4 | import torch.nn.functional as F
  5 | from model.transformer.Beam import Beam
  6 | 
  7 | __author__ = 'Jacob Zhiyuan Fang'
  8 | 
  9 | 
 10 | def pos_emb_generation(word_labels):
 11 |     '''
 12 |         Generate the position embedding input for Transformers.
 13 |     '''
 14 | 
 15 |     seq = list(range(1, word_labels.shape[1] + 1))
 16 |     tgt_pos = torch.tensor([seq] * word_labels.shape[0]).cuda()
 17 |     binary_mask = (word_labels != 0).long()
 18 | 
 19 |     return tgt_pos*binary_mask
 20 | 
 21 | 
 22 | def translate_batch(model, src_emb, cap_label, opt):
 23 |     ''' Translation work in one batch '''
 24 | 
 25 |     def get_inst_idx_to_tensor_position_map(inst_idx_list):
 26 |         ''' Indicate the position of an instance in a tensor. '''
 27 |         return {inst_idx: tensor_position for tensor_position, inst_idx in enumerate(inst_idx_list)}
 28 | 
 29 |     def collect_active_part(beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm):
 30 |         ''' Collect tensor parts associated to active instances. '''
 31 | 
 32 |         _, *d_hs = beamed_tensor.size()
 33 |         n_curr_active_inst = len(curr_active_inst_idx)
 34 |         new_shape = (n_curr_active_inst * n_bm, *d_hs)
 35 | 
 36 |         beamed_tensor = beamed_tensor.view(n_prev_active_inst, -1)
 37 |         beamed_tensor = beamed_tensor.index_select(0, curr_active_inst_idx)
 38 |         beamed_tensor = beamed_tensor.view(*new_shape)
 39 | 
 40 |         return beamed_tensor
 41 | 
 42 |     def collate_active_info(src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list):
 43 |         # Sentences which are still active are collected,
 44 |         # so the decoder will not run on completed sentences.
 45 |         n_prev_active_inst = len(inst_idx_to_position_map)
 46 |         active_inst_idx = [inst_idx_to_position_map[k] for k in active_inst_idx_list]
 47 |         active_inst_idx = torch.LongTensor(active_inst_idx).cuda()
 48 | 
 49 |         active_src_seq = collect_active_part(src_seq, active_inst_idx, n_prev_active_inst, n_bm)
 50 |         active_src_enc = collect_active_part(src_enc, active_inst_idx, n_prev_active_inst, n_bm)
 51 |         active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)
 52 | 
 53 |         return active_src_seq, active_src_enc, active_inst_idx_to_position_map
 54 | 
 55 |     def beam_decode_step(inst_dec_beams, len_dec_seq, src_seq, enc_output, inst_idx_to_position_map, n_bm, mode):
 56 |         ''' Decode and update beam status, and then return active beam idx '''
 57 | 
 58 |         def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq):
 59 |             dec_partial_seq = [b.get_current_state() for b in inst_dec_beams if not b.done]
 60 |             dec_partial_seq = torch.stack(dec_partial_seq).cuda()
 61 |             dec_partial_seq = dec_partial_seq.view(-1, len_dec_seq)
 62 |             return dec_partial_seq
 63 | 
 64 |         def prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm):
 65 |             dec_partial_pos = torch.arange(1, len_dec_seq + 1, dtype=torch.long).cuda()
 66 |             dec_partial_pos = dec_partial_pos.unsqueeze(0).repeat(n_active_inst * n_bm, 1)
 67 |             return dec_partial_pos
 68 | 
 69 |         def predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm):
 70 |             if mode == 'cap':
 71 |                 dec_output, *_ = model.decoder(dec_seq, dec_pos, src_seq, enc_output)
 72 |                 dec_output = dec_output[:, -1, :]  # Pick the last step: (bh * bm) * d_h
 73 |                 word_prob = F.log_softmax(model.cap_word_prj(dec_output), dim=1)
 74 |                 word_prob = word_prob.view(n_active_inst, n_bm, -1)
 75 | 
 76 |             elif mode == 'int':
 77 |                 dec_output, *_ = model.cms_decoder(dec_seq, dec_pos, src_seq, enc_output)
 78 |                 dec_output = dec_output[:, -1, :]  # Pick the last step: (bh * bm) * d_h
 79 |                 word_prob = F.log_softmax(model.cms_word_prj(dec_output), dim=1)
 80 |                 word_prob = word_prob.view(n_active_inst, n_bm, -1)
 81 |             return word_prob
 82 | 
 83 |         def collect_active_inst_idx_list(inst_beams, word_prob, inst_idx_to_position_map):
 84 |             active_inst_idx_list = []
 85 |             for inst_idx, inst_position in inst_idx_to_position_map.items():
 86 |                 is_inst_complete = inst_beams[inst_idx].advance(word_prob[inst_position])
 87 |                 if not is_inst_complete:
 88 |                     active_inst_idx_list += [inst_idx]
 89 | 
 90 |             return active_inst_idx_list
 91 | 
 92 |         n_active_inst = len(inst_idx_to_position_map)
 93 | 
 94 |         dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq)
 95 |         dec_pos = prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm)
 96 |         word_prob = predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm)
 97 | 
 98 |         # Update the beam with predicted word prob information and collect incomplete instances
 99 |         active_inst_idx_list = collect_active_inst_idx_list(inst_dec_beams, word_prob, inst_idx_to_position_map)
100 | 
101 |         return active_inst_idx_list
102 | 
103 |     def collect_hypothesis_and_scores(inst_dec_beams, n_best):
104 |         all_hyp, all_scores = [], []
105 |         for inst_idx in range(len(inst_dec_beams)):
106 |             scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores()
107 |             all_scores += [scores[:n_best]]
108 | 
109 |             hyps = [inst_dec_beams[inst_idx].get_hypothesis(i) for i in tail_idxs[:n_best]]
110 |             all_hyp += [hyps]
111 |         return all_hyp, all_scores
112 | 
113 |     with torch.no_grad():
114 |         # <--------------------------------------------- Decode Video ------------------------------------------------->
115 |         src_seq = src_emb.cuda()
116 |         src_enc, *_ = model.encoder(src_seq)
117 |         video_encoding = src_enc
118 | 
119 |         # <--------------------------------------------- Decode CAP --------------------------------------------------->
120 |         cap_pos = pos_emb_generation(cap_label)
121 |         cap_label, cap_pos = cap_label[:, :-1], cap_pos[:, :-1]
122 | 
123 |         cap_dec_output, *_ = model.decoder(cap_label, cap_pos, video_encoding, video_encoding)
124 | 
125 |         # Concatenate visual and caption encoding
126 |         cat_encoding = torch.cat((video_encoding, cap_dec_output), 1)
127 |         # cat_encoding = cap_dec_output
128 | 
129 |         # <--------------------------------------------- Decode CMS --------------------------------------------------->
130 |         # Repeat data for beam search for CMS
131 |         n_bm = 2
132 |         n_inst, len_s, d_h = cat_encoding.size()
133 |         src_enc = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h)
134 |         src_seq = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, -1)
135 | 
136 |         # Prepare beams
137 |         inst_dec_beams = [Beam(n_bm, device='cuda') for _ in range(n_inst)]
138 | 
139 |         # Bookkeeping for active or not
140 |         active_inst_idx_list = list(range(n_inst))
141 |         inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)
142 | 
143 |         for len_dec_seq in range(1, opt['eff_max_len'] - 1):
144 | 
145 |             active_inst_idx_list = beam_decode_step(
146 |                 inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm, mode='int')
147 | 
148 |             if not active_inst_idx_list:
149 |                 break  # all instances have finished their path to <EOS>
150 | 
151 |             src_seq, src_enc, inst_idx_to_position_map = collate_active_info(
152 |                 src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list)
153 | 
154 |         cms_batch_hyp, cms_batch_scores = collect_hypothesis_and_scores(inst_dec_beams, 1)
155 | 
156 |     return cms_batch_hyp
157 | 
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/model/transformer/cap2cms_Translator_transformer.py:
--------------------------------------------------------------------------------
  1 | ''' This module will handle the text generation with beam search. '''
  2 | import numpy as np
  3 | from utils.utils import *
  4 | import torch.nn.functional as F
  5 | from model.transformer.Beam import Beam
  6 | 
  7 | __author__ = 'Jacob Zhiyuan Fang'
  8 | 
  9 | 
 10 | def pos_emb_generation(word_labels):
 11 |     '''
 12 |         Generate the position embedding input for Transformers.
 13 |     '''
 14 | 
 15 |     seq = list(range(1, word_labels.shape[1] + 1))
 16 |     tgt_pos = torch.tensor([seq] * word_labels.shape[0]).cuda()
 17 |     binary_mask = (word_labels != 0).long()
 18 | 
 19 |     return tgt_pos*binary_mask
 20 | 
 21 | 
 22 | def translate_batch(model, src_emb, cap_label, opt):
 23 |     ''' Translation work in one batch '''
 24 | 
 25 |     def get_inst_idx_to_tensor_position_map(inst_idx_list):
 26 |         ''' Indicate the position of an instance in a tensor. '''
 27 |         return {inst_idx: tensor_position for tensor_position, inst_idx in enumerate(inst_idx_list)}
 28 | 
 29 |     def collect_active_part(beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm):
 30 |         ''' Collect tensor parts associated to active instances. '''
 31 | 
 32 |         _, *d_hs = beamed_tensor.size()
 33 |         n_curr_active_inst = len(curr_active_inst_idx)
 34 |         new_shape = (n_curr_active_inst * n_bm, *d_hs)
 35 | 
 36 |         beamed_tensor = beamed_tensor.view(n_prev_active_inst, -1)
 37 |         beamed_tensor = beamed_tensor.index_select(0, curr_active_inst_idx)
 38 |         beamed_tensor = beamed_tensor.view(*new_shape)
 39 | 
 40 |         return beamed_tensor
 41 | 
 42 |     def collate_active_info(src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list):
 43 |         # Sentences which are still active are collected,
 44 |         # so the decoder will not run on completed sentences.
 45 |         n_prev_active_inst = len(inst_idx_to_position_map)
 46 |         active_inst_idx = [inst_idx_to_position_map[k] for k in active_inst_idx_list]
 47 |         active_inst_idx = torch.LongTensor(active_inst_idx).cuda()
 48 | 
 49 |         active_src_seq = collect_active_part(src_seq, active_inst_idx, n_prev_active_inst, n_bm)
 50 |         active_src_enc = collect_active_part(src_enc, active_inst_idx, n_prev_active_inst, n_bm)
 51 |         active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)
 52 | 
 53 |         return active_src_seq, active_src_enc, active_inst_idx_to_position_map
 54 | 
 55 |     def beam_decode_step(inst_dec_beams, len_dec_seq, src_seq, enc_output, inst_idx_to_position_map, n_bm, mode):
 56 |         ''' Decode and update beam status, and then return active beam idx '''
 57 | 
 58 |         def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq):
 59 |             dec_partial_seq = [b.get_current_state() for b in inst_dec_beams if not b.done]
 60 |             dec_partial_seq = torch.stack(dec_partial_seq).cuda()
 61 |             dec_partial_seq = dec_partial_seq.view(-1, len_dec_seq)
 62 |             return dec_partial_seq
 63 | 
 64 |         def prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm):
 65 |             dec_partial_pos = torch.arange(1, len_dec_seq + 1, dtype=torch.long).cuda()
 66 |             dec_partial_pos = dec_partial_pos.unsqueeze(0).repeat(n_active_inst * n_bm, 1)
 67 |             return dec_partial_pos
 68 | 
 69 |         def predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm):
 70 |             if mode == 'cap':
 71 |                 dec_output, *_ = model.decoder(dec_seq, dec_pos, src_seq, enc_output)
 72 |                 dec_output = dec_output[:, -1, :]  # Pick the last step: (bh * bm) * d_h
 73 |                 word_prob = F.log_softmax(model.cap_word_prj(dec_output), dim=1)
 74 |                 word_prob = word_prob.view(n_active_inst, n_bm, -1)
 75 | 
 76 |             elif mode == 'int':
 77 |                 dec_output, *_ = model.cms_decoder(dec_seq, dec_pos, src_seq, enc_output)
 78 |                 dec_output = dec_output[:, -1, :]  # Pick the last step: (bh * bm) * d_h
 79 |                 word_prob = F.log_softmax(model.cms_word_prj(dec_output), dim=1)
 80 |                 word_prob = word_prob.view(n_active_inst, n_bm, -1)
 81 |             return word_prob
 82 | 
 83 |         def collect_active_inst_idx_list(inst_beams, word_prob, inst_idx_to_position_map):
 84 |             active_inst_idx_list = []
 85 |             for inst_idx, inst_position in inst_idx_to_position_map.items():
 86 |                 is_inst_complete = inst_beams[inst_idx].advance(word_prob[inst_position])
 87 |                 if not is_inst_complete:
 88 |                     active_inst_idx_list += [inst_idx]
 89 | 
 90 |             return active_inst_idx_list
 91 | 
 92 |         n_active_inst = len(inst_idx_to_position_map)
 93 | 
 94 |         dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq)
 95 |         dec_pos = prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm)
 96 |         word_prob = predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm)
 97 | 
 98 |         # Update the beam with predicted word prob information and collect incomplete instances
 99 |         active_inst_idx_list = collect_active_inst_idx_list(inst_dec_beams, word_prob, inst_idx_to_position_map)
100 | 
101 |         return active_inst_idx_list
102 | 
103 |     def collect_hypothesis_and_scores(inst_dec_beams, n_best):
104 |         all_hyp, all_scores = [], []
105 |         for inst_idx in range(len(inst_dec_beams)):
106 |             scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores()
107 |             all_scores += [scores[:n_best]]
108 | 
109 |             hyps = [inst_dec_beams[inst_idx].get_hypothesis(i) for i in tail_idxs[:n_best]]
110 |             all_hyp += [hyps]
111 |         return all_hyp, all_scores
112 | 
113 |     with torch.no_grad():
114 |         # <--------------------------------------------- Decode Video ------------------------------------------------->
115 |         src_seq = src_emb.cuda()
116 | 
117 |         src_seq = model.vis_emb(src_seq)
118 |         vis_pos = torch.tensor(list(range(0, 40))).cuda().unsqueeze(0).repeat(src_seq.shape[0], 1)
119 |         video_encoding, *_ = model.encoder(src_seq, vis_pos)
120 | 
121 |         # <--------------------------------------------- Decode CAP --------------------------------------------------->
122 |         cap_pos = pos_emb_generation(cap_label)
123 |         cap_label, cap_pos = cap_label[:, :-1], cap_pos[:, :-1]
124 | 
125 |         cap_dec_output, *_ = model.decoder(cap_label, cap_pos, video_encoding, video_encoding)
126 | 
127 |         # Concatenate visual and caption encoding
128 |         cat_encoding = torch.cat((video_encoding, cap_dec_output), 1)
129 |         # cat_encoding = cap_dec_output
130 | 
131 |         # <--------------------------------------------- Decode CMS --------------------------------------------------->
132 |         # Repeat data for beam search for CMS
133 |         n_bm = 2
134 |         n_inst, len_s, d_h = cat_encoding.size()
135 |         src_enc = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h)
136 |         src_seq = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, -1)
137 | 
138 |         # Prepare beams
139 |         inst_dec_beams = [Beam(n_bm, device='cuda') for _ in range(n_inst)]
140 | 
141 |         # Bookkeeping for active or not
142 |         active_inst_idx_list = list(range(n_inst))
143 |         inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)
144 | 
145 |         for len_dec_seq in range(1, opt['eff_max_len'] - 1):
146 | 
147 |             active_inst_idx_list = beam_decode_step(
148 |                 inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm, mode='int')
149 | 
150 |             if not active_inst_idx_list:
151 |                 break  # all instances have finished their path to <EOS>
152 | 
153 |             src_seq, src_enc, inst_idx_to_position_map = collate_active_info(
154 |                 src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list)
155 | 
156 |         cms_batch_hyp, cms_batch_scores = collect_hypothesis_and_scores(inst_dec_beams, 1)
157 | 
158 |     return cms_batch_hyp
159 | 
160 | 
161 | 
162 | 


--------------------------------------------------------------------------------
/model/transformer/cap2cms_Translator_transformerDecoder.py:
--------------------------------------------------------------------------------
  1 | ''' This module will handle the text generation with beam search. '''
  2 | import numpy as np
  3 | from utils.utils import *
  4 | import torch.nn.functional as F
  5 | from model.transformer.Beam import Beam
  6 | 
  7 | __author__ = 'Jacob Zhiyuan Fang'
  8 | 
  9 | 
 10 | def pos_emb_generation(word_labels):
 11 |     '''
 12 |         Generate the position embedding input for Transformers.
 13 |     '''
 14 | 
 15 |     seq = list(range(1, word_labels.shape[1] + 1))
 16 |     tgt_pos = torch.tensor([seq] * word_labels.shape[0]).cuda()
 17 |     binary_mask = (word_labels != 0).long()
 18 | 
 19 |     return tgt_pos*binary_mask
 20 | 
 21 | 
 22 | def translate_batch(model, src_emb, cap_label, opt):
 23 |     ''' Translation work in one batch '''
 24 | 
 25 |     def get_inst_idx_to_tensor_position_map(inst_idx_list):
 26 |         ''' Indicate the position of an instance in a tensor. '''
 27 |         return {inst_idx: tensor_position for tensor_position, inst_idx in enumerate(inst_idx_list)}
 28 | 
 29 |     def collect_active_part(beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm):
 30 |         ''' Collect tensor parts associated to active instances. '''
 31 | 
 32 |         _, *d_hs = beamed_tensor.size()
 33 |         n_curr_active_inst = len(curr_active_inst_idx)
 34 |         new_shape = (n_curr_active_inst * n_bm, *d_hs)
 35 | 
 36 |         beamed_tensor = beamed_tensor.view(n_prev_active_inst, -1)
 37 |         beamed_tensor = beamed_tensor.index_select(0, curr_active_inst_idx)
 38 |         beamed_tensor = beamed_tensor.view(*new_shape)
 39 | 
 40 |         return beamed_tensor
 41 | 
 42 |     def collate_active_info(src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list):
 43 |         # Sentences which are still active are collected,
 44 |         # so the decoder will not run on completed sentences.
 45 |         n_prev_active_inst = len(inst_idx_to_position_map)
 46 |         active_inst_idx = [inst_idx_to_position_map[k] for k in active_inst_idx_list]
 47 |         active_inst_idx = torch.LongTensor(active_inst_idx).cuda()
 48 | 
 49 |         active_src_seq = collect_active_part(src_seq, active_inst_idx, n_prev_active_inst, n_bm)
 50 |         active_src_enc = collect_active_part(src_enc, active_inst_idx, n_prev_active_inst, n_bm)
 51 |         active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)
 52 | 
 53 |         return active_src_seq, active_src_enc, active_inst_idx_to_position_map
 54 | 
 55 |     def beam_decode_step(inst_dec_beams, len_dec_seq, src_seq, enc_output, inst_idx_to_position_map, n_bm, mode):
 56 |         ''' Decode and update beam status, and then return active beam idx '''
 57 | 
 58 |         def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq):
 59 |             dec_partial_seq = [b.get_current_state() for b in inst_dec_beams if not b.done]
 60 |             dec_partial_seq = torch.stack(dec_partial_seq).cuda()
 61 |             dec_partial_seq = dec_partial_seq.view(-1, len_dec_seq)
 62 |             return dec_partial_seq
 63 | 
 64 |         def prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm):
 65 |             dec_partial_pos = torch.arange(1, len_dec_seq + 1, dtype=torch.long).cuda()
 66 |             dec_partial_pos = dec_partial_pos.unsqueeze(0).repeat(n_active_inst * n_bm, 1)
 67 |             return dec_partial_pos
 68 | 
 69 |         def predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm):
 70 |             if mode == 'cap':
 71 |                 dec_output, *_ = model.decoder(dec_seq, dec_pos, src_seq, enc_output)
 72 |                 dec_output = dec_output[:, -1, :]  # Pick the last step: (bh * bm) * d_h
 73 |                 word_prob = F.log_softmax(model.cap_word_prj(dec_output), dim=1)
 74 |                 word_prob = word_prob.view(n_active_inst, n_bm, -1)
 75 | 
 76 |             elif mode == 'int':
 77 |                 dec_output, *_ = model.cms_decoder(dec_seq, dec_pos, src_seq, enc_output)
 78 |                 dec_output = dec_output[:, -1, :]  # Pick the last step: (bh * bm) * d_h
 79 |                 word_prob = F.log_softmax(model.cms_word_prj(dec_output), dim=1)
 80 |                 word_prob = word_prob.view(n_active_inst, n_bm, -1)
 81 |             return word_prob
 82 | 
 83 |         def collect_active_inst_idx_list(inst_beams, word_prob, inst_idx_to_position_map):
 84 |             active_inst_idx_list = []
 85 |             for inst_idx, inst_position in inst_idx_to_position_map.items():
 86 |                 is_inst_complete = inst_beams[inst_idx].advance(word_prob[inst_position])
 87 |                 if not is_inst_complete:
 88 |                     active_inst_idx_list += [inst_idx]
 89 | 
 90 |             return active_inst_idx_list
 91 | 
 92 |         n_active_inst = len(inst_idx_to_position_map)
 93 | 
 94 |         dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq)
 95 |         dec_pos = prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm)
 96 |         word_prob = predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm)
 97 | 
 98 |         # Update the beam with predicted word prob information and collect incomplete instances
 99 |         active_inst_idx_list = collect_active_inst_idx_list(inst_dec_beams, word_prob, inst_idx_to_position_map)
100 | 
101 |         return active_inst_idx_list
102 | 
103 |     def collect_hypothesis_and_scores(inst_dec_beams, n_best):
104 |         all_hyp, all_scores = [], []
105 |         for inst_idx in range(len(inst_dec_beams)):
106 |             scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores()
107 |             all_scores += [scores[:n_best]]
108 | 
109 |             hyps = [inst_dec_beams[inst_idx].get_hypothesis(i) for i in tail_idxs[:n_best]]
110 |             all_hyp += [hyps]
111 |         return all_hyp, all_scores
112 | 
113 |     with torch.no_grad():
114 |         # <--------------------------------------------- Decode Video ------------------------------------------------->
115 |         src_seq = src_emb.cuda()
116 |         video_encoding = model.vis_emb(src_seq)
117 | 
118 |         # <--------------------------------------------- Decode CAP --------------------------------------------------->
119 |         cap_pos = pos_emb_generation(cap_label)
120 |         cap_label, cap_pos = cap_label[:, :-1], cap_pos[:, :-1]
121 | 
122 |         cap_dec_output, *_ = model.decoder(cap_label, cap_pos, video_encoding, video_encoding)
123 | 
124 |         # Concatenate visual and caption encoding
125 |         cat_encoding = torch.cat((video_encoding, cap_dec_output), 1)
126 |         # cat_encoding = cap_dec_output
127 | 
128 |         # <--------------------------------------------- Decode CMS --------------------------------------------------->
129 |         # Repeat data for beam search for CMS
130 |         n_bm = 2
131 |         n_inst, len_s, d_h = cat_encoding.size()
132 |         src_enc = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h)
133 |         src_seq = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, -1)
134 | 
135 |         # Prepare beams
136 |         inst_dec_beams = [Beam(n_bm, device='cuda') for _ in range(n_inst)]
137 | 
138 |         # Bookkeeping for active or not
139 |         active_inst_idx_list = list(range(n_inst))
140 |         inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)
141 | 
142 |         for len_dec_seq in range(1, opt['eff_max_len'] - 1):
143 | 
144 |             active_inst_idx_list = beam_decode_step(
145 |                 inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm, mode='int')
146 | 
147 |             if not active_inst_idx_list:
148 |                 break  # all instances have finished their path to <EOS>
149 | 
150 |             src_seq, src_enc, inst_idx_to_position_map = collate_active_info(
151 |                 src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list)
152 | 
153 |         cms_batch_hyp, cms_batch_scores = collect_hypothesis_and_scores(inst_dec_beams, 1)
154 | 
155 |     return cms_batch_hyp
156 | 
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/model/transformer/cap_cms_Translator.py:
--------------------------------------------------------------------------------
  1 | ''' This module will handle the text generation with beam search. '''
  2 | import numpy as np
  3 | from utils.utils import *
  4 | import torch.nn.functional as F
  5 | from model.transformer.Beam import Beam
  6 | 
  7 | __author__ = 'Jacob Zhiyuan Fang'
  8 | 
  9 | 
 10 | def pos_emb_generation(word_labels):
 11 |     '''
 12 |         Generate the position embedding input for Transformers.
 13 |     '''
 14 | 
 15 |     seq = list(range(1, word_labels.shape[1] + 1))
 16 |     tgt_pos = torch.tensor([seq] * word_labels.shape[0]).cuda()
 17 |     binary_mask = (word_labels != 0).long()
 18 | 
 19 |     return tgt_pos*binary_mask
 20 | 
 21 | 
 22 | def translate_batch(model, src_emb, cap_label, opt):
 23 |     ''' Translation work in one batch '''
 24 | 
 25 |     def get_inst_idx_to_tensor_position_map(inst_idx_list):
 26 |         ''' Indicate the position of an instance in a tensor. '''
 27 |         return {inst_idx: tensor_position for tensor_position, inst_idx in enumerate(inst_idx_list)}
 28 | 
 29 |     def collect_active_part(beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm):
 30 |         ''' Collect tensor parts associated to active instances. '''
 31 | 
 32 |         _, *d_hs = beamed_tensor.size()
 33 |         n_curr_active_inst = len(curr_active_inst_idx)
 34 |         new_shape = (n_curr_active_inst * n_bm, *d_hs)
 35 | 
 36 |         beamed_tensor = beamed_tensor.view(n_prev_active_inst, -1)
 37 |         beamed_tensor = beamed_tensor.index_select(0, curr_active_inst_idx)
 38 |         beamed_tensor = beamed_tensor.view(*new_shape)
 39 | 
 40 |         return beamed_tensor
 41 | 
 42 |     def collate_active_info(src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list):
 43 |         # Sentences which are still active are collected,
 44 |         # so the decoder will not run on completed sentences.
 45 |         n_prev_active_inst = len(inst_idx_to_position_map)
 46 |         active_inst_idx = [inst_idx_to_position_map[k] for k in active_inst_idx_list]
 47 |         active_inst_idx = torch.LongTensor(active_inst_idx).cuda()
 48 | 
 49 |         active_src_seq = collect_active_part(src_seq, active_inst_idx, n_prev_active_inst, n_bm)
 50 |         active_src_enc = collect_active_part(src_enc, active_inst_idx, n_prev_active_inst, n_bm)
 51 |         active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)
 52 | 
 53 |         return active_src_seq, active_src_enc, active_inst_idx_to_position_map
 54 | 
 55 |     def beam_decode_step(inst_dec_beams, len_dec_seq, src_seq, enc_output, inst_idx_to_position_map, n_bm, mode):
 56 |         ''' Decode and update beam status, and then return active beam idx '''
 57 | 
 58 |         def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq):
 59 |             dec_partial_seq = [b.get_current_state() for b in inst_dec_beams if not b.done]
 60 |             dec_partial_seq = torch.stack(dec_partial_seq).cuda()
 61 |             dec_partial_seq = dec_partial_seq.view(-1, len_dec_seq)
 62 |             return dec_partial_seq
 63 | 
 64 |         def prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm):
 65 |             dec_partial_pos = torch.arange(1, len_dec_seq + 1, dtype=torch.long).cuda()
 66 |             dec_partial_pos = dec_partial_pos.unsqueeze(0).repeat(n_active_inst * n_bm, 1)
 67 |             return dec_partial_pos
 68 | 
 69 |         def predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm):
 70 |             if mode == 'cap':
 71 |                 dec_output, *_ = model.decoder(dec_seq, dec_pos, src_seq, enc_output)
 72 |                 dec_output = dec_output[:, -1, :]  # Pick the last step: (bh * bm) * d_h
 73 |                 word_prob = F.log_softmax(model.cap_word_prj(dec_output), dim=1)
 74 |                 word_prob = word_prob.view(n_active_inst, n_bm, -1)
 75 | 
 76 |             elif mode == 'int':
 77 |                 dec_output, *_ = model.cms_decoder(dec_seq, dec_pos, src_seq, enc_output)
 78 |                 dec_output = dec_output[:, -1, :]  # Pick the last step: (bh * bm) * d_h
 79 |                 word_prob = F.log_softmax(model.cms_word_prj(dec_output), dim=1)
 80 |                 word_prob = word_prob.view(n_active_inst, n_bm, -1)
 81 |             return word_prob
 82 | 
 83 |         def collect_active_inst_idx_list(inst_beams, word_prob, inst_idx_to_position_map):
 84 |             active_inst_idx_list = []
 85 |             for inst_idx, inst_position in inst_idx_to_position_map.items():
 86 |                 is_inst_complete = inst_beams[inst_idx].advance(word_prob[inst_position])
 87 |                 if not is_inst_complete:
 88 |                     active_inst_idx_list += [inst_idx]
 89 | 
 90 |             return active_inst_idx_list
 91 | 
 92 |         n_active_inst = len(inst_idx_to_position_map)
 93 | 
 94 |         dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq)
 95 |         dec_pos = prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm)
 96 |         word_prob = predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm)
 97 | 
 98 |         # Update the beam with predicted word prob information and collect incomplete instances
 99 |         active_inst_idx_list = collect_active_inst_idx_list(inst_dec_beams, word_prob, inst_idx_to_position_map)
100 | 
101 |         return active_inst_idx_list
102 | 
103 |     def collect_hypothesis_and_scores(inst_dec_beams, n_best):
104 |         all_hyp, all_scores = [], []
105 |         for inst_idx in range(len(inst_dec_beams)):
106 |             scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores()
107 |             all_scores += [scores[:n_best]]
108 | 
109 |             hyps = [inst_dec_beams[inst_idx].get_hypothesis(i) for i in tail_idxs[:n_best]]
110 |             all_hyp += [hyps]
111 |         return all_hyp, all_scores
112 | 
113 |     with torch.no_grad():
114 |         # Encode
115 |         src_seq = src_emb.cuda()
116 |         src_enc, *_ = model.encoder(src_seq)
117 |         video_encoding = src_enc
118 | 
119 |         # <---------------------------------------------Decode CAP ---------------------------------------------------->
120 |         cap_pos = pos_emb_generation(cap_label)
121 |         cap_label, cap_pos = cap_label[:, :-1], cap_pos[:, :-1]
122 | 
123 |         cap_dec_output, *_ = model.decoder(cap_label, cap_pos, video_encoding, video_encoding)
124 | 
125 |         # Concatenate visual and caption encoding
126 |         cat_encoding = torch.cat((video_encoding, cap_dec_output), 1)
127 | 
128 |         # <---------------------------------------------Decode CMS ---------------------------------------------------->
129 |         # Repeat data for beam search for CMS
130 |         n_bm = 2
131 |         n_inst, len_s, d_h = cat_encoding.size()
132 |         src_enc = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h)
133 |         src_seq = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, -1)
134 | 
135 |         # Prepare beams
136 |         inst_dec_beams = [Beam(n_bm, device='cuda') for _ in range(n_inst)]
137 | 
138 |         # Bookkeeping for active or not
139 |         active_inst_idx_list = list(range(n_inst))
140 |         inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list)
141 | 
142 |         for len_dec_seq in range(1, opt['eff_max_len'] - 1):
143 | 
144 |             active_inst_idx_list = beam_decode_step(
145 |                 inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm, mode='int')
146 | 
147 |             if not active_inst_idx_list:
148 |                 break  # all instances have finished their path to <EOS>
149 | 
150 |             src_seq, src_enc, inst_idx_to_position_map = collate_active_info(
151 |                 src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list)
152 | 
153 |         cms_batch_hyp, cms_batch_scores = collect_hypothesis_and_scores(inst_dec_beams, 1)
154 | 
155 |     return cms_batch_hyp
156 | 
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/opts.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | 
  4 | def parse_opt():
  5 |     parser = argparse.ArgumentParser()
  6 | 
  7 |     # Data input settings
  8 |     parser.add_argument(
  9 |         '--info_json',
 10 |         type=str,
 11 |         default='data/v2c_info.json',
 12 |         help='path to the json file containing additional info and vocab')
 13 | 
 14 |     parser.add_argument(
 15 |         '--cap_info_json',
 16 |         type=str,
 17 |         default='data/msrvtt_new_info.json',
 18 |         help='path to the json file containing additional info and vocab')
 19 | 
 20 |     parser.add_argument(
 21 |         '--caption_json',
 22 |         type=str,
 23 |         # raw dataset: V2C_MSR-VTT_caption.json;
 24 |         # Human dataset: train_cvpr_humanRank_V2C_caption.json
 25 |         default='data/V2C_MSR-VTT_caption.json',
 26 |         help='path to the processed video caption json')
 27 | 
 28 |     parser.add_argument(
 29 |         '--feats_dir',
 30 |         nargs='*',
 31 |         type=str,
 32 |         default=['data/feats/resnet152/'],
 33 |         help='path to the directory containing the preprocessed fc feats')
 34 | 
 35 |     # Model settings
 36 |     parser.add_argument(
 37 |         "--cap_max_len",
 38 |         type=int,
 39 |         default=28,
 40 |         help='max length of captions(containing <sos>, <eos>)')
 41 | 
 42 |     parser.add_argument(
 43 |         "--int_max_len",
 44 |         type=int,
 45 |         default=21,
 46 |         help='max length of captions(containing <sos>, <eos>)')
 47 | 
 48 |     parser.add_argument(
 49 |         "--eff_max_len",
 50 |         type=int,
 51 |         default=26,
 52 |         help='max length of captions(containing <sos>, <eos>)')
 53 | 
 54 |     parser.add_argument(
 55 |         "--att_max_len",
 56 |         type=int,
 57 |         default=8,
 58 |         help='max length of captions(containing <sos>, <eos>)')
 59 | 
 60 |     parser.add_argument(
 61 |         '--input_dropout_p',
 62 |         type=float,
 63 |         default=0.2,
 64 |         help='strength of dropout in the Language Model RNN')
 65 | 
 66 |     parser.add_argument(
 67 |         '--dropout',
 68 |         type=float,
 69 |         default=0.1,
 70 |         help='Dropout rate for Transformer')
 71 | 
 72 |     parser.add_argument(
 73 |         '--dim_word',
 74 |         type=int,
 75 |         default=512,
 76 |         help='the encoding size of each token in the vocabulary, and the video.')
 77 | 
 78 |     parser.add_argument(
 79 |         '--dim_model',
 80 |         type=int,
 81 |         default=512,
 82 |         help='size of the rnn hidden layer')
 83 | 
 84 |     parser.add_argument(
 85 |         '--dim_vis_feat',
 86 |         type=int,
 87 |         default=2048,
 88 |         help='dim of features of video frames')
 89 | 
 90 |     # 12-12 8 6
 91 |     parser.add_argument(
 92 |         '--num_head',
 93 |         type=int,
 94 |         default=8,
 95 |         help='Numbers of head in transformers.')
 96 | 
 97 |     parser.add_argument(
 98 |         '--num_layer',
 99 |         type=int,
100 |         default=6,
101 |         help='Numbers of layers in transformers.')
102 | 
103 |     parser.add_argument(
104 |         '--rnn_layer',
105 |         type=int,
106 |         default=1,
107 |         help='Numbers of layers in Video Encoder, RNN.')
108 | 
109 |     parser.add_argument(
110 |         '--dim_head',
111 |         type=int,
112 |         default=64,
113 |         help='Dimension of the attention head.')
114 | 
115 |     parser.add_argument(
116 |         '--dim_inner',
117 |         type=int,
118 |         default=1024,
119 |         help='Dimension of inner feature in Encoder/Decoder.')
120 | 
121 |     # Optimization: General
122 |     parser.add_argument(
123 |         '--epochs',
124 |         type=int,
125 |         default=100,
126 |         help='number of epochs')
127 | 
128 |     parser.add_argument(
129 |         '--warm_up_steps',
130 |         type=int,
131 |         default=5000,
132 |         help='Warm up steps.')
133 | 
134 |     parser.add_argument(
135 |         '--batch_size',
136 |         type=int,
137 |         default=64,
138 |         help='minibatch size')
139 | 
140 |     parser.add_argument(
141 |         '--save_checkpoint_every',
142 |         type=int,
143 |         default=10,
144 |         help='how often to save a model checkpoint (in epoch)?')
145 | 
146 |     parser.add_argument(
147 |         '--print_loss_every',
148 |         type=int,
149 |         default=20,
150 |         help='how often to print the loss information (in iterations)?')
151 | 
152 |     parser.add_argument(
153 |         '--checkpoint_path',
154 |         type=str,
155 |         default='save',
156 |         help='directory to store check pointed models')
157 | 
158 |     parser.add_argument(
159 |         '--load_checkpoint',
160 |         type=str,
161 |         default='',
162 |         help='directory to load check pointed models')
163 | 
164 |     parser.add_argument(
165 |         '--gpu',
166 |         type=str,
167 |         default='0',
168 |         help='gpu device number')
169 | 
170 |     # other setting
171 |     parser.add_argument(
172 |         '--show_predict',
173 |         action='store_true',
174 |         help='whether to display intermediate generations during training/inference')
175 | 
176 |     parser.add_argument(
177 |         '--cuda',
178 |         action='store_true',
179 |         help='Use CUDA for training.')
180 | 
181 |     parser.add_argument(
182 |         '--resume',
183 |         action='store_true',
184 |         help='Resume from a midway checkpoint.')
185 | 
186 |     parser.add_argument(
187 |         '--cms',
188 |         choices=['int', 'eff', 'att'],
189 |         default='eff',
190 |         help='Type of Commonsense Knowledge.')
191 | 
192 |     args = parser.parse_args()
193 | 
194 |     return args
195 | 


--------------------------------------------------------------------------------
/others/generation.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import torch
  3 | import random
  4 | import numpy as np
  5 | from opts import *
  6 | from model.Model import Model
  7 | from pycocoevalcap.bleu.bleu import Bleu
  8 | from pycocoevalcap.rouge.rouge import Rouge
  9 | from pycocoevalcap.cider.cider import Cider
 10 | from pycocoevalcap.meteor.meteor import Meteor
 11 | from torch.utils.data import DataLoader
 12 | from utils.dataloader import VideoDataset
 13 | from model.transformer.Constants import *
 14 | from nltk.translate.bleu_score import corpus_bleu
 15 | from model.transformer.Translator import translate_batch
 16 | 
 17 | import sys
 18 | sys.path.append("utils/pycocoevalcap/")
 19 | 
 20 | 
 21 | def pos_emb_generation(visual_feats):
 22 |     '''
 23 |         Generate the position embedding input for Transformers.
 24 |     '''
 25 |     seq = list(range(1, visual_feats.shape[1] + 1))
 26 |     src_pos = torch.tensor([seq] * visual_feats.shape[0]).cuda()
 27 |     return src_pos
 28 | 
 29 | 
 30 | def list_to_sentence(list):
 31 |     sentence = ''
 32 |     for element in list:
 33 |         sentence += ' ' + element
 34 |     return sentence
 35 | 
 36 | 
 37 | def test(loader, model, opt, cap_vocab, cms_vocab):
 38 |     bleu_scores = []
 39 |     write_to_txt = []
 40 | 
 41 |     gts = []
 42 |     res = []
 43 |     for batch_id, data in enumerate(loader):
 44 | 
 45 |         fc_feats = data['fc_feats'].cuda()
 46 |         cap_labels = data['cap_labels'].cuda()
 47 |         video_ids = data['video_ids']
 48 | 
 49 |         with torch.no_grad():
 50 |             # Beam Search Starts From Here
 51 |             try:
 52 |                 batch_hyp, cms_batch_hyp = translate_batch(model, fc_feats, opt)
 53 |             except:
 54 |                 continue
 55 | 
 56 |         # Stack all GTs captions
 57 |         references = []
 58 |         for video in video_ids:
 59 |             video_caps = []
 60 |             for cap in opt['captions'][video]:
 61 |                 for _ in cap['attribute']:
 62 |                     video_caps.append(cap['final_caption'][1:-1] + _[1][1:-1])
 63 |             references.append(video_caps)
 64 | 
 65 |         # Stack all Predicted Captions
 66 |         hypotheses = []
 67 |         for cms_predict, predict in zip(cms_batch_hyp, batch_hyp):
 68 |             _ = []
 69 |             if CAP_EOS in predict[0]:
 70 |                 sep_id = predict[0].index(CAP_EOS)
 71 |             else:
 72 |                 sep_id = -1
 73 |             for word in predict[0][1: sep_id]:
 74 |                 _.append(cap_vocab[str(word)])
 75 | 
 76 |             if CAP_EOS in cms_predict[0]:
 77 |                 sep_id = cms_predict[0].index(CAP_EOS)
 78 |             else:
 79 |                 sep_id = -1
 80 |             for word in cms_predict[0][0: sep_id]:
 81 |                 _.append(cms_vocab[str(word)])
 82 |             hypotheses.append(_)
 83 | 
 84 |         # Print out the predicted sentences and GT
 85 |         for random_id in range(5):
 86 |             if 0 in batch_hyp[random_id][0]:
 87 |                 stop_idx = batch_hyp[random_id][0].index(EOS)
 88 |             else:
 89 |                 stop_idx = -1
 90 | 
 91 |             video_id = video_ids[random_id]
 92 |             cap = list_to_sentence([cap_vocab[str(widx)] for widx in batch_hyp[random_id][0][1: stop_idx] if widx != 0])
 93 |             cms = list_to_sentence([cms_vocab[str(widx)] for widx in cms_batch_hyp[random_id][0][: -1] if widx != 0])
 94 |             cap_gt = list_to_sentence([cap_vocab[str(word.cpu().numpy())] for word in cap_labels[random_id, 1:] if word != 0][0:-1])
 95 |             _ = str(video_id + ',' + cap + ',' + cms + ',' + cap_gt)
 96 |             write_to_txt.append(_)
 97 |             print('Generated Caption:', cap, ' ', 'Generated CMS:', cms)
 98 |             print('GT Caption:', cap_gt)
 99 |             print('\n')
100 |             print(batch_id, ' ', batch_id * opt['batch_size'], ' out of ', '3010')
101 | 
102 |         # Compute the BLEU-4 score
103 |         bleu_1 = corpus_bleu(references, hypotheses, weights=[1, 0, 0, 0])
104 |         bleu_2 = corpus_bleu(references, hypotheses, weights=[0.5, 0.5, 0, 0])
105 |         bleu_3 = corpus_bleu(references, hypotheses, weights=[0.333, 0.333, 0.333, 0])
106 |         bleu_4 = corpus_bleu(references, hypotheses, weights=[0.25, 0.25, 0.25, 0.25])
107 |         bleu_scores.append([bleu_1, bleu_2, bleu_3, bleu_4])
108 | 
109 |     print("Bleu scores 1-4:", np.mean(np.asarray(bleu_scores), 0))
110 | 
111 | 
112 | def main(opt):
113 |     dataset = VideoDataset(opt, 'test')
114 |     dataloader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=False)
115 |     opt['cms_vocab_size'] = dataset.get_cms_vocab_size()
116 |     opt['cap_vocab_size'] = dataset.get_cap_vocab_size()
117 | 
118 |     if opt['cms'] == 'int':
119 |         cms_text_length = opt['int_max_len']
120 |     elif opt['cms'] == 'eff':
121 |         cms_text_length = opt['eff_max_len']
122 |     else:
123 |         cms_text_length = opt['att_max_len']
124 | 
125 |     model = Model(
126 |         dataset.get_cap_vocab_size(),
127 |         dataset.get_cms_vocab_size(),
128 |         cap_max_seq=opt['cap_max_len'],
129 |         cms_max_seq=cms_text_length,
130 |         tgt_emb_prj_weight_sharing=True,
131 |         vis_emb=opt['dim_vis_feat'],
132 |         rnn_layers=opt['rnn_layer'],
133 |         d_k=opt['dim_head'],
134 |         d_v=opt['dim_head'],
135 |         d_model=opt['dim_model'],
136 |         d_word_vec=opt['dim_word'],
137 |         d_inner=opt['dim_inner'],
138 |         n_layers=opt['num_layer'],
139 |         n_head=opt['num_head'],
140 |         dropout=opt['dropout'])
141 | 
142 |     if len(opt['load_checkpoint']) != 0:
143 |         state_dict = torch.load(opt['load_checkpoint'])
144 |         model.load_state_dict(state_dict)
145 | 
146 |     model = model.cuda()
147 |     model.eval()
148 |     test(dataloader, model, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab())
149 | 
150 | 
151 | if __name__ == '__main__':
152 |     opt = parse_opt()
153 |     opt = vars(opt)
154 |     opt['captions'] = json.load(open(opt['caption_json']))
155 |     opt['batch_size'] = 30
156 |     main(opt)


--------------------------------------------------------------------------------
/others/test_RNN.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import json
  3 | import torch
  4 | from opts import *
  5 | import numpy as np
  6 | import nltk
  7 | from utils.utils import *
  8 | from pycocoevalcap.bleu.bleu import Bleu
  9 | from pycocoevalcap.rouge.rouge import Rouge
 10 | from pycocoevalcap.cider.cider import Cider
 11 | from pycocoevalcap.meteor.meteor import Meteor
 12 | from model.Model import Model
 13 | from torch.utils.data import DataLoader
 14 | from model.transformer.Constants import *
 15 | from utils.gt_caps_dataloader import VideoDataset
 16 | from model.transformer.cap2cms_Translator import translate_batch
 17 | 
 18 | 
 19 | def test(loader, model, opt, cap_vocab, cms_vocab):
 20 |     res = {}
 21 |     gts = {}
 22 |     eval_id = 0
 23 | 
 24 |     total_cms = set()
 25 |     ppl_scores = []
 26 | 
 27 |     for batch_id, raw_data in enumerate(loader):
 28 |         if opt['cuda']: torch.cuda.synchronize()
 29 | 
 30 |         # iterate each video within the batch
 31 |         for iterate_id in range(len(raw_data)):
 32 |             fc_feats = raw_data[iterate_id][0]['fc_feats'].unsqueeze(0)
 33 |             video_ids = raw_data[iterate_id][0]['video_ids']
 34 |             cap_labels = raw_data[iterate_id][0]['cap_labels']
 35 | 
 36 |             if opt['cms'] == 'int':
 37 |                 cms_list = raw_data[iterate_id][1]
 38 |             elif opt['cms'] == 'eff':
 39 |                 cms_list = raw_data[iterate_id][2]
 40 |             else:
 41 |                 cms_list = raw_data[iterate_id][3]
 42 | 
 43 |             if opt['cuda']:
 44 |                 # cms_list = cms_list.cuda()
 45 |                 cap_labels = cap_labels.cuda()
 46 |                 fc_feats = fc_feats.cuda()
 47 | 
 48 |             # repeat the fc features for num_cap times
 49 |             fc_feats = fc_feats.repeat(len(cap_labels), 1, 1)
 50 | 
 51 |             # iterate through all captions per video
 52 |             with torch.no_grad():
 53 | 
 54 |                 # Note, currently we used BEAM search to decode the captions, while greedy strategy
 55 |                 #  should yield close or even better results.
 56 |                 # cms_batch_hyp = translate_batch(model, fc_feats, cap_labels, opt)
 57 |                 _, _, _, cms_batch_hyp  = model(fc_feats, target_variable=cap_labels, cms_target_variable=None,
 58 |                                                 mode='test')
 59 | 
 60 |             for random_id in range(cap_labels.shape[0]):
 61 |                 # Print out the predicted sentences and GT
 62 |                 if EOS in cms_batch_hyp[random_id]:
 63 |                     stop_id = list(cms_batch_hyp[random_id]).index(EOS)
 64 |                 else:
 65 |                     stop_id = -1
 66 | 
 67 |                 cms = list_to_sentence([cms_vocab[str(widx.detach().cpu().numpy())] for widx in
 68 |                                         cms_batch_hyp[random_id][: stop_id] if widx != 0])
 69 |                 cap_gt = list_to_sentence([cap_vocab[str(word.cpu().numpy())] for word in
 70 |                                            cap_labels[random_id, 1:] if word != 0][0:-1])
 71 | 
 72 |                 print(video_ids, '\n', 'Predicted CMS: ', cms)
 73 |                 print('GT CMS Caption: ', cap_gt)
 74 |                 print('GT CMS Knowledge: ', cms_list[random_id].split(';')[1:])
 75 |                 print('\n')
 76 |                 print(batch_id * opt['batch_size'], ' out of ', '3010')
 77 | 
 78 |                 # Save for evaluation
 79 |                 cmses = cms_list[random_id].split(';')[1:]
 80 |                 res[eval_id] = [cms]
 81 |                 gts[eval_id] = cmses
 82 | 
 83 |                 eval_id += 1
 84 | 
 85 |                 ppl_corpus = ''
 86 |                 for c in cmses:
 87 |                     total_cms.add(c.lower())
 88 |                     ppl_corpus += ' ' + c.lower()
 89 |                 tokens = nltk.word_tokenize(ppl_corpus)
 90 |                 unigram_model = unigram(tokens)
 91 |                 ppl_scores.append(perplexity(cms.lower(), unigram_model))
 92 | 
 93 |     # Compute PPL score
 94 |     print('Perplexity score: ', sum(ppl_scores)/len(ppl_scores))
 95 | 
 96 |     avg_bleu_score, bleu_scores = Bleu(4).compute_score(gts, res)
 97 |     avg_cider_score, cider_scores = Cider().compute_score(gts, res)
 98 |     avg_meteor_score, meteor_scores = Meteor().compute_score(gts, res)
 99 |     avg_rouge_score, rouge_scores = Rouge().compute_score(gts, res)
100 |     print('C, M, R, B:', avg_cider_score, avg_meteor_score, avg_rouge_score, avg_bleu_score)
101 | 
102 | 
103 | def main(opt):
104 |     dataset = VideoDataset(opt, 'test')
105 |     dataloader = DataLoader(dataset, collate_fn=test_collate_fn, batch_size=opt['batch_size'],
106 |                             shuffle=False)
107 |     opt['cms_vocab_size'] = dataset.get_cms_vocab_size()
108 |     opt['cap_vocab_size'] = dataset.get_cap_vocab_size()
109 | 
110 |     if opt['cms'] == 'int':
111 |         cms_text_length = opt['int_max_len']
112 |     elif opt['cms'] == 'eff':
113 |         cms_text_length = opt['eff_max_len']
114 |     else:
115 |         cms_text_length = opt['att_max_len']
116 | 
117 |     from model.S2VTModel import S2VTModel
118 |     model = S2VTModel(dataset.get_cap_vocab_size(),
119 |                       dataset.get_cms_vocab_size(),
120 |                       opt['cap_max_len'],
121 |                       cms_text_length,
122 |                       opt["dim_model"],
123 |                       opt["dim_word"],
124 |                       opt['dim_vis_feat'],
125 |                       n_layers=opt['rnn_layer'])
126 | 
127 |     model_parameters = filter(lambda p: p.requires_grad, model.parameters())
128 |     params = sum([np.prod(p.size()) for p in model_parameters])
129 |     print(params)
130 | 
131 |     if len(opt['load_checkpoint']) != 0:
132 |         state_dict = torch.load(opt['load_checkpoint'])
133 |         model.load_state_dict(state_dict)
134 | 
135 |     if opt['cuda']:
136 |         model = model.cuda()
137 | 
138 | 
139 |     model.eval()
140 | 
141 |     test(dataloader, model, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab())
142 | 
143 | 
144 | if __name__ == '__main__':
145 |     opt = parse_opt()
146 |     opt = vars(opt)
147 |     opt['captions'] = json.load(open(opt['caption_json']))
148 |     opt['batch_size'] = 30
149 | 
150 |     main(opt)
151 | 


--------------------------------------------------------------------------------
/others/test_attention_Video2text.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import json
  3 | import torch
  4 | from opts import *
  5 | import numpy as np
  6 | import nltk
  7 | from utils.utils import *
  8 | from pycocoevalcap.bleu.bleu import Bleu
  9 | from pycocoevalcap.rouge.rouge import Rouge
 10 | from pycocoevalcap.cider.cider import Cider
 11 | from pycocoevalcap.meteor.meteor import Meteor
 12 | from torch.utils.data import DataLoader
 13 | from model.transformer.Constants import *
 14 | from utils.gt_caps_dataloader import VideoDataset
 15 | 
 16 | # sys.path.append("./pycocoevalcap/")
 17 | 
 18 | 
 19 | def test(loader, model, opt, cap_vocab, cms_vocab):
 20 |     res = {}
 21 |     gts = {}
 22 |     eval_id = 0
 23 | 
 24 |     total_cms = set()
 25 |     ppl_scores = []
 26 | 
 27 |     for batch_id, raw_data in enumerate(loader):
 28 |         if opt['cuda']: torch.cuda.synchronize()
 29 | 
 30 |         # iterate each video within the batch
 31 |         for iterate_id in range(len(raw_data)):
 32 |             fc_feats = raw_data[iterate_id][0]['fc_feats'].unsqueeze(0)
 33 |             video_ids = raw_data[iterate_id][0]['video_ids']
 34 |             cap_labels = raw_data[iterate_id][0]['cap_labels']
 35 | 
 36 |             if opt['cms'] == 'int':
 37 |                 cms_list = raw_data[iterate_id][1]
 38 |             elif opt['cms'] == 'eff':
 39 |                 cms_list = raw_data[iterate_id][2]
 40 |             else:
 41 |                 cms_list = raw_data[iterate_id][3]
 42 | 
 43 |             if opt['cuda']:
 44 |                 # cms_list = cms_list.cuda()
 45 |                 cap_labels = cap_labels.cuda()
 46 |                 fc_feats = fc_feats.cuda()
 47 | 
 48 |             # repeat the fc features for num_cap times
 49 |             fc_feats = fc_feats.repeat(len(cap_labels), 1, 1)
 50 | 
 51 |             # iterate through all captions per video
 52 |             with torch.no_grad():
 53 | 
 54 |                 # Note, currently we used BEAM search to decode the captions, while greedy strategy
 55 |                 #  should yield close or even better results.
 56 | 
 57 |                 # Beam Search Starts From Here
 58 |                 _, cms_batch_hyp = model(fc_feats, cap_labels=cap_labels, cms_labels=None, mode='test')
 59 | 
 60 |             for random_id in range(cap_labels.shape[0]):
 61 |                 # Print out the predicted sentences and GT
 62 |                 if EOS in cms_batch_hyp[random_id]:
 63 |                     stop_id = list(cms_batch_hyp[random_id]).index(EOS)
 64 |                 else:
 65 |                     stop_id = -1
 66 | 
 67 |                 cms = list_to_sentence([cms_vocab[str(widx.detach().cpu().numpy())] for widx in
 68 |                                         cms_batch_hyp[random_id][: stop_id] if widx != 0])
 69 |                 cap_gt = list_to_sentence([cap_vocab[str(word.cpu().numpy())] for word in
 70 |                                            cap_labels[random_id, 1:] if word != 0][0:-1])
 71 | 
 72 |                 print(video_ids, '\n', 'Predicted CMS: ', cms)
 73 |                 print('GT Caption: ', cap_gt)
 74 |                 print('GT CMS Knowledge: ', cms_list[random_id].split(';')[1:])
 75 |                 print('\n')
 76 |                 print(batch_id * opt['batch_size'], ' out of ', '3010')
 77 | 
 78 |                 # Save for evaluation
 79 |                 cmses = cms_list[random_id].split(';')[1:]
 80 |                 res[eval_id] = [cms]
 81 |                 gts[eval_id] = cmses
 82 | 
 83 |                 eval_id += 1
 84 | 
 85 |                 ppl_corpus = ''
 86 |                 for c in cmses:
 87 |                     total_cms.add(c.lower())
 88 |                     ppl_corpus += ' ' + c.lower()
 89 |                 tokens = nltk.word_tokenize(ppl_corpus)
 90 |                 unigram_model = unigram(tokens)
 91 |                 ppl_scores.append(perplexity(cms.lower(), unigram_model))
 92 | 
 93 |     # Compute PPL score
 94 |     print('Perplexity score: ', sum(ppl_scores)/len(ppl_scores))
 95 | 
 96 |     avg_bleu_score, bleu_scores = Bleu(4).compute_score(gts, res)
 97 |     avg_cider_score, cider_scores = Cider().compute_score(gts, res)
 98 |     avg_meteor_score, meteor_scores = Meteor().compute_score(gts, res)
 99 |     avg_rouge_score, rouge_scores = Rouge().compute_score(gts, res)
100 |     print('C, M, R, B:', avg_cider_score, avg_meteor_score, avg_rouge_score, avg_bleu_score)
101 | 
102 | 
103 | def main(opt):
104 |     dataset = VideoDataset(opt, 'test')
105 |     dataloader = DataLoader(dataset, collate_fn=test_collate_fn, batch_size=opt['batch_size'],
106 |                             shuffle=False)
107 |     opt['cms_vocab_size'] = dataset.get_cms_vocab_size()
108 |     opt['cap_vocab_size'] = dataset.get_cap_vocab_size()
109 | 
110 |     if opt['cms'] == 'int':
111 |         cms_text_length = opt['int_max_len']
112 |     elif opt['cms'] == 'eff':
113 |         cms_text_length = opt['eff_max_len']
114 |     else:
115 |         cms_text_length = opt['att_max_len']
116 | 
117 |     from model.DecoderRNN import DecoderRNN
118 |     from model.S2VT_EncoderRNN import EncoderRNN
119 |     from model.S2VTAttModel import S2VTAttModel
120 | 
121 |     encoder = EncoderRNN(2048, 512, 0, 0.2, rnn_cell='gru')
122 | 
123 |     decoder = DecoderRNN(dataset.get_cap_vocab_size(), opt['cap_max_len'], 512, 512)
124 | 
125 |     cms_decoder = DecoderRNN(dataset.get_cms_vocab_size(), cms_text_length, 512, 512)
126 | 
127 |     model = S2VTAttModel(encoder, decoder, cms_decoder)
128 | 
129 | 
130 |     model.eval()
131 |     model_parameters = filter(lambda p: p.requires_grad, model.parameters())
132 |     params = sum([np.prod(p.size()) for p in model_parameters])
133 |     print(params)
134 | 
135 |     model.load_state_dict(torch.load(opt['load_checkpoint']))
136 | 
137 |     if opt['cuda']:
138 |         model = model.cuda()
139 | 
140 |     test(dataloader, model, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab())
141 | 
142 | 
143 | if __name__ == '__main__':
144 |     opt = parse_opt()
145 |     opt = vars(opt)
146 |     opt['captions'] = json.load(open(opt['caption_json']))
147 |     opt['batch_size'] = 30
148 | 
149 |     main(opt)
150 | 


--------------------------------------------------------------------------------
/others/test_transformer.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import json
  3 | import torch
  4 | from opts import *
  5 | import numpy as np
  6 | import nltk
  7 | from utils.utils import *
  8 | from pycocoevalcap.bleu.bleu import Bleu
  9 | from pycocoevalcap.rouge.rouge import Rouge
 10 | from pycocoevalcap.cider.cider import Cider
 11 | from pycocoevalcap.meteor.meteor import Meteor
 12 | from model.TransformerModel import Model
 13 | from torch.utils.data import DataLoader
 14 | from model.transformer.Constants import *
 15 | from utils.gt_caps_dataloader import VideoDataset
 16 | from model.transformer.cap2cms_Translator_transformer import translate_batch
 17 | 
 18 | # sys.path.append("./pycocoevalcap/")
 19 | 
 20 | 
 21 | def test(loader, model, opt, cap_vocab, cms_vocab):
 22 |     res = {}
 23 |     gts = {}
 24 |     eval_id = 0
 25 | 
 26 |     total_cms = set()
 27 |     ppl_scores = []
 28 | 
 29 |     for batch_id, raw_data in enumerate(loader):
 30 |         if opt['cuda']: torch.cuda.synchronize()
 31 | 
 32 |         # iterate each video within the batch
 33 |         for iterate_id in range(len(raw_data)):
 34 |             fc_feats = raw_data[iterate_id][0]['fc_feats'].unsqueeze(0)
 35 |             video_ids = raw_data[iterate_id][0]['video_ids']
 36 |             cap_labels = raw_data[iterate_id][0]['cap_labels']
 37 | 
 38 |             if opt['cms'] == 'int':
 39 |                 cms_list = raw_data[iterate_id][1]
 40 |             elif opt['cms'] == 'eff':
 41 |                 cms_list = raw_data[iterate_id][2]
 42 |             else:
 43 |                 cms_list = raw_data[iterate_id][3]
 44 | 
 45 |             if opt['cuda']:
 46 |                 # cms_list = cms_list.cuda()
 47 |                 cap_labels = cap_labels.cuda()
 48 |                 fc_feats = fc_feats.cuda()
 49 | 
 50 |             # repeat the fc features for num_cap times
 51 |             fc_feats = fc_feats.repeat(len(cap_labels), 1, 1)
 52 | 
 53 |             # iterate through all captions per video
 54 |             with torch.no_grad():
 55 | 
 56 |                 # Note, currently we used BEAM search to decode the captions, while greedy strategy
 57 |                 #  should yield close or even better results.
 58 |                 cms_batch_hyp = translate_batch(model, fc_feats, cap_labels, opt)
 59 | 
 60 |             for random_id in range(cap_labels.shape[0]):
 61 |                 # Print out the predicted sentences and GT
 62 |                 if EOS in cms_batch_hyp[random_id][0]:
 63 |                     stop_id = cms_batch_hyp[random_id][0].index(EOS)
 64 |                 else:
 65 |                     stop_id = -1
 66 | 
 67 |                 cms = list_to_sentence([cms_vocab[str(widx)] for widx in
 68 |                                         cms_batch_hyp[random_id][0][: stop_id] if widx != 0])
 69 |                 cap_gt = list_to_sentence([cap_vocab[str(word.cpu().numpy())] for word in
 70 |                                            cap_labels[random_id, 1:] if word != 0][0:-1])
 71 | 
 72 |                 print(video_ids, '\n', 'Predicted CMS: ', cms)
 73 |                 print('GT CMS Caption: ', cap_gt)
 74 |                 print('GT CMS Knowledge: ', cms_list[random_id].split(';')[1:])
 75 |                 print('\n')
 76 |                 print(batch_id * opt['batch_size'], ' out of ', '3010')
 77 | 
 78 |                 # Save for evaluation
 79 |                 cmses = cms_list[random_id].split(';')[1:]
 80 |                 res[eval_id] = [cms]
 81 |                 gts[eval_id] = cmses
 82 | 
 83 |                 eval_id += 1
 84 | 
 85 | 
 86 |                 ppl_corpus = ''
 87 |                 for c in cmses:
 88 |                     total_cms.add(c.lower())
 89 |                     ppl_corpus += ' ' + c.lower()
 90 |                 tokens = nltk.word_tokenize(ppl_corpus)
 91 |                 unigram_model = unigram(tokens)
 92 |                 ppl_scores.append(perplexity(cms.lower(), unigram_model))
 93 | 
 94 |     # Compute PPL score
 95 |     print('Perplexity score: ', sum(ppl_scores)/len(ppl_scores))
 96 | 
 97 |     avg_bleu_score, bleu_scores = Bleu(4).compute_score(gts, res)
 98 |     avg_cider_score, cider_scores = Cider().compute_score(gts, res)
 99 |     avg_meteor_score, meteor_scores = Meteor().compute_score(gts, res)
100 |     avg_rouge_score, rouge_scores = Rouge().compute_score(gts, res)
101 |     print('C, M, R, B:', avg_cider_score, avg_meteor_score, avg_rouge_score, avg_bleu_score)
102 | 
103 | 
104 | def main(opt):
105 |     dataset = VideoDataset(opt, 'test')
106 |     dataloader = DataLoader(dataset, collate_fn=test_collate_fn, batch_size=opt['batch_size'],
107 |                             shuffle=False)
108 |     opt['cms_vocab_size'] = dataset.get_cms_vocab_size()
109 |     opt['cap_vocab_size'] = dataset.get_cap_vocab_size()
110 | 
111 |     if opt['cms'] == 'int':
112 |         cms_text_length = opt['int_max_len']
113 |     elif opt['cms'] == 'eff':
114 |         cms_text_length = opt['eff_max_len']
115 |     else:
116 |         cms_text_length = opt['att_max_len']
117 | 
118 |     model = Model(
119 |         dataset.get_cap_vocab_size(),
120 |         dataset.get_cms_vocab_size(),
121 |         cap_max_seq=opt['cap_max_len'],
122 |         cms_max_seq=cms_text_length,
123 |         tgt_emb_prj_weight_sharing=True,
124 |         vis_emb=opt['dim_vis_feat'],
125 |         rnn_layers=opt['rnn_layer'],
126 |         d_k=opt['dim_head'],
127 |         d_v=opt['dim_head'],
128 |         d_model=opt['dim_model'],
129 |         d_word_vec=opt['dim_word'],
130 |         d_inner=opt['dim_inner'],
131 |         n_layers=opt['num_layer'],
132 |         n_head=opt['num_head'],
133 |         dropout=opt['dropout'])
134 | 
135 |     if len(opt['load_checkpoint']) != 0:
136 |         state_dict = torch.load(opt['load_checkpoint'])
137 |         # for name, param in model.state_dict().items():
138 |         #     print(name, param.size())
139 |         #
140 |         # print('=================')
141 |         # print(state_dict.keys())
142 |         model.load_state_dict(state_dict)
143 | 
144 |     if opt['cuda']:
145 |         model = model.cuda()
146 | 
147 |     model.eval()
148 |     model_parameters = filter(lambda p: p.requires_grad, model.parameters())
149 |     params = sum([np.prod(p.size()) for p in model_parameters])
150 |     print(params)
151 |     test(dataloader, model, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab())
152 | 
153 | 
154 | if __name__ == '__main__':
155 |     opt = parse_opt()
156 |     opt = vars(opt)
157 |     opt['captions'] = json.load(open(opt['caption_json']))
158 |     opt['batch_size'] = 30
159 | 
160 |     main(opt)


--------------------------------------------------------------------------------
/others/train_RNN.py:
--------------------------------------------------------------------------------
  1 | ''' Training Scropt for V2C captioning task. '''
  2 | 
  3 | __author__ = 'Jacob Zhiyuan Fang'
  4 | 
  5 | import os
  6 | import numpy as np
  7 | from opts import *
  8 | from utils.utils import *
  9 | import torch.optim as optim
 10 | from model.Model import Model
 11 | from torch.utils.data import DataLoader
 12 | from utils.dataloader import VideoDataset
 13 | from model.transformer.Optim import ScheduledOptim
 14 | 
 15 | 
 16 | def train(loader, model, optimizer, opt, cap_vocab, cms_vocab):
 17 | 
 18 |     model.train()
 19 | 
 20 |     for epoch in range(opt['epochs']):
 21 |         iteration = 0
 22 | 
 23 |         for data in loader:
 24 |             torch.cuda.synchronize()
 25 | 
 26 |             if opt['cms'] == 'int':
 27 |                 cms_labels = data['int_labels']
 28 |             elif opt['cms'] == 'eff':
 29 |                 cms_labels = data['eff_labels']
 30 |             else:
 31 |                 cms_labels = data['att_labels']
 32 | 
 33 |             if opt['cuda']:
 34 |                 fc_feats = data['fc_feats'].cuda()
 35 |                 cap_labels = data['cap_labels'].cuda()
 36 |                 cms_labels = cms_labels.cuda()
 37 | 
 38 |             optimizer.zero_grad()
 39 | 
 40 |             # cap_probs, cms_probs = model(fc_feats, cap_labels, cap_pos, cms_labels, cms_pos)
 41 |             cap_probs, _, cms_probs, _ = model(fc_feats, cap_labels, cms_labels)
 42 | 
 43 |             # note: currently we just used most naive cross-entropy as training objective,
 44 |             # advanced loss func. like SELF-CRIT, different loss weights or stronger video feature
 45 |             # may lead performance boost, however is not the goal of this work.
 46 |             cap_loss, cap_n_correct = cal_performance(cap_probs.view(-1, cap_probs.shape[-1]),
 47 |                                                       cap_labels[:, 1:], smoothing=True)
 48 |             cms_loss, cms_n_correct = cal_performance(cms_probs.view(-1, cms_probs.shape[-1]),
 49 |                                                       cms_labels[:, 1:], smoothing=True)
 50 | 
 51 |             # compute the token prediction Acc.
 52 |             non_pad_mask = cap_labels[:, 1:].ne(Constants.PAD)
 53 |             n_word = non_pad_mask.sum().item()
 54 |             cms_non_pad_mask = cms_labels[:, 1:].ne(Constants.PAD)
 55 |             cms_n_word = cms_non_pad_mask.sum().item()
 56 |             cap_loss /= n_word
 57 |             cms_loss /= n_word
 58 | 
 59 |             loss = cms_loss + cap_loss
 60 | 
 61 |             loss.backward()
 62 |             optimizer.step_and_update_lr()
 63 |             torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, model.parameters()), 1)
 64 | 
 65 |             # update parameters
 66 |             cap_train_loss = cap_loss.item()
 67 |             cms_train_loss = cms_loss.item()
 68 | 
 69 |             # multi-gpu case, not necessary in newer PyTorch version or on single GPU.
 70 |             if opt['cuda']: torch.cuda.synchronize()
 71 | 
 72 |             iteration += 1
 73 | 
 74 |             if iteration % opt['print_loss_every'] ==0:
 75 |                 print('iter %d (epoch %d), cap_train_loss = %.6f, cms_train_loss = %.6f,'
 76 |                       ' current step = %d, current lr = %.3E, cap_acc = %.3f, cms_acc = %.3f'
 77 |                       % (iteration, epoch, cap_train_loss, cms_train_loss, optimizer.n_current_steps,
 78 |                          optimizer._optimizer.param_groups[0]['lr'],
 79 |                          cap_n_correct/n_word, cms_n_correct/cms_n_word))
 80 | 
 81 |                 # show the intermediate generations
 82 |                 if opt['show_predict']:
 83 |                     cap_pr, cap_gt = show_prediction(cap_probs, cap_labels[:, :-1], cap_vocab, caption=True)
 84 |                     cms_pr, cms_gt = show_prediction(cms_probs, cms_labels[:, :-1], cms_vocab, caption=False)
 85 |                     print(' \n')
 86 | 
 87 |                     with open(opt['info_path'], 'a') as f:
 88 |                         f.write('model_%d, cap_loss: %.6f, cms_loss: %.6f\n'
 89 |                                 % (epoch, cap_train_loss, cms_train_loss))
 90 |                         f.write('\n %s \n %s' % (cap_pr, cap_gt))
 91 |                         f.write('\n %s \n %s' % (cms_pr, cms_gt))
 92 |                         f.write('\n')
 93 | 
 94 |         if epoch % opt['save_checkpoint_every'] == 0:
 95 | 
 96 |             # save the checkpoint
 97 |             model_path = os.path.join(opt['output_dir'],
 98 |                                       'CMS_CAP_MODEL_{}_lr_{}_BS_{}_Layer_{}_ATTHEAD_{}_HID_{}_RNNLayer_{}_epoch_{}.pth'
 99 |                                       .format(opt['cms'], opt['init_lr'], opt['batch_size'], opt['num_layer'],
100 |                                               opt['num_head'], opt['dim_model'], opt['rnn_layer'], epoch))
101 | 
102 |             torch.save(model.state_dict(), model_path)
103 | 
104 |             print('model saved to %s' % model_path)
105 |             with open(opt['model_info_path'], 'a') as f:
106 |                 f.write('model_%d, cap_loss: %.6f, cms_loss: %.6f\n'
107 |                         % (epoch, cap_train_loss/n_word, cms_train_loss/n_word))
108 | 
109 | 
110 | def main(opt):
111 | 
112 |     # load and define dataloader
113 |     dataset = VideoDataset(opt, 'train')
114 |     dataloader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=True)
115 | 
116 |     opt['cms_vocab_size'] = dataset.get_cms_vocab_size()
117 |     opt['cap_vocab_size'] = dataset.get_cap_vocab_size()
118 | 
119 |     if opt['cms'] == 'int':
120 |         cms_text_length = opt['int_max_len']
121 |     elif opt['cms'] == 'eff':
122 |         cms_text_length = opt['eff_max_len']
123 |     else:
124 |         cms_text_length = opt['att_max_len']
125 | 
126 |     # model initialization.
127 |     from model.S2VTModel import S2VTModel
128 |     model = S2VTModel(
129 |                         dataset.get_cap_vocab_size(),
130 |                         dataset.get_cms_vocab_size(),
131 |                         opt['cap_max_len'],
132 |                         cms_text_length,
133 |                         opt["dim_model"],
134 |                         opt["dim_word"],
135 |                         opt['dim_vis_feat'],
136 |                         n_layers=opt['rnn_layer'])
137 | 
138 |     # number of parameters
139 |     model_parameters = filter(lambda p: p.requires_grad, model.parameters())
140 |     params = sum([np.prod(p.size()) for p in model_parameters])
141 |     print('number of learnable parameters are {}'.format(params))
142 | 
143 |     if opt['cuda']: model = model.cuda()
144 | 
145 |     # resume from previous checkpoint if indicated
146 |     if opt['load_checkpoint'] and opt['resume']:
147 |         cap_state_dict = torch.load(opt['load_checkpoint'])
148 |         model_dict = model.state_dict()
149 |         model_dict.update(cap_state_dict)
150 |         model.load_state_dict(model_dict)
151 | 
152 |     optimizer = ScheduledOptim(optim.Adam(filter(lambda x: x.requires_grad, model.parameters()),
153 |                                           betas=(0.9, 0.98), eps=1e-09), 512, opt['warm_up_steps'])
154 | 
155 |     # note: though we set the init learning rate as np.power(d_model, -0.5),
156 |     # grid search indicates different LR may improve the results.
157 |     opt['init_lr'] = round(optimizer.init_lr, 3)
158 | 
159 |     # create checkpoint output directory
160 |     dir = os.path.join(opt['checkpoint_path'], 'S2VT_CMS_CAP_MODEL_{}_lr_{}_BS_{}_Layer_{}_ATTHEAD_{}_HID_{}_RNNLayer_{}'
161 |                        .format(opt['cms'], opt['init_lr'], opt['batch_size'], opt['num_layer'],
162 |                                opt['num_head'], opt['dim_model'], opt['rnn_layer']))
163 | 
164 |     if not os.path.exists(dir): os.makedirs(dir)
165 | 
166 |     # save the model snapshot to local
167 |     info_path = os.path.join(dir, 'iteration_info_log.log')
168 |     print('model architecture saved to {} \n {}'.format(info_path, str(model)))
169 |     with open(info_path, 'a') as f:
170 |         f.write(str(model))
171 |         f.write('\n')
172 |         f.write(str(params))
173 |         f.write('\n')
174 | 
175 |     # log file directory
176 |     opt['output_dir'] = dir
177 |     opt['info_path'] = info_path
178 |     opt['model_info_path'] = os.path.join(opt['output_dir'], 'checkpoint_loss_log.log')
179 | 
180 |     train(dataloader, model, optimizer, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab())
181 | 
182 | if __name__ == '__main__':
183 |     opt = parse_opt()
184 |     opt = vars(opt)
185 |     main(opt)


--------------------------------------------------------------------------------
/others/train_transformer.py:
--------------------------------------------------------------------------------
  1 | ''' Training Scropt for V2C captioning task. '''
  2 | 
  3 | __author__ = 'Jacob Zhiyuan Fang'
  4 | 
  5 | import os
  6 | import numpy as np
  7 | from opts import *
  8 | from utils.utils import *
  9 | import torch.optim as optim
 10 | from model.TransformerModel import Model
 11 | from torch.utils.data import DataLoader
 12 | from utils.dataloader import VideoDataset
 13 | from model.transformer.Optim import ScheduledOptim
 14 | 
 15 | 
 16 | def train(loader, model, optimizer, opt, cap_vocab, cms_vocab):
 17 | 
 18 |     model.train()
 19 | 
 20 |     for epoch in range(opt['epochs']):
 21 |         iteration = 0
 22 | 
 23 |         for data in loader:
 24 |             torch.cuda.synchronize()
 25 | 
 26 |             if opt['cms'] == 'int':
 27 |                 cms_labels = data['int_labels']
 28 |             elif opt['cms'] == 'eff':
 29 |                 cms_labels = data['eff_labels']
 30 |             else:
 31 |                 cms_labels = data['att_labels']
 32 | 
 33 |             if opt['cuda']:
 34 |                 fc_feats = data['fc_feats'].cuda()
 35 |                 cap_labels = data['cap_labels'].cuda()
 36 |                 cms_labels = cms_labels.cuda()
 37 |             else:
 38 |                 fc_feats = data['fc_feats']
 39 |                 cap_labels = data['cap_labels']
 40 |                 cms_labels = cms_labels.cuda()
 41 | 
 42 |             optimizer.zero_grad()
 43 | 
 44 |             cap_pos = pos_emb_generation(cap_labels)
 45 |             cms_pos = pos_emb_generation(cms_labels)
 46 | 
 47 |             cap_probs, cms_probs = model(fc_feats, cap_labels, cap_pos, cms_labels, cms_pos)
 48 | 
 49 |             # note: currently we just used most naive cross-entropy as training objective,
 50 |             # advanced loss func. like SELF-CRIT, different loss weights or stronger video feature
 51 |             # may lead performance boost, however is not the goal of this work.
 52 |             cap_loss, cap_n_correct = cal_performance(cap_probs, cap_labels[:, 1:], smoothing=True)
 53 |             cms_loss, cms_n_correct = cal_performance(cms_probs, cms_labels[:, 1:], smoothing=True)
 54 | 
 55 |             # compute the token prediction Acc.
 56 |             non_pad_mask = cap_labels[:, 1:].ne(Constants.PAD)
 57 |             n_word = non_pad_mask.sum().item()
 58 |             cms_non_pad_mask = cms_labels[:, 1:].ne(Constants.PAD)
 59 |             cms_n_word = cms_non_pad_mask.sum().item()
 60 |             cap_loss /= n_word
 61 |             cms_loss /= n_word
 62 | 
 63 |             loss = cms_loss + cap_loss
 64 | 
 65 |             loss.backward()
 66 |             optimizer.step_and_update_lr()
 67 |             torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, model.parameters()), 1)
 68 | 
 69 |             # update parameters
 70 |             cap_train_loss = cap_loss.item()
 71 |             cms_train_loss = cms_loss.item()
 72 | 
 73 |             # multi-gpu case, not necessary in newer PyTorch version or on single GPU.
 74 |             if opt['cuda']: torch.cuda.synchronize()
 75 | 
 76 |             iteration += 1
 77 | 
 78 |             if iteration % opt['print_loss_every'] ==0:
 79 |                 print('iter %d (epoch %d), cap_train_loss = %.6f, cms_train_loss = %.6f,'
 80 |                       ' current step = %d, current lr = %.3E, cap_acc = %.3f, cms_acc = %.3f'
 81 |                       % (iteration, epoch, cap_train_loss, cms_train_loss, optimizer.n_current_steps,
 82 |                          optimizer._optimizer.param_groups[0]['lr'],
 83 |                          cap_n_correct/n_word, cms_n_correct/cms_n_word))
 84 | 
 85 |                 # show the intermediate generations
 86 |                 if opt['show_predict']:
 87 |                     cap_pr, cap_gt = show_prediction(cap_probs, cap_labels[:, :-1], cap_vocab, caption=True)
 88 |                     cms_pr, cms_gt = show_prediction(cms_probs, cms_labels[:, :-1], cms_vocab, caption=False)
 89 |                     print(' \n')
 90 | 
 91 |                     with open(opt['info_path'], 'a') as f:
 92 |                         f.write('model_%d, cap_loss: %.6f, cms_loss: %.6f\n'
 93 |                                 % (epoch, cap_train_loss, cms_train_loss))
 94 |                         f.write('\n %s \n %s' % (cap_pr, cap_gt))
 95 |                         f.write('\n %s \n %s' % (cms_pr, cms_gt))
 96 |                         f.write('\n')
 97 | 
 98 |         if epoch % opt['save_checkpoint_every'] == 0:
 99 | 
100 |             # save the checkpoint
101 |             model_path = os.path.join(opt['output_dir'],
102 |                                       'Transformer_CMS_CAP_MODEL_{}_lr_{}_BS_{}_Layer_{}_ATTHEAD_{}_HID_{}_RNNLayer_{}_epoch_{}.pth'
103 |                                       .format(opt['cms'], opt['init_lr'], opt['batch_size'], opt['num_layer'],
104 |                                               opt['num_head'], opt['dim_model'], opt['rnn_layer'], epoch))
105 | 
106 |             torch.save(model.state_dict(), model_path)
107 | 
108 |             print('model saved to %s' % model_path)
109 |             with open(opt['model_info_path'], 'a') as f:
110 |                 f.write('model_%d, cap_loss: %.6f, cms_loss: %.6f\n'
111 |                         % (epoch, cap_train_loss/n_word, cms_train_loss/n_word))
112 | 
113 | 
114 | def main(opt):
115 | 
116 |     # load and define dataloader
117 |     dataset = VideoDataset(opt, 'train')
118 |     dataloader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=True)
119 | 
120 |     opt['cms_vocab_size'] = dataset.get_cms_vocab_size()
121 |     opt['cap_vocab_size'] = dataset.get_cap_vocab_size()
122 | 
123 |     if opt['cms'] == 'int':
124 |         cms_text_length = opt['int_max_len']
125 |     elif opt['cms'] == 'eff':
126 |         cms_text_length = opt['eff_max_len']
127 |     else:
128 |         cms_text_length = opt['att_max_len']
129 | 
130 |     # model initialization.
131 |     model = Model(
132 |         dataset.get_cap_vocab_size(),
133 |         dataset.get_cms_vocab_size(),
134 |         cap_max_seq=opt['cap_max_len'],
135 |         cms_max_seq=cms_text_length,
136 |         tgt_emb_prj_weight_sharing=True,
137 |         vis_emb=opt['dim_vis_feat'],
138 |         rnn_layers=opt['rnn_layer'],
139 |         d_k=opt['dim_head'],
140 |         d_v=opt['dim_head'],
141 |         d_model=opt['dim_model'],
142 |         d_word_vec=opt['dim_word'],
143 |         d_inner=opt['dim_inner'],
144 |         n_layers=opt['num_layer'],
145 |         n_head=opt['num_head'],
146 |         dropout=opt['dropout'])
147 | 
148 |     # number of parameters
149 |     model_parameters = filter(lambda p: p.requires_grad, model.parameters())
150 |     params = sum([np.prod(p.size()) for p in model_parameters])
151 |     print('number of learnable parameters are {}'.format(params))
152 | 
153 |     if opt['cuda']: model = model.cuda()
154 | 
155 |     # resume from previous checkpoint if indicated
156 |     if opt['load_checkpoint'] and opt['resume']:
157 |         cap_state_dict = torch.load(opt['load_checkpoint'])
158 |         model_dict = model.state_dict()
159 |         model_dict.update(cap_state_dict)
160 |         model.load_state_dict(model_dict)
161 | 
162 |     optimizer = ScheduledOptim(optim.Adam(filter(lambda x: x.requires_grad, model.parameters()),
163 |                                           betas=(0.9, 0.98), eps=1e-09), 512, opt['warm_up_steps'])
164 | 
165 |     # note: though we set the init learning rate as np.power(d_model, -0.5),
166 |     # grid search indicates different LR may improve the results.
167 |     opt['init_lr'] = round(optimizer.init_lr, 3)
168 | 
169 |     # create checkpoint output directory
170 |     dir = os.path.join(opt['checkpoint_path'], 'CMS_CAP_MODEL_{}_lr_{}_BS_{}_Layer_{}_ATTHEAD_{}_HID_{}_RNNLayer_{}'
171 |                        .format(opt['cms'], opt['init_lr'], opt['batch_size'], opt['num_layer'],
172 |                                opt['num_head'], opt['dim_model'], opt['rnn_layer']))
173 | 
174 |     if not os.path.exists(dir): os.makedirs(dir)
175 | 
176 |     # save the model snapshot to local
177 |     info_path = os.path.join(dir, 'iteration_info_log.log')
178 |     print('model architecture saved to {} \n {}'.format(info_path, str(model)))
179 |     with open(info_path, 'a') as f:
180 |         f.write(str(model))
181 |         f.write('\n')
182 |         f.write(str(params))
183 |         f.write('\n')
184 | 
185 |     # log file directory
186 |     opt['output_dir'] = dir
187 |     opt['info_path'] = info_path
188 |     opt['model_info_path'] = os.path.join(opt['output_dir'], 'checkpoint_loss_log.log')
189 | 
190 |     train(dataloader, model, optimizer, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab())
191 | 
192 | if __name__ == '__main__':
193 |     opt = parse_opt()
194 |     opt = vars(opt)
195 |     main(opt)


--------------------------------------------------------------------------------
/pictures/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jacobswan1/Video2Commonsense/4dcef76360a29702fd90b7030a39a123da6db19e/pictures/arch.png


--------------------------------------------------------------------------------
/pictures/v2c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jacobswan1/Video2Commonsense/4dcef76360a29702fd90b7030a39a123da6db19e/pictures/v2c.png


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import json
  3 | import torch
  4 | from opts import *
  5 | import numpy as np
  6 | import nltk
  7 | from utils.utils import *
  8 | from pycocoevalcap.bleu.bleu import Bleu
  9 | from pycocoevalcap.rouge.rouge import Rouge
 10 | from pycocoevalcap.cider.cider import Cider
 11 | from pycocoevalcap.meteor.meteor import Meteor
 12 | from model.Model import Model
 13 | from torch.utils.data import DataLoader
 14 | from model.transformer.Constants import *
 15 | from utils.gt_caps_dataloader import VideoDataset
 16 | from model.transformer.cap2cms_Translator import translate_batch
 17 | 
 18 | # sys.path.append("./pycocoevalcap/")
 19 | 
 20 | 
 21 | def test(loader, model, opt, cap_vocab, cms_vocab):
 22 |     res = {}
 23 |     gts = {}
 24 |     eval_id = 0
 25 | 
 26 |     total_cms = set()
 27 |     ppl_scores = []
 28 | 
 29 |     for batch_id, raw_data in enumerate(loader):
 30 |         if opt['cuda']: torch.cuda.synchronize()
 31 | 
 32 |         # iterate each video within the batch
 33 |         for iterate_id in range(len(raw_data)):
 34 |             fc_feats = raw_data[iterate_id][0]['fc_feats'].unsqueeze(0)
 35 |             video_ids = raw_data[iterate_id][0]['video_ids']
 36 |             cap_labels = raw_data[iterate_id][0]['cap_labels']
 37 | 
 38 |             if opt['cms'] == 'int':
 39 |                 cms_list = raw_data[iterate_id][1]
 40 |             elif opt['cms'] == 'eff':
 41 |                 cms_list = raw_data[iterate_id][2]
 42 |             else:
 43 |                 cms_list = raw_data[iterate_id][3]
 44 | 
 45 |             if opt['cuda']:
 46 |                 # cms_list = cms_list.cuda()
 47 |                 cap_labels = cap_labels.cuda()
 48 |                 fc_feats = fc_feats.cuda()
 49 | 
 50 |             # repeat the fc features for num_cap times
 51 |             fc_feats = fc_feats.repeat(len(cap_labels), 1, 1)
 52 | 
 53 |             # iterate through all captions per video
 54 |             with torch.no_grad():
 55 | 
 56 |                 # Note, currently we used BEAM search to decode the captions, while greedy strategy
 57 |                 #  should yield close or even better results.
 58 |                 cms_batch_hyp = translate_batch(model, fc_feats, cap_labels, opt)
 59 | 
 60 |             for random_id in range(cap_labels.shape[0]):
 61 |                 # Print out the predicted sentences and GT
 62 |                 if EOS in cms_batch_hyp[random_id][0]:
 63 |                     stop_id = cms_batch_hyp[random_id][0].index(EOS)
 64 |                 else:
 65 |                     stop_id = -1
 66 | 
 67 |                 cms = list_to_sentence([cms_vocab[str(widx)] for widx in
 68 |                                         cms_batch_hyp[random_id][0][: stop_id] if widx != 0])
 69 |                 cap_gt = list_to_sentence([cap_vocab[str(word.cpu().numpy())] for word in
 70 |                                            cap_labels[random_id, 1:] if word != 0][0:-1])
 71 | 
 72 |                 print(video_ids, '\n', 'Predicted CMS: ', cms)
 73 |                 print('GT CMS Caption: ', cap_gt)
 74 |                 print('GT CMS Knowledge: ', cms_list[random_id].split(';')[1:])
 75 |                 print('\n')
 76 |                 print(batch_id * opt['batch_size'], ' out of ', '3010')
 77 | 
 78 |                 # Save for evaluation
 79 |                 cmses = cms_list[random_id].split(';')[1:]
 80 |                 res[eval_id] = [cms]
 81 |                 gts[eval_id] = cmses
 82 | 
 83 |                 eval_id += 1
 84 | 
 85 |                 # Note! It is controversial whether PPL score reflect the quality of CMS as its using the corpus token
 86 |                 # probability. It is unclear which corpus (either total CMS corpus or the only 5 GT CMS, which is narrow)
 87 |                 # best reflects the results. Thus we remove it from our official results. Score in here is just for
 88 |                 # comparisons, where we used the only 5 GT annotations as corpus base.
 89 |                 ppl_corpus = ''
 90 |                 for c in cmses:
 91 |                     total_cms.add(c.lower())
 92 |                     ppl_corpus += ' ' + c.lower()
 93 |                 tokens = nltk.word_tokenize(ppl_corpus)
 94 |                 unigram_model = unigram(tokens)
 95 |                 ppl_scores.append(perplexity(cms.lower(), unigram_model))
 96 | 
 97 |     # Compute PPL score
 98 |     print('Perplexity score: ', sum(ppl_scores)/len(ppl_scores))
 99 | 
100 |     avg_bleu_score, bleu_scores = Bleu(4).compute_score(gts, res)
101 |     avg_cider_score, cider_scores = Cider().compute_score(gts, res)
102 |     avg_meteor_score, meteor_scores = Meteor().compute_score(gts, res)
103 |     avg_rouge_score, rouge_scores = Rouge().compute_score(gts, res)
104 |     print('C, M, R, B:', avg_cider_score, avg_meteor_score, avg_rouge_score, avg_bleu_score)
105 | 
106 | 
107 | def main(opt):
108 |     dataset = VideoDataset(opt, 'test')
109 |     dataloader = DataLoader(dataset, collate_fn=test_collate_fn, batch_size=opt['batch_size'],
110 |                             shuffle=False)
111 |     opt['cms_vocab_size'] = dataset.get_cms_vocab_size()
112 |     opt['cap_vocab_size'] = dataset.get_cap_vocab_size()
113 | 
114 |     if opt['cms'] == 'int':
115 |         cms_text_length = opt['int_max_len']
116 |     elif opt['cms'] == 'eff':
117 |         cms_text_length = opt['eff_max_len']
118 |     else:
119 |         cms_text_length = opt['att_max_len']
120 | 
121 |     model = Model(
122 |         dataset.get_cap_vocab_size(),
123 |         dataset.get_cms_vocab_size(),
124 |         cap_max_seq=opt['cap_max_len'],
125 |         cms_max_seq=cms_text_length,
126 |         tgt_emb_prj_weight_sharing=True,
127 |         vis_emb=opt['dim_vis_feat'],
128 |         rnn_layers=opt['rnn_layer'],
129 |         d_k=opt['dim_head'],
130 |         d_v=opt['dim_head'],
131 |         d_model=opt['dim_model'],
132 |         d_word_vec=opt['dim_word'],
133 |         d_inner=opt['dim_inner'],
134 |         n_layers=opt['num_layer'],
135 |         n_head=opt['num_head'],
136 |         dropout=opt['dropout'])
137 | 
138 |     if len(opt['load_checkpoint']) != 0:
139 |         state_dict = torch.load(opt['load_checkpoint'])
140 |         # for name, param in model.state_dict().items():
141 |         #     print(name, param.size())
142 |         #
143 |         # print('=================')
144 |         # print(state_dict.keys())
145 |         model.load_state_dict(state_dict)
146 | 
147 |     if opt['cuda']:
148 |         model = model.cuda()
149 | 
150 |     model.eval()
151 |     model_parameters = filter(lambda p: p.requires_grad, model.parameters())
152 |     params = sum([np.prod(p.size()) for p in model_parameters])
153 |     print(params)
154 |     test(dataloader, model, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab())
155 | 
156 | 
157 | if __name__ == '__main__':
158 |     opt = parse_opt()
159 |     opt = vars(opt)
160 |     opt['captions'] = json.load(open(opt['caption_json']))
161 |     opt['batch_size'] = 30
162 | 
163 |     main(opt)


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | ''' Training Scropt for V2C captioning task. '''
  2 | 
  3 | __author__ = 'Jacob Zhiyuan Fang'
  4 | 
  5 | import os
  6 | import numpy as np
  7 | from opts import *
  8 | from utils.utils import *
  9 | import torch.optim as optim
 10 | from model.Model import Model
 11 | from torch.utils.data import DataLoader
 12 | from utils.dataloader import VideoDataset
 13 | from model.transformer.Optim import ScheduledOptim
 14 | 
 15 | 
 16 | def train(loader, model, optimizer, opt, cap_vocab, cms_vocab):
 17 | 
 18 |     model.train()
 19 | 
 20 |     for epoch in range(opt['epochs']):
 21 |         iteration = 0
 22 | 
 23 |         for data in loader:
 24 |             torch.cuda.synchronize()
 25 | 
 26 |             if opt['cms'] == 'int':
 27 |                 cms_labels = data['int_labels']
 28 |             elif opt['cms'] == 'eff':
 29 |                 cms_labels = data['eff_labels']
 30 |             else:
 31 |                 cms_labels = data['att_labels']
 32 | 
 33 |             if opt['cuda']:
 34 |                 fc_feats = data['fc_feats'].cuda()
 35 |                 cap_labels = data['cap_labels'].cuda()
 36 |                 cms_labels = cms_labels.cuda()
 37 |             else:
 38 |                 fc_feats = data['fc_feats']
 39 |                 cap_labels = data['cap_labels']
 40 |                 cms_labels = cms_labels.cuda()
 41 | 
 42 |             optimizer.zero_grad()
 43 | 
 44 |             cap_pos = pos_emb_generation(cap_labels)
 45 |             cms_pos = pos_emb_generation(cms_labels)
 46 | 
 47 |             cap_probs, cms_probs = model(fc_feats, cap_labels, cap_pos, cms_labels, cms_pos)
 48 | 
 49 |             # note: currently we just used most naive cross-entropy as training objective,
 50 |             # advanced loss func. like SELF-CRIT, different loss weights or stronger video feature
 51 |             # may lead performance boost, however is not the goal of this work.
 52 |             cap_loss, cap_n_correct = cal_performance(cap_probs, cap_labels[:, 1:], smoothing=True)
 53 |             cms_loss, cms_n_correct = cal_performance(cms_probs, cms_labels[:, 1:], smoothing=True)
 54 | 
 55 |             # compute the token prediction Acc.
 56 |             non_pad_mask = cap_labels[:, 1:].ne(Constants.PAD)
 57 |             n_word = non_pad_mask.sum().item()
 58 |             cms_non_pad_mask = cms_labels[:, 1:].ne(Constants.PAD)
 59 |             cms_n_word = cms_non_pad_mask.sum().item()
 60 |             cap_loss /= n_word
 61 |             cms_loss /= n_word
 62 | 
 63 |             loss = cms_loss + cap_loss
 64 | 
 65 |             loss.backward()
 66 |             optimizer.step_and_update_lr()
 67 |             torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, model.parameters()), 1)
 68 | 
 69 |             # update parameters
 70 |             cap_train_loss = cap_loss.item()
 71 |             cms_train_loss = cms_loss.item()
 72 | 
 73 |             # multi-gpu case, not necessary in newer PyTorch version or on single GPU.
 74 |             if opt['cuda']: torch.cuda.synchronize()
 75 | 
 76 |             iteration += 1
 77 | 
 78 |             if iteration % opt['print_loss_every'] ==0:
 79 |                 print('iter %d (epoch %d), cap_train_loss = %.6f, cms_train_loss = %.6f,'
 80 |                       ' current step = %d, current lr = %.3E, cap_acc = %.3f, cms_acc = %.3f'
 81 |                       % (iteration, epoch, cap_train_loss, cms_train_loss, optimizer.n_current_steps,
 82 |                          optimizer._optimizer.param_groups[0]['lr'],
 83 |                          cap_n_correct/n_word, cms_n_correct/cms_n_word))
 84 | 
 85 |                 # show the intermediate generations
 86 |                 if opt['show_predict']:
 87 |                     cap_pr, cap_gt = show_prediction(cap_probs, cap_labels[:, :-1], cap_vocab, caption=True)
 88 |                     cms_pr, cms_gt = show_prediction(cms_probs, cms_labels[:, :-1], cms_vocab, caption=False)
 89 |                     print(' \n')
 90 | 
 91 |                     with open(opt['info_path'], 'a') as f:
 92 |                         f.write('model_%d, cap_loss: %.6f, cms_loss: %.6f\n'% (epoch, cap_train_loss, cms_train_loss))
 93 |                         f.write('\n %s \n %s' % (cap_pr, cap_gt))
 94 |                         f.write('\n %s \n %s' % (cms_pr, cms_gt))
 95 |                         f.write('\n')
 96 | 
 97 |         if epoch % opt['save_checkpoint_every'] == 0:
 98 | 
 99 |             # save the checkpoint
100 |             model_path = os.path.join(opt['output_dir'],
101 |                                       'CMS_CAP_MODEL_INT_lr_{}_BS_{}_Layer_{}_ATTHEAD_{}_HID_{}_RNNLayer_{}_epoch_{}.pth'
102 |                                       .format(opt['init_lr'], opt['batch_size'], opt['num_layer'],
103 |                                               opt['num_head'], opt['dim_model'], opt['rnn_layer'], epoch))
104 | 
105 |             torch.save(model.state_dict(), model_path)
106 | 
107 |             print('model saved to %s' % model_path)
108 |             with open(opt['model_info_path'], 'a') as f:
109 |                 f.write('model_%d, cap_loss: %.6f, cms_loss: %.6f\n'
110 |                         % (epoch, cap_train_loss/n_word, cms_train_loss/n_word))
111 | 
112 | 
113 | def main(opt):
114 | 
115 |     # load and define dataloader
116 |     dataset = VideoDataset(opt, 'train')
117 |     dataloader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=True)
118 | 
119 |     opt['cms_vocab_size'] = dataset.get_cms_vocab_size()
120 |     opt['cap_vocab_size'] = dataset.get_cap_vocab_size()
121 | 
122 |     if opt['cms'] == 'int':
123 |         cms_text_length = opt['int_max_len']
124 |     elif opt['cms'] == 'eff':
125 |         cms_text_length = opt['eff_max_len']
126 |     else:
127 |         cms_text_length = opt['att_max_len']
128 | 
129 |     # model initialization.
130 |     model = Model(
131 |         dataset.get_cap_vocab_size(),
132 |         dataset.get_cms_vocab_size(),
133 |         cap_max_seq=opt['cap_max_len'],
134 |         cms_max_seq=cms_text_length,
135 |         tgt_emb_prj_weight_sharing=True,
136 |         vis_emb=opt['dim_vis_feat'],
137 |         rnn_layers=opt['rnn_layer'],
138 |         d_k=opt['dim_head'],
139 |         d_v=opt['dim_head'],
140 |         d_model=opt['dim_model'],
141 |         d_word_vec=opt['dim_word'],
142 |         d_inner=opt['dim_inner'],
143 |         n_layers=opt['num_layer'],
144 |         n_head=opt['num_head'],
145 |         dropout=opt['dropout'])
146 | 
147 |     # number of parameters
148 |     model_parameters = filter(lambda p: p.requires_grad, model.parameters())
149 |     params = sum([np.prod(p.size()) for p in model_parameters])
150 |     print('number of learnable parameters are {}'.format(params))
151 | 
152 |     if opt['cuda']: model = model.cuda()
153 | 
154 |     # resume from previous checkpoint if indicated
155 |     if opt['load_checkpoint'] and opt['resume']:
156 |         cap_state_dict = torch.load(opt['load_checkpoint'])
157 |         model_dict = model.state_dict()
158 |         model_dict.update(cap_state_dict)
159 |         model.load_state_dict(model_dict)
160 | 
161 |     optimizer = ScheduledOptim(optim.Adam(filter(lambda x: x.requires_grad, model.parameters()),
162 |                                           betas=(0.9, 0.98), eps=1e-09), 512, opt['warm_up_steps'])
163 | 
164 |     # note: though we set the init learning rate as np.power(d_model, -0.5),
165 |     # grid search indicates different LR may improve the results.
166 |     opt['init_lr'] = round(optimizer.init_lr, 3)
167 | 
168 |     # create checkpoint output directory
169 |     dir = os.path.join(opt['checkpoint_path'], 'CMS_CAP_MODEL_INT_lr_{}_BS_{}_Layer_{}_ATTHEAD_{}_HID_{}_RNNLayer_{}'
170 |                        .format(opt['init_lr'], opt['batch_size'], opt['num_layer'],
171 |                                opt['num_head'], opt['dim_model'], opt['rnn_layer']))
172 | 
173 |     if not os.path.exists(dir): os.makedirs(dir)
174 | 
175 |     # save the model snapshot to local
176 |     info_path = os.path.join(dir, 'iteration_info_log.log')
177 |     print('model architecture saved to {} \n {}'.format(info_path, str(model)))
178 |     with open(info_path, 'a') as f:
179 |         f.write(str(model))
180 |         f.write('\n')
181 |         f.write(str(params))
182 |         f.write('\n')
183 | 
184 |     # log file directory
185 |     opt['output_dir'] = dir
186 |     opt['info_path'] = info_path
187 |     opt['model_info_path'] = os.path.join(opt['output_dir'], 'checkpoint_loss_log.log')
188 | 
189 |     train(dataloader, model, optimizer, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab())
190 | 
191 | if __name__ == '__main__':
192 |     opt = parse_opt()
193 |     opt = vars(opt)
194 |     main(opt)
195 | 


--------------------------------------------------------------------------------
/utils/allinone_dataloader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import torch
  4 | import random
  5 | import numpy as np
  6 | from torch.utils.data import Dataset
  7 | 
  8 | 
  9 | class VideoDataset(Dataset):
 10 | 
 11 |     def get_cms_vocab_size(self):
 12 |         return len(self.get_cms_vocab())
 13 | 
 14 |     def get_cap_vocab_size(self):
 15 |         return len(self.get_cap_vocab())
 16 | 
 17 |     def get_cms_vocab(self):
 18 |         return self.cms_ix_to_word
 19 | 
 20 |     def get_cap_vocab(self):
 21 |         return self.cap_ix_to_word
 22 | 
 23 |     def get_seq_length(self):
 24 |         return self.seq_length
 25 | 
 26 |     def __init__(self, opt, mode='train'):
 27 |         super(VideoDataset, self).__init__()
 28 |         self.mode = mode
 29 | 
 30 |         self.captions = json.load(open(opt['caption_json']))
 31 |         cms_info = json.load(open(opt['info_json']))
 32 |         self.cms_ix_to_word = cms_info['ix_to_word']
 33 |         self.cms_word_to_ix = cms_info['word_to_ix']
 34 |         self.splits = cms_info['videos']
 35 | 
 36 |         # Load caption dictionary
 37 |         cap_info = json.load(open(opt['cap_info_json']))
 38 |         self.cap_ix_to_word = cap_info['ix_to_word']
 39 |         self.cap_word_to_ix = cap_info['word_to_ix']
 40 | 
 41 |         print('Caption vocab size is ', len(self.cap_ix_to_word))
 42 |         print('CMS vocab size is ', len(self.cms_ix_to_word))
 43 |         print('number of train videos: ', len(self.splits['train']))
 44 |         print('number of test videos: ', len(self.splits['test']))
 45 |         print('number of val videos: ', len(self.splits['val']))
 46 | 
 47 |         self.feats_dir = opt['feats_dir']
 48 |         print('load feats from %s' % self.feats_dir)
 49 | 
 50 |         self.cap_max_len = opt['cap_max_len']
 51 |         self.int_max_len = opt['int_max_len']
 52 |         self.eff_max_len = opt['eff_max_len']
 53 |         self.att_max_len = opt['att_max_len']
 54 |         print('max sequence length of caption is', self.cap_max_len)
 55 |         print('max sequence length of intention is', self.int_max_len)
 56 |         print('max sequence length of effect is', self.eff_max_len)
 57 |         print('max sequence length of attribute is', self.att_max_len)
 58 | 
 59 |     def __getitem__(self, ix=False):
 60 |         if not ix:
 61 |             if self.mode == 'train':
 62 |                 ix = random.choice(self.splits['train'])
 63 |             elif self.mode == 'test':
 64 |                 ix = self.splits['test'][ix]
 65 |         
 66 |         fc_feat = []
 67 |         for dir in self.feats_dir:
 68 |             fc_feat.append(np.load(os.path.join(dir, 'video%i.npy' % ix)))
 69 |         fc_feat = np.concatenate(fc_feat, axis=1)
 70 | 
 71 |         total_length = self.int_max_len + self.cap_max_len + self.eff_max_len + self.att_max_len
 72 |         cap_mask = np.zeros(total_length)
 73 |         cap_gts = np.zeros(total_length)
 74 | 
 75 |         idx = 'video%i' % ix
 76 |         if idx not in self.captions.keys():
 77 |             raw_data = self.captions[random.choice(list(self.captions.keys()))]
 78 |         else:
 79 |             raw_data = self.captions[idx]
 80 | 
 81 |         cap_ix = random.randint(0, len(raw_data) - 1)   # Random pick out one caption
 82 | 
 83 |         caption = raw_data[cap_ix]['final_caption']
 84 |         intentions = raw_data[cap_ix]['intention']
 85 |         intention = intentions[random.randint(0, len(intentions)-1)][1]
 86 | 
 87 |         effects = raw_data[cap_ix]['effect']
 88 |         effect = effects[random.randint(0, len(effects)-1)][1]
 89 | 
 90 |         attributes = raw_data[cap_ix]['attribute']
 91 |         attribute = attributes[random.randint(0, len(attributes)-1)][1]
 92 | 
 93 |         allinone_caption = intention[:-1] + ['<eos>'] + caption[1:-1] + ['<eos>'] + \
 94 |                            effect[1:-1] + ['<eos>'] + attribute[1:]
 95 | 
 96 |         if len(allinone_caption) > total_length:
 97 |             allinone_caption = allinone_caption[:total_length]
 98 |             allinone_caption[-1] = '<eos>'
 99 | 
100 |         for j, w in enumerate(allinone_caption):
101 |             cap_gts[j] = self.cap_word_to_ix.get(w, '1')
102 | 
103 |         non_zero = (cap_gts == 0).nonzero()
104 |         if len(non_zero[0]) != 0: cap_mask[:int(non_zero[0][0])] = 1
105 |         else: cap_mask += 1
106 | 
107 |         data = {}
108 |         data['fc_feats'] = torch.from_numpy(fc_feat).type(torch.FloatTensor)
109 |         data['cap_labels'] = torch.from_numpy(cap_gts).type(torch.LongTensor)
110 |         data['cap_masks'] = torch.from_numpy(cap_mask).type(torch.FloatTensor)
111 | 
112 |         data['video_ids'] = 'video%i' % ix
113 |         return data
114 | 
115 |     def __len__(self):
116 |         return len(self.splits[self.mode])
117 | 


--------------------------------------------------------------------------------
/utils/cocoeval.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Wrapper for evaluation on CIDEr, ROUGE_L, METEOR and Bleu_N
  3 | using coco-caption repo https://github.com/tylin/coco-caption
  4 | 
  5 | class COCOScorer is taken from https://github.com/yaoli/arctic-capgen-vid
  6 | '''
  7 | 
  8 | import json
  9 | import os
 10 | import sys
 11 | sys.path.append('coco-caption')
 12 | 
 13 | from pycocoevalcap.bleu.bleu import Bleu
 14 | from pycocoevalcap.rouge.rouge import Rouge
 15 | from pycocoevalcap.cider.cider import Cider
 16 | from pycocoevalcap.meteor.meteor import Meteor
 17 | from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
 18 | # Define a context manager to suppress stdout and stderr.
 19 | 
 20 | 
 21 | class suppress_stdout_stderr:
 22 |     '''
 23 |     A context manager for doing a "deep suppression" of stdout and stderr in
 24 |     Python, i.e. will suppress all print, even if the print originates in a
 25 |     compiled C/Fortran sub-function.
 26 |        This will not suppress raised exceptions, since exceptions are printed
 27 |     to stderr just before a script exits, and after the context manager has
 28 |     exited (at least, I think that is why it lets exceptions through).
 29 | 
 30 |     '''
 31 | 
 32 |     def __init__(self):
 33 |         # Open a pair of null files
 34 |         self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
 35 |         # Save the actual stdout (1) and stderr (2) file descriptors.
 36 |         self.save_fds = (os.dup(1), os.dup(2))
 37 | 
 38 |     def __enter__(self):
 39 |         # Assign the null pointers to stdout and stderr.
 40 |         os.dup2(self.null_fds[0], 1)
 41 |         os.dup2(self.null_fds[1], 2)
 42 | 
 43 |     def __exit__(self, *_):
 44 |         # Re-assign the real stdout/stderr back to (1) and (2)
 45 |         os.dup2(self.save_fds[0], 1)
 46 |         os.dup2(self.save_fds[1], 2)
 47 |         # Close the null files
 48 |         os.close(self.null_fds[0])
 49 |         os.close(self.null_fds[1])
 50 | 
 51 | 
 52 | class COCOScorer(object):
 53 |     def __init__(self):
 54 |         print('init COCO-EVAL scorer')
 55 | 
 56 |     def score(self, GT, RES, IDs):
 57 |         self.eval = {}
 58 |         self.imgToEval = {}
 59 |         gts = {}
 60 |         res = {}
 61 |         for ID in IDs:
 62 |             #            print ID
 63 |             gts[ID] = GT[ID]
 64 |             res[ID] = RES[ID]
 65 |         print('tokenization...')
 66 |         tokenizer = PTBTokenizer()
 67 |         gts = tokenizer.tokenize(gts)
 68 |         res = tokenizer.tokenize(res)
 69 | 
 70 |         # =================================================
 71 |         # Set up scorers
 72 |         # =================================================
 73 |         print('setting up scorers...')
 74 |         scorers = [
 75 |             (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
 76 |             (Meteor(),"METEOR"),
 77 |             (Rouge(), "ROUGE_L"),
 78 |             (Cider(), "CIDEr"),
 79 |             #(Spice(), "SPICE")
 80 |         ]
 81 | 
 82 |         # =================================================
 83 |         # Compute scores
 84 |         # =================================================
 85 |         eval = {}
 86 |         for scorer, method in scorers:
 87 |             print('computing %s score...' % (scorer.method()))
 88 |             score, scores = scorer.compute_score(gts, res)
 89 |             if type(method) == list:
 90 |                 for sc, scs, m in zip(score, scores, method):
 91 |                     self.setEval(sc, m)
 92 |                     self.setImgToEvalImgs(scs, IDs, m)
 93 |                     print("%s: %0.3f" % (m, sc))
 94 |             else:
 95 |                 self.setEval(score, method)
 96 |                 self.setImgToEvalImgs(scores, IDs, method)
 97 |                 print("%s: %0.3f" % (method, score))
 98 | 
 99 |         # for metric, score in self.eval.items():
100 |         #    print '%s: %.3f'%(metric, score)
101 |         return self.eval
102 | 
103 |     def setEval(self, score, method):
104 |         self.eval[method] = score
105 | 
106 |     def setImgToEvalImgs(self, scores, imgIds, method):
107 |         for imgId, score in zip(imgIds, scores):
108 |             if imgId not in self.imgToEval:
109 |                 self.imgToEval[imgId] = {}
110 |                 self.imgToEval[imgId]["image_id"] = imgId
111 |             self.imgToEval[imgId][method] = score
112 | 
113 | 
114 | def score(ref, sample):
115 |     # ref and sample are both dict
116 |     scorers = [
117 |         (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
118 |         (Rouge(), "ROUGE_L"),
119 |         (Cider(), "CIDEr")
120 |     ]
121 |     final_scores = {}
122 |     for scorer, method in scorers:
123 |         print('computing %s score with COCO-EVAL...' % (scorer.method()))
124 |         score, scores = scorer.compute_score(ref, sample)
125 |         if type(score) == list:
126 |             for m, s in zip(method, score):
127 |                 final_scores[m] = s
128 |         else:
129 |             final_scores[method] = score
130 |     return final_scores
131 | 
132 | 
133 | def test_cocoscorer():
134 |     '''gts = {
135 |         184321:[
136 |         {u'image_id': 184321, u'id': 352188, u'caption': u'A train traveling down-tracks next to lights.'},
137 |         {u'image_id': 184321, u'id': 356043, u'caption': u"A blue and silver train next to train's station and trees."},
138 |         {u'image_id': 184321, u'id': 356382, u'caption': u'A blue train is next to a sidewalk on the rails.'},
139 |         {u'image_id': 184321, u'id': 361110, u'caption': u'A passenger train pulls into a train station.'},
140 |         {u'image_id': 184321, u'id': 362544, u'caption': u'A train coming down the tracks arriving at a station.'}],
141 |         81922: [
142 |         {u'image_id': 81922, u'id': 86779, u'caption': u'A large jetliner flying over a traffic filled street.'},
143 |         {u'image_id': 81922, u'id': 90172, u'caption': u'An airplane flies low in the sky over a city street. '},
144 |         {u'image_id': 81922, u'id': 91615, u'caption': u'An airplane flies over a street with many cars.'},
145 |         {u'image_id': 81922, u'id': 92689, u'caption': u'An airplane comes in to land over a road full of cars'},
146 |         {u'image_id': 81922, u'id': 823814, u'caption': u'The plane is flying over top of the cars'}]
147 |         }
148 | 
149 |     samples = {
150 |         184321: [{u'image_id': 184321, 'id': 111, u'caption': u'train traveling down a track in front of a road'}],
151 |         81922: [{u'image_id': 81922, 'id': 219, u'caption': u'plane is flying through the sky'}],
152 |         }
153 |     '''
154 |     gts = {
155 |         '184321': [
156 |             {u'image_id': '184321', u'cap_id': 0, u'caption': u'A train traveling down tracks next to lights.',
157 |              'tokenized': 'a train traveling down tracks next to lights'},
158 |             {u'image_id': '184321', u'cap_id': 1, u'caption': u'A train coming down the tracks arriving at a station.',
159 |              'tokenized': 'a train coming down the tracks arriving at a station'}],
160 |         '81922': [
161 |             {u'image_id': '81922', u'cap_id': 0, u'caption': u'A large jetliner flying over a traffic filled street.',
162 |              'tokenized': 'a large jetliner flying over a traffic filled street'},
163 |             {u'image_id': '81922', u'cap_id': 1, u'caption': u'The plane is flying over top of the cars',
164 |              'tokenized': 'the plan is flying over top of the cars'}, ]
165 |     }
166 | 
167 |     samples = {
168 |         '184321': [{u'image_id': '184321', u'caption': u'train traveling down a track in front of a road'}],
169 |         '81922': [{u'image_id': '81922', u'caption': u'plane is flying through the sky'}],
170 |     }
171 |     IDs = ['184321', '81922']
172 |     scorer = COCOScorer()
173 |     scorer.score(gts, samples, IDs)
174 | 
175 | 
176 | if __name__ == '__main__':
177 |     test_cocoscorer()
178 | 


--------------------------------------------------------------------------------
/utils/dataloader.py:
--------------------------------------------------------------------------------
  1 | ''' Customized dataloader for V2C dataset. '''
  2 | 
  3 | __author__ = 'Jacob Zhiyuan Fang'
  4 | 
  5 | import os
  6 | import json
  7 | import torch
  8 | import random
  9 | import numpy as np
 10 | from torch.utils.data import Dataset
 11 | 
 12 | 
 13 | class VideoDataset(Dataset):
 14 | 
 15 |     def tensorize_float(self, obj):
 16 |         return torch.from_numpy(obj).type(torch.FloatTensor)
 17 | 
 18 |     def tensorize_long(self, obj):
 19 |         return torch.from_numpy(obj).type(torch.LongTensor)
 20 | 
 21 |     def get_cms_vocab_size(self):
 22 |         return len(self.get_cms_vocab())
 23 | 
 24 |     def get_cap_vocab_size(self):
 25 |         return len(self.get_cap_vocab())
 26 | 
 27 |     def get_cms_vocab(self):
 28 |         return self.cms_ix_to_word
 29 | 
 30 |     def get_cap_vocab(self):
 31 |         return self.cap_ix_to_word
 32 | 
 33 |     def get_seq_length(self):
 34 |         return self.seq_length
 35 | 
 36 |     def __init__(self, opt, mode='train'):
 37 |         super(VideoDataset, self).__init__()
 38 |         self.mode = mode
 39 | 
 40 |         self.captions = json.load(open(opt['caption_json']))
 41 |         cms_info = json.load(open(opt['info_json']))
 42 |         self.cms_ix_to_word = cms_info['ix_to_word']
 43 |         self.cms_word_to_ix = cms_info['word_to_ix']
 44 |         self.splits = cms_info['videos']
 45 | 
 46 |         # Load caption dictionary
 47 |         cap_info = json.load(open(opt['cap_info_json']))
 48 |         self.cap_ix_to_word = cap_info['ix_to_word']
 49 |         self.cap_word_to_ix = cap_info['word_to_ix']
 50 | 
 51 |         print('Caption vocab size is ', len(self.cap_ix_to_word))
 52 |         print('CMS vocab size is ', len(self.cms_ix_to_word))
 53 |         print('number of train videos: ', len(self.splits['train']))
 54 |         print('number of test videos: ', len(self.splits['test']))
 55 |         print('number of val videos: ', len(self.splits['val']))
 56 | 
 57 |         self.feats_dir = opt['feats_dir']
 58 |         print('load feats from %s' % self.feats_dir)
 59 | 
 60 |         self.cap_max_len = opt['cap_max_len']
 61 |         self.int_max_len = opt['int_max_len']
 62 |         self.eff_max_len = opt['eff_max_len']
 63 |         self.att_max_len = opt['att_max_len']
 64 | 
 65 |         print('max sequence length of caption is', self.cap_max_len)
 66 |         print('max sequence length of intention is', self.int_max_len)
 67 |         print('max sequence length of effect is', self.eff_max_len)
 68 |         print('max sequence length of attribute is', self.att_max_len)
 69 | 
 70 |     def __getitem__(self, ix=False):
 71 |         # if not ix:
 72 |         #     if self.mode == 'train':
 73 |         #         ix = random.choice(self.splits['train'])
 74 |         #
 75 |         #     elif self.mode == 'test':
 76 |         #         ix = self.splits['test'][ix]
 77 | 
 78 |         if self.mode == 'train':
 79 |             ix = self.splits['train'][ix]
 80 | 
 81 |         elif self.mode == 'test':
 82 |             ix = self.splits['test'][ix]
 83 | 
 84 |         # Load the visual features
 85 |         fc_feat = []
 86 |         for dir in self.feats_dir:
 87 |             fc_feat.append(np.load(os.path.join(dir, 'video%i.npy' % ix)))
 88 | 
 89 |         fc_feat = np.concatenate(fc_feat, axis=1)
 90 | 
 91 |         # Placeholder for returning parameters
 92 |         cap_mask = np.zeros(self.cap_max_len)
 93 |         int_mask = np.zeros(self.int_max_len)
 94 |         eff_mask = np.zeros(self.eff_max_len)
 95 |         att_mask = np.zeros(self.att_max_len)
 96 | 
 97 |         cap_gts = np.zeros(self.cap_max_len)
 98 |         int_gts = np.zeros(self.int_max_len)
 99 |         eff_gts = np.zeros(self.eff_max_len)
100 |         att_gts = np.zeros(self.att_max_len)
101 | 
102 |         if 'video%i' % ix not in self.captions:
103 |             print(ix in self.splits['train'])
104 | 
105 |         assert 'video%i' % ix in self.captions
106 |         raw_data = self.captions['video%i' % ix]
107 | 
108 |         # Random pick out one caption in Training mode
109 |         cap_ix = random.randint(0, len(raw_data) - 1)
110 | 
111 |         # Pop out Cap, Int, Eff and Att
112 |         caption = self.captions['video%i' % ix][cap_ix]['final_caption']
113 | 
114 |         intentions = self.captions['video%i' % ix][cap_ix]['intention']
115 |         intention = intentions[random.randint(0, len(intentions)-1)][1]
116 | 
117 |         effects = self.captions['video%i' % ix][cap_ix]['effect']
118 |         effect = effects[random.randint(0, len(effects)-1)][1]
119 | 
120 |         attributes = self.captions['video%i' % ix][cap_ix]['attribute']
121 |         attribute = attributes[random.randint(0, len(attributes)-1)][1]
122 | 
123 |         # Trunk the tokens if it exceed the maximum limitation
124 |         if len(caption) > self.cap_max_len:
125 |             caption = caption[:self.cap_max_len]
126 |             caption[-1] = '<eos>'
127 | 
128 |         if len(effect) > self.eff_max_len:
129 |             effect = effect[:self.eff_max_len]
130 |             effect[-1] = '<eos>'
131 | 
132 |         if len(attribute) > self.att_max_len:
133 |             attribute = attribute[:self.att_max_len]
134 |             attribute[-1] = '<eos>'
135 | 
136 |         # Tokenize it
137 |         for j, w in enumerate(caption):
138 |             cap_gts[j] = self.cap_word_to_ix.get(w, '1')
139 | 
140 |         for j, w in enumerate(intention):
141 |             int_gts[j] = self.cms_word_to_ix.get(w, '1')
142 | 
143 |         for j, w in enumerate(effect):
144 |             eff_gts[j] = self.cms_word_to_ix.get(w, '1')
145 | 
146 |         for j, w in enumerate(attribute):
147 |             att_gts[j] = self.cms_word_to_ix.get(w, '1')
148 | 
149 |         # Mask out additional positions
150 |         non_zero = (cap_gts == 0).nonzero()
151 |         if len(non_zero[0]) != 0: cap_mask[:int(non_zero[0][0])] = 1
152 |         else: cap_mask += 1
153 | 
154 |         non_zero = (int_gts == 0).nonzero()
155 |         if len(non_zero[0]) != 0: int_mask[:int(non_zero[0][0])] = 1
156 |         else: int_mask += 1
157 | 
158 |         non_zero = (eff_gts == 0).nonzero()
159 |         if len(non_zero[0]) != 0: eff_mask[:int(non_zero[0][0])] = 1
160 |         else: eff_mask += 1
161 | 
162 |         non_zero = (att_gts == 0).nonzero()
163 |         if len(non_zero[0]) != 0: att_mask[:int(non_zero[0][0])] = 1
164 |         else: att_mask += 1
165 | 
166 |         # Convert to Tensors
167 |         data = {}
168 |         data['fc_feats'] = self.tensorize_float(fc_feat)
169 |         data['cap_labels'] = self.tensorize_long(cap_gts)
170 |         data['cap_masks'] = self.tensorize_float(cap_mask)
171 |         data['int_labels'] = self.tensorize_long(int_gts)
172 |         data['int_masks'] = self.tensorize_float(int_mask)
173 |         data['eff_labels'] = self.tensorize_long(eff_gts)
174 |         data['eff_masks'] = self.tensorize_float(eff_mask)
175 |         data['att_labels'] = self.tensorize_long(att_gts)
176 |         data['att_masks'] = self.tensorize_float(att_mask)
177 |         data['video_ids'] = 'video%i' % ix
178 | 
179 |         return data
180 | 
181 |     def __len__(self):
182 |         return len(self.splits[self.mode])
183 | 


--------------------------------------------------------------------------------
/utils/gt_cap_dataloader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import torch
  4 | import random
  5 | import numpy as np
  6 | from torch.utils.data import Dataset
  7 | 
  8 | 
  9 | class VideoDataset(Dataset):
 10 | 
 11 |     def get_cms_vocab_size(self):
 12 |         return len(self.get_cms_vocab())
 13 | 
 14 |     def get_cap_vocab_size(self):
 15 |         return len(self.get_cap_vocab())
 16 | 
 17 |     def get_cms_vocab(self):
 18 |         return self.cms_ix_to_word
 19 | 
 20 |     def get_cap_vocab(self):
 21 |         return self.cap_ix_to_word
 22 | 
 23 |     def get_seq_length(self):
 24 |         return self.seq_length
 25 | 
 26 |     def __init__(self, opt, mode='train'):
 27 |         super(VideoDataset, self).__init__()
 28 |         self.mode = mode
 29 | 
 30 |         self.captions = json.load(open(opt['caption_json']))
 31 |         cms_info = json.load(open(opt['info_json']))
 32 |         self.cms_ix_to_word = cms_info['ix_to_word']
 33 |         self.cms_word_to_ix = cms_info['word_to_ix']
 34 |         self.splits = cms_info['videos']
 35 | 
 36 |         # Load caption dictionary
 37 |         cap_info = json.load(open(opt['cap_info_json']))
 38 |         self.cap_ix_to_word = cap_info['ix_to_word']
 39 |         self.cap_word_to_ix = cap_info['word_to_ix']
 40 | 
 41 |         print('Caption vocab size is ', len(self.cap_ix_to_word))
 42 |         print('CMS vocab size is ', len(self.cms_ix_to_word))
 43 |         print('number of train videos: ', len(self.splits['train']))
 44 |         print('number of test videos: ', len(self.splits['test']))
 45 |         print('number of val videos: ', len(self.splits['val']))
 46 | 
 47 |         self.feats_dir = opt['feats_dir']
 48 |         print('load feats from %s' % self.feats_dir)
 49 | 
 50 |         self.cap_max_len = opt['cap_max_len']
 51 |         self.int_max_len = opt['int_max_len']
 52 |         self.eff_max_len = opt['eff_max_len']
 53 |         self.att_max_len = opt['att_max_len']
 54 |         print('max sequence length of caption is', self.cap_max_len)
 55 |         print('max sequence length of intention is', self.int_max_len)
 56 |         print('max sequence length of effect is', self.eff_max_len)
 57 |         print('max sequence length of attribute is', self.att_max_len)
 58 | 
 59 |     def __getitem__(self, ix):
 60 | 
 61 |         if self.mode == 'train':
 62 |             ix = random.choice(self.splits['train'])
 63 |         elif self.mode == 'test':
 64 |             ix = self.splits['test'][ix]
 65 |         
 66 |         fc_feat = []
 67 |         for dir in self.feats_dir:
 68 |             fc_feat.append(np.load(os.path.join(dir, 'video%i.npy' % ix)))
 69 |         fc_feat = np.concatenate(fc_feat, axis=1)
 70 | 
 71 |         cap_mask = np.zeros(self.cap_max_len)
 72 |         int_mask = np.zeros(self.int_max_len)
 73 |         eff_mask = np.zeros(self.eff_max_len)
 74 |         att_mask = np.zeros(self.att_max_len)
 75 | 
 76 |         # cap_gts = np.zeros((10, self.cap_max_len))
 77 |         cap_gts = np.zeros((1, self.cap_max_len))
 78 |         int_gts = np.zeros(self.int_max_len)
 79 |         eff_gts = np.zeros(self.eff_max_len)
 80 |         att_gts = np.zeros(self.att_max_len)
 81 | 
 82 |         raw_data = self.captions['video%i' % ix]
 83 | 
 84 |         cap_ix = random.randint(0, len(raw_data) - 1)   # Random pick out one caption
 85 | 
 86 |         caption = raw_data[cap_ix]['final_caption']
 87 | 
 88 |         intentions = raw_data[cap_ix]['intention']
 89 |         intention = intentions[random.randint(0, len(intentions)-1)][1]
 90 | 
 91 |         effects = raw_data[cap_ix]['effect']
 92 |         effect = effects[random.randint(0, len(effects)-1)][1]
 93 | 
 94 |         attributes = raw_data[cap_ix]['attribute']
 95 |         attribute = attributes[random.randint(0, len(attributes)-1)][1]
 96 | 
 97 |         # Load all intentions again for eval
 98 |         # intentions = [item['intention'][0] for item in raw_data]
 99 |         # effects = [item['effect'][0] for item in raw_data]
100 |         # attributes = [item['attribute'][0] for item in raw_data]
101 | 
102 |         if len(caption) > self.cap_max_len:
103 |             caption = caption[:self.cap_max_len]
104 |             caption[-1] = '<eos>'
105 |         if len(effect) > self.eff_max_len:
106 |             effect = effect[:self.eff_max_len]
107 |             effect[-1] = '<eos>'
108 |         if len(attribute) > self.att_max_len:
109 |             attribute = attribute[:self.att_max_len]
110 |             attribute[-1] = '<eos>'
111 | 
112 |         # Load all 10 gt captions
113 |         # for i in range(10):
114 |         #     _ = len(raw_data)
115 |         #     caption = raw_data[i%_]['final_caption']
116 |         #     for j, w in enumerate(caption[0:28]):
117 |         #         cap_gts[i, j] = self.cap_word_to_ix.get(w, '1')
118 | 
119 |         # Load one random gt captions
120 |         for j, w in enumerate(caption[0:28]):
121 |             cap_gts[0, j] = self.cap_word_to_ix.get(w, '1')
122 | 
123 |         for j, w in enumerate(intention):
124 |             int_gts[j] = self.cms_word_to_ix.get(w, '1')
125 |         for j, w in enumerate(effect):
126 |             eff_gts[j] = self.cms_word_to_ix.get(w, '1')
127 |         for j, w in enumerate(attribute):
128 |             att_gts[j] = self.cms_word_to_ix.get(w, '1')
129 | 
130 |         non_zero = (cap_gts == 0).nonzero()
131 |         if len(non_zero[0]) != 0: cap_mask[:int(non_zero[0][0])] = 1
132 |         else: cap_mask += 1
133 | 
134 |         non_zero = (int_gts == 0).nonzero()
135 |         if len(non_zero[0]) != 0: int_mask[:int(non_zero[0][0])] = 1
136 |         else: int_mask += 1
137 | 
138 |         non_zero = (eff_gts == 0).nonzero()
139 |         if len(non_zero[0]) != 0: eff_mask[:int(non_zero[0][0])] = 1
140 |         else: eff_mask += 1
141 | 
142 |         non_zero = (att_gts == 0).nonzero()
143 |         if len(non_zero[0]) != 0: att_mask[:int(non_zero[0][0])] = 1
144 |         else: att_mask += 1
145 | 
146 |         data = {}
147 |         data['fc_feats'] = torch.from_numpy(fc_feat).type(torch.FloatTensor)
148 |         data['cap_labels'] = torch.from_numpy(cap_gts).type(torch.LongTensor)
149 |         data['cap_masks'] = torch.from_numpy(cap_mask).type(torch.FloatTensor)
150 |         data['int_labels'] = torch.from_numpy(int_gts).type(torch.LongTensor)
151 |         data['int_masks'] = torch.from_numpy(int_mask).type(torch.FloatTensor)
152 |         data['eff_labels'] = torch.from_numpy(eff_gts).type(torch.LongTensor)
153 |         data['eff_masks'] = torch.from_numpy(eff_mask).type(torch.FloatTensor)
154 |         data['att_labels'] = torch.from_numpy(att_gts).type(torch.LongTensor)
155 |         data['att_masks'] = torch.from_numpy(att_mask).type(torch.FloatTensor)
156 | 
157 |         data['video_ids'] = 'video%i' % ix
158 | 
159 |         # Concatenate all CMS
160 |         int_str = ''
161 |         for _ in intentions:
162 |             int_str += ';' + _[0]
163 |         att_str = ''
164 |         for _ in attributes:
165 |             att_str += ';' + _[0]
166 |         eff_str = ''
167 |         for _ in effects:
168 |             eff_str += ';' + _[0]
169 |         return data, int_str, att_str, eff_str
170 | 
171 |     def __len__(self):
172 |         return len(self.splits[self.mode])
173 | 


--------------------------------------------------------------------------------
/utils/gt_caps_dataloader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import torch
  4 | import random
  5 | import numpy as np
  6 | from torch.utils.data import Dataset
  7 | 
  8 | 
  9 | class VideoDataset(Dataset):
 10 | 
 11 |     def get_cms_vocab_size(self):
 12 |         return len(self.get_cms_vocab())
 13 | 
 14 |     def get_cap_vocab_size(self):
 15 |         return len(self.get_cap_vocab())
 16 | 
 17 |     def get_cms_vocab(self):
 18 |         return self.cms_ix_to_word
 19 | 
 20 |     def get_cap_vocab(self):
 21 |         return self.cap_ix_to_word
 22 | 
 23 |     def get_seq_length(self):
 24 |         return self.seq_length
 25 | 
 26 |     def __init__(self, opt, mode='train'):
 27 |         super(VideoDataset, self).__init__()
 28 |         self.mode = mode
 29 | 
 30 |         self.captions = json.load(open(opt['caption_json']))
 31 |         cms_info = json.load(open(opt['info_json']))
 32 |         self.cms_ix_to_word = cms_info['ix_to_word']
 33 |         self.cms_word_to_ix = cms_info['word_to_ix']
 34 |         self.splits = cms_info['videos']
 35 | 
 36 |         # Load caption dictionary
 37 |         cap_info = json.load(open(opt['cap_info_json']))
 38 |         self.cap_ix_to_word = cap_info['ix_to_word']
 39 |         self.cap_word_to_ix = cap_info['word_to_ix']
 40 | 
 41 |         print('Caption vocab size is ', len(self.cap_ix_to_word))
 42 |         print('CMS vocab size is ', len(self.cms_ix_to_word))
 43 |         print('number of train videos: ', len(self.splits['train']))
 44 |         print('number of test videos: ', len(self.splits['test']))
 45 |         print('number of val videos: ', len(self.splits['val']))
 46 | 
 47 |         self.feats_dir = opt['feats_dir']
 48 |         self.cap_max_len = opt['cap_max_len']
 49 | 
 50 |         print('load feats from %s' % self.feats_dir)
 51 |         print('max sequence length of caption is', self.cap_max_len)
 52 | 
 53 |     def __getitem__(self, ix):
 54 | 
 55 |         if self.mode == 'train':
 56 |             ix = random.choice(self.splits['train'])
 57 |         elif self.mode == 'test':
 58 |             ix = self.splits['test'][ix]
 59 |         
 60 |         fc_feat = []
 61 |         for dir in self.feats_dir:
 62 |             fc_feat.append(np.load(os.path.join(dir, 'video%i.npy' % ix)))
 63 |         fc_feat = np.concatenate(fc_feat, axis=1)
 64 | 
 65 |         raw_data = self.captions['video%i' % ix]
 66 |         num_cap = len(raw_data)
 67 |         cap_mask = np.zeros((num_cap, self.cap_max_len))
 68 |         cap_gts = np.zeros((num_cap, self.cap_max_len))
 69 |         int_list, eff_list, att_list = [], [], []
 70 | 
 71 |         # Load all num_cap gt captions
 72 |         for cap_ix in range(num_cap):
 73 |             caption = raw_data[cap_ix % len(raw_data)]['final_caption']
 74 | 
 75 |             if len(caption) > self.cap_max_len:
 76 |                 caption = caption[:self.cap_max_len]
 77 |                 caption[-1] = '<eos>'
 78 | 
 79 |             for j, w in enumerate(caption[0: self.cap_max_len]):
 80 |                 cap_gts[cap_ix, j] = self.cap_word_to_ix.get(w, '1')
 81 | 
 82 |             intentions, effects, attributes =  raw_data[cap_ix]['intention'], raw_data[cap_ix]['effect'],\
 83 |                                                raw_data[cap_ix]['attribute']
 84 | 
 85 |             # Concatenate all CMS
 86 |             int_str, att_str, eff_str = '', '', ''
 87 |             for int, eff, att in zip(intentions, effects, attributes):
 88 |                 int_str += ';' + int[0]
 89 |                 eff_str += ';' + eff[0]
 90 |                 att_str += ';' + att[0]
 91 | 
 92 |             int_list.append(int_str)
 93 |             eff_list.append(eff_str)
 94 |             att_list.append(att_str)
 95 | 
 96 |         # Insert mask
 97 |         cap_mask[(cap_gts != 0)] = 1
 98 | 
 99 |         data = {}
100 |         data['fc_feats'] = torch.from_numpy(fc_feat).type(torch.FloatTensor)
101 |         data['cap_labels'] = torch.from_numpy(cap_gts).type(torch.LongTensor)
102 |         data['cap_masks'] = torch.from_numpy(cap_mask).type(torch.FloatTensor)
103 |         data['video_ids'] = 'video%i' % ix
104 | 
105 |         return data, int_list, eff_list, att_list
106 | 
107 |     def __len__(self):
108 |         return len(self.splits[self.mode])
109 | 
110 | 


--------------------------------------------------------------------------------
/utils/gt_human_cap_dataloader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import torch
  4 | import random
  5 | import numpy as np
  6 | from torch.utils.data import Dataset
  7 | 
  8 | 
  9 | class VideoDataset(Dataset):
 10 | 
 11 |     def get_cms_vocab_size(self):
 12 |         return len(self.get_cms_vocab())
 13 | 
 14 |     def get_cap_vocab_size(self):
 15 |         return len(self.get_cap_vocab())
 16 | 
 17 |     def get_cms_vocab(self):
 18 |         return self.cms_ix_to_word
 19 | 
 20 |     def get_cap_vocab(self):
 21 |         return self.cap_ix_to_word
 22 | 
 23 |     def get_seq_length(self):
 24 |         return self.seq_length
 25 | 
 26 |     def __init__(self, opt, mode='train'):
 27 |         super(VideoDataset, self).__init__()
 28 |         self.mode = mode
 29 | 
 30 |         self.captions = json.load(open(opt['caption_json']))
 31 |         cms_info = json.load(open(opt['info_json']))
 32 |         self.cms_ix_to_word = cms_info['ix_to_word']
 33 |         self.cms_word_to_ix = cms_info['word_to_ix']
 34 |         self.splits = cms_info['videos']
 35 | 
 36 |         # Load caption dictionary
 37 |         cap_info = json.load(open(opt['cap_info_json']))
 38 |         self.cap_ix_to_word = cap_info['ix_to_word']
 39 |         self.cap_word_to_ix = cap_info['word_to_ix']
 40 | 
 41 |         print('Caption vocab size is ', len(self.cap_ix_to_word))
 42 |         print('CMS vocab size is ', len(self.cms_ix_to_word))
 43 |         print('number of train videos: ', len(self.splits['train']))
 44 |         print('number of test videos: ', len(self.splits['test']))
 45 |         print('number of val videos: ', len(self.splits['val']))
 46 | 
 47 |         self.feats_dir = opt['feats_dir']
 48 |         print('load feats from %s' % self.feats_dir)
 49 | 
 50 |         self.cap_max_len = opt['cap_max_len']
 51 |         self.int_max_len = opt['int_max_len']
 52 |         self.eff_max_len = opt['eff_max_len']
 53 |         self.att_max_len = opt['att_max_len']
 54 |         print('max sequence length of caption is', self.cap_max_len)
 55 |         print('max sequence length of intention is', self.int_max_len)
 56 |         print('max sequence length of effect is', self.eff_max_len)
 57 |         print('max sequence length of attribute is', self.att_max_len)
 58 | 
 59 |     def __getitem__(self, ix):
 60 | 
 61 |         if self.mode == 'train':
 62 |             ix = self.captions.keys()
 63 |         elif self.mode == 'test':
 64 |             ix = list(self.captions.keys())[ix]
 65 | 
 66 |         fc_feat = []
 67 |         for dir in self.feats_dir:
 68 |             fc_feat.append(np.load(os.path.join(dir, ix+'.npy')))
 69 |         fc_feat = np.concatenate(fc_feat, axis=1)
 70 | 
 71 |         cap_mask = np.zeros(self.cap_max_len)
 72 |         int_mask = np.zeros(self.int_max_len)
 73 |         eff_mask = np.zeros(self.eff_max_len)
 74 |         att_mask = np.zeros(self.att_max_len)
 75 | 
 76 |         # cap_gts = np.zeros((10, self.cap_max_len))
 77 |         cap_gts = np.zeros((1, self.cap_max_len))
 78 |         int_gts = np.zeros(self.int_max_len)
 79 |         eff_gts = np.zeros(self.eff_max_len)
 80 |         att_gts = np.zeros(self.att_max_len)
 81 | 
 82 |         raw_data = self.captions[ix]
 83 | 
 84 |         cap_ix = random.randint(0, len(raw_data) - 1)   # Random pick out one caption
 85 | 
 86 |         caption = raw_data[cap_ix]['final_caption']
 87 | 
 88 |         intentions = raw_data[cap_ix]['intention']
 89 |         intention = intentions[random.randint(0, len(intentions)-1)][1]
 90 | 
 91 |         effects = raw_data[cap_ix]['effect']
 92 |         effect = effects[random.randint(0, len(effects)-1)][1]
 93 | 
 94 |         attributes = raw_data[cap_ix]['attribute']
 95 |         attribute = attributes[random.randint(0, len(attributes)-1)][1]
 96 | 
 97 |         # Load all intentions again for eval
 98 |         # intentions = [item['intention'][0] for item in raw_data]
 99 |         # effects = [item['effect'][0] for item in raw_data]
100 |         # attributes = [item['attribute'][0] for item in raw_data]
101 | 
102 |         if len(caption) > self.cap_max_len:
103 |             caption = caption[:self.cap_max_len]
104 |             caption[-1] = '<eos>'
105 |         if len(effect) > self.eff_max_len:
106 |             effect = effect[:self.eff_max_len]
107 |             effect[-1] = '<eos>'
108 |         if len(attribute) > self.att_max_len:
109 |             attribute = attribute[:self.att_max_len]
110 |             attribute[-1] = '<eos>'
111 | 
112 |         # Load all 10 gt captions
113 |         # for i in range(10):
114 |         #     _ = len(raw_data)
115 |         #     caption = raw_data[i%_]['final_caption']
116 |         #     for j, w in enumerate(caption[0:28]):
117 |         #         cap_gts[i, j] = self.cap_word_to_ix.get(w, '1')
118 | 
119 |         # Load one random gt captions
120 |         for j, w in enumerate(caption[0:28]):
121 |             cap_gts[0, j] = self.cap_word_to_ix.get(w, '1')
122 | 
123 |         for j, w in enumerate(intention):
124 |             int_gts[j] = self.cms_word_to_ix.get(w, '1')
125 |         for j, w in enumerate(effect):
126 |             eff_gts[j] = self.cms_word_to_ix.get(w, '1')
127 |         for j, w in enumerate(attribute):
128 |             att_gts[j] = self.cms_word_to_ix.get(w, '1')
129 | 
130 |         non_zero = (cap_gts == 0).nonzero()
131 |         if len(non_zero[0]) != 0: cap_mask[:int(non_zero[0][0])] = 1
132 |         else: cap_mask += 1
133 | 
134 |         non_zero = (int_gts == 0).nonzero()
135 |         if len(non_zero[0]) != 0: int_mask[:int(non_zero[0][0])] = 1
136 |         else: int_mask += 1
137 | 
138 |         non_zero = (eff_gts == 0).nonzero()
139 |         if len(non_zero[0]) != 0: eff_mask[:int(non_zero[0][0])] = 1
140 |         else: eff_mask += 1
141 | 
142 |         non_zero = (att_gts == 0).nonzero()
143 |         if len(non_zero[0]) != 0: att_mask[:int(non_zero[0][0])] = 1
144 |         else: att_mask += 1
145 | 
146 |         data = {}
147 |         data['fc_feats'] = torch.from_numpy(fc_feat).type(torch.FloatTensor)
148 |         data['cap_labels'] = torch.from_numpy(cap_gts).type(torch.LongTensor)
149 |         data['cap_masks'] = torch.from_numpy(cap_mask).type(torch.FloatTensor)
150 |         data['int_labels'] = torch.from_numpy(int_gts).type(torch.LongTensor)
151 |         data['int_masks'] = torch.from_numpy(int_mask).type(torch.FloatTensor)
152 |         data['eff_labels'] = torch.from_numpy(eff_gts).type(torch.LongTensor)
153 |         data['eff_masks'] = torch.from_numpy(eff_mask).type(torch.FloatTensor)
154 |         data['att_labels'] = torch.from_numpy(att_gts).type(torch.LongTensor)
155 |         data['att_masks'] = torch.from_numpy(att_mask).type(torch.FloatTensor)
156 | 
157 |         data['video_ids'] = ix
158 | 
159 |         # Concatenate all CMS
160 |         int_str = ''
161 |         for _ in intentions:
162 |             int_str += ';' + _[0]
163 |         att_str = ''
164 |         for _ in attributes:
165 |             att_str += ';' + _[0]
166 |         eff_str = ''
167 |         for _ in effects:
168 |             eff_str += ';' + _[0]
169 |         return data, int_str, att_str, eff_str
170 | 
171 |     def __len__(self):
172 |         return len(list(self.captions.keys()))
173 | 


--------------------------------------------------------------------------------
/utils/prepro_feats.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | import subprocess
  3 | import glob
  4 | from tqdm import tqdm
  5 | import numpy as np
  6 | import os
  7 | import argparse
  8 | 
  9 | import torch
 10 | from torch import nn
 11 | import torch.nn.functional as F
 12 | import pretrainedmodels
 13 | from pretrainedmodels import utils
 14 | 
 15 | C, H, W = 3, 224, 224
 16 | 
 17 | 
 18 | def extract_frames(video, dst):
 19 |     with open(os.devnull, "w") as ffmpeg_log:
 20 |         if os.path.exists(dst):
 21 |             print(" cleanup: " + dst + "/")
 22 |             shutil.rmtree(dst)
 23 |         os.makedirs(dst)
 24 |         video_to_frames_command = ["ffmpeg",
 25 |                                    # (optional) overwrite output file if it exists
 26 |                                    '-y',
 27 |                                    '-i', video,  # input file
 28 |                                    '-vf', "scale=400:300",  # input file
 29 |                                    '-qscale:v', "2",  # quality for JPEG
 30 |                                    '{0}/%06d.jpg'.format(dst)]
 31 |         subprocess.call(video_to_frames_command,
 32 |                         stdout=ffmpeg_log, stderr=ffmpeg_log)
 33 | 
 34 | 
 35 | def extract_feats(params, model, load_image_fn):
 36 |     global C, H, W
 37 |     model.eval()
 38 | 
 39 |     dir_fc = params['output_dir']
 40 |     if not os.path.isdir(dir_fc):
 41 |         os.mkdir(dir_fc)
 42 |     print("save video feats to %s" % (dir_fc))
 43 |     video_list = glob.glob(os.path.join(params['video_path'], '*.mp4'))
 44 |     for video in tqdm(video_list):
 45 |         video_id = video.split("/")[-1].split(".")[0]
 46 |         dst = params['model'] + '_' + video_id
 47 |         extract_frames(video, dst)
 48 | 
 49 |         image_list = sorted(glob.glob(os.path.join(dst, '*.jpg')))
 50 |         samples = np.round(np.linspace(
 51 |             0, len(image_list) - 1, params['n_frame_steps']))
 52 |         image_list = [image_list[int(sample)] for sample in samples]
 53 |         images = torch.zeros((len(image_list), C, H, W))
 54 |         for iImg in range(len(image_list)):
 55 |             img = load_image_fn(image_list[iImg])
 56 |             images[iImg] = img
 57 |         with torch.no_grad():
 58 |             fc_feats = model(images.cuda()).squeeze()
 59 |         img_feats = fc_feats.cpu().numpy()
 60 |         # Save the inception features
 61 |         outfile = os.path.join(dir_fc, video_id + '.npy')
 62 |         np.save(outfile, img_feats)
 63 |         # cleanup
 64 |         shutil.rmtree(dst)
 65 | 
 66 | 
 67 | if __name__ == '__main__':
 68 |     parser = argparse.ArgumentParser()
 69 |     parser.add_argument("--gpu", dest='gpu', type=str, default='0',
 70 |                         help='Set CUDA_VISIBLE_DEVICES environment variable, optional')
 71 |     parser.add_argument("--output_dir", dest='output_dir', type=str,
 72 |                         default='data/feats/resnet152', help='directory to store features')
 73 |     parser.add_argument("--n_frame_steps", dest='n_frame_steps', type=int, default=40,
 74 |                         help='how many frames to sampler per video')
 75 | 
 76 |     parser.add_argument("--video_path", dest='video_path', type=str,
 77 |                         default='data/test-video', help='path to video dataset')
 78 |     parser.add_argument("--model", dest="model", type=str, default='resnet152',
 79 |                         help='the CNN model you want to use to extract_feats')
 80 |     
 81 |     args = parser.parse_args()
 82 |     os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
 83 |     params = vars(args)
 84 |     if params['model'] == 'inception_v3':
 85 |         C, H, W = 3, 299, 299
 86 |         model = pretrainedmodels.inceptionv3(pretrained='imagenet')
 87 |         load_image_fn = utils.LoadTransformImage(model)
 88 | 
 89 |     elif params['model'] == 'resnet152':
 90 |         C, H, W = 3, 224, 224
 91 |         model = pretrainedmodels.resnet152(pretrained='imagenet')
 92 |         load_image_fn = utils.LoadTransformImage(model)
 93 | 
 94 |     elif params['model'] == 'inception_v4':
 95 |         C, H, W = 3, 299, 299
 96 |         model = pretrainedmodels.inceptionv4(
 97 |             num_classes=1000, pretrained='imagenet')
 98 |         load_image_fn = utils.LoadTransformImage(model)
 99 | 
100 |     else:
101 |         print("doesn't support %s" % (params['model']))
102 | 
103 |     model.last_linear = utils.Identity()
104 |     model = nn.DataParallel(model)
105 |     
106 |     model = model.cuda()
107 |     extract_feats(params, model, load_image_fn)
108 | 


--------------------------------------------------------------------------------
/utils/prepro_ngrams.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import argparse
  3 | import pickle as pkl
  4 | from collections import defaultdict
  5 | 
  6 | 
  7 | def precook(s, n=4):
  8 |     """
  9 |     Takes a string as input and returns an object that can be given to
 10 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 11 |     can take string arguments as well.
 12 |     :param s: string : sentence to be converted into ngrams
 13 |     :param n: int    : number of ngrams for which representation is calculated
 14 |     :return: term frequency vector for occuring ngrams
 15 |     """
 16 |     words = s.split()
 17 |     counts = defaultdict(int)
 18 |     for k in range(1, n+1):
 19 |         for i in range(len(words)-k+1):
 20 |             ngram = tuple(words[i:i+k])
 21 |             counts[ngram] += 1
 22 |     return counts
 23 | 
 24 | 
 25 | def cook_refs(refs, n=4):  # lhuang: oracle will call with "average"
 26 |     '''Takes a list of reference sentences for a single segment
 27 |     and returns an object that encapsulates everything that BLEU
 28 |     needs to know about them.
 29 |     :param refs: list of string : reference sentences for some image
 30 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 31 |     :return: result (list of dict)
 32 |     '''
 33 |     return [precook(ref, n) for ref in refs]
 34 | 
 35 | 
 36 | def create_crefs(refs):
 37 |     crefs = []
 38 |     for ref in refs:
 39 |         # ref is a list of 5 captions
 40 |         crefs.append(cook_refs(ref))
 41 |     return crefs
 42 | 
 43 | 
 44 | def compute_doc_freq(crefs):
 45 |     '''
 46 |     Compute term frequency for reference data.
 47 |     This will be used to compute idf (inverse document frequency later)
 48 |     The term frequency is stored in the object
 49 |     :return: None
 50 |     '''
 51 |     document_frequency = defaultdict(float)
 52 |     for refs in crefs:
 53 |         # refs, k ref captions of one image
 54 |         for ngram in set([ngram for ref in refs for (ngram, count) in ref.items()]):
 55 |             document_frequency[ngram] += 1
 56 |       # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 57 |     return document_frequency
 58 | 
 59 | 
 60 | def build_dict(vids, wtoi):
 61 |     refs_words = []
 62 |     refs_idxs = []
 63 |     count_vids = 0
 64 |     for vid in vids:
 65 |         ref_words = []
 66 |         ref_idxs = []
 67 |         for cap in vids[vid]['final_captions']:
 68 |             tmp_tokens = cap
 69 |             tmp_tokens = [_ if _ in wtoi else '<UNK>' for _ in tmp_tokens]
 70 |             ref_words.append(' '.join(tmp_tokens))
 71 |             ref_idxs.append(' '.join([str(wtoi[_]) for _ in tmp_tokens]))
 72 |         refs_words.append(ref_words)
 73 |         refs_idxs.append(ref_idxs)
 74 |         count_vids += 1
 75 |     ngram_words = compute_doc_freq(create_crefs(refs_words))
 76 |     ngram_idxs = compute_doc_freq(create_crefs(refs_idxs))
 77 |     return ngram_words, ngram_idxs, count_vids
 78 | 
 79 | 
 80 | def main(params):
 81 |     vids = json.load(open(params['caption_json']))
 82 |     wtoi = json.load(open(params['info_json']))['word_to_ix']
 83 | 
 84 |     ngram_words, ngram_idxs, ref_len = build_dict(vids, wtoi)
 85 | 
 86 |     pkl.dump({'document_frequency': ngram_words, 'ref_len': ref_len}, open(
 87 |         params['output_pkl']+'-words.p', 'wb'))
 88 |     pkl.dump({'document_frequency': ngram_idxs, 'ref_len': ref_len}, open(
 89 |         params['output_pkl']+'-idxs.p', 'wb'))
 90 | 
 91 | if __name__ == "__main__":
 92 | 
 93 |     parser = argparse.ArgumentParser()
 94 | 
 95 |     # input json
 96 |     parser.add_argument('--caption_json', default='data/caption.json',
 97 |                         help='input json file to containing video captions')
 98 |     parser.add_argument('--info_json', default='data/info.json', help='vocab info json file')
 99 |     parser.add_argument('--output_pkl', default='data/msr-all', help='output pickle file')
100 |     args = parser.parse_args()
101 |     params = vars(args)  # convert to ordinary dict
102 | 
103 |     main(params)
104 | 


--------------------------------------------------------------------------------
/utils/prepro_vocab.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | import argparse
 4 | import numpy as np
 5 | 
 6 | 
 7 | def build_vocab(vids, params):
 8 |     count_thr = params['word_count_threshold']
 9 |     # count up the number of words
10 |     counts = {}
11 |     for vid, caps in vids.items():
12 |         for cap in caps['captions']:
13 |             ws = re.sub(r'[.!,;?]', ' ', cap).split()
14 |             for w in ws:
15 |                 counts[w] = counts.get(w, 0) + 1
16 |     # cw = sorted([(count, w) for w, count in counts.items()], reverse=True)
17 |     total_words = sum(counts.values())
18 |     bad_words = [w for w, n in counts.items() if n <= count_thr]
19 |     vocab = [w for w, n in counts.items() if n > count_thr]
20 |     bad_count = sum(counts[w] for w in bad_words)
21 |     print('number of bad words: %d/%d = %.2f%%' %
22 |           (len(bad_words), len(counts), len(bad_words) * 100.0 / len(counts)))
23 |     print('number of words in vocab would be %d' % (len(vocab), ))
24 |     print('number of UNKs: %d/%d = %.2f%%' %
25 |           (bad_count, total_words, bad_count * 100.0 / total_words))
26 |     # lets now produce the final annotations
27 |     if bad_count > 0:
28 |         # additional special UNK token we will use below to map infrequent words to
29 |         print('inserting the special UNK token')
30 |         vocab.append('<UNK>')
31 |     for vid, caps in vids.items():
32 |         caps = caps['captions']
33 |         vids[vid]['final_captions'] = []
34 |         for cap in caps:
35 |             ws = re.sub(r'[.!,;?]', ' ', cap).split()
36 |             caption = [
37 |                 '<sos>'] + [w if counts.get(w, 0) > count_thr else '<UNK>' for w in ws] + ['<eos>']
38 |             vids[vid]['final_captions'].append(caption)
39 |     return vocab
40 | 
41 | 
42 | def main(params):
43 |     videos = json.load(open(params['input_json'], 'r'))['sentences']
44 |     video_caption = {}
45 |     for i in videos:
46 |         if i['video_id'] not in video_caption.keys():
47 |             video_caption[i['video_id']] = {'captions': []}
48 |         video_caption[i['video_id']]['captions'].append(i['caption'])
49 |     # create the vocab
50 |     vocab = build_vocab(video_caption, params)
51 |     itow = {i + 2: w for i, w in enumerate(vocab)}
52 |     wtoi = {w: i + 2 for i, w in enumerate(vocab)}  # inverse table
53 |     wtoi['<eos>'] = 0
54 |     itow[0] = '<eos>'
55 |     wtoi['<sos>'] = 1
56 |     itow[1] = '<sos>'
57 | 
58 |     out = {}
59 |     out['ix_to_word'] = itow
60 |     out['word_to_ix'] = wtoi
61 |     out['videos'] = {'train': [], 'val': [], 'test': []}
62 |     videos = json.load(open(params['input_json'], 'r'))['videos']
63 |     for i in videos:
64 |         out['videos'][i['split']].append(int(i['id']))
65 |     json.dump(out, open(params['info_json'], 'w'))
66 |     json.dump(video_caption, open(params['caption_json'], 'w'))
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     parser = argparse.ArgumentParser()
71 | 
72 |     # input json
73 |     parser.add_argument('--input_json', type=str, default='data/videodatainfo_2017.json',
74 |                         help='msr_vtt videoinfo json')
75 |     parser.add_argument('--info_json', default='data/info.json',
76 |                         help='info about iw2word and word2ix')
77 |     parser.add_argument('--caption_json', default='data/caption.json', help='caption json file')
78 | 
79 | 
80 |     parser.add_argument('--word_count_threshold', default=1, type=int,
81 |                         help='only words that occur more than this number of times will be put in vocab')
82 | 
83 |     args = parser.parse_args()
84 |     params = vars(args)  # convert to ordinary dict
85 |     main(params)
86 | 


--------------------------------------------------------------------------------
/utils/rouge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # File Name : rouge.py
  4 | #
  5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
  6 | #
  7 | # Creation Date : 2015-01-07 06:03
  8 | # Author : Ramakrishna Vedantam <vrama91@vt.edu>
  9 | 
 10 | import numpy as np
 11 | import pdb
 12 | 
 13 | 
 14 | def my_lcs(string, sub):
 15 |     """
 16 |     Calculates longest common subsequence for a pair of tokenized strings
 17 |     :param string : list of str : tokens from a string split using whitespace
 18 |     :param sub : list of str : shorter string, also split using whitespace
 19 |     :returns: length (list of int): length of the longest common subsequence between the two strings
 20 | 
 21 |     Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
 22 |     """
 23 |     if(len(string)< len(sub)):
 24 |         sub, string = string, sub
 25 | 
 26 |     lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
 27 | 
 28 |     for j in range(1,len(sub)+1):
 29 |         for i in range(1,len(string)+1):
 30 |             if(string[i-1] == sub[j-1]):
 31 |                 lengths[i][j] = lengths[i-1][j-1] + 1
 32 |             else:
 33 |                 lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
 34 | 
 35 |     return lengths[len(string)][len(sub)]
 36 | 
 37 | 
 38 | class Rouge():
 39 |     '''
 40 |     Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
 41 | 
 42 |     '''
 43 |     def __init__(self):
 44 |         # vrama91: updated the value below based on discussion with Hovey
 45 |         self.beta = 1.2
 46 | 
 47 |     def calc_score(self, candidate, refs):
 48 |         """
 49 |         Compute ROUGE-L score given one candidate and references for an image
 50 |         :param candidate: str : candidate sentence to be evaluated
 51 |         :param refs: list of str : COCO reference sentences for the particular image to be evaluated
 52 |         :returns score: int (ROUGE-L score for the candidate evaluated against references)
 53 |         """
 54 |         assert(len(candidate)==1)
 55 |         assert(len(refs)>0)
 56 |         prec = []
 57 |         rec = []
 58 | 
 59 |         # split into tokens
 60 |         token_c = candidate[0].split(" ")
 61 | 
 62 |         for reference in refs:
 63 |             # split into tokens
 64 |             token_r = reference.split(" ")
 65 |             # compute the longest common subsequence
 66 |             lcs = my_lcs(token_r, token_c)
 67 |             prec.append(lcs/float(len(token_c)))
 68 |             rec.append(lcs/float(len(token_r)))
 69 | 
 70 |         prec_max = max(prec)
 71 |         rec_max = max(rec)
 72 | 
 73 |         if(prec_max!=0 and rec_max !=0):
 74 |             score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
 75 |         else:
 76 |             score = 0.0
 77 |         return score
 78 | 
 79 |     def compute_score(self, gts, res):
 80 |         """
 81 |         Computes Rouge-L score given a set of reference and candidate sentences for the dataset
 82 |         Invoked by evaluate_captions.py
 83 |         :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
 84 |         :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
 85 |         :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
 86 |         """
 87 |         assert(gts.keys() == res.keys())
 88 |         imgIds = gts.keys()
 89 | 
 90 |         score = []
 91 |         for id in imgIds:
 92 |             hypo = res[id]
 93 |             ref  = gts[id]
 94 | 
 95 |             score.append(self.calc_score(hypo, ref))
 96 | 
 97 |             # Sanity check.
 98 |             assert(type(hypo) is list)
 99 |             assert(len(hypo) == 1)
100 |             assert(type(ref) is list)
101 |             assert(len(ref) > 0)
102 | 
103 |         average_score = np.mean(np.array(score))
104 |         return average_score, np.array(score)
105 | 
106 |     def method(self):
107 |         return "Rouge"
108 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import collections
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import model.transformer.Constants as Constants
  6 | 
  7 | # Construct the uni-gram language model
  8 | def unigram(tokens):
  9 |     model = collections.defaultdict(lambda: 0.01)
 10 |     for f in tokens:
 11 |         try:
 12 |             model[f] += 1
 13 |         except KeyError:
 14 |             model [f] = 1
 15 |             continue
 16 |     N = float(sum(model.values()))
 17 |     for word in model:
 18 |         model[word] /= N
 19 |     return model
 20 | 
 21 | 
 22 | # Computes perplexity of the UniGram model on a test-set
 23 | def perplexity(testset, model):
 24 |     testset = testset.split()
 25 |     ppl = 1
 26 |     N = 0
 27 |     for word in testset:
 28 |         N += 1
 29 |         ppl *= 1/model[word]
 30 |     ppl = pow(ppl, 1/float(N))
 31 |     return ppl
 32 | 
 33 | 
 34 | # Mean Pool Out the word2vec features of sentences.
 35 | def mean_pool_vec(sentence, wordmodel):
 36 |     vector = np.zeros(50)
 37 |     vector += np.mean([wordmodel[ele] for ele in sentence.split(' ') if ele in wordmodel.keys()], 0)
 38 |     return vector
 39 | 
 40 | 
 41 | def test_collate_fn(batch):
 42 |     '''
 43 |     :param batch: input batch data
 44 |     :return: aligned features
 45 |     '''
 46 | 
 47 |     return batch
 48 | 
 49 | 
 50 | def pos_emb_generation(visual_feats):
 51 |     '''
 52 |         Generate the position embedding input for Transformers.
 53 |     '''
 54 |     seq = list(range(1, visual_feats.shape[1] + 1))
 55 |     src_pos = torch.tensor([seq] * visual_feats.shape[0]).cuda()
 56 |     return src_pos
 57 | 
 58 | 
 59 | def list_to_sentence(list):
 60 |     sentence = ''
 61 |     for element in list:
 62 |         sentence += ' ' + element
 63 |     return sentence
 64 | 
 65 | 
 66 | class LanguageModelCriterion(nn.Module):
 67 | 
 68 |     def __init__(self):
 69 |         super(LanguageModelCriterion, self).__init__()
 70 |         # self.loss_fn = nn.NLLLoss(reduce=False)
 71 |         self.loss_fn = nn.CrossEntropyLoss()
 72 | 
 73 |     def forward(self, logits, target, mask):
 74 |         """
 75 |         logits: shape of (N, seq_len, vocab_size)
 76 |         target: shape of (N, seq_len)
 77 |         mask: shape of (N, seq_len)
 78 |         """
 79 |         # truncate to the same size
 80 |         batch_size = target.shape[0]
 81 |         target = target[:, :logits.shape[1]]
 82 |         mask = mask[:, :logits.shape[1]]
 83 |         target = target.contiguous().view(-1)
 84 |         mask = mask.contiguous().view(-1)
 85 |         loss = self.loss_fn(logits, target)
 86 |         output = torch.sum(loss * mask) / batch_size
 87 |         return output
 88 | 
 89 | 
 90 | def cal_loss(pred, gold, smoothing):
 91 |     ''' Calculate cross entropy loss, apply label smoothing if needed. '''
 92 | 
 93 |     gold = gold.contiguous().view(-1)
 94 | 
 95 |     if smoothing:
 96 |         eps = 0.1
 97 |         n_class = pred.size(1)
 98 | 
 99 |         one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)
100 |         one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
101 |         log_prb = F.log_softmax(pred, dim=1)
102 | 
103 |         non_pad_mask = gold.ne(Constants.PAD)
104 |         loss = -(one_hot * log_prb).sum(dim=1)
105 |         loss = loss.masked_select(non_pad_mask).sum()  # average later
106 |     else:
107 |         loss = F.cross_entropy(pred, gold, ignore_index=Constants.PAD, reduction='sum')
108 | 
109 |     return loss
110 | 
111 | 
112 | def cal_performance(pred, gold, smoothing=False):
113 |     ''' Apply label smoothing if needed '''
114 | 
115 |     loss = cal_loss(pred, gold, smoothing)
116 | 
117 |     pred = pred.max(1)[1]
118 |     gold = gold.contiguous().view(-1)
119 |     non_pad_mask = gold.ne(Constants.PAD)
120 |     n_correct = pred.eq(gold)
121 |     n_correct = n_correct.masked_select(non_pad_mask).sum().item()
122 |     return loss, n_correct
123 | 
124 | 
125 | def pos_emb_generation(word_labels):
126 |     '''
127 |         Generate the position embedding input for Transformers.
128 |     '''
129 | 
130 |     seq = list(range(1, word_labels.shape[1] + 1))
131 |     tgt_pos = torch.tensor([seq] * word_labels.shape[0]).cuda()
132 |     binary_mask = (word_labels != 0).long()
133 | 
134 |     return tgt_pos*binary_mask
135 | 
136 | 
137 | def show_prediction(seq_probs, labels, vocab, caption=True):
138 |     '''
139 |         :return: predicted words and GT words.
140 |     '''
141 |     # Print out the predicted sentences and GT
142 |     _ = seq_probs.view(labels.shape[0], labels.shape[1], -1)[0]
143 |     pred_idx = torch.argmax(_, 1)
144 |     # print(' \n')
145 |     if caption:
146 |         print('Caption: ')
147 |     else:
148 |         print('CMS: ')
149 | 
150 |     pr = 'Generation: ', list_to_sentence([vocab[str(widx.cpu().numpy())] for widx in pred_idx if widx != 0])
151 |     gt = 'GT: ', list_to_sentence([vocab[str(word.cpu().numpy())] for word in labels[0] if word != 0])
152 |     print(pr)
153 |     print(gt)
154 |     return pr, gt
155 | 
156 | 


--------------------------------------------------------------------------------