├── README.md ├── data ├── model ├── Allinone_Model.py ├── Attention.py ├── Decoder.py ├── DecoderRNN.py ├── EncoderRNN.py ├── Model.py ├── S2VTAttModel.py ├── S2VTModel.py ├── S2VT_EncoderRNN.py ├── TransformerDecoderModel.py ├── TransformerModel.py ├── __init__.py └── transformer │ ├── Beam.py │ ├── Constants.py │ ├── Layers.py │ ├── Modules.py │ ├── Optim.py │ ├── SubLayers.py │ ├── Transformers.py │ ├── Translator.py │ ├── __init__.py │ ├── cap2cms_Translator.py │ ├── cap2cms_Translator_transformer.py │ ├── cap2cms_Translator_transformerDecoder.py │ └── cap_cms_Translator.py ├── opts.py ├── others ├── generation.py ├── test_RNN.py ├── test_attention_Video2text.py ├── test_transformer.py ├── train_RNN.py └── train_transformer.py ├── pictures ├── arch.png └── v2c.png ├── test.py ├── train.py └── utils ├── allinone_dataloader.py ├── cocoeval.py ├── dataloader.py ├── gt_cap_dataloader.py ├── gt_caps_dataloader.py ├── gt_human_cap_dataloader.py ├── prepro_feats.py ├── prepro_ngrams.py ├── prepro_vocab.py ├── rouge.py └── utils.py /data: -------------------------------------------------------------------------------- 1 | /media/drive1/Data/MSR-VTT -------------------------------------------------------------------------------- /model/Allinone_Model.py: -------------------------------------------------------------------------------- 1 | ''' Define the Transformer model ''' 2 | from utils.utils import * 3 | from model.Decoder import Decoder 4 | from model.EncoderRNN import EncoderRNN 5 | 6 | __author__ = 'Jacob Zhiyuan Fang' 7 | 8 | 9 | class Model(nn.Module): 10 | ''' A sequence to sequence model with attention mechanism. ''' 11 | 12 | def __init__( 13 | self, 14 | n_cap_vocab, cap_max_seq, vis_emb=2048, 15 | d_word_vec=512, d_model=512, d_inner=2048, 16 | n_layers=6, n_head=8, d_k=64, d_v=64, dropout=0.1, 17 | tgt_emb_prj_weight_sharing=True): 18 | 19 | super().__init__() 20 | 21 | self.encoder = EncoderRNN(vis_emb, d_model, bidirectional=0) 22 | # self.encoder = nn.Linear(vis_emb, d_model) 23 | 24 | self.decoder = Decoder( 25 | n_tgt_vocab=n_cap_vocab, len_max_seq=cap_max_seq, 26 | d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, 27 | n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, 28 | dropout=dropout) 29 | 30 | self.cap_word_prj = nn.Linear(d_model, n_cap_vocab, bias=False) 31 | nn.init.xavier_normal_(self.cap_word_prj.weight) 32 | 33 | assert d_model == d_word_vec, \ 34 | 'To facilitate the residual connections, ' \ 35 | 'the dimensions of all module outputs shall be the same.' 36 | 37 | if tgt_emb_prj_weight_sharing: 38 | # Share the weight matrix between target word embedding & the final logit dense layer 39 | self.cap_word_prj.weight = self.decoder.tgt_word_emb.weight 40 | self.x_logit_scale = (d_model ** -0.5) 41 | else: 42 | self.x_logit_scale = 1. 43 | 44 | def forward(self, vis_feat, tgt_seq, tgt_pos): 45 | 46 | tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1] 47 | 48 | enc_output, *_ = self.encoder(vis_feat) 49 | dec_output, *_ = self.decoder(tgt_seq, tgt_pos, vis_feat, enc_output) 50 | seq_logit = self.cap_word_prj(dec_output) * self.x_logit_scale 51 | 52 | return seq_logit.view(-1, seq_logit.size(2)) 53 | 54 | -------------------------------------------------------------------------------- /model/Attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Attention(nn.Module): 7 | """ 8 | Applies an attention mechanism on the output features from the decoder. 9 | """ 10 | 11 | def __init__(self, dim): 12 | super(Attention, self).__init__() 13 | self.dim = dim 14 | self.linear1 = nn.Linear(dim * 2, dim) 15 | self.linear2 = nn.Linear(dim, 1, bias=False) 16 | #self._init_hidden() 17 | 18 | def _init_hidden(self): 19 | nn.init.xavier_normal_(self.linear1.weight) 20 | nn.init.xavier_normal_(self.linear2.weight) 21 | 22 | def forward(self, hidden_state, encoder_outputs): 23 | """ 24 | Arguments: 25 | hidden_state {Variable} -- batch_size x dim 26 | encoder_outputs {Variable} -- batch_size x seq_len x dim 27 | 28 | Returns: 29 | Variable -- context vector of size batch_size x dim 30 | """ 31 | batch_size, seq_len, _ = encoder_outputs.size() 32 | hidden_state = hidden_state.unsqueeze(1).repeat(1, seq_len, 1) 33 | inputs = torch.cat((encoder_outputs, hidden_state), 34 | 2).view(-1, self.dim * 2) 35 | o = self.linear2(F.tanh(self.linear1(inputs))) 36 | e = o.view(batch_size, seq_len) 37 | alpha = F.softmax(e, dim=1) 38 | context = torch.bmm(alpha.unsqueeze(1), encoder_outputs).squeeze(1) 39 | return context 40 | -------------------------------------------------------------------------------- /model/Decoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from .transformer import Constants as Constants 5 | from .transformer.Layers import DecoderLayer 6 | 7 | 8 | def get_non_pad_mask(seq): 9 | assert seq.dim() == 2 10 | return seq.ne(Constants.PAD).type(torch.float).unsqueeze(-1).cuda() 11 | 12 | 13 | def get_attn_key_pad_mask(seq_k, seq_q): 14 | ''' For masking out the padding part of key sequence. ''' 15 | 16 | # Expand to fit the shape of key query attention matrix. 17 | len_q = seq_q.size(1) 18 | padding_mask = seq_k.eq(Constants.PAD) 19 | padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1) # b x lq x lk 20 | 21 | return padding_mask.cuda() 22 | 23 | 24 | def get_subsequent_mask(seq): 25 | ''' For masking out the subsequent info. ''' 26 | 27 | sz_b, len_s = seq.size() 28 | subsequent_mask = torch.triu( 29 | torch.ones((len_s, len_s), device=seq.device, dtype=torch.uint8), diagonal=1) 30 | subsequent_mask = subsequent_mask.unsqueeze(0).expand(sz_b, -1, -1) # b x ls x ls 31 | 32 | return subsequent_mask.cuda() 33 | 34 | 35 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): 36 | ''' Sinusoid position encoding table ''' 37 | 38 | def cal_angle(position, hid_idx): 39 | return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) 40 | 41 | def get_posi_angle_vec(position): 42 | return [cal_angle(position, hid_j) for hid_j in range(d_hid)] 43 | 44 | sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)]) 45 | 46 | sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 47 | sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 48 | 49 | if padding_idx is not None: 50 | # zero vector for padding dimension 51 | sinusoid_table[padding_idx] = 0. 52 | 53 | return torch.FloatTensor(sinusoid_table) 54 | 55 | 56 | class Decoder(nn.Module): 57 | ''' A decoder model with self attention mechanism. ''' 58 | 59 | def __init__( 60 | self, n_tgt_vocab, len_max_seq, d_word_vec, 61 | n_layers, n_head, d_k, d_v, 62 | d_model, d_inner, dropout=0.1): 63 | 64 | super().__init__() 65 | n_position = len_max_seq + 1 66 | 67 | self.tgt_word_emb = nn.Embedding( 68 | n_tgt_vocab, d_word_vec, padding_idx=Constants.PAD) 69 | 70 | self.position_enc = nn.Embedding.from_pretrained( 71 | get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0), 72 | freeze=True) 73 | 74 | self.layer_stack = nn.ModuleList([ 75 | DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) 76 | for _ in range(n_layers)]) 77 | 78 | def forward(self, tgt_seq, tgt_pos, src_seq, enc_output, return_attns=False): 79 | 80 | dec_slf_attn_list, dec_enc_attn_list = [], [] 81 | 82 | # -- Prepare masks 83 | non_pad_mask = get_non_pad_mask(tgt_seq) 84 | 85 | slf_attn_mask_subseq = get_subsequent_mask(tgt_seq) 86 | slf_attn_mask_keypad = get_attn_key_pad_mask(seq_k=tgt_seq, seq_q=tgt_seq) 87 | slf_attn_mask = (slf_attn_mask_keypad + slf_attn_mask_subseq).gt(0) 88 | 89 | src_tmp = torch.ones(src_seq.shape[0], src_seq.shape[1]).cuda() 90 | dec_enc_attn_mask = get_attn_key_pad_mask(seq_k=src_tmp, seq_q=tgt_seq) 91 | 92 | # -- Forward 93 | dec_output = self.tgt_word_emb(tgt_seq) + self.position_enc(tgt_pos) 94 | 95 | for dec_layer in self.layer_stack: 96 | dec_output, dec_slf_attn, dec_enc_attn = dec_layer( 97 | dec_output, enc_output, 98 | non_pad_mask=non_pad_mask, 99 | slf_attn_mask=slf_attn_mask, 100 | dec_enc_attn_mask=dec_enc_attn_mask) 101 | 102 | if return_attns: 103 | dec_slf_attn_list += [dec_slf_attn] 104 | dec_enc_attn_list += [dec_enc_attn] 105 | 106 | if return_attns: 107 | return dec_output, dec_slf_attn_list, dec_enc_attn_list 108 | return dec_output, 109 | -------------------------------------------------------------------------------- /model/DecoderRNN.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from .Attention import Attention 8 | 9 | 10 | class DecoderRNN(nn.Module): 11 | """ 12 | Provides functionality for decoding in a seq2seq framework, with an option for attention. 13 | Args: 14 | vocab_size (int): size of the vocabulary 15 | max_len (int): a maximum allowed length for the sequence to be processed 16 | dim_hidden (int): the number of features in the hidden state `h` 17 | n_layers (int, optional): number of recurrent layers (default: 1) 18 | rnn_cell (str, optional): type of RNN cell (default: gru) 19 | bidirectional (bool, optional): if the encoder is bidirectional (default False) 20 | input_dropout_p (float, optional): dropout probability for the input sequence (default: 0) 21 | rnn_dropout_p (float, optional): dropout probability for the output sequence (default: 0) 22 | 23 | """ 24 | 25 | def __init__(self, 26 | vocab_size, 27 | max_len, 28 | dim_hidden, 29 | dim_word, 30 | n_layers=1, 31 | rnn_cell='gru', 32 | bidirectional=False, 33 | input_dropout_p=0.1, 34 | rnn_dropout_p=0.1): 35 | super(DecoderRNN, self).__init__() 36 | 37 | self.bidirectional_encoder = bidirectional 38 | 39 | self.dim_output = vocab_size 40 | self.dim_hidden = dim_hidden * 2 if bidirectional else dim_hidden 41 | self.dim_word = dim_word 42 | self.max_length = max_len 43 | self.sos_id = 1 44 | self.eos_id = 0 45 | self.input_dropout = nn.Dropout(input_dropout_p) 46 | self.embedding = nn.Embedding(self.dim_output, dim_word) 47 | self.attention = Attention(self.dim_hidden) 48 | if rnn_cell.lower() == 'lstm': 49 | self.rnn_cell = nn.LSTM 50 | elif rnn_cell.lower() == 'gru': 51 | self.rnn_cell = nn.GRU 52 | self.rnn = self.rnn_cell( 53 | self.dim_hidden + dim_word, 54 | self.dim_hidden, 55 | n_layers, 56 | batch_first=True, 57 | dropout=rnn_dropout_p) 58 | 59 | self.out = nn.Linear(self.dim_hidden, self.dim_output) 60 | 61 | self._init_weights() 62 | 63 | def forward(self, 64 | encoder_outputs, 65 | encoder_hidden, 66 | targets=None, 67 | mode='train', 68 | opt={}): 69 | """ 70 | 71 | Inputs: inputs, encoder_hidden, encoder_outputs, function, teacher_forcing_ratio 72 | - **encoder_hidden** (num_layers * num_directions, batch_size, dim_hidden): tensor containing the features in the 73 | hidden state `h` of encoder. Used as the initial hidden state of the decoder. (default `None`) 74 | - **encoder_outputs** (batch, seq_len, dim_hidden * num_directions): (default is `None`). 75 | - **targets** (batch, max_length): targets labels of the ground truth sentences 76 | 77 | Outputs: seq_probs, 78 | - **seq_logprobs** (batch_size, max_length, vocab_size): tensors containing the outputs of the decoding function. 79 | - **seq_preds** (batch_size, max_length): predicted symbols 80 | """ 81 | sample_max = 1 82 | beam_size = 1 83 | temperature = 1.0 84 | 85 | batch_size, _, _ = encoder_outputs.size() 86 | decoder_hidden = self._init_rnn_state(encoder_hidden) 87 | 88 | seq_logprobs = [] 89 | seq_preds = [] 90 | final_encoding = [] 91 | self.rnn.flatten_parameters() 92 | if mode == 'train': 93 | # use targets as rnn inputs 94 | targets_emb = self.embedding(targets) 95 | for i in range(self.max_length - 1): 96 | current_words = targets_emb[:, i, :] 97 | context = self.attention(decoder_hidden.squeeze(0), encoder_outputs) 98 | decoder_input = torch.cat([current_words, context], dim=1) 99 | decoder_input = self.input_dropout(decoder_input).unsqueeze(1) 100 | decoder_output, decoder_hidden = self.rnn( 101 | decoder_input, decoder_hidden) 102 | final_encoding.append(decoder_output) 103 | logprobs = F.log_softmax( 104 | self.out(decoder_output.squeeze(1)), dim=1) 105 | seq_logprobs.append(logprobs.unsqueeze(1)) 106 | final_encoding = torch.cat(final_encoding, 1) 107 | seq_logprobs = torch.cat(seq_logprobs, 1) 108 | 109 | elif mode == 'inference': 110 | if beam_size > 1: 111 | return self.sample_beam(encoder_outputs, decoder_hidden, opt) 112 | 113 | for t in range(self.max_length - 1): 114 | context = self.attention( 115 | decoder_hidden.squeeze(0), encoder_outputs) 116 | 117 | if t == 0: # input 118 | it = torch.LongTensor([self.sos_id] * batch_size).cuda() 119 | elif sample_max: 120 | sampleLogprobs, it = torch.max(logprobs, 1) 121 | seq_logprobs.append(sampleLogprobs.view(-1, 1)) 122 | it = it.view(-1).long() 123 | 124 | else: 125 | # sample according to distribuition 126 | if temperature == 1.0: 127 | prob_prev = torch.exp(logprobs) 128 | else: 129 | # scale logprobs by temperature 130 | prob_prev = torch.exp(torch.div(logprobs, temperature)) 131 | it = torch.multinomial(prob_prev, 1).cuda() 132 | sampleLogprobs = logprobs.gather(1, it) 133 | seq_logprobs.append(sampleLogprobs.view(-1, 1)) 134 | it = it.view(-1).long() 135 | 136 | seq_preds.append(it.view(-1, 1)) 137 | 138 | xt = self.embedding(it) 139 | decoder_input = torch.cat([xt, context], dim=1) 140 | decoder_input = self.input_dropout(decoder_input).unsqueeze(1) 141 | decoder_output, decoder_hidden = self.rnn( 142 | decoder_input, decoder_hidden) 143 | final_encoding.append(decoder_output) 144 | logprobs = F.log_softmax( 145 | self.out(decoder_output.squeeze(1)), dim=1) 146 | 147 | seq_logprobs = torch.cat(seq_logprobs, 1) 148 | seq_preds = torch.cat(seq_preds[1:], 1) 149 | final_encoding = torch.cat(final_encoding, 1) 150 | return seq_logprobs, seq_preds, final_encoding, decoder_hidden 151 | 152 | def _init_weights(self): 153 | """ init the weight of some layers 154 | """ 155 | nn.init.xavier_normal_(self.out.weight) 156 | 157 | def _init_rnn_state(self, encoder_hidden): 158 | """ Initialize the encoder hidden state. """ 159 | if encoder_hidden is None: 160 | return None 161 | if isinstance(encoder_hidden, tuple): 162 | encoder_hidden = tuple( 163 | [self._cat_directions(h) for h in encoder_hidden]) 164 | else: 165 | encoder_hidden = self._cat_directions(encoder_hidden) 166 | return encoder_hidden 167 | 168 | def _cat_directions(self, h): 169 | """ If the encoder is bidirectional, do the following transformation. 170 | (#directions * #layers, #batch, dim_hidden) -> (#layers, #batch, #directions * dim_hidden) 171 | """ 172 | if self.bidirectional_encoder: 173 | h = torch.cat([h[0:h.size(0):2], h[1:h.size(0):2]], 2) 174 | return h 175 | -------------------------------------------------------------------------------- /model/EncoderRNN.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class EncoderRNN(nn.Module): 5 | def __init__(self, dim_vid, dim_hidden, input_dropout_p=0.2, rnn_dropout_p=0.5, 6 | n_layers=1, bidirectional=False, rnn_cell='gru'): 7 | """ 8 | 9 | Args: 10 | hidden_dim (int): dim of hidden state of rnn 11 | input_dropout_p (int): dropout probability for the input sequence 12 | dropout_p (float): dropout probability for the output sequence 13 | n_layers (int): number of rnn layers 14 | rnn_cell (str): type of RNN cell ('LSTM'/'GRU') 15 | """ 16 | super(EncoderRNN, self).__init__() 17 | self.dim_vid = dim_vid 18 | self.dim_hidden = dim_hidden 19 | self.input_dropout_p = input_dropout_p 20 | self.rnn_dropout_p = rnn_dropout_p 21 | self.n_layers = n_layers 22 | self.bidirectional = bidirectional 23 | self.rnn_cell = rnn_cell 24 | 25 | self.vid2hid = nn.Linear(dim_vid, dim_hidden) 26 | self.input_dropout = nn.Dropout(input_dropout_p) 27 | 28 | if rnn_cell.lower() == 'lstm': 29 | self.rnn_cell = nn.LSTM 30 | elif rnn_cell.lower() == 'gru': 31 | self.rnn_cell = nn.GRU 32 | 33 | self.rnn = self.rnn_cell(dim_hidden, dim_hidden, n_layers, batch_first=True, 34 | dropout=self.rnn_dropout_p) 35 | 36 | self._init_hidden() 37 | 38 | def _init_hidden(self): 39 | nn.init.xavier_normal_(self.vid2hid.weight) 40 | 41 | def forward(self, vid_feats): 42 | """ 43 | Applies a multi-layer RNN to an input sequence. 44 | Args: 45 | input_var (batch, seq_len): tensor containing the features of the input sequence. 46 | input_lengths (list of int, optional): A list that contains the lengths of sequences 47 | in the mini-batch 48 | Returns: output, hidden 49 | - **output** (batch, seq_len, hidden_size): variable containing the encoded features of the input sequence 50 | - **hidden** (num_layers * num_directions, batch, hidden_size): variable containing the features in the hidden state h 51 | """ 52 | batch_size, seq_len, dim_vid = vid_feats.size() 53 | vid_feats = self.vid2hid(vid_feats.view(-1, dim_vid)) 54 | vid_feats = self.input_dropout(vid_feats) 55 | vid_feats = vid_feats.view(batch_size, seq_len, self.dim_hidden) 56 | self.rnn.flatten_parameters() 57 | state1 = None 58 | output, hidden = self.rnn(vid_feats, state1) 59 | return output, hidden 60 | 61 | -------------------------------------------------------------------------------- /model/Model.py: -------------------------------------------------------------------------------- 1 | ''' Define the Transformer model ''' 2 | from utils.utils import * 3 | from model.Decoder import Decoder 4 | from model.EncoderRNN import EncoderRNN 5 | 6 | __author__ = 'Jacob Zhiyuan Fang' 7 | 8 | 9 | class Model(nn.Module): 10 | ''' A sequence to sequence model with attention mechanism. ''' 11 | 12 | def __init__( 13 | self, 14 | n_cap_vocab, n_cms_vocab, cap_max_seq, cms_max_seq, vis_emb=2048, 15 | d_word_vec=512, d_model=512, d_inner=2048, n_layers=6, rnn_layers=1, 16 | n_head=8, d_k=64, d_v=64, dropout=0.1, tgt_emb_prj_weight_sharing=True): 17 | 18 | super().__init__() 19 | 20 | # set RNN layers at 1 or 2 yield better performance. 21 | self.encoder = EncoderRNN(vis_emb, d_model, n_layers=rnn_layers, 22 | bidirectional=0) 23 | 24 | self.decoder = Decoder( 25 | n_tgt_vocab=n_cap_vocab, len_max_seq=cap_max_seq, 26 | d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, 27 | n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, 28 | dropout=dropout) 29 | 30 | self.cms_decoder = Decoder( 31 | n_tgt_vocab=n_cms_vocab, len_max_seq=cms_max_seq, 32 | d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, 33 | n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, 34 | dropout=dropout) 35 | 36 | self.cap_word_prj = nn.Linear(d_model, n_cap_vocab, bias=False) 37 | self.cms_word_prj = nn.Linear(d_model, n_cms_vocab, bias=False) 38 | 39 | nn.init.xavier_normal_(self.cap_word_prj.weight) 40 | nn.init.xavier_normal_(self.cms_word_prj.weight) 41 | 42 | assert d_model == d_word_vec, \ 43 | 'To facilitate the residual connections, ' \ 44 | 'the dimensions of all module outputs shall be the same.' 45 | 46 | if tgt_emb_prj_weight_sharing: 47 | # Share the weight matrix between target word embedding & the final logit dense layer 48 | self.cap_word_prj.weight = self.decoder.tgt_word_emb.weight 49 | self.cms_word_prj.weight = self.cms_decoder.tgt_word_emb.weight 50 | self.x_logit_scale = (d_model ** -0.5) 51 | else: 52 | self.x_logit_scale = 1. 53 | 54 | def forward(self, vis_feat, tgt_seq, tgt_pos, cms_seq, cms_pos): 55 | 56 | tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1] 57 | cms_seq, cms_pos = cms_seq[:, :-1], cms_pos[:, :-1] 58 | 59 | enc_output, *_ = self.encoder(vis_feat) 60 | dec_output, *_ = self.decoder(tgt_seq, tgt_pos, vis_feat, enc_output) 61 | seq_logit = self.cap_word_prj(dec_output) * self.x_logit_scale 62 | 63 | # Concatenate visual and caption encoding 64 | cat_output = torch.cat((enc_output, dec_output), 1) 65 | 66 | cms_dec_output, *_ = self.cms_decoder(cms_seq, cms_pos, cat_output, cat_output) 67 | cms_logit = self.cms_word_prj(cms_dec_output) * self.x_logit_scale 68 | 69 | return seq_logit.view(-1, seq_logit.size(2)), cms_logit.view(-1, cms_logit.size(2)) 70 | 71 | -------------------------------------------------------------------------------- /model/S2VTAttModel.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | 4 | class S2VTAttModel(nn.Module): 5 | def __init__(self, encoder, decoder, cms_decoder): 6 | """ 7 | 8 | Args: 9 | encoder (nn.Module): Encoder rnn 10 | decoder (nn.Module): Decoder rnn 11 | """ 12 | super(S2VTAttModel, self).__init__() 13 | self.encoder = encoder 14 | self.decoder = decoder 15 | self.cms_decoder = cms_decoder 16 | 17 | def forward(self, vid_feats, cap_labels=None, cms_labels=None, mode='train', opt={}): 18 | """ 19 | 20 | Args: 21 | vid_feats (Variable): video feats of shape [batch_size, seq_len, dim_vid] 22 | target_variable (None, optional): ground truth labels 23 | 24 | Returns: 25 | seq_prob: Variable of shape [batch_size, max_len-1, vocab_size] 26 | seq_preds: [] or Variable of shape [batch_size, max_len-1] 27 | """ 28 | encoder_outputs, encoder_hidden = self.encoder(vid_feats) 29 | # seq_prob, _, cap_encoding, cap_hidden = self.decoder(encoder_outputs,encoder_hidden, cap_labels, 'train', opt) 30 | _, seq_prob, cap_encoding, cap_hidden = self.decoder(encoder_outputs, encoder_hidden, 31 | None, 'inference', opt) 32 | 33 | cat_encoding = torch.cat((encoder_outputs, cap_encoding), 1) 34 | if mode == 'test': 35 | _, cms_seq_prob, _, _ = self.cms_decoder(cat_encoding, cap_hidden, targets=None, mode='inference', opt=opt) 36 | else: 37 | cms_seq_prob, _, _, _ = self.cms_decoder(cat_encoding, cap_hidden, cms_labels, mode='train', opt=opt) 38 | return seq_prob, cms_seq_prob 39 | -------------------------------------------------------------------------------- /model/S2VTModel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | 6 | 7 | class S2VTModel(nn.Module): 8 | def __init__(self, vocab_size, cms_vocab_size, max_len, cms_max_len, dim_hidden, dim_word, dim_vid=2048, sos_id=2, eos_id=3, 9 | n_layers=1, rnn_cell='gru', rnn_dropout_p=0.2): 10 | super(S2VTModel, self).__init__() 11 | 12 | if rnn_cell.lower() == 'lstm': 13 | self.rnn_cell = nn.LSTM 14 | elif rnn_cell.lower() == 'gru': 15 | self.rnn_cell = nn.GRU 16 | 17 | self.rnn1 = self.rnn_cell(dim_vid, dim_hidden, n_layers, 18 | batch_first=True, dropout=rnn_dropout_p) 19 | 20 | self.rnn2 = self.rnn_cell(dim_hidden + dim_word, dim_hidden, n_layers, 21 | batch_first=True, dropout=rnn_dropout_p) 22 | 23 | self.rnn3 = self.rnn_cell(dim_hidden + dim_word, dim_hidden, n_layers, 24 | batch_first=True, dropout=rnn_dropout_p) 25 | 26 | self.dim_vid = dim_vid 27 | self.dim_output = vocab_size 28 | self.cms_dim_output = cms_vocab_size 29 | self.dim_hidden = dim_hidden 30 | self.dim_word = dim_word 31 | self.max_length = max_len 32 | self.cms_max_length = cms_max_len 33 | self.sos_id = sos_id 34 | self.eos_id = eos_id 35 | self.embedding = nn.Embedding(self.dim_output, self.dim_word) 36 | 37 | self.out = nn.Linear(self.dim_hidden, self.dim_output) 38 | self.cms_out = nn.Linear(self.dim_hidden, self.cms_dim_output) 39 | 40 | def forward(self, vid_feats, target_variable=None, cms_target_variable=None, mode='train', opt={}): 41 | batch_size, n_frames, _ = vid_feats.shape 42 | 43 | padding_words = torch.zeros((batch_size, n_frames, self.dim_word)).cuda() 44 | padding_frames = torch.zeros((batch_size, 1, self.dim_vid)).cuda() 45 | state1 = None 46 | state2 = None 47 | 48 | output1, state1 = self.rnn1(vid_feats, state1) 49 | input2 = torch.cat((output1, padding_words), dim=2) 50 | output2, state2 = self.rnn2(input2, state2) 51 | 52 | seq_probs = [] 53 | seq_preds = [] 54 | cms_seq_probs = [] 55 | cms_seq_preds = [] 56 | if mode == 'train': 57 | for i in range(self.max_length - 1): 58 | # doesn't input to the network 59 | current_words = self.embedding(target_variable[:, i]) 60 | self.rnn1.flatten_parameters() 61 | self.rnn2.flatten_parameters() 62 | output1, state1 = self.rnn1(padding_frames, state1) 63 | input2 = torch.cat( 64 | (output1, current_words.unsqueeze(1)), dim=2) 65 | output2, state2 = self.rnn2(input2, state2) 66 | logits = self.out(output2.squeeze(1)) 67 | logits = F.log_softmax(logits, dim=1) 68 | seq_probs.append(logits.unsqueeze(1)) 69 | seq_probs = torch.cat(seq_probs, 1) 70 | 71 | # CMS decoding training 72 | state3 = state2 73 | for i in range(self.cms_max_length - 1): 74 | # doesn't input to the network 75 | current_words = self.embedding(cms_target_variable[:, i]) 76 | self.rnn3.flatten_parameters() 77 | input3 = torch.cat( 78 | (output2, current_words.unsqueeze(1)), dim=2) 79 | 80 | output3, state3 = self.rnn3(input3, state3) 81 | logits = self.cms_out(output3.squeeze(1)) 82 | logits = F.log_softmax(logits, dim=1) 83 | cms_seq_probs.append(logits.unsqueeze(1)) 84 | cms_seq_probs = torch.cat(cms_seq_probs, 1) 85 | 86 | else: 87 | for i in range(self.max_length - 1): 88 | # doesn't input to the network 89 | current_words = self.embedding(target_variable[:, i]) 90 | self.rnn1.flatten_parameters() 91 | self.rnn2.flatten_parameters() 92 | output1, state1 = self.rnn1(padding_frames, state1) 93 | input2 = torch.cat( 94 | (output1, current_words.unsqueeze(1)), dim=2) 95 | output2, state2 = self.rnn2(input2, state2) 96 | logits = self.out(output2.squeeze(1)) 97 | logits = F.log_softmax(logits, dim=1) 98 | seq_probs.append(logits.unsqueeze(1)) 99 | seq_probs = torch.cat(seq_probs, 1) 100 | 101 | state3 = state2 102 | current_words = self.embedding( 103 | Variable(torch.LongTensor([self.sos_id] * batch_size)).cuda()) 104 | for i in range(self.cms_max_length - 1): 105 | # current_words = self.embedding(cms_target_variable[:, i]) 106 | self.rnn3.flatten_parameters() 107 | input3 = torch.cat((output2, current_words.unsqueeze(1)), dim=2) 108 | output3, state3 = self.rnn3(input3, state3) 109 | 110 | logits = self.cms_out(output3.squeeze(1)) 111 | logits = F.log_softmax(logits, dim=1) 112 | cms_seq_probs.append(logits.unsqueeze(1)) 113 | 114 | _, preds = torch.max(logits, 1) 115 | current_words = self.embedding(preds) 116 | cms_seq_preds.append(preds.unsqueeze(1)) 117 | 118 | cms_seq_probs = torch.cat(cms_seq_probs, 1) 119 | cms_seq_preds = torch.cat(cms_seq_preds, 1) 120 | return seq_probs, seq_preds, cms_seq_probs, cms_seq_preds -------------------------------------------------------------------------------- /model/S2VT_EncoderRNN.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class EncoderRNN(nn.Module): 5 | def __init__(self, dim_vid, dim_hidden, input_dropout_p=0.2, rnn_dropout_p=0.5, 6 | n_layers=1, bidirectional=False, rnn_cell='gru'): 7 | """ 8 | Args: 9 | hidden_dim (int): dim of hidden state of rnn 10 | input_dropout_p (int): dropout probability for the input sequence 11 | dropout_p (float): dropout probability for the output sequence 12 | n_layers (int): number of rnn layers 13 | rnn_cell (str): type of RNN cell ('LSTM'/'GRU') 14 | """ 15 | super(EncoderRNN, self).__init__() 16 | self.dim_vid = dim_vid 17 | self.dim_hidden = dim_hidden 18 | self.input_dropout_p = input_dropout_p 19 | self.rnn_dropout_p = rnn_dropout_p 20 | self.n_layers = n_layers 21 | self.bidirectional = bidirectional 22 | self.rnn_cell = rnn_cell 23 | 24 | self.vid2hid = nn.Linear(dim_vid, dim_hidden) 25 | self.input_dropout = nn.Dropout(input_dropout_p) 26 | 27 | if rnn_cell.lower() == 'lstm': 28 | self.rnn_cell = nn.LSTM 29 | elif rnn_cell.lower() == 'gru': 30 | self.rnn_cell = nn.GRU 31 | 32 | self.rnn = self.rnn_cell(dim_hidden, dim_hidden, n_layers, batch_first=True, 33 | dropout=self.rnn_dropout_p) 34 | 35 | self._init_hidden() 36 | 37 | def _init_hidden(self): 38 | nn.init.xavier_normal_(self.vid2hid.weight) 39 | 40 | def forward(self, vid_feats): 41 | """ 42 | Applies a multi-layer RNN to an input sequence. 43 | Args: 44 | input_var (batch, seq_len): tensor containing the features of the input sequence. 45 | input_lengths (list of int, optional): A list that contains the lengths of sequences 46 | in the mini-batch 47 | Returns: output, hidden 48 | - **output** (batch, seq_len, hidden_size): variable containing the encoded features of the input sequence 49 | - **hidden** (num_layers * num_directions, batch, hidden_size): variable containing the features in the hidden state h 50 | """ 51 | batch_size, seq_len, dim_vid = vid_feats.size() 52 | vid_feats = self.vid2hid(vid_feats.view(-1, dim_vid)) 53 | vid_feats = self.input_dropout(vid_feats) 54 | vid_feats = vid_feats.view(batch_size, seq_len, self.dim_hidden) 55 | self.rnn.flatten_parameters() 56 | state1 = None 57 | output, hidden = self.rnn(vid_feats, state1) 58 | return output, hidden 59 | -------------------------------------------------------------------------------- /model/TransformerDecoderModel.py: -------------------------------------------------------------------------------- 1 | ''' Define the Transformer model ''' 2 | import numpy as np 3 | from utils.utils import * 4 | from model.Decoder import Decoder 5 | from model.transformer.Layers import EncoderLayer 6 | 7 | __author__ = 'Yu-Hsiang Huang' 8 | __AugmentedBy__ = 'Jacob Zhiyuan Fang' 9 | 10 | 11 | class Model(nn.Module): 12 | ''' A sequence to sequence model with attention mechanism. ''' 13 | 14 | def __init__( 15 | self, 16 | n_cap_vocab, n_cms_vocab, cap_max_seq, cms_max_seq, vis_emb=2048, 17 | d_word_vec=512, d_model=512, d_inner=2048, n_layers=6, rnn_layers=1, 18 | n_head=8, d_k=64, d_v=64, dropout=0.1, tgt_emb_prj_weight_sharing=True): 19 | 20 | super().__init__() 21 | 22 | # set RNN layers at 1 or 2 yield better performance. 23 | self.vis_emb = nn.Linear(vis_emb, d_model) 24 | 25 | self.decoder = Decoder( 26 | n_tgt_vocab=n_cap_vocab, len_max_seq=cap_max_seq, 27 | d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, 28 | n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, 29 | dropout=dropout) 30 | 31 | self.cms_decoder = Decoder( 32 | n_tgt_vocab=n_cms_vocab, len_max_seq=cms_max_seq, 33 | d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, 34 | n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, 35 | dropout=dropout) 36 | 37 | self.cap_word_prj = nn.Linear(d_model, n_cap_vocab, bias=False) 38 | self.cms_word_prj = nn.Linear(d_model, n_cms_vocab, bias=False) 39 | 40 | nn.init.xavier_normal_(self.cap_word_prj.weight) 41 | nn.init.xavier_normal_(self.cms_word_prj.weight) 42 | 43 | assert d_model == d_word_vec, \ 44 | 'To facilitate the residual connections, ' \ 45 | 'the dimensions of all module outputs shall be the same.' 46 | 47 | if tgt_emb_prj_weight_sharing: 48 | # Share the weight matrix between target word embedding & the final logit dense layer 49 | self.cap_word_prj.weight = self.decoder.tgt_word_emb.weight 50 | self.cms_word_prj.weight = self.cms_decoder.tgt_word_emb.weight 51 | self.x_logit_scale = (d_model ** -0.5) 52 | else: 53 | self.x_logit_scale = 1. 54 | 55 | def forward(self, vis_feat, tgt_seq, tgt_pos, cms_seq, cms_pos): 56 | enc_output = self.vis_emb(vis_feat) 57 | tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1] 58 | cms_seq, cms_pos = cms_seq[:, :-1], cms_pos[:, :-1] 59 | 60 | dec_output, *_ = self.decoder(tgt_seq, tgt_pos, vis_feat, enc_output) 61 | seq_logit = self.cap_word_prj(dec_output) * self.x_logit_scale 62 | 63 | # Concatenate visual and caption encoding 64 | cat_output = torch.cat((enc_output, dec_output), 1) 65 | 66 | cms_dec_output, *_ = self.cms_decoder(cms_seq, cms_pos, cat_output, cat_output) 67 | cms_logit = self.cms_word_prj(cms_dec_output) * self.x_logit_scale 68 | 69 | return seq_logit.view(-1, seq_logit.size(2)), cms_logit.view(-1, cms_logit.size(2)) 70 | 71 | -------------------------------------------------------------------------------- /model/TransformerModel.py: -------------------------------------------------------------------------------- 1 | ''' Define the Transformer model ''' 2 | import numpy as np 3 | from utils.utils import * 4 | from model.Decoder import Decoder 5 | from model.transformer.Layers import EncoderLayer 6 | 7 | __author__ = 'Yu-Hsiang Huang' 8 | __AugmentedBy__ = 'Jacob Zhiyuan Fang' 9 | 10 | 11 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): 12 | ''' Sinusoid position encoding table ''' 13 | 14 | def cal_angle(position, hid_idx): 15 | return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) 16 | 17 | def get_posi_angle_vec(position): 18 | return [cal_angle(position, hid_j) for hid_j in range(d_hid)] 19 | 20 | sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)]) 21 | 22 | sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 23 | sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 24 | 25 | if padding_idx is not None: 26 | # zero vector for padding dimension 27 | sinusoid_table[padding_idx] = 0. 28 | 29 | return torch.FloatTensor(sinusoid_table) 30 | 31 | 32 | def get_non_pad_mask(seq): 33 | assert seq.dim() == 2 34 | return seq.ne(Constants.PAD).type(torch.float).unsqueeze(-1).cuda() 35 | 36 | 37 | def get_attn_key_pad_mask(seq_k, seq_q): 38 | ''' For masking out the padding part of key sequence. ''' 39 | 40 | # Expand to fit the shape of key query attention matrix. 41 | len_q = seq_q.size(1) 42 | padding_mask = seq_k.eq(Constants.PAD) 43 | padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1) # b x lq x lk 44 | 45 | return padding_mask.cuda() 46 | 47 | class Encoder(nn.Module): 48 | ''' A encoder model with self attention mechanism. ''' 49 | 50 | def __init__( 51 | self, 52 | len_max_seq, d_word_vec, 53 | n_layers, n_head, d_k, d_v, 54 | d_model, d_inner, dropout=0.1): 55 | 56 | super().__init__() 57 | 58 | n_position = len_max_seq + 1 59 | 60 | self.position_enc = nn.Embedding.from_pretrained( 61 | get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0), freeze=True) 62 | 63 | self.layer_stack = nn.ModuleList([ 64 | EncoderLayer(d_word_vec, d_inner, n_head, d_k, d_v, dropout=dropout) 65 | for _ in range(n_layers)]) 66 | 67 | def forward(self, src_emb, src_pos, return_attns=False): 68 | 69 | enc_slf_attn_list = [] 70 | 71 | # -- Prepare masks 72 | _ = torch.rand(src_emb.shape[0], src_emb.shape[1]) 73 | slf_attn_mask = get_attn_key_pad_mask(seq_k=_, seq_q=_) 74 | non_pad_mask = get_non_pad_mask(_) 75 | 76 | # -- Forward 77 | enc_output = src_emb + self.position_enc(src_pos) 78 | 79 | for enc_layer in self.layer_stack: 80 | enc_output, enc_slf_attn = enc_layer(enc_output, non_pad_mask=non_pad_mask, 81 | slf_attn_mask=slf_attn_mask) 82 | if return_attns: 83 | enc_slf_attn_list += [enc_slf_attn] 84 | 85 | if return_attns: 86 | return enc_output, enc_slf_attn_list 87 | return enc_output, 88 | 89 | class Model(nn.Module): 90 | ''' A sequence to sequence model with attention mechanism. ''' 91 | 92 | def __init__( 93 | self, 94 | n_cap_vocab, n_cms_vocab, cap_max_seq, cms_max_seq, vis_emb=2048, 95 | d_word_vec=512, d_model=512, d_inner=2048, n_layers=6, rnn_layers=1, 96 | n_head=8, d_k=64, d_v=64, dropout=0.1, tgt_emb_prj_weight_sharing=True): 97 | 98 | super().__init__() 99 | 100 | # set RNN layers at 1 or 2 yield better performance. 101 | self.vis_emb = nn.Linear(vis_emb, d_model) 102 | self.encoder = Encoder(40, d_model, rnn_layers, n_head, d_k, d_v, 103 | d_model, d_inner, dropout=0.1) 104 | 105 | self.decoder = Decoder( 106 | n_tgt_vocab=n_cap_vocab, len_max_seq=cap_max_seq, 107 | d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, 108 | n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, 109 | dropout=dropout) 110 | 111 | self.cms_decoder = Decoder( 112 | n_tgt_vocab=n_cms_vocab, len_max_seq=cms_max_seq, 113 | d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, 114 | n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, 115 | dropout=dropout) 116 | 117 | self.cap_word_prj = nn.Linear(d_model, n_cap_vocab, bias=False) 118 | self.cms_word_prj = nn.Linear(d_model, n_cms_vocab, bias=False) 119 | 120 | nn.init.xavier_normal_(self.cap_word_prj.weight) 121 | nn.init.xavier_normal_(self.cms_word_prj.weight) 122 | 123 | assert d_model == d_word_vec, \ 124 | 'To facilitate the residual connections, ' \ 125 | 'the dimensions of all module outputs shall be the same.' 126 | 127 | if tgt_emb_prj_weight_sharing: 128 | # Share the weight matrix between target word embedding & the final logit dense layer 129 | self.cap_word_prj.weight = self.decoder.tgt_word_emb.weight 130 | self.cms_word_prj.weight = self.cms_decoder.tgt_word_emb.weight 131 | self.x_logit_scale = (d_model ** -0.5) 132 | else: 133 | self.x_logit_scale = 1. 134 | 135 | def forward(self, vis_feat, tgt_seq, tgt_pos, cms_seq, cms_pos): 136 | vis_feat = self.vis_emb(vis_feat) 137 | tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1] 138 | cms_seq, cms_pos = cms_seq[:, :-1], cms_pos[:, :-1] 139 | 140 | vis_pos = torch.tensor(list(range(0, 40))).cuda().unsqueeze(0).repeat(vis_feat.shape[0], 1) 141 | enc_output, *_ = self.encoder(vis_feat, vis_pos) 142 | dec_output, *_ = self.decoder(tgt_seq, tgt_pos, vis_feat, enc_output) 143 | seq_logit = self.cap_word_prj(dec_output) * self.x_logit_scale 144 | 145 | # Concatenate visual and caption encoding 146 | cat_output = torch.cat((enc_output, dec_output), 1) 147 | 148 | cms_dec_output, *_ = self.cms_decoder(cms_seq, cms_pos, cat_output, cat_output) 149 | cms_logit = self.cms_word_prj(cms_dec_output) * self.x_logit_scale 150 | 151 | return seq_logit.view(-1, seq_logit.size(2)), cms_logit.view(-1, cms_logit.size(2)) 152 | 153 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jacobswan1/Video2Commonsense/4dcef76360a29702fd90b7030a39a123da6db19e/model/__init__.py -------------------------------------------------------------------------------- /model/transformer/Beam.py: -------------------------------------------------------------------------------- 1 | """ Manage beam search info structure. 2 | 3 | Heavily borrowed from OpenNMT-py. 4 | For code in OpenNMT-py, please check the following link: 5 | https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/Beam.py 6 | """ 7 | 8 | import torch 9 | from ..transformer import Constants as Constants 10 | # import Constants as Constants 11 | 12 | 13 | class Beam(): 14 | ''' Beam search ''' 15 | 16 | def __init__(self, size, device=False): 17 | 18 | self.size = size 19 | self._done = False 20 | 21 | # The score for each translation on the beam. 22 | self.scores = torch.zeros((size,), dtype=torch.float, device=device) 23 | self.all_scores = [] 24 | 25 | # The backpointers at each time-step. 26 | self.prev_ks = [] 27 | 28 | # The outputs at each time-step. 29 | self.next_ys = [torch.full((size,), Constants.PAD, dtype=torch.long, device=device)] 30 | self.next_ys[0][0] = Constants.BOS 31 | 32 | def get_current_state(self): 33 | "Get the outputs for the current timestep." 34 | return self.get_tentative_hypothesis() 35 | 36 | def get_current_origin(self): 37 | "Get the backpointers for the current timestep." 38 | return self.prev_ks[-1] 39 | 40 | @property 41 | def done(self): 42 | return self._done 43 | 44 | def advance(self, word_prob): 45 | "Update beam status and check if finished or not." 46 | num_words = word_prob.size(1) 47 | 48 | # Sum the previous scores. 49 | if len(self.prev_ks) > 0: 50 | beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob) 51 | else: 52 | beam_lk = word_prob[0] 53 | 54 | flat_beam_lk = beam_lk.view(-1) 55 | 56 | best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) # 1st sort 57 | best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) # 2nd sort 58 | 59 | self.all_scores.append(self.scores) 60 | self.scores = best_scores 61 | 62 | # bestScoresId is flattened as a (beam x word) array, 63 | # so we need to calculate which word and beam each score came from 64 | prev_k = best_scores_id / num_words 65 | self.prev_ks.append(prev_k) 66 | self.next_ys.append(best_scores_id - prev_k * num_words) 67 | 68 | # End condition is when top-of-beam is EOS. 69 | if self.next_ys[-1][0].item() == Constants.EOS: 70 | self._done = True 71 | self.all_scores.append(self.scores) 72 | 73 | return self._done 74 | 75 | def sort_scores(self): 76 | "Sort the scores." 77 | return torch.sort(self.scores, 0, True) 78 | 79 | def get_the_best_score_and_idx(self): 80 | "Get the score of the best in the beam." 81 | scores, ids = self.sort_scores() 82 | return scores[1], ids[1] 83 | 84 | def get_tentative_hypothesis(self): 85 | "Get the decoded sequence for the current timestep." 86 | 87 | if len(self.next_ys) == 1: 88 | dec_seq = self.next_ys[0].unsqueeze(1) 89 | else: 90 | _, keys = self.sort_scores() 91 | hyps = [self.get_hypothesis(k) for k in keys] 92 | hyps = [[Constants.BOS] + h for h in hyps] 93 | dec_seq = torch.LongTensor(hyps) 94 | 95 | return dec_seq 96 | 97 | def get_hypothesis(self, k): 98 | """ Walk back to construct the full hypothesis. """ 99 | hyp = [] 100 | for j in range(len(self.prev_ks) - 1, -1, -1): 101 | hyp.append(self.next_ys[j+1][k]) 102 | k = self.prev_ks[j][k] 103 | 104 | return list(map(lambda x: x.item(), hyp[::-1])) 105 | -------------------------------------------------------------------------------- /model/transformer/Constants.py: -------------------------------------------------------------------------------- 1 | 2 | PAD = 0 3 | UNK = 1 4 | BOS = 2 5 | EOS = 3 6 | SEP = 4 7 | 8 | CAP_PAD = 0 9 | CAP_UNK = 1 10 | CAP_BOS = 2 11 | CAP_EOS = 3 12 | 13 | # PAD_WORD = '' 14 | # UNK_WORD = '' 15 | # BOS_WORD = '' 16 | # EOS_WORD = '' 17 | -------------------------------------------------------------------------------- /model/transformer/Layers.py: -------------------------------------------------------------------------------- 1 | ''' Define the Layers ''' 2 | import torch.nn as nn 3 | from model.transformer.SubLayers import MultiHeadAttention, PositionwiseFeedForward 4 | 5 | __author__ = 'Yu-Hsiang Huang' 6 | __RevisedBy__ = 'Jacob Zhiyuan Fang' 7 | 8 | 9 | class EncoderLayer(nn.Module): 10 | ''' Compose with two layers ''' 11 | 12 | def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): 13 | super(EncoderLayer, self).__init__() 14 | self.slf_attn = MultiHeadAttention( 15 | n_head, d_model, d_k, d_v, dropout=dropout) 16 | self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout) 17 | 18 | def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): 19 | enc_output, enc_slf_attn = self.slf_attn( 20 | enc_input, enc_input, enc_input, mask=slf_attn_mask) 21 | enc_output *= non_pad_mask 22 | 23 | enc_output = self.pos_ffn(enc_output) 24 | enc_output *= non_pad_mask 25 | 26 | return enc_output, enc_slf_attn 27 | 28 | 29 | class DecoderLayer(nn.Module): 30 | ''' Compose with three layers ''' 31 | 32 | def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): 33 | super(DecoderLayer, self).__init__() 34 | self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) 35 | self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) 36 | self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout) 37 | 38 | def forward(self, dec_input, enc_output, non_pad_mask=None, slf_attn_mask=None, dec_enc_attn_mask=None): 39 | dec_output, dec_slf_attn = self.slf_attn( 40 | dec_input, dec_input, dec_input, mask=slf_attn_mask) 41 | dec_output *= non_pad_mask 42 | 43 | dec_output, dec_enc_attn = self.enc_attn( 44 | dec_output, enc_output, enc_output, mask=dec_enc_attn_mask) 45 | dec_output *= non_pad_mask 46 | 47 | dec_output = self.pos_ffn(dec_output) 48 | dec_output *= non_pad_mask 49 | 50 | return dec_output, dec_slf_attn, dec_enc_attn 51 | -------------------------------------------------------------------------------- /model/transformer/Modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | __author__ = "Yu-Hsiang Huang" 6 | 7 | class ScaledDotProductAttention(nn.Module): 8 | ''' Scaled Dot-Product Attention ''' 9 | 10 | def __init__(self, temperature, attn_dropout=0.1): 11 | super().__init__() 12 | self.temperature = temperature 13 | self.dropout = nn.Dropout(attn_dropout) 14 | self.softmax = nn.Softmax(dim=2) 15 | 16 | def forward(self, q, k, v, mask=None): 17 | 18 | attn = torch.bmm(q, k.transpose(1, 2)) 19 | attn = attn / self.temperature 20 | 21 | if mask is not None: 22 | attn = attn.masked_fill(mask, -np.inf) 23 | 24 | attn = self.softmax(attn) 25 | attn = self.dropout(attn) 26 | output = torch.bmm(attn, v) 27 | 28 | return output, attn 29 | -------------------------------------------------------------------------------- /model/transformer/Optim.py: -------------------------------------------------------------------------------- 1 | '''A wrapper class for optimizer ''' 2 | import numpy as np 3 | 4 | 5 | class ScheduledOptim(): 6 | '''A simple wrapper class for learning rate scheduling''' 7 | 8 | def __init__(self, optimizer, d_model, n_warmup_steps): 9 | self._optimizer = optimizer 10 | self.n_warmup_steps = n_warmup_steps 11 | self.n_current_steps = 0 12 | self.init_lr = np.power(d_model, -0.5) 13 | 14 | def step_and_update_lr(self): 15 | "Step with the inner optimizer" 16 | self._update_learning_rate() 17 | self._optimizer.step() 18 | 19 | def zero_grad(self): 20 | "Zero out the gradients by the inner optimizer" 21 | self._optimizer.zero_grad() 22 | 23 | def _get_lr_scale(self): 24 | return np.min([ 25 | np.power(self.n_current_steps, -0.8), 26 | np.power(self.n_warmup_steps, -1.8) * self.n_current_steps]) 27 | 28 | def _update_learning_rate(self): 29 | ''' Learning rate scheduling per step ''' 30 | 31 | self.n_current_steps += 1 32 | lr = self.init_lr * self._get_lr_scale() 33 | 34 | for param_group in self._optimizer.param_groups: 35 | param_group['lr'] = lr 36 | 37 | -------------------------------------------------------------------------------- /model/transformer/SubLayers.py: -------------------------------------------------------------------------------- 1 | ''' Define the sublayers in encoder/decoder layer ''' 2 | import numpy as np 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from model.transformer.Modules import ScaledDotProductAttention 6 | 7 | __author__ = "Yu-Hsiang Huang" 8 | __RevisedBy__ = 'Jacob Zhiyuan Fang' 9 | 10 | 11 | class MultiHeadAttention(nn.Module): 12 | ''' Multi-Head Attention module ''' 13 | 14 | def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): 15 | super().__init__() 16 | 17 | self.n_head = n_head 18 | self.d_k = d_k 19 | self.d_v = d_v 20 | 21 | self.w_qs = nn.Linear(d_model, n_head * d_k) 22 | self.w_ks = nn.Linear(d_model, n_head * d_k) 23 | self.w_vs = nn.Linear(d_model, n_head * d_v) 24 | nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) 25 | nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) 26 | nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) 27 | 28 | self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5)) 29 | self.layer_norm = nn.LayerNorm(d_model) 30 | 31 | self.fc = nn.Linear(n_head * d_v, d_model) 32 | nn.init.xavier_normal_(self.fc.weight) 33 | self.dropout = nn.Dropout(dropout) 34 | 35 | def forward(self, q, k, v, mask=None): 36 | d_k, d_v, n_head = self.d_k, self.d_v, self.n_head 37 | 38 | sz_b, len_q, _ = q.size() 39 | sz_b, len_k, _ = k.size() 40 | sz_b, len_v, _ = v.size() 41 | 42 | residual = q 43 | 44 | q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) 45 | k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) 46 | v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) 47 | 48 | q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk 49 | k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk 50 | v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv 51 | 52 | if mask is not None: 53 | mask = mask.repeat(n_head, 1, 1) # 54 | 55 | output, attn = self.attention(q, k, v, mask=mask) 56 | 57 | output = output.view(n_head, sz_b, len_q, d_v) 58 | output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv) 59 | 60 | output = self.dropout(self.fc(output)) 61 | output = self.layer_norm(output + residual) 62 | 63 | return output, attn 64 | 65 | 66 | class PositionwiseFeedForward(nn.Module): 67 | ''' A two-feed-forward-layer module ''' 68 | 69 | def __init__(self, d_in, d_hid, dropout=0.1): 70 | super().__init__() 71 | self.w_1 = nn.Conv1d(d_in, d_hid, 1) # position-wise 72 | self.w_2 = nn.Conv1d(d_hid, d_in, 1) # position-wise 73 | self.layer_norm = nn.LayerNorm(d_in) 74 | self.dropout = nn.Dropout(dropout) 75 | 76 | def forward(self, x): 77 | residual = x 78 | output = x.transpose(1, 2) 79 | output = self.w_2(F.relu(self.w_1(output))) 80 | output = output.transpose(1, 2) 81 | output = self.dropout(output) 82 | output = self.layer_norm(output + residual) 83 | return output 84 | -------------------------------------------------------------------------------- /model/transformer/Transformers.py: -------------------------------------------------------------------------------- 1 | ''' Define the Transformer model ''' 2 | import torch 3 | import numpy as np 4 | import torch.nn as nn 5 | import model.transformer.Constants as Constants 6 | from model.transformer.Layers import EncoderLayer, DecoderLayer 7 | 8 | __author__ = 'Yu-Hsiang Huang' 9 | __AugmentedBy__ = 'Jacob Zhiyuan Fang' 10 | 11 | 12 | def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None): 13 | ''' Sinusoid position encoding table ''' 14 | 15 | def cal_angle(position, hid_idx): 16 | return position / np.power(10000, 2 * (hid_idx // 2) / d_hid) 17 | 18 | def get_posi_angle_vec(position): 19 | return [cal_angle(position, hid_j) for hid_j in range(d_hid)] 20 | 21 | sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)]) 22 | 23 | sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 24 | sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 25 | 26 | if padding_idx is not None: 27 | # zero vector for padding dimension 28 | sinusoid_table[padding_idx] = 0. 29 | 30 | return torch.FloatTensor(sinusoid_table) 31 | 32 | 33 | def get_non_pad_mask(seq): 34 | assert seq.dim() == 2 35 | return seq.ne(Constants.PAD).type(torch.float).unsqueeze(-1).cuda() 36 | 37 | 38 | def get_attn_key_pad_mask(seq_k, seq_q): 39 | ''' For masking out the padding part of key sequence. ''' 40 | 41 | # Expand to fit the shape of key query attention matrix. 42 | len_q = seq_q.size(1) 43 | padding_mask = seq_k.eq(Constants.PAD) 44 | padding_mask = padding_mask.unsqueeze(1).expand(-1, len_q, -1) # b x lq x lk 45 | 46 | return padding_mask.cuda() 47 | 48 | 49 | def get_subsequent_mask(seq): 50 | ''' For masking out the subsequent info. ''' 51 | 52 | sz_b, len_s = seq.size() 53 | subsequent_mask = torch.triu( 54 | torch.ones((len_s, len_s), device=seq.device, dtype=torch.uint8), diagonal=1) 55 | subsequent_mask = subsequent_mask.unsqueeze(0).expand(sz_b, -1, -1) # b x ls x ls 56 | 57 | return subsequent_mask.cuda() 58 | 59 | 60 | class Encoder(nn.Module): 61 | ''' A encoder model with self attention mechanism. ''' 62 | 63 | def __init__( 64 | self, 65 | len_max_seq, d_word_vec, 66 | n_layers, n_head, d_k, d_v, 67 | d_model, d_inner, dropout=0.1): 68 | 69 | super().__init__() 70 | 71 | n_position = len_max_seq + 1 72 | 73 | self.position_enc = nn.Embedding.from_pretrained( 74 | get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0), freeze=True) 75 | 76 | self.layer_stack = nn.ModuleList([ 77 | EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) 78 | for _ in range(n_layers)]) 79 | 80 | def forward(self, src_emb, src_pos, return_attns=False): 81 | 82 | enc_slf_attn_list = [] 83 | 84 | # -- Prepare masks 85 | _ = torch.rand(src_emb.shape[0], src_emb.shape[1]) 86 | slf_attn_mask = get_attn_key_pad_mask(seq_k=_, seq_q=_) 87 | non_pad_mask = get_non_pad_mask(_) 88 | 89 | # -- Forward 90 | enc_output = src_emb + self.position_enc(src_pos) 91 | 92 | for enc_layer in self.layer_stack: 93 | enc_output, enc_slf_attn = enc_layer(enc_output, non_pad_mask=non_pad_mask, slf_attn_mask=slf_attn_mask) 94 | if return_attns: 95 | enc_slf_attn_list += [enc_slf_attn] 96 | 97 | if return_attns: 98 | return enc_output, enc_slf_attn_list 99 | return enc_output, 100 | 101 | 102 | class Decoder(nn.Module): 103 | ''' A decoder model with self attention mechanism. ''' 104 | 105 | def __init__( 106 | self, n_tgt_vocab, len_max_seq, d_word_vec, 107 | n_layers, n_head, d_k, d_v, 108 | d_model, d_inner, dropout=0.1): 109 | 110 | super().__init__() 111 | n_position = len_max_seq + 1 112 | 113 | self.tgt_word_emb = nn.Embedding( 114 | n_tgt_vocab, d_word_vec, padding_idx=Constants.PAD) 115 | 116 | self.position_enc = nn.Embedding.from_pretrained( 117 | get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0), 118 | freeze=True) 119 | 120 | self.layer_stack = nn.ModuleList([ 121 | DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) 122 | for _ in range(n_layers)]) 123 | 124 | def forward(self, tgt_seq, tgt_pos, src_seq, enc_output, return_attns=False): 125 | 126 | dec_slf_attn_list, dec_enc_attn_list = [], [] 127 | 128 | # -- Prepare masks 129 | non_pad_mask = get_non_pad_mask(tgt_seq) 130 | 131 | slf_attn_mask_subseq = get_subsequent_mask(tgt_seq) 132 | slf_attn_mask_keypad = get_attn_key_pad_mask(seq_k=tgt_seq, seq_q=tgt_seq) 133 | slf_attn_mask = (slf_attn_mask_keypad + slf_attn_mask_subseq).gt(0) 134 | 135 | src_tmp = torch.ones(src_seq.shape[0], src_seq.shape[1]).cuda() 136 | dec_enc_attn_mask = get_attn_key_pad_mask(seq_k=src_tmp, seq_q=tgt_seq) 137 | 138 | # -- Forward 139 | dec_output = self.tgt_word_emb(tgt_seq) + self.position_enc(tgt_pos) 140 | 141 | for dec_layer in self.layer_stack: 142 | dec_output, dec_slf_attn, dec_enc_attn = dec_layer( 143 | dec_output, enc_output, 144 | non_pad_mask=non_pad_mask, 145 | slf_attn_mask=slf_attn_mask, 146 | dec_enc_attn_mask=dec_enc_attn_mask) 147 | 148 | if return_attns: 149 | dec_slf_attn_list += [dec_slf_attn] 150 | dec_enc_attn_list += [dec_enc_attn] 151 | 152 | if return_attns: 153 | return dec_output, dec_slf_attn_list, dec_enc_attn_list 154 | return dec_output, 155 | 156 | 157 | class Transformer(nn.Module): 158 | ''' A sequence to sequence model with attention mechanism. ''' 159 | 160 | def __init__( 161 | self, 162 | n_tgt_vocab, len_max_seq, vis_emb=2048, 163 | d_word_vec=512, d_model=512, d_inner=2048, 164 | n_layers=6, n_head=8, d_k=64, d_v=64, dropout=0.1, 165 | tgt_emb_prj_weight_sharing=True): 166 | 167 | super().__init__() 168 | 169 | self.vis_emb = nn.Linear(vis_emb, d_model) 170 | 171 | self.encoder = Encoder( 172 | len_max_seq=40, 173 | d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, 174 | n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, 175 | dropout=dropout) 176 | 177 | self.decoder = Decoder( 178 | n_tgt_vocab=n_tgt_vocab, len_max_seq=len_max_seq, 179 | d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, 180 | n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, 181 | dropout=dropout) 182 | 183 | self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False) 184 | nn.init.xavier_normal_(self.tgt_word_prj.weight) 185 | 186 | assert d_model == d_word_vec, \ 187 | 'To facilitate the residual connections, ' \ 188 | 'the dimensions of all module outputs shall be the same.' 189 | 190 | if tgt_emb_prj_weight_sharing: 191 | # Share the weight matrix between target word embedding & the final logit dense layer 192 | self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight 193 | self.x_logit_scale = (d_model ** -0.5) 194 | else: 195 | self.x_logit_scale = 1. 196 | 197 | def forward(self, src_emb, src_pos, tgt_seq, tgt_pos): 198 | 199 | src_emb = self.vis_emb(src_emb) 200 | tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1] 201 | 202 | enc_output, *_ = self.encoder(src_emb, src_pos) 203 | dec_output, *_ = self.decoder(tgt_seq, tgt_pos, src_emb, enc_output) 204 | seq_logit = self.tgt_word_prj(dec_output) * self.x_logit_scale 205 | 206 | return seq_logit.view(-1, seq_logit.size(2)) 207 | 208 | -------------------------------------------------------------------------------- /model/transformer/Translator.py: -------------------------------------------------------------------------------- 1 | ''' This module will handle the text generation with beam search. ''' 2 | import numpy as np 3 | from utils.utils import * 4 | import torch.nn.functional as F 5 | from model.transformer.Beam import Beam 6 | 7 | __author__ = 'Jacob Zhiyuan Fang' 8 | 9 | 10 | def pos_emb_generation(word_labels): 11 | ''' 12 | Generate the position embedding input for Transformers. 13 | ''' 14 | 15 | seq = list(range(1, word_labels.shape[1] + 1)) 16 | tgt_pos = torch.tensor([seq] * word_labels.shape[0]).cuda() 17 | binary_mask = (word_labels != 0).long() 18 | 19 | return tgt_pos*binary_mask 20 | 21 | 22 | def translate_batch(model, src_emb, opt): 23 | ''' Translation work in one batch ''' 24 | 25 | def get_inst_idx_to_tensor_position_map(inst_idx_list): 26 | ''' Indicate the position of an instance in a tensor. ''' 27 | return {inst_idx: tensor_position for tensor_position, inst_idx in enumerate(inst_idx_list)} 28 | 29 | def collect_active_part(beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm): 30 | ''' Collect tensor parts associated to active instances. ''' 31 | 32 | _, *d_hs = beamed_tensor.size() 33 | n_curr_active_inst = len(curr_active_inst_idx) 34 | new_shape = (n_curr_active_inst * n_bm, *d_hs) 35 | 36 | beamed_tensor = beamed_tensor.view(n_prev_active_inst, -1) 37 | beamed_tensor = beamed_tensor.index_select(0, curr_active_inst_idx) 38 | beamed_tensor = beamed_tensor.view(*new_shape) 39 | 40 | return beamed_tensor 41 | 42 | def collate_active_info(src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list): 43 | # Sentences which are still active are collected, 44 | # so the decoder will not run on completed sentences. 45 | n_prev_active_inst = len(inst_idx_to_position_map) 46 | active_inst_idx = [inst_idx_to_position_map[k] for k in active_inst_idx_list] 47 | active_inst_idx = torch.LongTensor(active_inst_idx).cuda() 48 | 49 | active_src_seq = collect_active_part(src_seq, active_inst_idx, n_prev_active_inst, n_bm) 50 | active_src_enc = collect_active_part(src_enc, active_inst_idx, n_prev_active_inst, n_bm) 51 | active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list) 52 | 53 | return active_src_seq, active_src_enc, active_inst_idx_to_position_map 54 | 55 | def beam_decode_step(inst_dec_beams, len_dec_seq, src_seq, enc_output, inst_idx_to_position_map, n_bm, mode): 56 | ''' Decode and update beam status, and then return active beam idx ''' 57 | 58 | def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq): 59 | dec_partial_seq = [b.get_current_state() for b in inst_dec_beams if not b.done] 60 | dec_partial_seq = torch.stack(dec_partial_seq).cuda() 61 | dec_partial_seq = dec_partial_seq.view(-1, len_dec_seq) 62 | return dec_partial_seq 63 | 64 | def prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm): 65 | dec_partial_pos = torch.arange(1, len_dec_seq + 1, dtype=torch.long).cuda() 66 | dec_partial_pos = dec_partial_pos.unsqueeze(0).repeat(n_active_inst * n_bm, 1) 67 | return dec_partial_pos 68 | 69 | def predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm): 70 | if mode == 'cap': 71 | dec_output, dec_slf_attn_list, dec_enc_attn_list = model.decoder\ 72 | (dec_seq, dec_pos, src_seq, enc_output, return_attns=True) 73 | # print(dec_enc_attn_list[-1][0]) 74 | dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h 75 | word_prob = F.log_softmax(model.cap_word_prj(dec_output), dim=1) 76 | word_prob = word_prob.view(n_active_inst, n_bm, -1) 77 | 78 | elif mode == 'int': 79 | dec_output, dec_slf_attn_list, dec_enc_attn_list = model.cms_decoder\ 80 | (dec_seq, dec_pos, src_seq, enc_output, return_attns=True) 81 | # print(dec_enc_attn_list[-1][0]) 82 | dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h 83 | word_prob = F.log_softmax(model.cms_word_prj(dec_output), dim=1) 84 | word_prob = word_prob.view(n_active_inst, n_bm, -1) 85 | return word_prob 86 | 87 | def collect_active_inst_idx_list(inst_beams, word_prob, inst_idx_to_position_map): 88 | active_inst_idx_list = [] 89 | for inst_idx, inst_position in inst_idx_to_position_map.items(): 90 | is_inst_complete = inst_beams[inst_idx].advance(word_prob[inst_position]) 91 | if not is_inst_complete: 92 | active_inst_idx_list += [inst_idx] 93 | 94 | return active_inst_idx_list 95 | 96 | n_active_inst = len(inst_idx_to_position_map) 97 | 98 | dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq) 99 | dec_pos = prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm) 100 | word_prob = predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm) 101 | 102 | # Update the beam with predicted word prob information and collect incomplete instances 103 | active_inst_idx_list = collect_active_inst_idx_list(inst_dec_beams, word_prob, inst_idx_to_position_map) 104 | 105 | return active_inst_idx_list 106 | 107 | def collect_hypothesis_and_scores(inst_dec_beams, n_best): 108 | all_hyp, all_scores = [], [] 109 | for inst_idx in range(len(inst_dec_beams)): 110 | scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores() 111 | all_scores += [scores[:n_best]] 112 | 113 | hyps = [inst_dec_beams[inst_idx].get_hypothesis(i) for i in tail_idxs[:n_best]] 114 | all_hyp += [hyps] 115 | return all_hyp, all_scores 116 | 117 | with torch.no_grad(): 118 | # Encode 119 | src_seq = src_emb.cuda() 120 | src_enc, *_ = model.encoder(src_seq) 121 | video_encoding = src_enc 122 | 123 | # Repeat data for beam search 124 | n_bm = 1 125 | n_inst, len_s, d_h = src_enc.size() 126 | src_enc = src_enc.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h) 127 | src_seq = src_seq.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, -1) 128 | 129 | # Prepare beams 130 | inst_dec_beams = [Beam(n_bm, device='cuda') for _ in range(n_inst)] 131 | 132 | # Bookkeeping for active or not 133 | active_inst_idx_list = list(range(n_inst)) 134 | inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list) 135 | 136 | # <---------------------------------------------Decode CAP ----------------------------------------------------> 137 | for len_dec_seq in range(1, 28 + 1): 138 | 139 | active_inst_idx_list = beam_decode_step( 140 | inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm, mode='cap') 141 | 142 | if not active_inst_idx_list: 143 | break # all instances have finished their path to 144 | 145 | src_seq, src_enc, inst_idx_to_position_map = collate_active_info( 146 | src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list) 147 | 148 | batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams, n_bm) 149 | 150 | # <---------------------------------------------Decode CMS ----------------------------------------------------> 151 | cms_batch_hyp = [] 152 | for cap_idx in range(n_bm): 153 | [_[0].insert(0, 2) for _ in batch_hyp] # Start with symbol 154 | dec_seq = np.zeros((opt['batch_size'], opt['cap_max_len'])) 155 | for idx, seq in enumerate(batch_hyp): 156 | dec_seq[idx, :len(seq[cap_idx])] = seq[cap_idx] 157 | dec_seq = torch.as_tensor(dec_seq).cuda().long() 158 | dec_pos = pos_emb_generation(dec_seq).long() 159 | dec_output_, *_ = model.decoder(dec_seq[:, :-1], dec_pos[:, :-1], src_emb.cuda(), video_encoding) 160 | 161 | # Concatenate visual-captioning encodings 162 | cat_encoding = torch.cat((video_encoding, dec_output_), 1) 163 | 164 | # Repeat data for beam search for CMS 165 | n_inst, len_s, d_h = cat_encoding.size() 166 | src_enc = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h) 167 | src_seq = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, -1) 168 | 169 | # Prepare beams 170 | inst_dec_beams = [Beam(n_bm, device='cuda') for _ in range(n_inst)] 171 | 172 | # Bookkeeping for active or not 173 | active_inst_idx_list = list(range(n_inst)) 174 | inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list) 175 | 176 | for len_dec_seq in range(1, opt['eff_max_len'] - 1): 177 | 178 | active_inst_idx_list = beam_decode_step( 179 | inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm, mode='int') 180 | 181 | if not active_inst_idx_list: 182 | break # all instances have finished their path to 183 | 184 | src_seq, src_enc, inst_idx_to_position_map = collate_active_info( 185 | src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list) 186 | 187 | cms_hyp, cms_batch_scores = collect_hypothesis_and_scores(inst_dec_beams, 3) 188 | cms_batch_hyp.append(cms_hyp) 189 | # only return the top-1 cms beam searched result 190 | return batch_hyp, cms_batch_hyp[0] 191 | 192 | 193 | 194 | -------------------------------------------------------------------------------- /model/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | import model.transformer.Constants 2 | import model.transformer.Modules 3 | import model.transformer.Layers 4 | import model.transformer.SubLayers 5 | import model.Model 6 | import model.transformer.Translator 7 | import model.transformer.Beam 8 | import model.transformer.Optim 9 | 10 | # __all__ = [ 11 | # model.transformer.Constants, model.transformer.Modules, model.transformer.Layers, 12 | # model.transformer.SubLayers, model.transformer.Models, model.transformer.Optim, 13 | # model.transformer.Translator, model.transformer.Beam] 14 | -------------------------------------------------------------------------------- /model/transformer/cap2cms_Translator.py: -------------------------------------------------------------------------------- 1 | ''' This module will handle the text generation with beam search. ''' 2 | import numpy as np 3 | from utils.utils import * 4 | import torch.nn.functional as F 5 | from model.transformer.Beam import Beam 6 | 7 | __author__ = 'Jacob Zhiyuan Fang' 8 | 9 | 10 | def pos_emb_generation(word_labels): 11 | ''' 12 | Generate the position embedding input for Transformers. 13 | ''' 14 | 15 | seq = list(range(1, word_labels.shape[1] + 1)) 16 | tgt_pos = torch.tensor([seq] * word_labels.shape[0]).cuda() 17 | binary_mask = (word_labels != 0).long() 18 | 19 | return tgt_pos*binary_mask 20 | 21 | 22 | def translate_batch(model, src_emb, cap_label, opt): 23 | ''' Translation work in one batch ''' 24 | 25 | def get_inst_idx_to_tensor_position_map(inst_idx_list): 26 | ''' Indicate the position of an instance in a tensor. ''' 27 | return {inst_idx: tensor_position for tensor_position, inst_idx in enumerate(inst_idx_list)} 28 | 29 | def collect_active_part(beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm): 30 | ''' Collect tensor parts associated to active instances. ''' 31 | 32 | _, *d_hs = beamed_tensor.size() 33 | n_curr_active_inst = len(curr_active_inst_idx) 34 | new_shape = (n_curr_active_inst * n_bm, *d_hs) 35 | 36 | beamed_tensor = beamed_tensor.view(n_prev_active_inst, -1) 37 | beamed_tensor = beamed_tensor.index_select(0, curr_active_inst_idx) 38 | beamed_tensor = beamed_tensor.view(*new_shape) 39 | 40 | return beamed_tensor 41 | 42 | def collate_active_info(src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list): 43 | # Sentences which are still active are collected, 44 | # so the decoder will not run on completed sentences. 45 | n_prev_active_inst = len(inst_idx_to_position_map) 46 | active_inst_idx = [inst_idx_to_position_map[k] for k in active_inst_idx_list] 47 | active_inst_idx = torch.LongTensor(active_inst_idx).cuda() 48 | 49 | active_src_seq = collect_active_part(src_seq, active_inst_idx, n_prev_active_inst, n_bm) 50 | active_src_enc = collect_active_part(src_enc, active_inst_idx, n_prev_active_inst, n_bm) 51 | active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list) 52 | 53 | return active_src_seq, active_src_enc, active_inst_idx_to_position_map 54 | 55 | def beam_decode_step(inst_dec_beams, len_dec_seq, src_seq, enc_output, inst_idx_to_position_map, n_bm, mode): 56 | ''' Decode and update beam status, and then return active beam idx ''' 57 | 58 | def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq): 59 | dec_partial_seq = [b.get_current_state() for b in inst_dec_beams if not b.done] 60 | dec_partial_seq = torch.stack(dec_partial_seq).cuda() 61 | dec_partial_seq = dec_partial_seq.view(-1, len_dec_seq) 62 | return dec_partial_seq 63 | 64 | def prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm): 65 | dec_partial_pos = torch.arange(1, len_dec_seq + 1, dtype=torch.long).cuda() 66 | dec_partial_pos = dec_partial_pos.unsqueeze(0).repeat(n_active_inst * n_bm, 1) 67 | return dec_partial_pos 68 | 69 | def predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm): 70 | if mode == 'cap': 71 | dec_output, *_ = model.decoder(dec_seq, dec_pos, src_seq, enc_output) 72 | dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h 73 | word_prob = F.log_softmax(model.cap_word_prj(dec_output), dim=1) 74 | word_prob = word_prob.view(n_active_inst, n_bm, -1) 75 | 76 | elif mode == 'int': 77 | dec_output, *_ = model.cms_decoder(dec_seq, dec_pos, src_seq, enc_output) 78 | dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h 79 | word_prob = F.log_softmax(model.cms_word_prj(dec_output), dim=1) 80 | word_prob = word_prob.view(n_active_inst, n_bm, -1) 81 | return word_prob 82 | 83 | def collect_active_inst_idx_list(inst_beams, word_prob, inst_idx_to_position_map): 84 | active_inst_idx_list = [] 85 | for inst_idx, inst_position in inst_idx_to_position_map.items(): 86 | is_inst_complete = inst_beams[inst_idx].advance(word_prob[inst_position]) 87 | if not is_inst_complete: 88 | active_inst_idx_list += [inst_idx] 89 | 90 | return active_inst_idx_list 91 | 92 | n_active_inst = len(inst_idx_to_position_map) 93 | 94 | dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq) 95 | dec_pos = prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm) 96 | word_prob = predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm) 97 | 98 | # Update the beam with predicted word prob information and collect incomplete instances 99 | active_inst_idx_list = collect_active_inst_idx_list(inst_dec_beams, word_prob, inst_idx_to_position_map) 100 | 101 | return active_inst_idx_list 102 | 103 | def collect_hypothesis_and_scores(inst_dec_beams, n_best): 104 | all_hyp, all_scores = [], [] 105 | for inst_idx in range(len(inst_dec_beams)): 106 | scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores() 107 | all_scores += [scores[:n_best]] 108 | 109 | hyps = [inst_dec_beams[inst_idx].get_hypothesis(i) for i in tail_idxs[:n_best]] 110 | all_hyp += [hyps] 111 | return all_hyp, all_scores 112 | 113 | with torch.no_grad(): 114 | # <--------------------------------------------- Decode Video -------------------------------------------------> 115 | src_seq = src_emb.cuda() 116 | src_enc, *_ = model.encoder(src_seq) 117 | video_encoding = src_enc 118 | 119 | # <--------------------------------------------- Decode CAP ---------------------------------------------------> 120 | cap_pos = pos_emb_generation(cap_label) 121 | cap_label, cap_pos = cap_label[:, :-1], cap_pos[:, :-1] 122 | 123 | cap_dec_output, *_ = model.decoder(cap_label, cap_pos, video_encoding, video_encoding) 124 | 125 | # Concatenate visual and caption encoding 126 | cat_encoding = torch.cat((video_encoding, cap_dec_output), 1) 127 | # cat_encoding = cap_dec_output 128 | 129 | # <--------------------------------------------- Decode CMS ---------------------------------------------------> 130 | # Repeat data for beam search for CMS 131 | n_bm = 2 132 | n_inst, len_s, d_h = cat_encoding.size() 133 | src_enc = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h) 134 | src_seq = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, -1) 135 | 136 | # Prepare beams 137 | inst_dec_beams = [Beam(n_bm, device='cuda') for _ in range(n_inst)] 138 | 139 | # Bookkeeping for active or not 140 | active_inst_idx_list = list(range(n_inst)) 141 | inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list) 142 | 143 | for len_dec_seq in range(1, opt['eff_max_len'] - 1): 144 | 145 | active_inst_idx_list = beam_decode_step( 146 | inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm, mode='int') 147 | 148 | if not active_inst_idx_list: 149 | break # all instances have finished their path to 150 | 151 | src_seq, src_enc, inst_idx_to_position_map = collate_active_info( 152 | src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list) 153 | 154 | cms_batch_hyp, cms_batch_scores = collect_hypothesis_and_scores(inst_dec_beams, 1) 155 | 156 | return cms_batch_hyp 157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /model/transformer/cap2cms_Translator_transformer.py: -------------------------------------------------------------------------------- 1 | ''' This module will handle the text generation with beam search. ''' 2 | import numpy as np 3 | from utils.utils import * 4 | import torch.nn.functional as F 5 | from model.transformer.Beam import Beam 6 | 7 | __author__ = 'Jacob Zhiyuan Fang' 8 | 9 | 10 | def pos_emb_generation(word_labels): 11 | ''' 12 | Generate the position embedding input for Transformers. 13 | ''' 14 | 15 | seq = list(range(1, word_labels.shape[1] + 1)) 16 | tgt_pos = torch.tensor([seq] * word_labels.shape[0]).cuda() 17 | binary_mask = (word_labels != 0).long() 18 | 19 | return tgt_pos*binary_mask 20 | 21 | 22 | def translate_batch(model, src_emb, cap_label, opt): 23 | ''' Translation work in one batch ''' 24 | 25 | def get_inst_idx_to_tensor_position_map(inst_idx_list): 26 | ''' Indicate the position of an instance in a tensor. ''' 27 | return {inst_idx: tensor_position for tensor_position, inst_idx in enumerate(inst_idx_list)} 28 | 29 | def collect_active_part(beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm): 30 | ''' Collect tensor parts associated to active instances. ''' 31 | 32 | _, *d_hs = beamed_tensor.size() 33 | n_curr_active_inst = len(curr_active_inst_idx) 34 | new_shape = (n_curr_active_inst * n_bm, *d_hs) 35 | 36 | beamed_tensor = beamed_tensor.view(n_prev_active_inst, -1) 37 | beamed_tensor = beamed_tensor.index_select(0, curr_active_inst_idx) 38 | beamed_tensor = beamed_tensor.view(*new_shape) 39 | 40 | return beamed_tensor 41 | 42 | def collate_active_info(src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list): 43 | # Sentences which are still active are collected, 44 | # so the decoder will not run on completed sentences. 45 | n_prev_active_inst = len(inst_idx_to_position_map) 46 | active_inst_idx = [inst_idx_to_position_map[k] for k in active_inst_idx_list] 47 | active_inst_idx = torch.LongTensor(active_inst_idx).cuda() 48 | 49 | active_src_seq = collect_active_part(src_seq, active_inst_idx, n_prev_active_inst, n_bm) 50 | active_src_enc = collect_active_part(src_enc, active_inst_idx, n_prev_active_inst, n_bm) 51 | active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list) 52 | 53 | return active_src_seq, active_src_enc, active_inst_idx_to_position_map 54 | 55 | def beam_decode_step(inst_dec_beams, len_dec_seq, src_seq, enc_output, inst_idx_to_position_map, n_bm, mode): 56 | ''' Decode and update beam status, and then return active beam idx ''' 57 | 58 | def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq): 59 | dec_partial_seq = [b.get_current_state() for b in inst_dec_beams if not b.done] 60 | dec_partial_seq = torch.stack(dec_partial_seq).cuda() 61 | dec_partial_seq = dec_partial_seq.view(-1, len_dec_seq) 62 | return dec_partial_seq 63 | 64 | def prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm): 65 | dec_partial_pos = torch.arange(1, len_dec_seq + 1, dtype=torch.long).cuda() 66 | dec_partial_pos = dec_partial_pos.unsqueeze(0).repeat(n_active_inst * n_bm, 1) 67 | return dec_partial_pos 68 | 69 | def predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm): 70 | if mode == 'cap': 71 | dec_output, *_ = model.decoder(dec_seq, dec_pos, src_seq, enc_output) 72 | dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h 73 | word_prob = F.log_softmax(model.cap_word_prj(dec_output), dim=1) 74 | word_prob = word_prob.view(n_active_inst, n_bm, -1) 75 | 76 | elif mode == 'int': 77 | dec_output, *_ = model.cms_decoder(dec_seq, dec_pos, src_seq, enc_output) 78 | dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h 79 | word_prob = F.log_softmax(model.cms_word_prj(dec_output), dim=1) 80 | word_prob = word_prob.view(n_active_inst, n_bm, -1) 81 | return word_prob 82 | 83 | def collect_active_inst_idx_list(inst_beams, word_prob, inst_idx_to_position_map): 84 | active_inst_idx_list = [] 85 | for inst_idx, inst_position in inst_idx_to_position_map.items(): 86 | is_inst_complete = inst_beams[inst_idx].advance(word_prob[inst_position]) 87 | if not is_inst_complete: 88 | active_inst_idx_list += [inst_idx] 89 | 90 | return active_inst_idx_list 91 | 92 | n_active_inst = len(inst_idx_to_position_map) 93 | 94 | dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq) 95 | dec_pos = prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm) 96 | word_prob = predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm) 97 | 98 | # Update the beam with predicted word prob information and collect incomplete instances 99 | active_inst_idx_list = collect_active_inst_idx_list(inst_dec_beams, word_prob, inst_idx_to_position_map) 100 | 101 | return active_inst_idx_list 102 | 103 | def collect_hypothesis_and_scores(inst_dec_beams, n_best): 104 | all_hyp, all_scores = [], [] 105 | for inst_idx in range(len(inst_dec_beams)): 106 | scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores() 107 | all_scores += [scores[:n_best]] 108 | 109 | hyps = [inst_dec_beams[inst_idx].get_hypothesis(i) for i in tail_idxs[:n_best]] 110 | all_hyp += [hyps] 111 | return all_hyp, all_scores 112 | 113 | with torch.no_grad(): 114 | # <--------------------------------------------- Decode Video -------------------------------------------------> 115 | src_seq = src_emb.cuda() 116 | 117 | src_seq = model.vis_emb(src_seq) 118 | vis_pos = torch.tensor(list(range(0, 40))).cuda().unsqueeze(0).repeat(src_seq.shape[0], 1) 119 | video_encoding, *_ = model.encoder(src_seq, vis_pos) 120 | 121 | # <--------------------------------------------- Decode CAP ---------------------------------------------------> 122 | cap_pos = pos_emb_generation(cap_label) 123 | cap_label, cap_pos = cap_label[:, :-1], cap_pos[:, :-1] 124 | 125 | cap_dec_output, *_ = model.decoder(cap_label, cap_pos, video_encoding, video_encoding) 126 | 127 | # Concatenate visual and caption encoding 128 | cat_encoding = torch.cat((video_encoding, cap_dec_output), 1) 129 | # cat_encoding = cap_dec_output 130 | 131 | # <--------------------------------------------- Decode CMS ---------------------------------------------------> 132 | # Repeat data for beam search for CMS 133 | n_bm = 2 134 | n_inst, len_s, d_h = cat_encoding.size() 135 | src_enc = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h) 136 | src_seq = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, -1) 137 | 138 | # Prepare beams 139 | inst_dec_beams = [Beam(n_bm, device='cuda') for _ in range(n_inst)] 140 | 141 | # Bookkeeping for active or not 142 | active_inst_idx_list = list(range(n_inst)) 143 | inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list) 144 | 145 | for len_dec_seq in range(1, opt['eff_max_len'] - 1): 146 | 147 | active_inst_idx_list = beam_decode_step( 148 | inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm, mode='int') 149 | 150 | if not active_inst_idx_list: 151 | break # all instances have finished their path to 152 | 153 | src_seq, src_enc, inst_idx_to_position_map = collate_active_info( 154 | src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list) 155 | 156 | cms_batch_hyp, cms_batch_scores = collect_hypothesis_and_scores(inst_dec_beams, 1) 157 | 158 | return cms_batch_hyp 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /model/transformer/cap2cms_Translator_transformerDecoder.py: -------------------------------------------------------------------------------- 1 | ''' This module will handle the text generation with beam search. ''' 2 | import numpy as np 3 | from utils.utils import * 4 | import torch.nn.functional as F 5 | from model.transformer.Beam import Beam 6 | 7 | __author__ = 'Jacob Zhiyuan Fang' 8 | 9 | 10 | def pos_emb_generation(word_labels): 11 | ''' 12 | Generate the position embedding input for Transformers. 13 | ''' 14 | 15 | seq = list(range(1, word_labels.shape[1] + 1)) 16 | tgt_pos = torch.tensor([seq] * word_labels.shape[0]).cuda() 17 | binary_mask = (word_labels != 0).long() 18 | 19 | return tgt_pos*binary_mask 20 | 21 | 22 | def translate_batch(model, src_emb, cap_label, opt): 23 | ''' Translation work in one batch ''' 24 | 25 | def get_inst_idx_to_tensor_position_map(inst_idx_list): 26 | ''' Indicate the position of an instance in a tensor. ''' 27 | return {inst_idx: tensor_position for tensor_position, inst_idx in enumerate(inst_idx_list)} 28 | 29 | def collect_active_part(beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm): 30 | ''' Collect tensor parts associated to active instances. ''' 31 | 32 | _, *d_hs = beamed_tensor.size() 33 | n_curr_active_inst = len(curr_active_inst_idx) 34 | new_shape = (n_curr_active_inst * n_bm, *d_hs) 35 | 36 | beamed_tensor = beamed_tensor.view(n_prev_active_inst, -1) 37 | beamed_tensor = beamed_tensor.index_select(0, curr_active_inst_idx) 38 | beamed_tensor = beamed_tensor.view(*new_shape) 39 | 40 | return beamed_tensor 41 | 42 | def collate_active_info(src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list): 43 | # Sentences which are still active are collected, 44 | # so the decoder will not run on completed sentences. 45 | n_prev_active_inst = len(inst_idx_to_position_map) 46 | active_inst_idx = [inst_idx_to_position_map[k] for k in active_inst_idx_list] 47 | active_inst_idx = torch.LongTensor(active_inst_idx).cuda() 48 | 49 | active_src_seq = collect_active_part(src_seq, active_inst_idx, n_prev_active_inst, n_bm) 50 | active_src_enc = collect_active_part(src_enc, active_inst_idx, n_prev_active_inst, n_bm) 51 | active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list) 52 | 53 | return active_src_seq, active_src_enc, active_inst_idx_to_position_map 54 | 55 | def beam_decode_step(inst_dec_beams, len_dec_seq, src_seq, enc_output, inst_idx_to_position_map, n_bm, mode): 56 | ''' Decode and update beam status, and then return active beam idx ''' 57 | 58 | def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq): 59 | dec_partial_seq = [b.get_current_state() for b in inst_dec_beams if not b.done] 60 | dec_partial_seq = torch.stack(dec_partial_seq).cuda() 61 | dec_partial_seq = dec_partial_seq.view(-1, len_dec_seq) 62 | return dec_partial_seq 63 | 64 | def prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm): 65 | dec_partial_pos = torch.arange(1, len_dec_seq + 1, dtype=torch.long).cuda() 66 | dec_partial_pos = dec_partial_pos.unsqueeze(0).repeat(n_active_inst * n_bm, 1) 67 | return dec_partial_pos 68 | 69 | def predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm): 70 | if mode == 'cap': 71 | dec_output, *_ = model.decoder(dec_seq, dec_pos, src_seq, enc_output) 72 | dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h 73 | word_prob = F.log_softmax(model.cap_word_prj(dec_output), dim=1) 74 | word_prob = word_prob.view(n_active_inst, n_bm, -1) 75 | 76 | elif mode == 'int': 77 | dec_output, *_ = model.cms_decoder(dec_seq, dec_pos, src_seq, enc_output) 78 | dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h 79 | word_prob = F.log_softmax(model.cms_word_prj(dec_output), dim=1) 80 | word_prob = word_prob.view(n_active_inst, n_bm, -1) 81 | return word_prob 82 | 83 | def collect_active_inst_idx_list(inst_beams, word_prob, inst_idx_to_position_map): 84 | active_inst_idx_list = [] 85 | for inst_idx, inst_position in inst_idx_to_position_map.items(): 86 | is_inst_complete = inst_beams[inst_idx].advance(word_prob[inst_position]) 87 | if not is_inst_complete: 88 | active_inst_idx_list += [inst_idx] 89 | 90 | return active_inst_idx_list 91 | 92 | n_active_inst = len(inst_idx_to_position_map) 93 | 94 | dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq) 95 | dec_pos = prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm) 96 | word_prob = predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm) 97 | 98 | # Update the beam with predicted word prob information and collect incomplete instances 99 | active_inst_idx_list = collect_active_inst_idx_list(inst_dec_beams, word_prob, inst_idx_to_position_map) 100 | 101 | return active_inst_idx_list 102 | 103 | def collect_hypothesis_and_scores(inst_dec_beams, n_best): 104 | all_hyp, all_scores = [], [] 105 | for inst_idx in range(len(inst_dec_beams)): 106 | scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores() 107 | all_scores += [scores[:n_best]] 108 | 109 | hyps = [inst_dec_beams[inst_idx].get_hypothesis(i) for i in tail_idxs[:n_best]] 110 | all_hyp += [hyps] 111 | return all_hyp, all_scores 112 | 113 | with torch.no_grad(): 114 | # <--------------------------------------------- Decode Video -------------------------------------------------> 115 | src_seq = src_emb.cuda() 116 | video_encoding = model.vis_emb(src_seq) 117 | 118 | # <--------------------------------------------- Decode CAP ---------------------------------------------------> 119 | cap_pos = pos_emb_generation(cap_label) 120 | cap_label, cap_pos = cap_label[:, :-1], cap_pos[:, :-1] 121 | 122 | cap_dec_output, *_ = model.decoder(cap_label, cap_pos, video_encoding, video_encoding) 123 | 124 | # Concatenate visual and caption encoding 125 | cat_encoding = torch.cat((video_encoding, cap_dec_output), 1) 126 | # cat_encoding = cap_dec_output 127 | 128 | # <--------------------------------------------- Decode CMS ---------------------------------------------------> 129 | # Repeat data for beam search for CMS 130 | n_bm = 2 131 | n_inst, len_s, d_h = cat_encoding.size() 132 | src_enc = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h) 133 | src_seq = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, -1) 134 | 135 | # Prepare beams 136 | inst_dec_beams = [Beam(n_bm, device='cuda') for _ in range(n_inst)] 137 | 138 | # Bookkeeping for active or not 139 | active_inst_idx_list = list(range(n_inst)) 140 | inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list) 141 | 142 | for len_dec_seq in range(1, opt['eff_max_len'] - 1): 143 | 144 | active_inst_idx_list = beam_decode_step( 145 | inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm, mode='int') 146 | 147 | if not active_inst_idx_list: 148 | break # all instances have finished their path to 149 | 150 | src_seq, src_enc, inst_idx_to_position_map = collate_active_info( 151 | src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list) 152 | 153 | cms_batch_hyp, cms_batch_scores = collect_hypothesis_and_scores(inst_dec_beams, 1) 154 | 155 | return cms_batch_hyp 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /model/transformer/cap_cms_Translator.py: -------------------------------------------------------------------------------- 1 | ''' This module will handle the text generation with beam search. ''' 2 | import numpy as np 3 | from utils.utils import * 4 | import torch.nn.functional as F 5 | from model.transformer.Beam import Beam 6 | 7 | __author__ = 'Jacob Zhiyuan Fang' 8 | 9 | 10 | def pos_emb_generation(word_labels): 11 | ''' 12 | Generate the position embedding input for Transformers. 13 | ''' 14 | 15 | seq = list(range(1, word_labels.shape[1] + 1)) 16 | tgt_pos = torch.tensor([seq] * word_labels.shape[0]).cuda() 17 | binary_mask = (word_labels != 0).long() 18 | 19 | return tgt_pos*binary_mask 20 | 21 | 22 | def translate_batch(model, src_emb, cap_label, opt): 23 | ''' Translation work in one batch ''' 24 | 25 | def get_inst_idx_to_tensor_position_map(inst_idx_list): 26 | ''' Indicate the position of an instance in a tensor. ''' 27 | return {inst_idx: tensor_position for tensor_position, inst_idx in enumerate(inst_idx_list)} 28 | 29 | def collect_active_part(beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm): 30 | ''' Collect tensor parts associated to active instances. ''' 31 | 32 | _, *d_hs = beamed_tensor.size() 33 | n_curr_active_inst = len(curr_active_inst_idx) 34 | new_shape = (n_curr_active_inst * n_bm, *d_hs) 35 | 36 | beamed_tensor = beamed_tensor.view(n_prev_active_inst, -1) 37 | beamed_tensor = beamed_tensor.index_select(0, curr_active_inst_idx) 38 | beamed_tensor = beamed_tensor.view(*new_shape) 39 | 40 | return beamed_tensor 41 | 42 | def collate_active_info(src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list): 43 | # Sentences which are still active are collected, 44 | # so the decoder will not run on completed sentences. 45 | n_prev_active_inst = len(inst_idx_to_position_map) 46 | active_inst_idx = [inst_idx_to_position_map[k] for k in active_inst_idx_list] 47 | active_inst_idx = torch.LongTensor(active_inst_idx).cuda() 48 | 49 | active_src_seq = collect_active_part(src_seq, active_inst_idx, n_prev_active_inst, n_bm) 50 | active_src_enc = collect_active_part(src_enc, active_inst_idx, n_prev_active_inst, n_bm) 51 | active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list) 52 | 53 | return active_src_seq, active_src_enc, active_inst_idx_to_position_map 54 | 55 | def beam_decode_step(inst_dec_beams, len_dec_seq, src_seq, enc_output, inst_idx_to_position_map, n_bm, mode): 56 | ''' Decode and update beam status, and then return active beam idx ''' 57 | 58 | def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq): 59 | dec_partial_seq = [b.get_current_state() for b in inst_dec_beams if not b.done] 60 | dec_partial_seq = torch.stack(dec_partial_seq).cuda() 61 | dec_partial_seq = dec_partial_seq.view(-1, len_dec_seq) 62 | return dec_partial_seq 63 | 64 | def prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm): 65 | dec_partial_pos = torch.arange(1, len_dec_seq + 1, dtype=torch.long).cuda() 66 | dec_partial_pos = dec_partial_pos.unsqueeze(0).repeat(n_active_inst * n_bm, 1) 67 | return dec_partial_pos 68 | 69 | def predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm): 70 | if mode == 'cap': 71 | dec_output, *_ = model.decoder(dec_seq, dec_pos, src_seq, enc_output) 72 | dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h 73 | word_prob = F.log_softmax(model.cap_word_prj(dec_output), dim=1) 74 | word_prob = word_prob.view(n_active_inst, n_bm, -1) 75 | 76 | elif mode == 'int': 77 | dec_output, *_ = model.cms_decoder(dec_seq, dec_pos, src_seq, enc_output) 78 | dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h 79 | word_prob = F.log_softmax(model.cms_word_prj(dec_output), dim=1) 80 | word_prob = word_prob.view(n_active_inst, n_bm, -1) 81 | return word_prob 82 | 83 | def collect_active_inst_idx_list(inst_beams, word_prob, inst_idx_to_position_map): 84 | active_inst_idx_list = [] 85 | for inst_idx, inst_position in inst_idx_to_position_map.items(): 86 | is_inst_complete = inst_beams[inst_idx].advance(word_prob[inst_position]) 87 | if not is_inst_complete: 88 | active_inst_idx_list += [inst_idx] 89 | 90 | return active_inst_idx_list 91 | 92 | n_active_inst = len(inst_idx_to_position_map) 93 | 94 | dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq) 95 | dec_pos = prepare_beam_dec_pos(len_dec_seq, n_active_inst, n_bm) 96 | word_prob = predict_word(dec_seq, dec_pos, src_seq, enc_output, n_active_inst, n_bm) 97 | 98 | # Update the beam with predicted word prob information and collect incomplete instances 99 | active_inst_idx_list = collect_active_inst_idx_list(inst_dec_beams, word_prob, inst_idx_to_position_map) 100 | 101 | return active_inst_idx_list 102 | 103 | def collect_hypothesis_and_scores(inst_dec_beams, n_best): 104 | all_hyp, all_scores = [], [] 105 | for inst_idx in range(len(inst_dec_beams)): 106 | scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores() 107 | all_scores += [scores[:n_best]] 108 | 109 | hyps = [inst_dec_beams[inst_idx].get_hypothesis(i) for i in tail_idxs[:n_best]] 110 | all_hyp += [hyps] 111 | return all_hyp, all_scores 112 | 113 | with torch.no_grad(): 114 | # Encode 115 | src_seq = src_emb.cuda() 116 | src_enc, *_ = model.encoder(src_seq) 117 | video_encoding = src_enc 118 | 119 | # <---------------------------------------------Decode CAP ----------------------------------------------------> 120 | cap_pos = pos_emb_generation(cap_label) 121 | cap_label, cap_pos = cap_label[:, :-1], cap_pos[:, :-1] 122 | 123 | cap_dec_output, *_ = model.decoder(cap_label, cap_pos, video_encoding, video_encoding) 124 | 125 | # Concatenate visual and caption encoding 126 | cat_encoding = torch.cat((video_encoding, cap_dec_output), 1) 127 | 128 | # <---------------------------------------------Decode CMS ----------------------------------------------------> 129 | # Repeat data for beam search for CMS 130 | n_bm = 2 131 | n_inst, len_s, d_h = cat_encoding.size() 132 | src_enc = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, d_h) 133 | src_seq = cat_encoding.repeat(1, n_bm, 1).view(n_inst * n_bm, len_s, -1) 134 | 135 | # Prepare beams 136 | inst_dec_beams = [Beam(n_bm, device='cuda') for _ in range(n_inst)] 137 | 138 | # Bookkeeping for active or not 139 | active_inst_idx_list = list(range(n_inst)) 140 | inst_idx_to_position_map = get_inst_idx_to_tensor_position_map(active_inst_idx_list) 141 | 142 | for len_dec_seq in range(1, opt['eff_max_len'] - 1): 143 | 144 | active_inst_idx_list = beam_decode_step( 145 | inst_dec_beams, len_dec_seq, src_seq, src_enc, inst_idx_to_position_map, n_bm, mode='int') 146 | 147 | if not active_inst_idx_list: 148 | break # all instances have finished their path to 149 | 150 | src_seq, src_enc, inst_idx_to_position_map = collate_active_info( 151 | src_seq, src_enc, inst_idx_to_position_map, active_inst_idx_list) 152 | 153 | cms_batch_hyp, cms_batch_scores = collect_hypothesis_and_scores(inst_dec_beams, 1) 154 | 155 | return cms_batch_hyp 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def parse_opt(): 5 | parser = argparse.ArgumentParser() 6 | 7 | # Data input settings 8 | parser.add_argument( 9 | '--info_json', 10 | type=str, 11 | default='data/v2c_info.json', 12 | help='path to the json file containing additional info and vocab') 13 | 14 | parser.add_argument( 15 | '--cap_info_json', 16 | type=str, 17 | default='data/msrvtt_new_info.json', 18 | help='path to the json file containing additional info and vocab') 19 | 20 | parser.add_argument( 21 | '--caption_json', 22 | type=str, 23 | # raw dataset: V2C_MSR-VTT_caption.json; 24 | # Human dataset: train_cvpr_humanRank_V2C_caption.json 25 | default='data/V2C_MSR-VTT_caption.json', 26 | help='path to the processed video caption json') 27 | 28 | parser.add_argument( 29 | '--feats_dir', 30 | nargs='*', 31 | type=str, 32 | default=['data/feats/resnet152/'], 33 | help='path to the directory containing the preprocessed fc feats') 34 | 35 | # Model settings 36 | parser.add_argument( 37 | "--cap_max_len", 38 | type=int, 39 | default=28, 40 | help='max length of captions(containing , )') 41 | 42 | parser.add_argument( 43 | "--int_max_len", 44 | type=int, 45 | default=21, 46 | help='max length of captions(containing , )') 47 | 48 | parser.add_argument( 49 | "--eff_max_len", 50 | type=int, 51 | default=26, 52 | help='max length of captions(containing , )') 53 | 54 | parser.add_argument( 55 | "--att_max_len", 56 | type=int, 57 | default=8, 58 | help='max length of captions(containing , )') 59 | 60 | parser.add_argument( 61 | '--input_dropout_p', 62 | type=float, 63 | default=0.2, 64 | help='strength of dropout in the Language Model RNN') 65 | 66 | parser.add_argument( 67 | '--dropout', 68 | type=float, 69 | default=0.1, 70 | help='Dropout rate for Transformer') 71 | 72 | parser.add_argument( 73 | '--dim_word', 74 | type=int, 75 | default=512, 76 | help='the encoding size of each token in the vocabulary, and the video.') 77 | 78 | parser.add_argument( 79 | '--dim_model', 80 | type=int, 81 | default=512, 82 | help='size of the rnn hidden layer') 83 | 84 | parser.add_argument( 85 | '--dim_vis_feat', 86 | type=int, 87 | default=2048, 88 | help='dim of features of video frames') 89 | 90 | # 12-12 8 6 91 | parser.add_argument( 92 | '--num_head', 93 | type=int, 94 | default=8, 95 | help='Numbers of head in transformers.') 96 | 97 | parser.add_argument( 98 | '--num_layer', 99 | type=int, 100 | default=6, 101 | help='Numbers of layers in transformers.') 102 | 103 | parser.add_argument( 104 | '--rnn_layer', 105 | type=int, 106 | default=1, 107 | help='Numbers of layers in Video Encoder, RNN.') 108 | 109 | parser.add_argument( 110 | '--dim_head', 111 | type=int, 112 | default=64, 113 | help='Dimension of the attention head.') 114 | 115 | parser.add_argument( 116 | '--dim_inner', 117 | type=int, 118 | default=1024, 119 | help='Dimension of inner feature in Encoder/Decoder.') 120 | 121 | # Optimization: General 122 | parser.add_argument( 123 | '--epochs', 124 | type=int, 125 | default=100, 126 | help='number of epochs') 127 | 128 | parser.add_argument( 129 | '--warm_up_steps', 130 | type=int, 131 | default=5000, 132 | help='Warm up steps.') 133 | 134 | parser.add_argument( 135 | '--batch_size', 136 | type=int, 137 | default=64, 138 | help='minibatch size') 139 | 140 | parser.add_argument( 141 | '--save_checkpoint_every', 142 | type=int, 143 | default=10, 144 | help='how often to save a model checkpoint (in epoch)?') 145 | 146 | parser.add_argument( 147 | '--print_loss_every', 148 | type=int, 149 | default=20, 150 | help='how often to print the loss information (in iterations)?') 151 | 152 | parser.add_argument( 153 | '--checkpoint_path', 154 | type=str, 155 | default='save', 156 | help='directory to store check pointed models') 157 | 158 | parser.add_argument( 159 | '--load_checkpoint', 160 | type=str, 161 | default='', 162 | help='directory to load check pointed models') 163 | 164 | parser.add_argument( 165 | '--gpu', 166 | type=str, 167 | default='0', 168 | help='gpu device number') 169 | 170 | # other setting 171 | parser.add_argument( 172 | '--show_predict', 173 | action='store_true', 174 | help='whether to display intermediate generations during training/inference') 175 | 176 | parser.add_argument( 177 | '--cuda', 178 | action='store_true', 179 | help='Use CUDA for training.') 180 | 181 | parser.add_argument( 182 | '--resume', 183 | action='store_true', 184 | help='Resume from a midway checkpoint.') 185 | 186 | parser.add_argument( 187 | '--cms', 188 | choices=['int', 'eff', 'att'], 189 | default='eff', 190 | help='Type of Commonsense Knowledge.') 191 | 192 | args = parser.parse_args() 193 | 194 | return args 195 | -------------------------------------------------------------------------------- /others/generation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | import random 4 | import numpy as np 5 | from opts import * 6 | from model.Model import Model 7 | from pycocoevalcap.bleu.bleu import Bleu 8 | from pycocoevalcap.rouge.rouge import Rouge 9 | from pycocoevalcap.cider.cider import Cider 10 | from pycocoevalcap.meteor.meteor import Meteor 11 | from torch.utils.data import DataLoader 12 | from utils.dataloader import VideoDataset 13 | from model.transformer.Constants import * 14 | from nltk.translate.bleu_score import corpus_bleu 15 | from model.transformer.Translator import translate_batch 16 | 17 | import sys 18 | sys.path.append("utils/pycocoevalcap/") 19 | 20 | 21 | def pos_emb_generation(visual_feats): 22 | ''' 23 | Generate the position embedding input for Transformers. 24 | ''' 25 | seq = list(range(1, visual_feats.shape[1] + 1)) 26 | src_pos = torch.tensor([seq] * visual_feats.shape[0]).cuda() 27 | return src_pos 28 | 29 | 30 | def list_to_sentence(list): 31 | sentence = '' 32 | for element in list: 33 | sentence += ' ' + element 34 | return sentence 35 | 36 | 37 | def test(loader, model, opt, cap_vocab, cms_vocab): 38 | bleu_scores = [] 39 | write_to_txt = [] 40 | 41 | gts = [] 42 | res = [] 43 | for batch_id, data in enumerate(loader): 44 | 45 | fc_feats = data['fc_feats'].cuda() 46 | cap_labels = data['cap_labels'].cuda() 47 | video_ids = data['video_ids'] 48 | 49 | with torch.no_grad(): 50 | # Beam Search Starts From Here 51 | try: 52 | batch_hyp, cms_batch_hyp = translate_batch(model, fc_feats, opt) 53 | except: 54 | continue 55 | 56 | # Stack all GTs captions 57 | references = [] 58 | for video in video_ids: 59 | video_caps = [] 60 | for cap in opt['captions'][video]: 61 | for _ in cap['attribute']: 62 | video_caps.append(cap['final_caption'][1:-1] + _[1][1:-1]) 63 | references.append(video_caps) 64 | 65 | # Stack all Predicted Captions 66 | hypotheses = [] 67 | for cms_predict, predict in zip(cms_batch_hyp, batch_hyp): 68 | _ = [] 69 | if CAP_EOS in predict[0]: 70 | sep_id = predict[0].index(CAP_EOS) 71 | else: 72 | sep_id = -1 73 | for word in predict[0][1: sep_id]: 74 | _.append(cap_vocab[str(word)]) 75 | 76 | if CAP_EOS in cms_predict[0]: 77 | sep_id = cms_predict[0].index(CAP_EOS) 78 | else: 79 | sep_id = -1 80 | for word in cms_predict[0][0: sep_id]: 81 | _.append(cms_vocab[str(word)]) 82 | hypotheses.append(_) 83 | 84 | # Print out the predicted sentences and GT 85 | for random_id in range(5): 86 | if 0 in batch_hyp[random_id][0]: 87 | stop_idx = batch_hyp[random_id][0].index(EOS) 88 | else: 89 | stop_idx = -1 90 | 91 | video_id = video_ids[random_id] 92 | cap = list_to_sentence([cap_vocab[str(widx)] for widx in batch_hyp[random_id][0][1: stop_idx] if widx != 0]) 93 | cms = list_to_sentence([cms_vocab[str(widx)] for widx in cms_batch_hyp[random_id][0][: -1] if widx != 0]) 94 | cap_gt = list_to_sentence([cap_vocab[str(word.cpu().numpy())] for word in cap_labels[random_id, 1:] if word != 0][0:-1]) 95 | _ = str(video_id + ',' + cap + ',' + cms + ',' + cap_gt) 96 | write_to_txt.append(_) 97 | print('Generated Caption:', cap, ' ', 'Generated CMS:', cms) 98 | print('GT Caption:', cap_gt) 99 | print('\n') 100 | print(batch_id, ' ', batch_id * opt['batch_size'], ' out of ', '3010') 101 | 102 | # Compute the BLEU-4 score 103 | bleu_1 = corpus_bleu(references, hypotheses, weights=[1, 0, 0, 0]) 104 | bleu_2 = corpus_bleu(references, hypotheses, weights=[0.5, 0.5, 0, 0]) 105 | bleu_3 = corpus_bleu(references, hypotheses, weights=[0.333, 0.333, 0.333, 0]) 106 | bleu_4 = corpus_bleu(references, hypotheses, weights=[0.25, 0.25, 0.25, 0.25]) 107 | bleu_scores.append([bleu_1, bleu_2, bleu_3, bleu_4]) 108 | 109 | print("Bleu scores 1-4:", np.mean(np.asarray(bleu_scores), 0)) 110 | 111 | 112 | def main(opt): 113 | dataset = VideoDataset(opt, 'test') 114 | dataloader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=False) 115 | opt['cms_vocab_size'] = dataset.get_cms_vocab_size() 116 | opt['cap_vocab_size'] = dataset.get_cap_vocab_size() 117 | 118 | if opt['cms'] == 'int': 119 | cms_text_length = opt['int_max_len'] 120 | elif opt['cms'] == 'eff': 121 | cms_text_length = opt['eff_max_len'] 122 | else: 123 | cms_text_length = opt['att_max_len'] 124 | 125 | model = Model( 126 | dataset.get_cap_vocab_size(), 127 | dataset.get_cms_vocab_size(), 128 | cap_max_seq=opt['cap_max_len'], 129 | cms_max_seq=cms_text_length, 130 | tgt_emb_prj_weight_sharing=True, 131 | vis_emb=opt['dim_vis_feat'], 132 | rnn_layers=opt['rnn_layer'], 133 | d_k=opt['dim_head'], 134 | d_v=opt['dim_head'], 135 | d_model=opt['dim_model'], 136 | d_word_vec=opt['dim_word'], 137 | d_inner=opt['dim_inner'], 138 | n_layers=opt['num_layer'], 139 | n_head=opt['num_head'], 140 | dropout=opt['dropout']) 141 | 142 | if len(opt['load_checkpoint']) != 0: 143 | state_dict = torch.load(opt['load_checkpoint']) 144 | model.load_state_dict(state_dict) 145 | 146 | model = model.cuda() 147 | model.eval() 148 | test(dataloader, model, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab()) 149 | 150 | 151 | if __name__ == '__main__': 152 | opt = parse_opt() 153 | opt = vars(opt) 154 | opt['captions'] = json.load(open(opt['caption_json'])) 155 | opt['batch_size'] = 30 156 | main(opt) -------------------------------------------------------------------------------- /others/test_RNN.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import torch 4 | from opts import * 5 | import numpy as np 6 | import nltk 7 | from utils.utils import * 8 | from pycocoevalcap.bleu.bleu import Bleu 9 | from pycocoevalcap.rouge.rouge import Rouge 10 | from pycocoevalcap.cider.cider import Cider 11 | from pycocoevalcap.meteor.meteor import Meteor 12 | from model.Model import Model 13 | from torch.utils.data import DataLoader 14 | from model.transformer.Constants import * 15 | from utils.gt_caps_dataloader import VideoDataset 16 | from model.transformer.cap2cms_Translator import translate_batch 17 | 18 | 19 | def test(loader, model, opt, cap_vocab, cms_vocab): 20 | res = {} 21 | gts = {} 22 | eval_id = 0 23 | 24 | total_cms = set() 25 | ppl_scores = [] 26 | 27 | for batch_id, raw_data in enumerate(loader): 28 | if opt['cuda']: torch.cuda.synchronize() 29 | 30 | # iterate each video within the batch 31 | for iterate_id in range(len(raw_data)): 32 | fc_feats = raw_data[iterate_id][0]['fc_feats'].unsqueeze(0) 33 | video_ids = raw_data[iterate_id][0]['video_ids'] 34 | cap_labels = raw_data[iterate_id][0]['cap_labels'] 35 | 36 | if opt['cms'] == 'int': 37 | cms_list = raw_data[iterate_id][1] 38 | elif opt['cms'] == 'eff': 39 | cms_list = raw_data[iterate_id][2] 40 | else: 41 | cms_list = raw_data[iterate_id][3] 42 | 43 | if opt['cuda']: 44 | # cms_list = cms_list.cuda() 45 | cap_labels = cap_labels.cuda() 46 | fc_feats = fc_feats.cuda() 47 | 48 | # repeat the fc features for num_cap times 49 | fc_feats = fc_feats.repeat(len(cap_labels), 1, 1) 50 | 51 | # iterate through all captions per video 52 | with torch.no_grad(): 53 | 54 | # Note, currently we used BEAM search to decode the captions, while greedy strategy 55 | # should yield close or even better results. 56 | # cms_batch_hyp = translate_batch(model, fc_feats, cap_labels, opt) 57 | _, _, _, cms_batch_hyp = model(fc_feats, target_variable=cap_labels, cms_target_variable=None, 58 | mode='test') 59 | 60 | for random_id in range(cap_labels.shape[0]): 61 | # Print out the predicted sentences and GT 62 | if EOS in cms_batch_hyp[random_id]: 63 | stop_id = list(cms_batch_hyp[random_id]).index(EOS) 64 | else: 65 | stop_id = -1 66 | 67 | cms = list_to_sentence([cms_vocab[str(widx.detach().cpu().numpy())] for widx in 68 | cms_batch_hyp[random_id][: stop_id] if widx != 0]) 69 | cap_gt = list_to_sentence([cap_vocab[str(word.cpu().numpy())] for word in 70 | cap_labels[random_id, 1:] if word != 0][0:-1]) 71 | 72 | print(video_ids, '\n', 'Predicted CMS: ', cms) 73 | print('GT CMS Caption: ', cap_gt) 74 | print('GT CMS Knowledge: ', cms_list[random_id].split(';')[1:]) 75 | print('\n') 76 | print(batch_id * opt['batch_size'], ' out of ', '3010') 77 | 78 | # Save for evaluation 79 | cmses = cms_list[random_id].split(';')[1:] 80 | res[eval_id] = [cms] 81 | gts[eval_id] = cmses 82 | 83 | eval_id += 1 84 | 85 | ppl_corpus = '' 86 | for c in cmses: 87 | total_cms.add(c.lower()) 88 | ppl_corpus += ' ' + c.lower() 89 | tokens = nltk.word_tokenize(ppl_corpus) 90 | unigram_model = unigram(tokens) 91 | ppl_scores.append(perplexity(cms.lower(), unigram_model)) 92 | 93 | # Compute PPL score 94 | print('Perplexity score: ', sum(ppl_scores)/len(ppl_scores)) 95 | 96 | avg_bleu_score, bleu_scores = Bleu(4).compute_score(gts, res) 97 | avg_cider_score, cider_scores = Cider().compute_score(gts, res) 98 | avg_meteor_score, meteor_scores = Meteor().compute_score(gts, res) 99 | avg_rouge_score, rouge_scores = Rouge().compute_score(gts, res) 100 | print('C, M, R, B:', avg_cider_score, avg_meteor_score, avg_rouge_score, avg_bleu_score) 101 | 102 | 103 | def main(opt): 104 | dataset = VideoDataset(opt, 'test') 105 | dataloader = DataLoader(dataset, collate_fn=test_collate_fn, batch_size=opt['batch_size'], 106 | shuffle=False) 107 | opt['cms_vocab_size'] = dataset.get_cms_vocab_size() 108 | opt['cap_vocab_size'] = dataset.get_cap_vocab_size() 109 | 110 | if opt['cms'] == 'int': 111 | cms_text_length = opt['int_max_len'] 112 | elif opt['cms'] == 'eff': 113 | cms_text_length = opt['eff_max_len'] 114 | else: 115 | cms_text_length = opt['att_max_len'] 116 | 117 | from model.S2VTModel import S2VTModel 118 | model = S2VTModel(dataset.get_cap_vocab_size(), 119 | dataset.get_cms_vocab_size(), 120 | opt['cap_max_len'], 121 | cms_text_length, 122 | opt["dim_model"], 123 | opt["dim_word"], 124 | opt['dim_vis_feat'], 125 | n_layers=opt['rnn_layer']) 126 | 127 | model_parameters = filter(lambda p: p.requires_grad, model.parameters()) 128 | params = sum([np.prod(p.size()) for p in model_parameters]) 129 | print(params) 130 | 131 | if len(opt['load_checkpoint']) != 0: 132 | state_dict = torch.load(opt['load_checkpoint']) 133 | model.load_state_dict(state_dict) 134 | 135 | if opt['cuda']: 136 | model = model.cuda() 137 | 138 | 139 | model.eval() 140 | 141 | test(dataloader, model, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab()) 142 | 143 | 144 | if __name__ == '__main__': 145 | opt = parse_opt() 146 | opt = vars(opt) 147 | opt['captions'] = json.load(open(opt['caption_json'])) 148 | opt['batch_size'] = 30 149 | 150 | main(opt) 151 | -------------------------------------------------------------------------------- /others/test_attention_Video2text.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import torch 4 | from opts import * 5 | import numpy as np 6 | import nltk 7 | from utils.utils import * 8 | from pycocoevalcap.bleu.bleu import Bleu 9 | from pycocoevalcap.rouge.rouge import Rouge 10 | from pycocoevalcap.cider.cider import Cider 11 | from pycocoevalcap.meteor.meteor import Meteor 12 | from torch.utils.data import DataLoader 13 | from model.transformer.Constants import * 14 | from utils.gt_caps_dataloader import VideoDataset 15 | 16 | # sys.path.append("./pycocoevalcap/") 17 | 18 | 19 | def test(loader, model, opt, cap_vocab, cms_vocab): 20 | res = {} 21 | gts = {} 22 | eval_id = 0 23 | 24 | total_cms = set() 25 | ppl_scores = [] 26 | 27 | for batch_id, raw_data in enumerate(loader): 28 | if opt['cuda']: torch.cuda.synchronize() 29 | 30 | # iterate each video within the batch 31 | for iterate_id in range(len(raw_data)): 32 | fc_feats = raw_data[iterate_id][0]['fc_feats'].unsqueeze(0) 33 | video_ids = raw_data[iterate_id][0]['video_ids'] 34 | cap_labels = raw_data[iterate_id][0]['cap_labels'] 35 | 36 | if opt['cms'] == 'int': 37 | cms_list = raw_data[iterate_id][1] 38 | elif opt['cms'] == 'eff': 39 | cms_list = raw_data[iterate_id][2] 40 | else: 41 | cms_list = raw_data[iterate_id][3] 42 | 43 | if opt['cuda']: 44 | # cms_list = cms_list.cuda() 45 | cap_labels = cap_labels.cuda() 46 | fc_feats = fc_feats.cuda() 47 | 48 | # repeat the fc features for num_cap times 49 | fc_feats = fc_feats.repeat(len(cap_labels), 1, 1) 50 | 51 | # iterate through all captions per video 52 | with torch.no_grad(): 53 | 54 | # Note, currently we used BEAM search to decode the captions, while greedy strategy 55 | # should yield close or even better results. 56 | 57 | # Beam Search Starts From Here 58 | _, cms_batch_hyp = model(fc_feats, cap_labels=cap_labels, cms_labels=None, mode='test') 59 | 60 | for random_id in range(cap_labels.shape[0]): 61 | # Print out the predicted sentences and GT 62 | if EOS in cms_batch_hyp[random_id]: 63 | stop_id = list(cms_batch_hyp[random_id]).index(EOS) 64 | else: 65 | stop_id = -1 66 | 67 | cms = list_to_sentence([cms_vocab[str(widx.detach().cpu().numpy())] for widx in 68 | cms_batch_hyp[random_id][: stop_id] if widx != 0]) 69 | cap_gt = list_to_sentence([cap_vocab[str(word.cpu().numpy())] for word in 70 | cap_labels[random_id, 1:] if word != 0][0:-1]) 71 | 72 | print(video_ids, '\n', 'Predicted CMS: ', cms) 73 | print('GT Caption: ', cap_gt) 74 | print('GT CMS Knowledge: ', cms_list[random_id].split(';')[1:]) 75 | print('\n') 76 | print(batch_id * opt['batch_size'], ' out of ', '3010') 77 | 78 | # Save for evaluation 79 | cmses = cms_list[random_id].split(';')[1:] 80 | res[eval_id] = [cms] 81 | gts[eval_id] = cmses 82 | 83 | eval_id += 1 84 | 85 | ppl_corpus = '' 86 | for c in cmses: 87 | total_cms.add(c.lower()) 88 | ppl_corpus += ' ' + c.lower() 89 | tokens = nltk.word_tokenize(ppl_corpus) 90 | unigram_model = unigram(tokens) 91 | ppl_scores.append(perplexity(cms.lower(), unigram_model)) 92 | 93 | # Compute PPL score 94 | print('Perplexity score: ', sum(ppl_scores)/len(ppl_scores)) 95 | 96 | avg_bleu_score, bleu_scores = Bleu(4).compute_score(gts, res) 97 | avg_cider_score, cider_scores = Cider().compute_score(gts, res) 98 | avg_meteor_score, meteor_scores = Meteor().compute_score(gts, res) 99 | avg_rouge_score, rouge_scores = Rouge().compute_score(gts, res) 100 | print('C, M, R, B:', avg_cider_score, avg_meteor_score, avg_rouge_score, avg_bleu_score) 101 | 102 | 103 | def main(opt): 104 | dataset = VideoDataset(opt, 'test') 105 | dataloader = DataLoader(dataset, collate_fn=test_collate_fn, batch_size=opt['batch_size'], 106 | shuffle=False) 107 | opt['cms_vocab_size'] = dataset.get_cms_vocab_size() 108 | opt['cap_vocab_size'] = dataset.get_cap_vocab_size() 109 | 110 | if opt['cms'] == 'int': 111 | cms_text_length = opt['int_max_len'] 112 | elif opt['cms'] == 'eff': 113 | cms_text_length = opt['eff_max_len'] 114 | else: 115 | cms_text_length = opt['att_max_len'] 116 | 117 | from model.DecoderRNN import DecoderRNN 118 | from model.S2VT_EncoderRNN import EncoderRNN 119 | from model.S2VTAttModel import S2VTAttModel 120 | 121 | encoder = EncoderRNN(2048, 512, 0, 0.2, rnn_cell='gru') 122 | 123 | decoder = DecoderRNN(dataset.get_cap_vocab_size(), opt['cap_max_len'], 512, 512) 124 | 125 | cms_decoder = DecoderRNN(dataset.get_cms_vocab_size(), cms_text_length, 512, 512) 126 | 127 | model = S2VTAttModel(encoder, decoder, cms_decoder) 128 | 129 | 130 | model.eval() 131 | model_parameters = filter(lambda p: p.requires_grad, model.parameters()) 132 | params = sum([np.prod(p.size()) for p in model_parameters]) 133 | print(params) 134 | 135 | model.load_state_dict(torch.load(opt['load_checkpoint'])) 136 | 137 | if opt['cuda']: 138 | model = model.cuda() 139 | 140 | test(dataloader, model, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab()) 141 | 142 | 143 | if __name__ == '__main__': 144 | opt = parse_opt() 145 | opt = vars(opt) 146 | opt['captions'] = json.load(open(opt['caption_json'])) 147 | opt['batch_size'] = 30 148 | 149 | main(opt) 150 | -------------------------------------------------------------------------------- /others/test_transformer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import torch 4 | from opts import * 5 | import numpy as np 6 | import nltk 7 | from utils.utils import * 8 | from pycocoevalcap.bleu.bleu import Bleu 9 | from pycocoevalcap.rouge.rouge import Rouge 10 | from pycocoevalcap.cider.cider import Cider 11 | from pycocoevalcap.meteor.meteor import Meteor 12 | from model.TransformerModel import Model 13 | from torch.utils.data import DataLoader 14 | from model.transformer.Constants import * 15 | from utils.gt_caps_dataloader import VideoDataset 16 | from model.transformer.cap2cms_Translator_transformer import translate_batch 17 | 18 | # sys.path.append("./pycocoevalcap/") 19 | 20 | 21 | def test(loader, model, opt, cap_vocab, cms_vocab): 22 | res = {} 23 | gts = {} 24 | eval_id = 0 25 | 26 | total_cms = set() 27 | ppl_scores = [] 28 | 29 | for batch_id, raw_data in enumerate(loader): 30 | if opt['cuda']: torch.cuda.synchronize() 31 | 32 | # iterate each video within the batch 33 | for iterate_id in range(len(raw_data)): 34 | fc_feats = raw_data[iterate_id][0]['fc_feats'].unsqueeze(0) 35 | video_ids = raw_data[iterate_id][0]['video_ids'] 36 | cap_labels = raw_data[iterate_id][0]['cap_labels'] 37 | 38 | if opt['cms'] == 'int': 39 | cms_list = raw_data[iterate_id][1] 40 | elif opt['cms'] == 'eff': 41 | cms_list = raw_data[iterate_id][2] 42 | else: 43 | cms_list = raw_data[iterate_id][3] 44 | 45 | if opt['cuda']: 46 | # cms_list = cms_list.cuda() 47 | cap_labels = cap_labels.cuda() 48 | fc_feats = fc_feats.cuda() 49 | 50 | # repeat the fc features for num_cap times 51 | fc_feats = fc_feats.repeat(len(cap_labels), 1, 1) 52 | 53 | # iterate through all captions per video 54 | with torch.no_grad(): 55 | 56 | # Note, currently we used BEAM search to decode the captions, while greedy strategy 57 | # should yield close or even better results. 58 | cms_batch_hyp = translate_batch(model, fc_feats, cap_labels, opt) 59 | 60 | for random_id in range(cap_labels.shape[0]): 61 | # Print out the predicted sentences and GT 62 | if EOS in cms_batch_hyp[random_id][0]: 63 | stop_id = cms_batch_hyp[random_id][0].index(EOS) 64 | else: 65 | stop_id = -1 66 | 67 | cms = list_to_sentence([cms_vocab[str(widx)] for widx in 68 | cms_batch_hyp[random_id][0][: stop_id] if widx != 0]) 69 | cap_gt = list_to_sentence([cap_vocab[str(word.cpu().numpy())] for word in 70 | cap_labels[random_id, 1:] if word != 0][0:-1]) 71 | 72 | print(video_ids, '\n', 'Predicted CMS: ', cms) 73 | print('GT CMS Caption: ', cap_gt) 74 | print('GT CMS Knowledge: ', cms_list[random_id].split(';')[1:]) 75 | print('\n') 76 | print(batch_id * opt['batch_size'], ' out of ', '3010') 77 | 78 | # Save for evaluation 79 | cmses = cms_list[random_id].split(';')[1:] 80 | res[eval_id] = [cms] 81 | gts[eval_id] = cmses 82 | 83 | eval_id += 1 84 | 85 | 86 | ppl_corpus = '' 87 | for c in cmses: 88 | total_cms.add(c.lower()) 89 | ppl_corpus += ' ' + c.lower() 90 | tokens = nltk.word_tokenize(ppl_corpus) 91 | unigram_model = unigram(tokens) 92 | ppl_scores.append(perplexity(cms.lower(), unigram_model)) 93 | 94 | # Compute PPL score 95 | print('Perplexity score: ', sum(ppl_scores)/len(ppl_scores)) 96 | 97 | avg_bleu_score, bleu_scores = Bleu(4).compute_score(gts, res) 98 | avg_cider_score, cider_scores = Cider().compute_score(gts, res) 99 | avg_meteor_score, meteor_scores = Meteor().compute_score(gts, res) 100 | avg_rouge_score, rouge_scores = Rouge().compute_score(gts, res) 101 | print('C, M, R, B:', avg_cider_score, avg_meteor_score, avg_rouge_score, avg_bleu_score) 102 | 103 | 104 | def main(opt): 105 | dataset = VideoDataset(opt, 'test') 106 | dataloader = DataLoader(dataset, collate_fn=test_collate_fn, batch_size=opt['batch_size'], 107 | shuffle=False) 108 | opt['cms_vocab_size'] = dataset.get_cms_vocab_size() 109 | opt['cap_vocab_size'] = dataset.get_cap_vocab_size() 110 | 111 | if opt['cms'] == 'int': 112 | cms_text_length = opt['int_max_len'] 113 | elif opt['cms'] == 'eff': 114 | cms_text_length = opt['eff_max_len'] 115 | else: 116 | cms_text_length = opt['att_max_len'] 117 | 118 | model = Model( 119 | dataset.get_cap_vocab_size(), 120 | dataset.get_cms_vocab_size(), 121 | cap_max_seq=opt['cap_max_len'], 122 | cms_max_seq=cms_text_length, 123 | tgt_emb_prj_weight_sharing=True, 124 | vis_emb=opt['dim_vis_feat'], 125 | rnn_layers=opt['rnn_layer'], 126 | d_k=opt['dim_head'], 127 | d_v=opt['dim_head'], 128 | d_model=opt['dim_model'], 129 | d_word_vec=opt['dim_word'], 130 | d_inner=opt['dim_inner'], 131 | n_layers=opt['num_layer'], 132 | n_head=opt['num_head'], 133 | dropout=opt['dropout']) 134 | 135 | if len(opt['load_checkpoint']) != 0: 136 | state_dict = torch.load(opt['load_checkpoint']) 137 | # for name, param in model.state_dict().items(): 138 | # print(name, param.size()) 139 | # 140 | # print('=================') 141 | # print(state_dict.keys()) 142 | model.load_state_dict(state_dict) 143 | 144 | if opt['cuda']: 145 | model = model.cuda() 146 | 147 | model.eval() 148 | model_parameters = filter(lambda p: p.requires_grad, model.parameters()) 149 | params = sum([np.prod(p.size()) for p in model_parameters]) 150 | print(params) 151 | test(dataloader, model, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab()) 152 | 153 | 154 | if __name__ == '__main__': 155 | opt = parse_opt() 156 | opt = vars(opt) 157 | opt['captions'] = json.load(open(opt['caption_json'])) 158 | opt['batch_size'] = 30 159 | 160 | main(opt) -------------------------------------------------------------------------------- /others/train_RNN.py: -------------------------------------------------------------------------------- 1 | ''' Training Scropt for V2C captioning task. ''' 2 | 3 | __author__ = 'Jacob Zhiyuan Fang' 4 | 5 | import os 6 | import numpy as np 7 | from opts import * 8 | from utils.utils import * 9 | import torch.optim as optim 10 | from model.Model import Model 11 | from torch.utils.data import DataLoader 12 | from utils.dataloader import VideoDataset 13 | from model.transformer.Optim import ScheduledOptim 14 | 15 | 16 | def train(loader, model, optimizer, opt, cap_vocab, cms_vocab): 17 | 18 | model.train() 19 | 20 | for epoch in range(opt['epochs']): 21 | iteration = 0 22 | 23 | for data in loader: 24 | torch.cuda.synchronize() 25 | 26 | if opt['cms'] == 'int': 27 | cms_labels = data['int_labels'] 28 | elif opt['cms'] == 'eff': 29 | cms_labels = data['eff_labels'] 30 | else: 31 | cms_labels = data['att_labels'] 32 | 33 | if opt['cuda']: 34 | fc_feats = data['fc_feats'].cuda() 35 | cap_labels = data['cap_labels'].cuda() 36 | cms_labels = cms_labels.cuda() 37 | 38 | optimizer.zero_grad() 39 | 40 | # cap_probs, cms_probs = model(fc_feats, cap_labels, cap_pos, cms_labels, cms_pos) 41 | cap_probs, _, cms_probs, _ = model(fc_feats, cap_labels, cms_labels) 42 | 43 | # note: currently we just used most naive cross-entropy as training objective, 44 | # advanced loss func. like SELF-CRIT, different loss weights or stronger video feature 45 | # may lead performance boost, however is not the goal of this work. 46 | cap_loss, cap_n_correct = cal_performance(cap_probs.view(-1, cap_probs.shape[-1]), 47 | cap_labels[:, 1:], smoothing=True) 48 | cms_loss, cms_n_correct = cal_performance(cms_probs.view(-1, cms_probs.shape[-1]), 49 | cms_labels[:, 1:], smoothing=True) 50 | 51 | # compute the token prediction Acc. 52 | non_pad_mask = cap_labels[:, 1:].ne(Constants.PAD) 53 | n_word = non_pad_mask.sum().item() 54 | cms_non_pad_mask = cms_labels[:, 1:].ne(Constants.PAD) 55 | cms_n_word = cms_non_pad_mask.sum().item() 56 | cap_loss /= n_word 57 | cms_loss /= n_word 58 | 59 | loss = cms_loss + cap_loss 60 | 61 | loss.backward() 62 | optimizer.step_and_update_lr() 63 | torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, model.parameters()), 1) 64 | 65 | # update parameters 66 | cap_train_loss = cap_loss.item() 67 | cms_train_loss = cms_loss.item() 68 | 69 | # multi-gpu case, not necessary in newer PyTorch version or on single GPU. 70 | if opt['cuda']: torch.cuda.synchronize() 71 | 72 | iteration += 1 73 | 74 | if iteration % opt['print_loss_every'] ==0: 75 | print('iter %d (epoch %d), cap_train_loss = %.6f, cms_train_loss = %.6f,' 76 | ' current step = %d, current lr = %.3E, cap_acc = %.3f, cms_acc = %.3f' 77 | % (iteration, epoch, cap_train_loss, cms_train_loss, optimizer.n_current_steps, 78 | optimizer._optimizer.param_groups[0]['lr'], 79 | cap_n_correct/n_word, cms_n_correct/cms_n_word)) 80 | 81 | # show the intermediate generations 82 | if opt['show_predict']: 83 | cap_pr, cap_gt = show_prediction(cap_probs, cap_labels[:, :-1], cap_vocab, caption=True) 84 | cms_pr, cms_gt = show_prediction(cms_probs, cms_labels[:, :-1], cms_vocab, caption=False) 85 | print(' \n') 86 | 87 | with open(opt['info_path'], 'a') as f: 88 | f.write('model_%d, cap_loss: %.6f, cms_loss: %.6f\n' 89 | % (epoch, cap_train_loss, cms_train_loss)) 90 | f.write('\n %s \n %s' % (cap_pr, cap_gt)) 91 | f.write('\n %s \n %s' % (cms_pr, cms_gt)) 92 | f.write('\n') 93 | 94 | if epoch % opt['save_checkpoint_every'] == 0: 95 | 96 | # save the checkpoint 97 | model_path = os.path.join(opt['output_dir'], 98 | 'CMS_CAP_MODEL_{}_lr_{}_BS_{}_Layer_{}_ATTHEAD_{}_HID_{}_RNNLayer_{}_epoch_{}.pth' 99 | .format(opt['cms'], opt['init_lr'], opt['batch_size'], opt['num_layer'], 100 | opt['num_head'], opt['dim_model'], opt['rnn_layer'], epoch)) 101 | 102 | torch.save(model.state_dict(), model_path) 103 | 104 | print('model saved to %s' % model_path) 105 | with open(opt['model_info_path'], 'a') as f: 106 | f.write('model_%d, cap_loss: %.6f, cms_loss: %.6f\n' 107 | % (epoch, cap_train_loss/n_word, cms_train_loss/n_word)) 108 | 109 | 110 | def main(opt): 111 | 112 | # load and define dataloader 113 | dataset = VideoDataset(opt, 'train') 114 | dataloader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=True) 115 | 116 | opt['cms_vocab_size'] = dataset.get_cms_vocab_size() 117 | opt['cap_vocab_size'] = dataset.get_cap_vocab_size() 118 | 119 | if opt['cms'] == 'int': 120 | cms_text_length = opt['int_max_len'] 121 | elif opt['cms'] == 'eff': 122 | cms_text_length = opt['eff_max_len'] 123 | else: 124 | cms_text_length = opt['att_max_len'] 125 | 126 | # model initialization. 127 | from model.S2VTModel import S2VTModel 128 | model = S2VTModel( 129 | dataset.get_cap_vocab_size(), 130 | dataset.get_cms_vocab_size(), 131 | opt['cap_max_len'], 132 | cms_text_length, 133 | opt["dim_model"], 134 | opt["dim_word"], 135 | opt['dim_vis_feat'], 136 | n_layers=opt['rnn_layer']) 137 | 138 | # number of parameters 139 | model_parameters = filter(lambda p: p.requires_grad, model.parameters()) 140 | params = sum([np.prod(p.size()) for p in model_parameters]) 141 | print('number of learnable parameters are {}'.format(params)) 142 | 143 | if opt['cuda']: model = model.cuda() 144 | 145 | # resume from previous checkpoint if indicated 146 | if opt['load_checkpoint'] and opt['resume']: 147 | cap_state_dict = torch.load(opt['load_checkpoint']) 148 | model_dict = model.state_dict() 149 | model_dict.update(cap_state_dict) 150 | model.load_state_dict(model_dict) 151 | 152 | optimizer = ScheduledOptim(optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), 153 | betas=(0.9, 0.98), eps=1e-09), 512, opt['warm_up_steps']) 154 | 155 | # note: though we set the init learning rate as np.power(d_model, -0.5), 156 | # grid search indicates different LR may improve the results. 157 | opt['init_lr'] = round(optimizer.init_lr, 3) 158 | 159 | # create checkpoint output directory 160 | dir = os.path.join(opt['checkpoint_path'], 'S2VT_CMS_CAP_MODEL_{}_lr_{}_BS_{}_Layer_{}_ATTHEAD_{}_HID_{}_RNNLayer_{}' 161 | .format(opt['cms'], opt['init_lr'], opt['batch_size'], opt['num_layer'], 162 | opt['num_head'], opt['dim_model'], opt['rnn_layer'])) 163 | 164 | if not os.path.exists(dir): os.makedirs(dir) 165 | 166 | # save the model snapshot to local 167 | info_path = os.path.join(dir, 'iteration_info_log.log') 168 | print('model architecture saved to {} \n {}'.format(info_path, str(model))) 169 | with open(info_path, 'a') as f: 170 | f.write(str(model)) 171 | f.write('\n') 172 | f.write(str(params)) 173 | f.write('\n') 174 | 175 | # log file directory 176 | opt['output_dir'] = dir 177 | opt['info_path'] = info_path 178 | opt['model_info_path'] = os.path.join(opt['output_dir'], 'checkpoint_loss_log.log') 179 | 180 | train(dataloader, model, optimizer, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab()) 181 | 182 | if __name__ == '__main__': 183 | opt = parse_opt() 184 | opt = vars(opt) 185 | main(opt) -------------------------------------------------------------------------------- /others/train_transformer.py: -------------------------------------------------------------------------------- 1 | ''' Training Scropt for V2C captioning task. ''' 2 | 3 | __author__ = 'Jacob Zhiyuan Fang' 4 | 5 | import os 6 | import numpy as np 7 | from opts import * 8 | from utils.utils import * 9 | import torch.optim as optim 10 | from model.TransformerModel import Model 11 | from torch.utils.data import DataLoader 12 | from utils.dataloader import VideoDataset 13 | from model.transformer.Optim import ScheduledOptim 14 | 15 | 16 | def train(loader, model, optimizer, opt, cap_vocab, cms_vocab): 17 | 18 | model.train() 19 | 20 | for epoch in range(opt['epochs']): 21 | iteration = 0 22 | 23 | for data in loader: 24 | torch.cuda.synchronize() 25 | 26 | if opt['cms'] == 'int': 27 | cms_labels = data['int_labels'] 28 | elif opt['cms'] == 'eff': 29 | cms_labels = data['eff_labels'] 30 | else: 31 | cms_labels = data['att_labels'] 32 | 33 | if opt['cuda']: 34 | fc_feats = data['fc_feats'].cuda() 35 | cap_labels = data['cap_labels'].cuda() 36 | cms_labels = cms_labels.cuda() 37 | else: 38 | fc_feats = data['fc_feats'] 39 | cap_labels = data['cap_labels'] 40 | cms_labels = cms_labels.cuda() 41 | 42 | optimizer.zero_grad() 43 | 44 | cap_pos = pos_emb_generation(cap_labels) 45 | cms_pos = pos_emb_generation(cms_labels) 46 | 47 | cap_probs, cms_probs = model(fc_feats, cap_labels, cap_pos, cms_labels, cms_pos) 48 | 49 | # note: currently we just used most naive cross-entropy as training objective, 50 | # advanced loss func. like SELF-CRIT, different loss weights or stronger video feature 51 | # may lead performance boost, however is not the goal of this work. 52 | cap_loss, cap_n_correct = cal_performance(cap_probs, cap_labels[:, 1:], smoothing=True) 53 | cms_loss, cms_n_correct = cal_performance(cms_probs, cms_labels[:, 1:], smoothing=True) 54 | 55 | # compute the token prediction Acc. 56 | non_pad_mask = cap_labels[:, 1:].ne(Constants.PAD) 57 | n_word = non_pad_mask.sum().item() 58 | cms_non_pad_mask = cms_labels[:, 1:].ne(Constants.PAD) 59 | cms_n_word = cms_non_pad_mask.sum().item() 60 | cap_loss /= n_word 61 | cms_loss /= n_word 62 | 63 | loss = cms_loss + cap_loss 64 | 65 | loss.backward() 66 | optimizer.step_and_update_lr() 67 | torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, model.parameters()), 1) 68 | 69 | # update parameters 70 | cap_train_loss = cap_loss.item() 71 | cms_train_loss = cms_loss.item() 72 | 73 | # multi-gpu case, not necessary in newer PyTorch version or on single GPU. 74 | if opt['cuda']: torch.cuda.synchronize() 75 | 76 | iteration += 1 77 | 78 | if iteration % opt['print_loss_every'] ==0: 79 | print('iter %d (epoch %d), cap_train_loss = %.6f, cms_train_loss = %.6f,' 80 | ' current step = %d, current lr = %.3E, cap_acc = %.3f, cms_acc = %.3f' 81 | % (iteration, epoch, cap_train_loss, cms_train_loss, optimizer.n_current_steps, 82 | optimizer._optimizer.param_groups[0]['lr'], 83 | cap_n_correct/n_word, cms_n_correct/cms_n_word)) 84 | 85 | # show the intermediate generations 86 | if opt['show_predict']: 87 | cap_pr, cap_gt = show_prediction(cap_probs, cap_labels[:, :-1], cap_vocab, caption=True) 88 | cms_pr, cms_gt = show_prediction(cms_probs, cms_labels[:, :-1], cms_vocab, caption=False) 89 | print(' \n') 90 | 91 | with open(opt['info_path'], 'a') as f: 92 | f.write('model_%d, cap_loss: %.6f, cms_loss: %.6f\n' 93 | % (epoch, cap_train_loss, cms_train_loss)) 94 | f.write('\n %s \n %s' % (cap_pr, cap_gt)) 95 | f.write('\n %s \n %s' % (cms_pr, cms_gt)) 96 | f.write('\n') 97 | 98 | if epoch % opt['save_checkpoint_every'] == 0: 99 | 100 | # save the checkpoint 101 | model_path = os.path.join(opt['output_dir'], 102 | 'Transformer_CMS_CAP_MODEL_{}_lr_{}_BS_{}_Layer_{}_ATTHEAD_{}_HID_{}_RNNLayer_{}_epoch_{}.pth' 103 | .format(opt['cms'], opt['init_lr'], opt['batch_size'], opt['num_layer'], 104 | opt['num_head'], opt['dim_model'], opt['rnn_layer'], epoch)) 105 | 106 | torch.save(model.state_dict(), model_path) 107 | 108 | print('model saved to %s' % model_path) 109 | with open(opt['model_info_path'], 'a') as f: 110 | f.write('model_%d, cap_loss: %.6f, cms_loss: %.6f\n' 111 | % (epoch, cap_train_loss/n_word, cms_train_loss/n_word)) 112 | 113 | 114 | def main(opt): 115 | 116 | # load and define dataloader 117 | dataset = VideoDataset(opt, 'train') 118 | dataloader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=True) 119 | 120 | opt['cms_vocab_size'] = dataset.get_cms_vocab_size() 121 | opt['cap_vocab_size'] = dataset.get_cap_vocab_size() 122 | 123 | if opt['cms'] == 'int': 124 | cms_text_length = opt['int_max_len'] 125 | elif opt['cms'] == 'eff': 126 | cms_text_length = opt['eff_max_len'] 127 | else: 128 | cms_text_length = opt['att_max_len'] 129 | 130 | # model initialization. 131 | model = Model( 132 | dataset.get_cap_vocab_size(), 133 | dataset.get_cms_vocab_size(), 134 | cap_max_seq=opt['cap_max_len'], 135 | cms_max_seq=cms_text_length, 136 | tgt_emb_prj_weight_sharing=True, 137 | vis_emb=opt['dim_vis_feat'], 138 | rnn_layers=opt['rnn_layer'], 139 | d_k=opt['dim_head'], 140 | d_v=opt['dim_head'], 141 | d_model=opt['dim_model'], 142 | d_word_vec=opt['dim_word'], 143 | d_inner=opt['dim_inner'], 144 | n_layers=opt['num_layer'], 145 | n_head=opt['num_head'], 146 | dropout=opt['dropout']) 147 | 148 | # number of parameters 149 | model_parameters = filter(lambda p: p.requires_grad, model.parameters()) 150 | params = sum([np.prod(p.size()) for p in model_parameters]) 151 | print('number of learnable parameters are {}'.format(params)) 152 | 153 | if opt['cuda']: model = model.cuda() 154 | 155 | # resume from previous checkpoint if indicated 156 | if opt['load_checkpoint'] and opt['resume']: 157 | cap_state_dict = torch.load(opt['load_checkpoint']) 158 | model_dict = model.state_dict() 159 | model_dict.update(cap_state_dict) 160 | model.load_state_dict(model_dict) 161 | 162 | optimizer = ScheduledOptim(optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), 163 | betas=(0.9, 0.98), eps=1e-09), 512, opt['warm_up_steps']) 164 | 165 | # note: though we set the init learning rate as np.power(d_model, -0.5), 166 | # grid search indicates different LR may improve the results. 167 | opt['init_lr'] = round(optimizer.init_lr, 3) 168 | 169 | # create checkpoint output directory 170 | dir = os.path.join(opt['checkpoint_path'], 'CMS_CAP_MODEL_{}_lr_{}_BS_{}_Layer_{}_ATTHEAD_{}_HID_{}_RNNLayer_{}' 171 | .format(opt['cms'], opt['init_lr'], opt['batch_size'], opt['num_layer'], 172 | opt['num_head'], opt['dim_model'], opt['rnn_layer'])) 173 | 174 | if not os.path.exists(dir): os.makedirs(dir) 175 | 176 | # save the model snapshot to local 177 | info_path = os.path.join(dir, 'iteration_info_log.log') 178 | print('model architecture saved to {} \n {}'.format(info_path, str(model))) 179 | with open(info_path, 'a') as f: 180 | f.write(str(model)) 181 | f.write('\n') 182 | f.write(str(params)) 183 | f.write('\n') 184 | 185 | # log file directory 186 | opt['output_dir'] = dir 187 | opt['info_path'] = info_path 188 | opt['model_info_path'] = os.path.join(opt['output_dir'], 'checkpoint_loss_log.log') 189 | 190 | train(dataloader, model, optimizer, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab()) 191 | 192 | if __name__ == '__main__': 193 | opt = parse_opt() 194 | opt = vars(opt) 195 | main(opt) -------------------------------------------------------------------------------- /pictures/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jacobswan1/Video2Commonsense/4dcef76360a29702fd90b7030a39a123da6db19e/pictures/arch.png -------------------------------------------------------------------------------- /pictures/v2c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jacobswan1/Video2Commonsense/4dcef76360a29702fd90b7030a39a123da6db19e/pictures/v2c.png -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import torch 4 | from opts import * 5 | import numpy as np 6 | import nltk 7 | from utils.utils import * 8 | from pycocoevalcap.bleu.bleu import Bleu 9 | from pycocoevalcap.rouge.rouge import Rouge 10 | from pycocoevalcap.cider.cider import Cider 11 | from pycocoevalcap.meteor.meteor import Meteor 12 | from model.Model import Model 13 | from torch.utils.data import DataLoader 14 | from model.transformer.Constants import * 15 | from utils.gt_caps_dataloader import VideoDataset 16 | from model.transformer.cap2cms_Translator import translate_batch 17 | 18 | # sys.path.append("./pycocoevalcap/") 19 | 20 | 21 | def test(loader, model, opt, cap_vocab, cms_vocab): 22 | res = {} 23 | gts = {} 24 | eval_id = 0 25 | 26 | total_cms = set() 27 | ppl_scores = [] 28 | 29 | for batch_id, raw_data in enumerate(loader): 30 | if opt['cuda']: torch.cuda.synchronize() 31 | 32 | # iterate each video within the batch 33 | for iterate_id in range(len(raw_data)): 34 | fc_feats = raw_data[iterate_id][0]['fc_feats'].unsqueeze(0) 35 | video_ids = raw_data[iterate_id][0]['video_ids'] 36 | cap_labels = raw_data[iterate_id][0]['cap_labels'] 37 | 38 | if opt['cms'] == 'int': 39 | cms_list = raw_data[iterate_id][1] 40 | elif opt['cms'] == 'eff': 41 | cms_list = raw_data[iterate_id][2] 42 | else: 43 | cms_list = raw_data[iterate_id][3] 44 | 45 | if opt['cuda']: 46 | # cms_list = cms_list.cuda() 47 | cap_labels = cap_labels.cuda() 48 | fc_feats = fc_feats.cuda() 49 | 50 | # repeat the fc features for num_cap times 51 | fc_feats = fc_feats.repeat(len(cap_labels), 1, 1) 52 | 53 | # iterate through all captions per video 54 | with torch.no_grad(): 55 | 56 | # Note, currently we used BEAM search to decode the captions, while greedy strategy 57 | # should yield close or even better results. 58 | cms_batch_hyp = translate_batch(model, fc_feats, cap_labels, opt) 59 | 60 | for random_id in range(cap_labels.shape[0]): 61 | # Print out the predicted sentences and GT 62 | if EOS in cms_batch_hyp[random_id][0]: 63 | stop_id = cms_batch_hyp[random_id][0].index(EOS) 64 | else: 65 | stop_id = -1 66 | 67 | cms = list_to_sentence([cms_vocab[str(widx)] for widx in 68 | cms_batch_hyp[random_id][0][: stop_id] if widx != 0]) 69 | cap_gt = list_to_sentence([cap_vocab[str(word.cpu().numpy())] for word in 70 | cap_labels[random_id, 1:] if word != 0][0:-1]) 71 | 72 | print(video_ids, '\n', 'Predicted CMS: ', cms) 73 | print('GT CMS Caption: ', cap_gt) 74 | print('GT CMS Knowledge: ', cms_list[random_id].split(';')[1:]) 75 | print('\n') 76 | print(batch_id * opt['batch_size'], ' out of ', '3010') 77 | 78 | # Save for evaluation 79 | cmses = cms_list[random_id].split(';')[1:] 80 | res[eval_id] = [cms] 81 | gts[eval_id] = cmses 82 | 83 | eval_id += 1 84 | 85 | # Note! It is controversial whether PPL score reflect the quality of CMS as its using the corpus token 86 | # probability. It is unclear which corpus (either total CMS corpus or the only 5 GT CMS, which is narrow) 87 | # best reflects the results. Thus we remove it from our official results. Score in here is just for 88 | # comparisons, where we used the only 5 GT annotations as corpus base. 89 | ppl_corpus = '' 90 | for c in cmses: 91 | total_cms.add(c.lower()) 92 | ppl_corpus += ' ' + c.lower() 93 | tokens = nltk.word_tokenize(ppl_corpus) 94 | unigram_model = unigram(tokens) 95 | ppl_scores.append(perplexity(cms.lower(), unigram_model)) 96 | 97 | # Compute PPL score 98 | print('Perplexity score: ', sum(ppl_scores)/len(ppl_scores)) 99 | 100 | avg_bleu_score, bleu_scores = Bleu(4).compute_score(gts, res) 101 | avg_cider_score, cider_scores = Cider().compute_score(gts, res) 102 | avg_meteor_score, meteor_scores = Meteor().compute_score(gts, res) 103 | avg_rouge_score, rouge_scores = Rouge().compute_score(gts, res) 104 | print('C, M, R, B:', avg_cider_score, avg_meteor_score, avg_rouge_score, avg_bleu_score) 105 | 106 | 107 | def main(opt): 108 | dataset = VideoDataset(opt, 'test') 109 | dataloader = DataLoader(dataset, collate_fn=test_collate_fn, batch_size=opt['batch_size'], 110 | shuffle=False) 111 | opt['cms_vocab_size'] = dataset.get_cms_vocab_size() 112 | opt['cap_vocab_size'] = dataset.get_cap_vocab_size() 113 | 114 | if opt['cms'] == 'int': 115 | cms_text_length = opt['int_max_len'] 116 | elif opt['cms'] == 'eff': 117 | cms_text_length = opt['eff_max_len'] 118 | else: 119 | cms_text_length = opt['att_max_len'] 120 | 121 | model = Model( 122 | dataset.get_cap_vocab_size(), 123 | dataset.get_cms_vocab_size(), 124 | cap_max_seq=opt['cap_max_len'], 125 | cms_max_seq=cms_text_length, 126 | tgt_emb_prj_weight_sharing=True, 127 | vis_emb=opt['dim_vis_feat'], 128 | rnn_layers=opt['rnn_layer'], 129 | d_k=opt['dim_head'], 130 | d_v=opt['dim_head'], 131 | d_model=opt['dim_model'], 132 | d_word_vec=opt['dim_word'], 133 | d_inner=opt['dim_inner'], 134 | n_layers=opt['num_layer'], 135 | n_head=opt['num_head'], 136 | dropout=opt['dropout']) 137 | 138 | if len(opt['load_checkpoint']) != 0: 139 | state_dict = torch.load(opt['load_checkpoint']) 140 | # for name, param in model.state_dict().items(): 141 | # print(name, param.size()) 142 | # 143 | # print('=================') 144 | # print(state_dict.keys()) 145 | model.load_state_dict(state_dict) 146 | 147 | if opt['cuda']: 148 | model = model.cuda() 149 | 150 | model.eval() 151 | model_parameters = filter(lambda p: p.requires_grad, model.parameters()) 152 | params = sum([np.prod(p.size()) for p in model_parameters]) 153 | print(params) 154 | test(dataloader, model, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab()) 155 | 156 | 157 | if __name__ == '__main__': 158 | opt = parse_opt() 159 | opt = vars(opt) 160 | opt['captions'] = json.load(open(opt['caption_json'])) 161 | opt['batch_size'] = 30 162 | 163 | main(opt) -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | ''' Training Scropt for V2C captioning task. ''' 2 | 3 | __author__ = 'Jacob Zhiyuan Fang' 4 | 5 | import os 6 | import numpy as np 7 | from opts import * 8 | from utils.utils import * 9 | import torch.optim as optim 10 | from model.Model import Model 11 | from torch.utils.data import DataLoader 12 | from utils.dataloader import VideoDataset 13 | from model.transformer.Optim import ScheduledOptim 14 | 15 | 16 | def train(loader, model, optimizer, opt, cap_vocab, cms_vocab): 17 | 18 | model.train() 19 | 20 | for epoch in range(opt['epochs']): 21 | iteration = 0 22 | 23 | for data in loader: 24 | torch.cuda.synchronize() 25 | 26 | if opt['cms'] == 'int': 27 | cms_labels = data['int_labels'] 28 | elif opt['cms'] == 'eff': 29 | cms_labels = data['eff_labels'] 30 | else: 31 | cms_labels = data['att_labels'] 32 | 33 | if opt['cuda']: 34 | fc_feats = data['fc_feats'].cuda() 35 | cap_labels = data['cap_labels'].cuda() 36 | cms_labels = cms_labels.cuda() 37 | else: 38 | fc_feats = data['fc_feats'] 39 | cap_labels = data['cap_labels'] 40 | cms_labels = cms_labels.cuda() 41 | 42 | optimizer.zero_grad() 43 | 44 | cap_pos = pos_emb_generation(cap_labels) 45 | cms_pos = pos_emb_generation(cms_labels) 46 | 47 | cap_probs, cms_probs = model(fc_feats, cap_labels, cap_pos, cms_labels, cms_pos) 48 | 49 | # note: currently we just used most naive cross-entropy as training objective, 50 | # advanced loss func. like SELF-CRIT, different loss weights or stronger video feature 51 | # may lead performance boost, however is not the goal of this work. 52 | cap_loss, cap_n_correct = cal_performance(cap_probs, cap_labels[:, 1:], smoothing=True) 53 | cms_loss, cms_n_correct = cal_performance(cms_probs, cms_labels[:, 1:], smoothing=True) 54 | 55 | # compute the token prediction Acc. 56 | non_pad_mask = cap_labels[:, 1:].ne(Constants.PAD) 57 | n_word = non_pad_mask.sum().item() 58 | cms_non_pad_mask = cms_labels[:, 1:].ne(Constants.PAD) 59 | cms_n_word = cms_non_pad_mask.sum().item() 60 | cap_loss /= n_word 61 | cms_loss /= n_word 62 | 63 | loss = cms_loss + cap_loss 64 | 65 | loss.backward() 66 | optimizer.step_and_update_lr() 67 | torch.nn.utils.clip_grad_norm_(filter(lambda p: p.requires_grad, model.parameters()), 1) 68 | 69 | # update parameters 70 | cap_train_loss = cap_loss.item() 71 | cms_train_loss = cms_loss.item() 72 | 73 | # multi-gpu case, not necessary in newer PyTorch version or on single GPU. 74 | if opt['cuda']: torch.cuda.synchronize() 75 | 76 | iteration += 1 77 | 78 | if iteration % opt['print_loss_every'] ==0: 79 | print('iter %d (epoch %d), cap_train_loss = %.6f, cms_train_loss = %.6f,' 80 | ' current step = %d, current lr = %.3E, cap_acc = %.3f, cms_acc = %.3f' 81 | % (iteration, epoch, cap_train_loss, cms_train_loss, optimizer.n_current_steps, 82 | optimizer._optimizer.param_groups[0]['lr'], 83 | cap_n_correct/n_word, cms_n_correct/cms_n_word)) 84 | 85 | # show the intermediate generations 86 | if opt['show_predict']: 87 | cap_pr, cap_gt = show_prediction(cap_probs, cap_labels[:, :-1], cap_vocab, caption=True) 88 | cms_pr, cms_gt = show_prediction(cms_probs, cms_labels[:, :-1], cms_vocab, caption=False) 89 | print(' \n') 90 | 91 | with open(opt['info_path'], 'a') as f: 92 | f.write('model_%d, cap_loss: %.6f, cms_loss: %.6f\n'% (epoch, cap_train_loss, cms_train_loss)) 93 | f.write('\n %s \n %s' % (cap_pr, cap_gt)) 94 | f.write('\n %s \n %s' % (cms_pr, cms_gt)) 95 | f.write('\n') 96 | 97 | if epoch % opt['save_checkpoint_every'] == 0: 98 | 99 | # save the checkpoint 100 | model_path = os.path.join(opt['output_dir'], 101 | 'CMS_CAP_MODEL_INT_lr_{}_BS_{}_Layer_{}_ATTHEAD_{}_HID_{}_RNNLayer_{}_epoch_{}.pth' 102 | .format(opt['init_lr'], opt['batch_size'], opt['num_layer'], 103 | opt['num_head'], opt['dim_model'], opt['rnn_layer'], epoch)) 104 | 105 | torch.save(model.state_dict(), model_path) 106 | 107 | print('model saved to %s' % model_path) 108 | with open(opt['model_info_path'], 'a') as f: 109 | f.write('model_%d, cap_loss: %.6f, cms_loss: %.6f\n' 110 | % (epoch, cap_train_loss/n_word, cms_train_loss/n_word)) 111 | 112 | 113 | def main(opt): 114 | 115 | # load and define dataloader 116 | dataset = VideoDataset(opt, 'train') 117 | dataloader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=True) 118 | 119 | opt['cms_vocab_size'] = dataset.get_cms_vocab_size() 120 | opt['cap_vocab_size'] = dataset.get_cap_vocab_size() 121 | 122 | if opt['cms'] == 'int': 123 | cms_text_length = opt['int_max_len'] 124 | elif opt['cms'] == 'eff': 125 | cms_text_length = opt['eff_max_len'] 126 | else: 127 | cms_text_length = opt['att_max_len'] 128 | 129 | # model initialization. 130 | model = Model( 131 | dataset.get_cap_vocab_size(), 132 | dataset.get_cms_vocab_size(), 133 | cap_max_seq=opt['cap_max_len'], 134 | cms_max_seq=cms_text_length, 135 | tgt_emb_prj_weight_sharing=True, 136 | vis_emb=opt['dim_vis_feat'], 137 | rnn_layers=opt['rnn_layer'], 138 | d_k=opt['dim_head'], 139 | d_v=opt['dim_head'], 140 | d_model=opt['dim_model'], 141 | d_word_vec=opt['dim_word'], 142 | d_inner=opt['dim_inner'], 143 | n_layers=opt['num_layer'], 144 | n_head=opt['num_head'], 145 | dropout=opt['dropout']) 146 | 147 | # number of parameters 148 | model_parameters = filter(lambda p: p.requires_grad, model.parameters()) 149 | params = sum([np.prod(p.size()) for p in model_parameters]) 150 | print('number of learnable parameters are {}'.format(params)) 151 | 152 | if opt['cuda']: model = model.cuda() 153 | 154 | # resume from previous checkpoint if indicated 155 | if opt['load_checkpoint'] and opt['resume']: 156 | cap_state_dict = torch.load(opt['load_checkpoint']) 157 | model_dict = model.state_dict() 158 | model_dict.update(cap_state_dict) 159 | model.load_state_dict(model_dict) 160 | 161 | optimizer = ScheduledOptim(optim.Adam(filter(lambda x: x.requires_grad, model.parameters()), 162 | betas=(0.9, 0.98), eps=1e-09), 512, opt['warm_up_steps']) 163 | 164 | # note: though we set the init learning rate as np.power(d_model, -0.5), 165 | # grid search indicates different LR may improve the results. 166 | opt['init_lr'] = round(optimizer.init_lr, 3) 167 | 168 | # create checkpoint output directory 169 | dir = os.path.join(opt['checkpoint_path'], 'CMS_CAP_MODEL_INT_lr_{}_BS_{}_Layer_{}_ATTHEAD_{}_HID_{}_RNNLayer_{}' 170 | .format(opt['init_lr'], opt['batch_size'], opt['num_layer'], 171 | opt['num_head'], opt['dim_model'], opt['rnn_layer'])) 172 | 173 | if not os.path.exists(dir): os.makedirs(dir) 174 | 175 | # save the model snapshot to local 176 | info_path = os.path.join(dir, 'iteration_info_log.log') 177 | print('model architecture saved to {} \n {}'.format(info_path, str(model))) 178 | with open(info_path, 'a') as f: 179 | f.write(str(model)) 180 | f.write('\n') 181 | f.write(str(params)) 182 | f.write('\n') 183 | 184 | # log file directory 185 | opt['output_dir'] = dir 186 | opt['info_path'] = info_path 187 | opt['model_info_path'] = os.path.join(opt['output_dir'], 'checkpoint_loss_log.log') 188 | 189 | train(dataloader, model, optimizer, opt, dataset.get_cap_vocab(), dataset.get_cms_vocab()) 190 | 191 | if __name__ == '__main__': 192 | opt = parse_opt() 193 | opt = vars(opt) 194 | main(opt) 195 | -------------------------------------------------------------------------------- /utils/allinone_dataloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | import random 5 | import numpy as np 6 | from torch.utils.data import Dataset 7 | 8 | 9 | class VideoDataset(Dataset): 10 | 11 | def get_cms_vocab_size(self): 12 | return len(self.get_cms_vocab()) 13 | 14 | def get_cap_vocab_size(self): 15 | return len(self.get_cap_vocab()) 16 | 17 | def get_cms_vocab(self): 18 | return self.cms_ix_to_word 19 | 20 | def get_cap_vocab(self): 21 | return self.cap_ix_to_word 22 | 23 | def get_seq_length(self): 24 | return self.seq_length 25 | 26 | def __init__(self, opt, mode='train'): 27 | super(VideoDataset, self).__init__() 28 | self.mode = mode 29 | 30 | self.captions = json.load(open(opt['caption_json'])) 31 | cms_info = json.load(open(opt['info_json'])) 32 | self.cms_ix_to_word = cms_info['ix_to_word'] 33 | self.cms_word_to_ix = cms_info['word_to_ix'] 34 | self.splits = cms_info['videos'] 35 | 36 | # Load caption dictionary 37 | cap_info = json.load(open(opt['cap_info_json'])) 38 | self.cap_ix_to_word = cap_info['ix_to_word'] 39 | self.cap_word_to_ix = cap_info['word_to_ix'] 40 | 41 | print('Caption vocab size is ', len(self.cap_ix_to_word)) 42 | print('CMS vocab size is ', len(self.cms_ix_to_word)) 43 | print('number of train videos: ', len(self.splits['train'])) 44 | print('number of test videos: ', len(self.splits['test'])) 45 | print('number of val videos: ', len(self.splits['val'])) 46 | 47 | self.feats_dir = opt['feats_dir'] 48 | print('load feats from %s' % self.feats_dir) 49 | 50 | self.cap_max_len = opt['cap_max_len'] 51 | self.int_max_len = opt['int_max_len'] 52 | self.eff_max_len = opt['eff_max_len'] 53 | self.att_max_len = opt['att_max_len'] 54 | print('max sequence length of caption is', self.cap_max_len) 55 | print('max sequence length of intention is', self.int_max_len) 56 | print('max sequence length of effect is', self.eff_max_len) 57 | print('max sequence length of attribute is', self.att_max_len) 58 | 59 | def __getitem__(self, ix=False): 60 | if not ix: 61 | if self.mode == 'train': 62 | ix = random.choice(self.splits['train']) 63 | elif self.mode == 'test': 64 | ix = self.splits['test'][ix] 65 | 66 | fc_feat = [] 67 | for dir in self.feats_dir: 68 | fc_feat.append(np.load(os.path.join(dir, 'video%i.npy' % ix))) 69 | fc_feat = np.concatenate(fc_feat, axis=1) 70 | 71 | total_length = self.int_max_len + self.cap_max_len + self.eff_max_len + self.att_max_len 72 | cap_mask = np.zeros(total_length) 73 | cap_gts = np.zeros(total_length) 74 | 75 | idx = 'video%i' % ix 76 | if idx not in self.captions.keys(): 77 | raw_data = self.captions[random.choice(list(self.captions.keys()))] 78 | else: 79 | raw_data = self.captions[idx] 80 | 81 | cap_ix = random.randint(0, len(raw_data) - 1) # Random pick out one caption 82 | 83 | caption = raw_data[cap_ix]['final_caption'] 84 | intentions = raw_data[cap_ix]['intention'] 85 | intention = intentions[random.randint(0, len(intentions)-1)][1] 86 | 87 | effects = raw_data[cap_ix]['effect'] 88 | effect = effects[random.randint(0, len(effects)-1)][1] 89 | 90 | attributes = raw_data[cap_ix]['attribute'] 91 | attribute = attributes[random.randint(0, len(attributes)-1)][1] 92 | 93 | allinone_caption = intention[:-1] + [''] + caption[1:-1] + [''] + \ 94 | effect[1:-1] + [''] + attribute[1:] 95 | 96 | if len(allinone_caption) > total_length: 97 | allinone_caption = allinone_caption[:total_length] 98 | allinone_caption[-1] = '' 99 | 100 | for j, w in enumerate(allinone_caption): 101 | cap_gts[j] = self.cap_word_to_ix.get(w, '1') 102 | 103 | non_zero = (cap_gts == 0).nonzero() 104 | if len(non_zero[0]) != 0: cap_mask[:int(non_zero[0][0])] = 1 105 | else: cap_mask += 1 106 | 107 | data = {} 108 | data['fc_feats'] = torch.from_numpy(fc_feat).type(torch.FloatTensor) 109 | data['cap_labels'] = torch.from_numpy(cap_gts).type(torch.LongTensor) 110 | data['cap_masks'] = torch.from_numpy(cap_mask).type(torch.FloatTensor) 111 | 112 | data['video_ids'] = 'video%i' % ix 113 | return data 114 | 115 | def __len__(self): 116 | return len(self.splits[self.mode]) 117 | -------------------------------------------------------------------------------- /utils/cocoeval.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Wrapper for evaluation on CIDEr, ROUGE_L, METEOR and Bleu_N 3 | using coco-caption repo https://github.com/tylin/coco-caption 4 | 5 | class COCOScorer is taken from https://github.com/yaoli/arctic-capgen-vid 6 | ''' 7 | 8 | import json 9 | import os 10 | import sys 11 | sys.path.append('coco-caption') 12 | 13 | from pycocoevalcap.bleu.bleu import Bleu 14 | from pycocoevalcap.rouge.rouge import Rouge 15 | from pycocoevalcap.cider.cider import Cider 16 | from pycocoevalcap.meteor.meteor import Meteor 17 | from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer 18 | # Define a context manager to suppress stdout and stderr. 19 | 20 | 21 | class suppress_stdout_stderr: 22 | ''' 23 | A context manager for doing a "deep suppression" of stdout and stderr in 24 | Python, i.e. will suppress all print, even if the print originates in a 25 | compiled C/Fortran sub-function. 26 | This will not suppress raised exceptions, since exceptions are printed 27 | to stderr just before a script exits, and after the context manager has 28 | exited (at least, I think that is why it lets exceptions through). 29 | 30 | ''' 31 | 32 | def __init__(self): 33 | # Open a pair of null files 34 | self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] 35 | # Save the actual stdout (1) and stderr (2) file descriptors. 36 | self.save_fds = (os.dup(1), os.dup(2)) 37 | 38 | def __enter__(self): 39 | # Assign the null pointers to stdout and stderr. 40 | os.dup2(self.null_fds[0], 1) 41 | os.dup2(self.null_fds[1], 2) 42 | 43 | def __exit__(self, *_): 44 | # Re-assign the real stdout/stderr back to (1) and (2) 45 | os.dup2(self.save_fds[0], 1) 46 | os.dup2(self.save_fds[1], 2) 47 | # Close the null files 48 | os.close(self.null_fds[0]) 49 | os.close(self.null_fds[1]) 50 | 51 | 52 | class COCOScorer(object): 53 | def __init__(self): 54 | print('init COCO-EVAL scorer') 55 | 56 | def score(self, GT, RES, IDs): 57 | self.eval = {} 58 | self.imgToEval = {} 59 | gts = {} 60 | res = {} 61 | for ID in IDs: 62 | # print ID 63 | gts[ID] = GT[ID] 64 | res[ID] = RES[ID] 65 | print('tokenization...') 66 | tokenizer = PTBTokenizer() 67 | gts = tokenizer.tokenize(gts) 68 | res = tokenizer.tokenize(res) 69 | 70 | # ================================================= 71 | # Set up scorers 72 | # ================================================= 73 | print('setting up scorers...') 74 | scorers = [ 75 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 76 | (Meteor(),"METEOR"), 77 | (Rouge(), "ROUGE_L"), 78 | (Cider(), "CIDEr"), 79 | #(Spice(), "SPICE") 80 | ] 81 | 82 | # ================================================= 83 | # Compute scores 84 | # ================================================= 85 | eval = {} 86 | for scorer, method in scorers: 87 | print('computing %s score...' % (scorer.method())) 88 | score, scores = scorer.compute_score(gts, res) 89 | if type(method) == list: 90 | for sc, scs, m in zip(score, scores, method): 91 | self.setEval(sc, m) 92 | self.setImgToEvalImgs(scs, IDs, m) 93 | print("%s: %0.3f" % (m, sc)) 94 | else: 95 | self.setEval(score, method) 96 | self.setImgToEvalImgs(scores, IDs, method) 97 | print("%s: %0.3f" % (method, score)) 98 | 99 | # for metric, score in self.eval.items(): 100 | # print '%s: %.3f'%(metric, score) 101 | return self.eval 102 | 103 | def setEval(self, score, method): 104 | self.eval[method] = score 105 | 106 | def setImgToEvalImgs(self, scores, imgIds, method): 107 | for imgId, score in zip(imgIds, scores): 108 | if imgId not in self.imgToEval: 109 | self.imgToEval[imgId] = {} 110 | self.imgToEval[imgId]["image_id"] = imgId 111 | self.imgToEval[imgId][method] = score 112 | 113 | 114 | def score(ref, sample): 115 | # ref and sample are both dict 116 | scorers = [ 117 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 118 | (Rouge(), "ROUGE_L"), 119 | (Cider(), "CIDEr") 120 | ] 121 | final_scores = {} 122 | for scorer, method in scorers: 123 | print('computing %s score with COCO-EVAL...' % (scorer.method())) 124 | score, scores = scorer.compute_score(ref, sample) 125 | if type(score) == list: 126 | for m, s in zip(method, score): 127 | final_scores[m] = s 128 | else: 129 | final_scores[method] = score 130 | return final_scores 131 | 132 | 133 | def test_cocoscorer(): 134 | '''gts = { 135 | 184321:[ 136 | {u'image_id': 184321, u'id': 352188, u'caption': u'A train traveling down-tracks next to lights.'}, 137 | {u'image_id': 184321, u'id': 356043, u'caption': u"A blue and silver train next to train's station and trees."}, 138 | {u'image_id': 184321, u'id': 356382, u'caption': u'A blue train is next to a sidewalk on the rails.'}, 139 | {u'image_id': 184321, u'id': 361110, u'caption': u'A passenger train pulls into a train station.'}, 140 | {u'image_id': 184321, u'id': 362544, u'caption': u'A train coming down the tracks arriving at a station.'}], 141 | 81922: [ 142 | {u'image_id': 81922, u'id': 86779, u'caption': u'A large jetliner flying over a traffic filled street.'}, 143 | {u'image_id': 81922, u'id': 90172, u'caption': u'An airplane flies low in the sky over a city street. '}, 144 | {u'image_id': 81922, u'id': 91615, u'caption': u'An airplane flies over a street with many cars.'}, 145 | {u'image_id': 81922, u'id': 92689, u'caption': u'An airplane comes in to land over a road full of cars'}, 146 | {u'image_id': 81922, u'id': 823814, u'caption': u'The plane is flying over top of the cars'}] 147 | } 148 | 149 | samples = { 150 | 184321: [{u'image_id': 184321, 'id': 111, u'caption': u'train traveling down a track in front of a road'}], 151 | 81922: [{u'image_id': 81922, 'id': 219, u'caption': u'plane is flying through the sky'}], 152 | } 153 | ''' 154 | gts = { 155 | '184321': [ 156 | {u'image_id': '184321', u'cap_id': 0, u'caption': u'A train traveling down tracks next to lights.', 157 | 'tokenized': 'a train traveling down tracks next to lights'}, 158 | {u'image_id': '184321', u'cap_id': 1, u'caption': u'A train coming down the tracks arriving at a station.', 159 | 'tokenized': 'a train coming down the tracks arriving at a station'}], 160 | '81922': [ 161 | {u'image_id': '81922', u'cap_id': 0, u'caption': u'A large jetliner flying over a traffic filled street.', 162 | 'tokenized': 'a large jetliner flying over a traffic filled street'}, 163 | {u'image_id': '81922', u'cap_id': 1, u'caption': u'The plane is flying over top of the cars', 164 | 'tokenized': 'the plan is flying over top of the cars'}, ] 165 | } 166 | 167 | samples = { 168 | '184321': [{u'image_id': '184321', u'caption': u'train traveling down a track in front of a road'}], 169 | '81922': [{u'image_id': '81922', u'caption': u'plane is flying through the sky'}], 170 | } 171 | IDs = ['184321', '81922'] 172 | scorer = COCOScorer() 173 | scorer.score(gts, samples, IDs) 174 | 175 | 176 | if __name__ == '__main__': 177 | test_cocoscorer() 178 | -------------------------------------------------------------------------------- /utils/dataloader.py: -------------------------------------------------------------------------------- 1 | ''' Customized dataloader for V2C dataset. ''' 2 | 3 | __author__ = 'Jacob Zhiyuan Fang' 4 | 5 | import os 6 | import json 7 | import torch 8 | import random 9 | import numpy as np 10 | from torch.utils.data import Dataset 11 | 12 | 13 | class VideoDataset(Dataset): 14 | 15 | def tensorize_float(self, obj): 16 | return torch.from_numpy(obj).type(torch.FloatTensor) 17 | 18 | def tensorize_long(self, obj): 19 | return torch.from_numpy(obj).type(torch.LongTensor) 20 | 21 | def get_cms_vocab_size(self): 22 | return len(self.get_cms_vocab()) 23 | 24 | def get_cap_vocab_size(self): 25 | return len(self.get_cap_vocab()) 26 | 27 | def get_cms_vocab(self): 28 | return self.cms_ix_to_word 29 | 30 | def get_cap_vocab(self): 31 | return self.cap_ix_to_word 32 | 33 | def get_seq_length(self): 34 | return self.seq_length 35 | 36 | def __init__(self, opt, mode='train'): 37 | super(VideoDataset, self).__init__() 38 | self.mode = mode 39 | 40 | self.captions = json.load(open(opt['caption_json'])) 41 | cms_info = json.load(open(opt['info_json'])) 42 | self.cms_ix_to_word = cms_info['ix_to_word'] 43 | self.cms_word_to_ix = cms_info['word_to_ix'] 44 | self.splits = cms_info['videos'] 45 | 46 | # Load caption dictionary 47 | cap_info = json.load(open(opt['cap_info_json'])) 48 | self.cap_ix_to_word = cap_info['ix_to_word'] 49 | self.cap_word_to_ix = cap_info['word_to_ix'] 50 | 51 | print('Caption vocab size is ', len(self.cap_ix_to_word)) 52 | print('CMS vocab size is ', len(self.cms_ix_to_word)) 53 | print('number of train videos: ', len(self.splits['train'])) 54 | print('number of test videos: ', len(self.splits['test'])) 55 | print('number of val videos: ', len(self.splits['val'])) 56 | 57 | self.feats_dir = opt['feats_dir'] 58 | print('load feats from %s' % self.feats_dir) 59 | 60 | self.cap_max_len = opt['cap_max_len'] 61 | self.int_max_len = opt['int_max_len'] 62 | self.eff_max_len = opt['eff_max_len'] 63 | self.att_max_len = opt['att_max_len'] 64 | 65 | print('max sequence length of caption is', self.cap_max_len) 66 | print('max sequence length of intention is', self.int_max_len) 67 | print('max sequence length of effect is', self.eff_max_len) 68 | print('max sequence length of attribute is', self.att_max_len) 69 | 70 | def __getitem__(self, ix=False): 71 | # if not ix: 72 | # if self.mode == 'train': 73 | # ix = random.choice(self.splits['train']) 74 | # 75 | # elif self.mode == 'test': 76 | # ix = self.splits['test'][ix] 77 | 78 | if self.mode == 'train': 79 | ix = self.splits['train'][ix] 80 | 81 | elif self.mode == 'test': 82 | ix = self.splits['test'][ix] 83 | 84 | # Load the visual features 85 | fc_feat = [] 86 | for dir in self.feats_dir: 87 | fc_feat.append(np.load(os.path.join(dir, 'video%i.npy' % ix))) 88 | 89 | fc_feat = np.concatenate(fc_feat, axis=1) 90 | 91 | # Placeholder for returning parameters 92 | cap_mask = np.zeros(self.cap_max_len) 93 | int_mask = np.zeros(self.int_max_len) 94 | eff_mask = np.zeros(self.eff_max_len) 95 | att_mask = np.zeros(self.att_max_len) 96 | 97 | cap_gts = np.zeros(self.cap_max_len) 98 | int_gts = np.zeros(self.int_max_len) 99 | eff_gts = np.zeros(self.eff_max_len) 100 | att_gts = np.zeros(self.att_max_len) 101 | 102 | if 'video%i' % ix not in self.captions: 103 | print(ix in self.splits['train']) 104 | 105 | assert 'video%i' % ix in self.captions 106 | raw_data = self.captions['video%i' % ix] 107 | 108 | # Random pick out one caption in Training mode 109 | cap_ix = random.randint(0, len(raw_data) - 1) 110 | 111 | # Pop out Cap, Int, Eff and Att 112 | caption = self.captions['video%i' % ix][cap_ix]['final_caption'] 113 | 114 | intentions = self.captions['video%i' % ix][cap_ix]['intention'] 115 | intention = intentions[random.randint(0, len(intentions)-1)][1] 116 | 117 | effects = self.captions['video%i' % ix][cap_ix]['effect'] 118 | effect = effects[random.randint(0, len(effects)-1)][1] 119 | 120 | attributes = self.captions['video%i' % ix][cap_ix]['attribute'] 121 | attribute = attributes[random.randint(0, len(attributes)-1)][1] 122 | 123 | # Trunk the tokens if it exceed the maximum limitation 124 | if len(caption) > self.cap_max_len: 125 | caption = caption[:self.cap_max_len] 126 | caption[-1] = '' 127 | 128 | if len(effect) > self.eff_max_len: 129 | effect = effect[:self.eff_max_len] 130 | effect[-1] = '' 131 | 132 | if len(attribute) > self.att_max_len: 133 | attribute = attribute[:self.att_max_len] 134 | attribute[-1] = '' 135 | 136 | # Tokenize it 137 | for j, w in enumerate(caption): 138 | cap_gts[j] = self.cap_word_to_ix.get(w, '1') 139 | 140 | for j, w in enumerate(intention): 141 | int_gts[j] = self.cms_word_to_ix.get(w, '1') 142 | 143 | for j, w in enumerate(effect): 144 | eff_gts[j] = self.cms_word_to_ix.get(w, '1') 145 | 146 | for j, w in enumerate(attribute): 147 | att_gts[j] = self.cms_word_to_ix.get(w, '1') 148 | 149 | # Mask out additional positions 150 | non_zero = (cap_gts == 0).nonzero() 151 | if len(non_zero[0]) != 0: cap_mask[:int(non_zero[0][0])] = 1 152 | else: cap_mask += 1 153 | 154 | non_zero = (int_gts == 0).nonzero() 155 | if len(non_zero[0]) != 0: int_mask[:int(non_zero[0][0])] = 1 156 | else: int_mask += 1 157 | 158 | non_zero = (eff_gts == 0).nonzero() 159 | if len(non_zero[0]) != 0: eff_mask[:int(non_zero[0][0])] = 1 160 | else: eff_mask += 1 161 | 162 | non_zero = (att_gts == 0).nonzero() 163 | if len(non_zero[0]) != 0: att_mask[:int(non_zero[0][0])] = 1 164 | else: att_mask += 1 165 | 166 | # Convert to Tensors 167 | data = {} 168 | data['fc_feats'] = self.tensorize_float(fc_feat) 169 | data['cap_labels'] = self.tensorize_long(cap_gts) 170 | data['cap_masks'] = self.tensorize_float(cap_mask) 171 | data['int_labels'] = self.tensorize_long(int_gts) 172 | data['int_masks'] = self.tensorize_float(int_mask) 173 | data['eff_labels'] = self.tensorize_long(eff_gts) 174 | data['eff_masks'] = self.tensorize_float(eff_mask) 175 | data['att_labels'] = self.tensorize_long(att_gts) 176 | data['att_masks'] = self.tensorize_float(att_mask) 177 | data['video_ids'] = 'video%i' % ix 178 | 179 | return data 180 | 181 | def __len__(self): 182 | return len(self.splits[self.mode]) 183 | -------------------------------------------------------------------------------- /utils/gt_cap_dataloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | import random 5 | import numpy as np 6 | from torch.utils.data import Dataset 7 | 8 | 9 | class VideoDataset(Dataset): 10 | 11 | def get_cms_vocab_size(self): 12 | return len(self.get_cms_vocab()) 13 | 14 | def get_cap_vocab_size(self): 15 | return len(self.get_cap_vocab()) 16 | 17 | def get_cms_vocab(self): 18 | return self.cms_ix_to_word 19 | 20 | def get_cap_vocab(self): 21 | return self.cap_ix_to_word 22 | 23 | def get_seq_length(self): 24 | return self.seq_length 25 | 26 | def __init__(self, opt, mode='train'): 27 | super(VideoDataset, self).__init__() 28 | self.mode = mode 29 | 30 | self.captions = json.load(open(opt['caption_json'])) 31 | cms_info = json.load(open(opt['info_json'])) 32 | self.cms_ix_to_word = cms_info['ix_to_word'] 33 | self.cms_word_to_ix = cms_info['word_to_ix'] 34 | self.splits = cms_info['videos'] 35 | 36 | # Load caption dictionary 37 | cap_info = json.load(open(opt['cap_info_json'])) 38 | self.cap_ix_to_word = cap_info['ix_to_word'] 39 | self.cap_word_to_ix = cap_info['word_to_ix'] 40 | 41 | print('Caption vocab size is ', len(self.cap_ix_to_word)) 42 | print('CMS vocab size is ', len(self.cms_ix_to_word)) 43 | print('number of train videos: ', len(self.splits['train'])) 44 | print('number of test videos: ', len(self.splits['test'])) 45 | print('number of val videos: ', len(self.splits['val'])) 46 | 47 | self.feats_dir = opt['feats_dir'] 48 | print('load feats from %s' % self.feats_dir) 49 | 50 | self.cap_max_len = opt['cap_max_len'] 51 | self.int_max_len = opt['int_max_len'] 52 | self.eff_max_len = opt['eff_max_len'] 53 | self.att_max_len = opt['att_max_len'] 54 | print('max sequence length of caption is', self.cap_max_len) 55 | print('max sequence length of intention is', self.int_max_len) 56 | print('max sequence length of effect is', self.eff_max_len) 57 | print('max sequence length of attribute is', self.att_max_len) 58 | 59 | def __getitem__(self, ix): 60 | 61 | if self.mode == 'train': 62 | ix = random.choice(self.splits['train']) 63 | elif self.mode == 'test': 64 | ix = self.splits['test'][ix] 65 | 66 | fc_feat = [] 67 | for dir in self.feats_dir: 68 | fc_feat.append(np.load(os.path.join(dir, 'video%i.npy' % ix))) 69 | fc_feat = np.concatenate(fc_feat, axis=1) 70 | 71 | cap_mask = np.zeros(self.cap_max_len) 72 | int_mask = np.zeros(self.int_max_len) 73 | eff_mask = np.zeros(self.eff_max_len) 74 | att_mask = np.zeros(self.att_max_len) 75 | 76 | # cap_gts = np.zeros((10, self.cap_max_len)) 77 | cap_gts = np.zeros((1, self.cap_max_len)) 78 | int_gts = np.zeros(self.int_max_len) 79 | eff_gts = np.zeros(self.eff_max_len) 80 | att_gts = np.zeros(self.att_max_len) 81 | 82 | raw_data = self.captions['video%i' % ix] 83 | 84 | cap_ix = random.randint(0, len(raw_data) - 1) # Random pick out one caption 85 | 86 | caption = raw_data[cap_ix]['final_caption'] 87 | 88 | intentions = raw_data[cap_ix]['intention'] 89 | intention = intentions[random.randint(0, len(intentions)-1)][1] 90 | 91 | effects = raw_data[cap_ix]['effect'] 92 | effect = effects[random.randint(0, len(effects)-1)][1] 93 | 94 | attributes = raw_data[cap_ix]['attribute'] 95 | attribute = attributes[random.randint(0, len(attributes)-1)][1] 96 | 97 | # Load all intentions again for eval 98 | # intentions = [item['intention'][0] for item in raw_data] 99 | # effects = [item['effect'][0] for item in raw_data] 100 | # attributes = [item['attribute'][0] for item in raw_data] 101 | 102 | if len(caption) > self.cap_max_len: 103 | caption = caption[:self.cap_max_len] 104 | caption[-1] = '' 105 | if len(effect) > self.eff_max_len: 106 | effect = effect[:self.eff_max_len] 107 | effect[-1] = '' 108 | if len(attribute) > self.att_max_len: 109 | attribute = attribute[:self.att_max_len] 110 | attribute[-1] = '' 111 | 112 | # Load all 10 gt captions 113 | # for i in range(10): 114 | # _ = len(raw_data) 115 | # caption = raw_data[i%_]['final_caption'] 116 | # for j, w in enumerate(caption[0:28]): 117 | # cap_gts[i, j] = self.cap_word_to_ix.get(w, '1') 118 | 119 | # Load one random gt captions 120 | for j, w in enumerate(caption[0:28]): 121 | cap_gts[0, j] = self.cap_word_to_ix.get(w, '1') 122 | 123 | for j, w in enumerate(intention): 124 | int_gts[j] = self.cms_word_to_ix.get(w, '1') 125 | for j, w in enumerate(effect): 126 | eff_gts[j] = self.cms_word_to_ix.get(w, '1') 127 | for j, w in enumerate(attribute): 128 | att_gts[j] = self.cms_word_to_ix.get(w, '1') 129 | 130 | non_zero = (cap_gts == 0).nonzero() 131 | if len(non_zero[0]) != 0: cap_mask[:int(non_zero[0][0])] = 1 132 | else: cap_mask += 1 133 | 134 | non_zero = (int_gts == 0).nonzero() 135 | if len(non_zero[0]) != 0: int_mask[:int(non_zero[0][0])] = 1 136 | else: int_mask += 1 137 | 138 | non_zero = (eff_gts == 0).nonzero() 139 | if len(non_zero[0]) != 0: eff_mask[:int(non_zero[0][0])] = 1 140 | else: eff_mask += 1 141 | 142 | non_zero = (att_gts == 0).nonzero() 143 | if len(non_zero[0]) != 0: att_mask[:int(non_zero[0][0])] = 1 144 | else: att_mask += 1 145 | 146 | data = {} 147 | data['fc_feats'] = torch.from_numpy(fc_feat).type(torch.FloatTensor) 148 | data['cap_labels'] = torch.from_numpy(cap_gts).type(torch.LongTensor) 149 | data['cap_masks'] = torch.from_numpy(cap_mask).type(torch.FloatTensor) 150 | data['int_labels'] = torch.from_numpy(int_gts).type(torch.LongTensor) 151 | data['int_masks'] = torch.from_numpy(int_mask).type(torch.FloatTensor) 152 | data['eff_labels'] = torch.from_numpy(eff_gts).type(torch.LongTensor) 153 | data['eff_masks'] = torch.from_numpy(eff_mask).type(torch.FloatTensor) 154 | data['att_labels'] = torch.from_numpy(att_gts).type(torch.LongTensor) 155 | data['att_masks'] = torch.from_numpy(att_mask).type(torch.FloatTensor) 156 | 157 | data['video_ids'] = 'video%i' % ix 158 | 159 | # Concatenate all CMS 160 | int_str = '' 161 | for _ in intentions: 162 | int_str += ';' + _[0] 163 | att_str = '' 164 | for _ in attributes: 165 | att_str += ';' + _[0] 166 | eff_str = '' 167 | for _ in effects: 168 | eff_str += ';' + _[0] 169 | return data, int_str, att_str, eff_str 170 | 171 | def __len__(self): 172 | return len(self.splits[self.mode]) 173 | -------------------------------------------------------------------------------- /utils/gt_caps_dataloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | import random 5 | import numpy as np 6 | from torch.utils.data import Dataset 7 | 8 | 9 | class VideoDataset(Dataset): 10 | 11 | def get_cms_vocab_size(self): 12 | return len(self.get_cms_vocab()) 13 | 14 | def get_cap_vocab_size(self): 15 | return len(self.get_cap_vocab()) 16 | 17 | def get_cms_vocab(self): 18 | return self.cms_ix_to_word 19 | 20 | def get_cap_vocab(self): 21 | return self.cap_ix_to_word 22 | 23 | def get_seq_length(self): 24 | return self.seq_length 25 | 26 | def __init__(self, opt, mode='train'): 27 | super(VideoDataset, self).__init__() 28 | self.mode = mode 29 | 30 | self.captions = json.load(open(opt['caption_json'])) 31 | cms_info = json.load(open(opt['info_json'])) 32 | self.cms_ix_to_word = cms_info['ix_to_word'] 33 | self.cms_word_to_ix = cms_info['word_to_ix'] 34 | self.splits = cms_info['videos'] 35 | 36 | # Load caption dictionary 37 | cap_info = json.load(open(opt['cap_info_json'])) 38 | self.cap_ix_to_word = cap_info['ix_to_word'] 39 | self.cap_word_to_ix = cap_info['word_to_ix'] 40 | 41 | print('Caption vocab size is ', len(self.cap_ix_to_word)) 42 | print('CMS vocab size is ', len(self.cms_ix_to_word)) 43 | print('number of train videos: ', len(self.splits['train'])) 44 | print('number of test videos: ', len(self.splits['test'])) 45 | print('number of val videos: ', len(self.splits['val'])) 46 | 47 | self.feats_dir = opt['feats_dir'] 48 | self.cap_max_len = opt['cap_max_len'] 49 | 50 | print('load feats from %s' % self.feats_dir) 51 | print('max sequence length of caption is', self.cap_max_len) 52 | 53 | def __getitem__(self, ix): 54 | 55 | if self.mode == 'train': 56 | ix = random.choice(self.splits['train']) 57 | elif self.mode == 'test': 58 | ix = self.splits['test'][ix] 59 | 60 | fc_feat = [] 61 | for dir in self.feats_dir: 62 | fc_feat.append(np.load(os.path.join(dir, 'video%i.npy' % ix))) 63 | fc_feat = np.concatenate(fc_feat, axis=1) 64 | 65 | raw_data = self.captions['video%i' % ix] 66 | num_cap = len(raw_data) 67 | cap_mask = np.zeros((num_cap, self.cap_max_len)) 68 | cap_gts = np.zeros((num_cap, self.cap_max_len)) 69 | int_list, eff_list, att_list = [], [], [] 70 | 71 | # Load all num_cap gt captions 72 | for cap_ix in range(num_cap): 73 | caption = raw_data[cap_ix % len(raw_data)]['final_caption'] 74 | 75 | if len(caption) > self.cap_max_len: 76 | caption = caption[:self.cap_max_len] 77 | caption[-1] = '' 78 | 79 | for j, w in enumerate(caption[0: self.cap_max_len]): 80 | cap_gts[cap_ix, j] = self.cap_word_to_ix.get(w, '1') 81 | 82 | intentions, effects, attributes = raw_data[cap_ix]['intention'], raw_data[cap_ix]['effect'],\ 83 | raw_data[cap_ix]['attribute'] 84 | 85 | # Concatenate all CMS 86 | int_str, att_str, eff_str = '', '', '' 87 | for int, eff, att in zip(intentions, effects, attributes): 88 | int_str += ';' + int[0] 89 | eff_str += ';' + eff[0] 90 | att_str += ';' + att[0] 91 | 92 | int_list.append(int_str) 93 | eff_list.append(eff_str) 94 | att_list.append(att_str) 95 | 96 | # Insert mask 97 | cap_mask[(cap_gts != 0)] = 1 98 | 99 | data = {} 100 | data['fc_feats'] = torch.from_numpy(fc_feat).type(torch.FloatTensor) 101 | data['cap_labels'] = torch.from_numpy(cap_gts).type(torch.LongTensor) 102 | data['cap_masks'] = torch.from_numpy(cap_mask).type(torch.FloatTensor) 103 | data['video_ids'] = 'video%i' % ix 104 | 105 | return data, int_list, eff_list, att_list 106 | 107 | def __len__(self): 108 | return len(self.splits[self.mode]) 109 | 110 | -------------------------------------------------------------------------------- /utils/gt_human_cap_dataloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | import random 5 | import numpy as np 6 | from torch.utils.data import Dataset 7 | 8 | 9 | class VideoDataset(Dataset): 10 | 11 | def get_cms_vocab_size(self): 12 | return len(self.get_cms_vocab()) 13 | 14 | def get_cap_vocab_size(self): 15 | return len(self.get_cap_vocab()) 16 | 17 | def get_cms_vocab(self): 18 | return self.cms_ix_to_word 19 | 20 | def get_cap_vocab(self): 21 | return self.cap_ix_to_word 22 | 23 | def get_seq_length(self): 24 | return self.seq_length 25 | 26 | def __init__(self, opt, mode='train'): 27 | super(VideoDataset, self).__init__() 28 | self.mode = mode 29 | 30 | self.captions = json.load(open(opt['caption_json'])) 31 | cms_info = json.load(open(opt['info_json'])) 32 | self.cms_ix_to_word = cms_info['ix_to_word'] 33 | self.cms_word_to_ix = cms_info['word_to_ix'] 34 | self.splits = cms_info['videos'] 35 | 36 | # Load caption dictionary 37 | cap_info = json.load(open(opt['cap_info_json'])) 38 | self.cap_ix_to_word = cap_info['ix_to_word'] 39 | self.cap_word_to_ix = cap_info['word_to_ix'] 40 | 41 | print('Caption vocab size is ', len(self.cap_ix_to_word)) 42 | print('CMS vocab size is ', len(self.cms_ix_to_word)) 43 | print('number of train videos: ', len(self.splits['train'])) 44 | print('number of test videos: ', len(self.splits['test'])) 45 | print('number of val videos: ', len(self.splits['val'])) 46 | 47 | self.feats_dir = opt['feats_dir'] 48 | print('load feats from %s' % self.feats_dir) 49 | 50 | self.cap_max_len = opt['cap_max_len'] 51 | self.int_max_len = opt['int_max_len'] 52 | self.eff_max_len = opt['eff_max_len'] 53 | self.att_max_len = opt['att_max_len'] 54 | print('max sequence length of caption is', self.cap_max_len) 55 | print('max sequence length of intention is', self.int_max_len) 56 | print('max sequence length of effect is', self.eff_max_len) 57 | print('max sequence length of attribute is', self.att_max_len) 58 | 59 | def __getitem__(self, ix): 60 | 61 | if self.mode == 'train': 62 | ix = self.captions.keys() 63 | elif self.mode == 'test': 64 | ix = list(self.captions.keys())[ix] 65 | 66 | fc_feat = [] 67 | for dir in self.feats_dir: 68 | fc_feat.append(np.load(os.path.join(dir, ix+'.npy'))) 69 | fc_feat = np.concatenate(fc_feat, axis=1) 70 | 71 | cap_mask = np.zeros(self.cap_max_len) 72 | int_mask = np.zeros(self.int_max_len) 73 | eff_mask = np.zeros(self.eff_max_len) 74 | att_mask = np.zeros(self.att_max_len) 75 | 76 | # cap_gts = np.zeros((10, self.cap_max_len)) 77 | cap_gts = np.zeros((1, self.cap_max_len)) 78 | int_gts = np.zeros(self.int_max_len) 79 | eff_gts = np.zeros(self.eff_max_len) 80 | att_gts = np.zeros(self.att_max_len) 81 | 82 | raw_data = self.captions[ix] 83 | 84 | cap_ix = random.randint(0, len(raw_data) - 1) # Random pick out one caption 85 | 86 | caption = raw_data[cap_ix]['final_caption'] 87 | 88 | intentions = raw_data[cap_ix]['intention'] 89 | intention = intentions[random.randint(0, len(intentions)-1)][1] 90 | 91 | effects = raw_data[cap_ix]['effect'] 92 | effect = effects[random.randint(0, len(effects)-1)][1] 93 | 94 | attributes = raw_data[cap_ix]['attribute'] 95 | attribute = attributes[random.randint(0, len(attributes)-1)][1] 96 | 97 | # Load all intentions again for eval 98 | # intentions = [item['intention'][0] for item in raw_data] 99 | # effects = [item['effect'][0] for item in raw_data] 100 | # attributes = [item['attribute'][0] for item in raw_data] 101 | 102 | if len(caption) > self.cap_max_len: 103 | caption = caption[:self.cap_max_len] 104 | caption[-1] = '' 105 | if len(effect) > self.eff_max_len: 106 | effect = effect[:self.eff_max_len] 107 | effect[-1] = '' 108 | if len(attribute) > self.att_max_len: 109 | attribute = attribute[:self.att_max_len] 110 | attribute[-1] = '' 111 | 112 | # Load all 10 gt captions 113 | # for i in range(10): 114 | # _ = len(raw_data) 115 | # caption = raw_data[i%_]['final_caption'] 116 | # for j, w in enumerate(caption[0:28]): 117 | # cap_gts[i, j] = self.cap_word_to_ix.get(w, '1') 118 | 119 | # Load one random gt captions 120 | for j, w in enumerate(caption[0:28]): 121 | cap_gts[0, j] = self.cap_word_to_ix.get(w, '1') 122 | 123 | for j, w in enumerate(intention): 124 | int_gts[j] = self.cms_word_to_ix.get(w, '1') 125 | for j, w in enumerate(effect): 126 | eff_gts[j] = self.cms_word_to_ix.get(w, '1') 127 | for j, w in enumerate(attribute): 128 | att_gts[j] = self.cms_word_to_ix.get(w, '1') 129 | 130 | non_zero = (cap_gts == 0).nonzero() 131 | if len(non_zero[0]) != 0: cap_mask[:int(non_zero[0][0])] = 1 132 | else: cap_mask += 1 133 | 134 | non_zero = (int_gts == 0).nonzero() 135 | if len(non_zero[0]) != 0: int_mask[:int(non_zero[0][0])] = 1 136 | else: int_mask += 1 137 | 138 | non_zero = (eff_gts == 0).nonzero() 139 | if len(non_zero[0]) != 0: eff_mask[:int(non_zero[0][0])] = 1 140 | else: eff_mask += 1 141 | 142 | non_zero = (att_gts == 0).nonzero() 143 | if len(non_zero[0]) != 0: att_mask[:int(non_zero[0][0])] = 1 144 | else: att_mask += 1 145 | 146 | data = {} 147 | data['fc_feats'] = torch.from_numpy(fc_feat).type(torch.FloatTensor) 148 | data['cap_labels'] = torch.from_numpy(cap_gts).type(torch.LongTensor) 149 | data['cap_masks'] = torch.from_numpy(cap_mask).type(torch.FloatTensor) 150 | data['int_labels'] = torch.from_numpy(int_gts).type(torch.LongTensor) 151 | data['int_masks'] = torch.from_numpy(int_mask).type(torch.FloatTensor) 152 | data['eff_labels'] = torch.from_numpy(eff_gts).type(torch.LongTensor) 153 | data['eff_masks'] = torch.from_numpy(eff_mask).type(torch.FloatTensor) 154 | data['att_labels'] = torch.from_numpy(att_gts).type(torch.LongTensor) 155 | data['att_masks'] = torch.from_numpy(att_mask).type(torch.FloatTensor) 156 | 157 | data['video_ids'] = ix 158 | 159 | # Concatenate all CMS 160 | int_str = '' 161 | for _ in intentions: 162 | int_str += ';' + _[0] 163 | att_str = '' 164 | for _ in attributes: 165 | att_str += ';' + _[0] 166 | eff_str = '' 167 | for _ in effects: 168 | eff_str += ';' + _[0] 169 | return data, int_str, att_str, eff_str 170 | 171 | def __len__(self): 172 | return len(list(self.captions.keys())) 173 | -------------------------------------------------------------------------------- /utils/prepro_feats.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | import glob 4 | from tqdm import tqdm 5 | import numpy as np 6 | import os 7 | import argparse 8 | 9 | import torch 10 | from torch import nn 11 | import torch.nn.functional as F 12 | import pretrainedmodels 13 | from pretrainedmodels import utils 14 | 15 | C, H, W = 3, 224, 224 16 | 17 | 18 | def extract_frames(video, dst): 19 | with open(os.devnull, "w") as ffmpeg_log: 20 | if os.path.exists(dst): 21 | print(" cleanup: " + dst + "/") 22 | shutil.rmtree(dst) 23 | os.makedirs(dst) 24 | video_to_frames_command = ["ffmpeg", 25 | # (optional) overwrite output file if it exists 26 | '-y', 27 | '-i', video, # input file 28 | '-vf', "scale=400:300", # input file 29 | '-qscale:v', "2", # quality for JPEG 30 | '{0}/%06d.jpg'.format(dst)] 31 | subprocess.call(video_to_frames_command, 32 | stdout=ffmpeg_log, stderr=ffmpeg_log) 33 | 34 | 35 | def extract_feats(params, model, load_image_fn): 36 | global C, H, W 37 | model.eval() 38 | 39 | dir_fc = params['output_dir'] 40 | if not os.path.isdir(dir_fc): 41 | os.mkdir(dir_fc) 42 | print("save video feats to %s" % (dir_fc)) 43 | video_list = glob.glob(os.path.join(params['video_path'], '*.mp4')) 44 | for video in tqdm(video_list): 45 | video_id = video.split("/")[-1].split(".")[0] 46 | dst = params['model'] + '_' + video_id 47 | extract_frames(video, dst) 48 | 49 | image_list = sorted(glob.glob(os.path.join(dst, '*.jpg'))) 50 | samples = np.round(np.linspace( 51 | 0, len(image_list) - 1, params['n_frame_steps'])) 52 | image_list = [image_list[int(sample)] for sample in samples] 53 | images = torch.zeros((len(image_list), C, H, W)) 54 | for iImg in range(len(image_list)): 55 | img = load_image_fn(image_list[iImg]) 56 | images[iImg] = img 57 | with torch.no_grad(): 58 | fc_feats = model(images.cuda()).squeeze() 59 | img_feats = fc_feats.cpu().numpy() 60 | # Save the inception features 61 | outfile = os.path.join(dir_fc, video_id + '.npy') 62 | np.save(outfile, img_feats) 63 | # cleanup 64 | shutil.rmtree(dst) 65 | 66 | 67 | if __name__ == '__main__': 68 | parser = argparse.ArgumentParser() 69 | parser.add_argument("--gpu", dest='gpu', type=str, default='0', 70 | help='Set CUDA_VISIBLE_DEVICES environment variable, optional') 71 | parser.add_argument("--output_dir", dest='output_dir', type=str, 72 | default='data/feats/resnet152', help='directory to store features') 73 | parser.add_argument("--n_frame_steps", dest='n_frame_steps', type=int, default=40, 74 | help='how many frames to sampler per video') 75 | 76 | parser.add_argument("--video_path", dest='video_path', type=str, 77 | default='data/test-video', help='path to video dataset') 78 | parser.add_argument("--model", dest="model", type=str, default='resnet152', 79 | help='the CNN model you want to use to extract_feats') 80 | 81 | args = parser.parse_args() 82 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 83 | params = vars(args) 84 | if params['model'] == 'inception_v3': 85 | C, H, W = 3, 299, 299 86 | model = pretrainedmodels.inceptionv3(pretrained='imagenet') 87 | load_image_fn = utils.LoadTransformImage(model) 88 | 89 | elif params['model'] == 'resnet152': 90 | C, H, W = 3, 224, 224 91 | model = pretrainedmodels.resnet152(pretrained='imagenet') 92 | load_image_fn = utils.LoadTransformImage(model) 93 | 94 | elif params['model'] == 'inception_v4': 95 | C, H, W = 3, 299, 299 96 | model = pretrainedmodels.inceptionv4( 97 | num_classes=1000, pretrained='imagenet') 98 | load_image_fn = utils.LoadTransformImage(model) 99 | 100 | else: 101 | print("doesn't support %s" % (params['model'])) 102 | 103 | model.last_linear = utils.Identity() 104 | model = nn.DataParallel(model) 105 | 106 | model = model.cuda() 107 | extract_feats(params, model, load_image_fn) 108 | -------------------------------------------------------------------------------- /utils/prepro_ngrams.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import pickle as pkl 4 | from collections import defaultdict 5 | 6 | 7 | def precook(s, n=4): 8 | """ 9 | Takes a string as input and returns an object that can be given to 10 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 11 | can take string arguments as well. 12 | :param s: string : sentence to be converted into ngrams 13 | :param n: int : number of ngrams for which representation is calculated 14 | :return: term frequency vector for occuring ngrams 15 | """ 16 | words = s.split() 17 | counts = defaultdict(int) 18 | for k in range(1, n+1): 19 | for i in range(len(words)-k+1): 20 | ngram = tuple(words[i:i+k]) 21 | counts[ngram] += 1 22 | return counts 23 | 24 | 25 | def cook_refs(refs, n=4): # lhuang: oracle will call with "average" 26 | '''Takes a list of reference sentences for a single segment 27 | and returns an object that encapsulates everything that BLEU 28 | needs to know about them. 29 | :param refs: list of string : reference sentences for some image 30 | :param n: int : number of ngrams for which (ngram) representation is calculated 31 | :return: result (list of dict) 32 | ''' 33 | return [precook(ref, n) for ref in refs] 34 | 35 | 36 | def create_crefs(refs): 37 | crefs = [] 38 | for ref in refs: 39 | # ref is a list of 5 captions 40 | crefs.append(cook_refs(ref)) 41 | return crefs 42 | 43 | 44 | def compute_doc_freq(crefs): 45 | ''' 46 | Compute term frequency for reference data. 47 | This will be used to compute idf (inverse document frequency later) 48 | The term frequency is stored in the object 49 | :return: None 50 | ''' 51 | document_frequency = defaultdict(float) 52 | for refs in crefs: 53 | # refs, k ref captions of one image 54 | for ngram in set([ngram for ref in refs for (ngram, count) in ref.items()]): 55 | document_frequency[ngram] += 1 56 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 57 | return document_frequency 58 | 59 | 60 | def build_dict(vids, wtoi): 61 | refs_words = [] 62 | refs_idxs = [] 63 | count_vids = 0 64 | for vid in vids: 65 | ref_words = [] 66 | ref_idxs = [] 67 | for cap in vids[vid]['final_captions']: 68 | tmp_tokens = cap 69 | tmp_tokens = [_ if _ in wtoi else '' for _ in tmp_tokens] 70 | ref_words.append(' '.join(tmp_tokens)) 71 | ref_idxs.append(' '.join([str(wtoi[_]) for _ in tmp_tokens])) 72 | refs_words.append(ref_words) 73 | refs_idxs.append(ref_idxs) 74 | count_vids += 1 75 | ngram_words = compute_doc_freq(create_crefs(refs_words)) 76 | ngram_idxs = compute_doc_freq(create_crefs(refs_idxs)) 77 | return ngram_words, ngram_idxs, count_vids 78 | 79 | 80 | def main(params): 81 | vids = json.load(open(params['caption_json'])) 82 | wtoi = json.load(open(params['info_json']))['word_to_ix'] 83 | 84 | ngram_words, ngram_idxs, ref_len = build_dict(vids, wtoi) 85 | 86 | pkl.dump({'document_frequency': ngram_words, 'ref_len': ref_len}, open( 87 | params['output_pkl']+'-words.p', 'wb')) 88 | pkl.dump({'document_frequency': ngram_idxs, 'ref_len': ref_len}, open( 89 | params['output_pkl']+'-idxs.p', 'wb')) 90 | 91 | if __name__ == "__main__": 92 | 93 | parser = argparse.ArgumentParser() 94 | 95 | # input json 96 | parser.add_argument('--caption_json', default='data/caption.json', 97 | help='input json file to containing video captions') 98 | parser.add_argument('--info_json', default='data/info.json', help='vocab info json file') 99 | parser.add_argument('--output_pkl', default='data/msr-all', help='output pickle file') 100 | args = parser.parse_args() 101 | params = vars(args) # convert to ordinary dict 102 | 103 | main(params) 104 | -------------------------------------------------------------------------------- /utils/prepro_vocab.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import argparse 4 | import numpy as np 5 | 6 | 7 | def build_vocab(vids, params): 8 | count_thr = params['word_count_threshold'] 9 | # count up the number of words 10 | counts = {} 11 | for vid, caps in vids.items(): 12 | for cap in caps['captions']: 13 | ws = re.sub(r'[.!,;?]', ' ', cap).split() 14 | for w in ws: 15 | counts[w] = counts.get(w, 0) + 1 16 | # cw = sorted([(count, w) for w, count in counts.items()], reverse=True) 17 | total_words = sum(counts.values()) 18 | bad_words = [w for w, n in counts.items() if n <= count_thr] 19 | vocab = [w for w, n in counts.items() if n > count_thr] 20 | bad_count = sum(counts[w] for w in bad_words) 21 | print('number of bad words: %d/%d = %.2f%%' % 22 | (len(bad_words), len(counts), len(bad_words) * 100.0 / len(counts))) 23 | print('number of words in vocab would be %d' % (len(vocab), )) 24 | print('number of UNKs: %d/%d = %.2f%%' % 25 | (bad_count, total_words, bad_count * 100.0 / total_words)) 26 | # lets now produce the final annotations 27 | if bad_count > 0: 28 | # additional special UNK token we will use below to map infrequent words to 29 | print('inserting the special UNK token') 30 | vocab.append('') 31 | for vid, caps in vids.items(): 32 | caps = caps['captions'] 33 | vids[vid]['final_captions'] = [] 34 | for cap in caps: 35 | ws = re.sub(r'[.!,;?]', ' ', cap).split() 36 | caption = [ 37 | ''] + [w if counts.get(w, 0) > count_thr else '' for w in ws] + [''] 38 | vids[vid]['final_captions'].append(caption) 39 | return vocab 40 | 41 | 42 | def main(params): 43 | videos = json.load(open(params['input_json'], 'r'))['sentences'] 44 | video_caption = {} 45 | for i in videos: 46 | if i['video_id'] not in video_caption.keys(): 47 | video_caption[i['video_id']] = {'captions': []} 48 | video_caption[i['video_id']]['captions'].append(i['caption']) 49 | # create the vocab 50 | vocab = build_vocab(video_caption, params) 51 | itow = {i + 2: w for i, w in enumerate(vocab)} 52 | wtoi = {w: i + 2 for i, w in enumerate(vocab)} # inverse table 53 | wtoi[''] = 0 54 | itow[0] = '' 55 | wtoi[''] = 1 56 | itow[1] = '' 57 | 58 | out = {} 59 | out['ix_to_word'] = itow 60 | out['word_to_ix'] = wtoi 61 | out['videos'] = {'train': [], 'val': [], 'test': []} 62 | videos = json.load(open(params['input_json'], 'r'))['videos'] 63 | for i in videos: 64 | out['videos'][i['split']].append(int(i['id'])) 65 | json.dump(out, open(params['info_json'], 'w')) 66 | json.dump(video_caption, open(params['caption_json'], 'w')) 67 | 68 | 69 | if __name__ == "__main__": 70 | parser = argparse.ArgumentParser() 71 | 72 | # input json 73 | parser.add_argument('--input_json', type=str, default='data/videodatainfo_2017.json', 74 | help='msr_vtt videoinfo json') 75 | parser.add_argument('--info_json', default='data/info.json', 76 | help='info about iw2word and word2ix') 77 | parser.add_argument('--caption_json', default='data/caption.json', help='caption json file') 78 | 79 | 80 | parser.add_argument('--word_count_threshold', default=1, type=int, 81 | help='only words that occur more than this number of times will be put in vocab') 82 | 83 | args = parser.parse_args() 84 | params = vars(args) # convert to ordinary dict 85 | main(params) 86 | -------------------------------------------------------------------------------- /utils/rouge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : rouge.py 4 | # 5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) 6 | # 7 | # Creation Date : 2015-01-07 06:03 8 | # Author : Ramakrishna Vedantam 9 | 10 | import numpy as np 11 | import pdb 12 | 13 | 14 | def my_lcs(string, sub): 15 | """ 16 | Calculates longest common subsequence for a pair of tokenized strings 17 | :param string : list of str : tokens from a string split using whitespace 18 | :param sub : list of str : shorter string, also split using whitespace 19 | :returns: length (list of int): length of the longest common subsequence between the two strings 20 | 21 | Note: my_lcs only gives length of the longest common subsequence, not the actual LCS 22 | """ 23 | if(len(string)< len(sub)): 24 | sub, string = string, sub 25 | 26 | lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] 27 | 28 | for j in range(1,len(sub)+1): 29 | for i in range(1,len(string)+1): 30 | if(string[i-1] == sub[j-1]): 31 | lengths[i][j] = lengths[i-1][j-1] + 1 32 | else: 33 | lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) 34 | 35 | return lengths[len(string)][len(sub)] 36 | 37 | 38 | class Rouge(): 39 | ''' 40 | Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set 41 | 42 | ''' 43 | def __init__(self): 44 | # vrama91: updated the value below based on discussion with Hovey 45 | self.beta = 1.2 46 | 47 | def calc_score(self, candidate, refs): 48 | """ 49 | Compute ROUGE-L score given one candidate and references for an image 50 | :param candidate: str : candidate sentence to be evaluated 51 | :param refs: list of str : COCO reference sentences for the particular image to be evaluated 52 | :returns score: int (ROUGE-L score for the candidate evaluated against references) 53 | """ 54 | assert(len(candidate)==1) 55 | assert(len(refs)>0) 56 | prec = [] 57 | rec = [] 58 | 59 | # split into tokens 60 | token_c = candidate[0].split(" ") 61 | 62 | for reference in refs: 63 | # split into tokens 64 | token_r = reference.split(" ") 65 | # compute the longest common subsequence 66 | lcs = my_lcs(token_r, token_c) 67 | prec.append(lcs/float(len(token_c))) 68 | rec.append(lcs/float(len(token_r))) 69 | 70 | prec_max = max(prec) 71 | rec_max = max(rec) 72 | 73 | if(prec_max!=0 and rec_max !=0): 74 | score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) 75 | else: 76 | score = 0.0 77 | return score 78 | 79 | def compute_score(self, gts, res): 80 | """ 81 | Computes Rouge-L score given a set of reference and candidate sentences for the dataset 82 | Invoked by evaluate_captions.py 83 | :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 84 | :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values 85 | :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) 86 | """ 87 | assert(gts.keys() == res.keys()) 88 | imgIds = gts.keys() 89 | 90 | score = [] 91 | for id in imgIds: 92 | hypo = res[id] 93 | ref = gts[id] 94 | 95 | score.append(self.calc_score(hypo, ref)) 96 | 97 | # Sanity check. 98 | assert(type(hypo) is list) 99 | assert(len(hypo) == 1) 100 | assert(type(ref) is list) 101 | assert(len(ref) > 0) 102 | 103 | average_score = np.mean(np.array(score)) 104 | return average_score, np.array(score) 105 | 106 | def method(self): 107 | return "Rouge" 108 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import collections 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import model.transformer.Constants as Constants 6 | 7 | # Construct the uni-gram language model 8 | def unigram(tokens): 9 | model = collections.defaultdict(lambda: 0.01) 10 | for f in tokens: 11 | try: 12 | model[f] += 1 13 | except KeyError: 14 | model [f] = 1 15 | continue 16 | N = float(sum(model.values())) 17 | for word in model: 18 | model[word] /= N 19 | return model 20 | 21 | 22 | # Computes perplexity of the UniGram model on a test-set 23 | def perplexity(testset, model): 24 | testset = testset.split() 25 | ppl = 1 26 | N = 0 27 | for word in testset: 28 | N += 1 29 | ppl *= 1/model[word] 30 | ppl = pow(ppl, 1/float(N)) 31 | return ppl 32 | 33 | 34 | # Mean Pool Out the word2vec features of sentences. 35 | def mean_pool_vec(sentence, wordmodel): 36 | vector = np.zeros(50) 37 | vector += np.mean([wordmodel[ele] for ele in sentence.split(' ') if ele in wordmodel.keys()], 0) 38 | return vector 39 | 40 | 41 | def test_collate_fn(batch): 42 | ''' 43 | :param batch: input batch data 44 | :return: aligned features 45 | ''' 46 | 47 | return batch 48 | 49 | 50 | def pos_emb_generation(visual_feats): 51 | ''' 52 | Generate the position embedding input for Transformers. 53 | ''' 54 | seq = list(range(1, visual_feats.shape[1] + 1)) 55 | src_pos = torch.tensor([seq] * visual_feats.shape[0]).cuda() 56 | return src_pos 57 | 58 | 59 | def list_to_sentence(list): 60 | sentence = '' 61 | for element in list: 62 | sentence += ' ' + element 63 | return sentence 64 | 65 | 66 | class LanguageModelCriterion(nn.Module): 67 | 68 | def __init__(self): 69 | super(LanguageModelCriterion, self).__init__() 70 | # self.loss_fn = nn.NLLLoss(reduce=False) 71 | self.loss_fn = nn.CrossEntropyLoss() 72 | 73 | def forward(self, logits, target, mask): 74 | """ 75 | logits: shape of (N, seq_len, vocab_size) 76 | target: shape of (N, seq_len) 77 | mask: shape of (N, seq_len) 78 | """ 79 | # truncate to the same size 80 | batch_size = target.shape[0] 81 | target = target[:, :logits.shape[1]] 82 | mask = mask[:, :logits.shape[1]] 83 | target = target.contiguous().view(-1) 84 | mask = mask.contiguous().view(-1) 85 | loss = self.loss_fn(logits, target) 86 | output = torch.sum(loss * mask) / batch_size 87 | return output 88 | 89 | 90 | def cal_loss(pred, gold, smoothing): 91 | ''' Calculate cross entropy loss, apply label smoothing if needed. ''' 92 | 93 | gold = gold.contiguous().view(-1) 94 | 95 | if smoothing: 96 | eps = 0.1 97 | n_class = pred.size(1) 98 | 99 | one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1) 100 | one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1) 101 | log_prb = F.log_softmax(pred, dim=1) 102 | 103 | non_pad_mask = gold.ne(Constants.PAD) 104 | loss = -(one_hot * log_prb).sum(dim=1) 105 | loss = loss.masked_select(non_pad_mask).sum() # average later 106 | else: 107 | loss = F.cross_entropy(pred, gold, ignore_index=Constants.PAD, reduction='sum') 108 | 109 | return loss 110 | 111 | 112 | def cal_performance(pred, gold, smoothing=False): 113 | ''' Apply label smoothing if needed ''' 114 | 115 | loss = cal_loss(pred, gold, smoothing) 116 | 117 | pred = pred.max(1)[1] 118 | gold = gold.contiguous().view(-1) 119 | non_pad_mask = gold.ne(Constants.PAD) 120 | n_correct = pred.eq(gold) 121 | n_correct = n_correct.masked_select(non_pad_mask).sum().item() 122 | return loss, n_correct 123 | 124 | 125 | def pos_emb_generation(word_labels): 126 | ''' 127 | Generate the position embedding input for Transformers. 128 | ''' 129 | 130 | seq = list(range(1, word_labels.shape[1] + 1)) 131 | tgt_pos = torch.tensor([seq] * word_labels.shape[0]).cuda() 132 | binary_mask = (word_labels != 0).long() 133 | 134 | return tgt_pos*binary_mask 135 | 136 | 137 | def show_prediction(seq_probs, labels, vocab, caption=True): 138 | ''' 139 | :return: predicted words and GT words. 140 | ''' 141 | # Print out the predicted sentences and GT 142 | _ = seq_probs.view(labels.shape[0], labels.shape[1], -1)[0] 143 | pred_idx = torch.argmax(_, 1) 144 | # print(' \n') 145 | if caption: 146 | print('Caption: ') 147 | else: 148 | print('CMS: ') 149 | 150 | pr = 'Generation: ', list_to_sentence([vocab[str(widx.cpu().numpy())] for widx in pred_idx if widx != 0]) 151 | gt = 'GT: ', list_to_sentence([vocab[str(word.cpu().numpy())] for word in labels[0] if word != 0]) 152 | print(pr) 153 | print(gt) 154 | return pr, gt 155 | 156 | --------------------------------------------------------------------------------