├── README.md ├── charades_test.csv ├── data_charades.py ├── evaluation_charades.py ├── model_charades.py ├── test_charades.py ├── vocab.py └── vocab └── vocab.pkl /README.md: -------------------------------------------------------------------------------- 1 | ### Weakly Supervised Video Moment Retrieval from Text Queries 2 | 3 | Code to evaluate "Weakly Supervised Video Moment Retrieval from Text Queries" (Mithun, Niluthpol C and Paul, Sujoy and Roy-Chowdhury, Amit K) 2019 4 | 5 | ### Dependencies 6 | 7 | This code is written in python3. The necessary packages are below: 8 | 9 | * PyTorch (>0.4) and torchvision 10 | * NumPy 11 | * pycocotools 12 | * pandas 13 | * matplotlib 14 | * NLTK Punkt Sentence Tokenizer 15 | * Punkt Sentence Tokenizer 16 | ```python 17 | import nltk 18 | nltk.download() 19 | > d punkt 20 | ``` 21 | 22 | 23 | ### Evaluate Models 24 | 25 | * Download models from https://drive.google.com/drive/folders/1iJLdITzcT95wDj5nF85pOZpP5GCwfPbH 26 | * Please follow https://github.com/jiyanggao/TALL to download Charades-STA annotations 27 | * To evaluate on Charades-STA dataset : python test_charades.py 28 | 29 | 30 | ### Reference 31 | If you use our code, please cite the following paper: 32 | 33 | > @inproceedings{mithun2019weakly, 34 | title={Weakly supervised video moment retrieval from text queries}, 35 | author={Mithun, Niluthpol Chowdhury and Paul, Sujoy and Roy-Chowdhury, Amit K}, 36 | booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, 37 | pages={11592--11601}, 38 | year={2019} 39 | } 40 | 41 | ### Updates on Charades-STA performance 42 | A small bug in our Charades dataset evaluation code (related to rounding to one decimal place) resulted in slightly improved score of the proposed approach in Table 1 in the CVPR paper. We fixed the bug and updated the Table in arXiv. Please note that, our conclusion remains the same after the update. 43 | The updated result of proposed approach on the Charades-STA dataset is below. Please compare to these results when using Charades-STA. 44 | 45 | 46 | | Method | IoU=0.3, R@1 | IoU=0.3, R@5 | IoU=0.3, R@10 | IoU=0.5, R@1 | IoU=0.5, R@5 | IoU=0.5, R@10 | IoU=0.7, R@1 | IoU=0.7, R@5 | IoU=0.7, R@10 | 47 | | :--------------- | ----------: | ----------: | ----------: | ----------: | ----------: | ----------: | ----------: | ----------: | ----------: | 48 | |Proposed | 29.68 | 83.87 | 98.41 | 17.04 | 58.17 | 83.44 | 6.93 | 26.80 | 44.06 | 49 | 50 | 51 | -- Parts of code are borrowed from - https://github.com/fartashf/vsepp 52 | 53 | -- Contact: Niluthpol Chowdhury Mithun (nmith001@ucr.edu) 54 | -------------------------------------------------------------------------------- /data_charades.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | import torchvision.transforms as transforms 4 | import os 5 | import nltk 6 | import numpy as np 7 | import pandas 8 | import scipy.io as sio 9 | import skimage.measure as scikit 10 | 11 | 12 | 13 | class Charades(data.Dataset): 14 | """ 15 | Load precomputed captions and image features 16 | """ 17 | 18 | def __init__(self, data_split, dpath, vocab): 19 | self.vocab=vocab 20 | path=dpath+"/Caption/charades_"+ str(data_split) + ".csv" 21 | df=pandas.read_csv(path) 22 | df_temp = pandas.read_csv(path,dtype={'ID': object}) 23 | self.inds = df_temp['video'] 24 | self.desc=df['description'] 25 | self.data_split=data_split 26 | self.data_path=dpath 27 | 28 | def __getitem__(self, index): 29 | 30 | img_id = index 31 | inds=self.inds 32 | desc=self.desc 33 | 34 | video_feat_file=self.data_path+"/c3d_features/"+str(inds[index])+".mat" 35 | video_feat_mat = sio.loadmat(video_feat_file) 36 | video_feat=video_feat_mat['feature'] 37 | # 128 frame features 38 | video_feat1=scikit.block_reduce(video_feat, block_size=(8, 1), func=np.mean) 39 | # 256 frame features 40 | video_feat2=scikit.block_reduce(video_feat, block_size=(16, 1), func=np.mean) 41 | # concatenation of all 128 frame feature and 256 frame feature 42 | video_feat=np.concatenate((video_feat1,video_feat2),axis=0) 43 | 44 | image = torch.Tensor(video_feat) 45 | caption = desc[index] 46 | vocab = self.vocab 47 | 48 | # Convert caption (string) to word ids. 49 | tokens = nltk.tokenize.word_tokenize( 50 | str(caption).lower()) 51 | caption = [] 52 | caption.append(vocab('')) 53 | caption.extend([vocab(token) for token in tokens]) 54 | caption.append(vocab('')) 55 | target = torch.Tensor(caption) 56 | return image, target, index, img_id 57 | 58 | def __len__(self): 59 | return len(self.desc) 60 | 61 | 62 | def collate_fn(data): 63 | """Build mini-batch tensors from a list of (image, caption) tuples. 64 | Args: 65 | data: list of (image, caption) tuple. 66 | 67 | Returns: 68 | images: torch tensor of shape (batch_size, feature_size). 69 | targets: torch tensor of shape (batch_size, padded_length). 70 | lengths: list; valid length for each padded caption. 71 | """ 72 | # Sort a data list by caption length 73 | data.sort(key=lambda x: len(x[1]), reverse=True) 74 | images, captions, ids, img_ids = zip(*data) 75 | 76 | # Merge images (convert tuple of 3D tensor to 4D tensor) 77 | lengths_img = [len(im) for im in images] 78 | 79 | target_images = torch.zeros(len(images), max(lengths_img), 4096) 80 | 81 | #images = torch.stack(images, 0) 82 | for i, im in enumerate(images): 83 | end = lengths_img[i] 84 | target_images[i, :end,] = im[:end,] 85 | 86 | # Merget captions (convert tuple of 1D tensor to 2D tensor) 87 | lengths = [len(cap) for cap in captions] 88 | targets = torch.zeros(len(captions), max(lengths)).long() 89 | for i, cap in enumerate(captions): 90 | end = lengths[i] 91 | targets[i, :end] = cap[:end] 92 | 93 | return target_images, targets, lengths, lengths_img, ids 94 | 95 | 96 | 97 | def get_charades_loader(data_split, dpath, vocab, opt, batch_size=100, 98 | shuffle=True, num_workers=2): 99 | """Returns torch.utils.data.DataLoader for custom coco dataset.""" 100 | dset = Charades(data_split, dpath, vocab) 101 | 102 | data_loader = torch.utils.data.DataLoader(dataset=dset, 103 | batch_size=batch_size, 104 | shuffle=shuffle, 105 | pin_memory=True, 106 | collate_fn=collate_fn) 107 | return data_loader 108 | 109 | 110 | 111 | def get_loaders(data_name, vocab, crop_size, batch_size, workers, opt): 112 | dpath = os.path.join(opt.data_path, data_name) 113 | train_loader = get_charades_loader('train', dpath, vocab, opt, 114 | batch_size, True, workers) 115 | val_loader = get_charades_loader('val', dpath, vocab, opt, 116 | batch_size, False, workers) 117 | 118 | 119 | return train_loader, val_loader 120 | 121 | 122 | def get_test_loader(split_name, data_name, vocab, crop_size, batch_size, 123 | workers, opt): 124 | dpath = os.path.join(opt.data_path, data_name) 125 | test_loader = get_charades_loader(split_name, dpath, vocab, opt, 126 | batch_size, False, workers) 127 | 128 | 129 | return test_loader 130 | 131 | 132 | -------------------------------------------------------------------------------- /evaluation_charades.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import pickle 4 | 5 | import numpy 6 | from data_charades import get_test_loader 7 | import time 8 | import numpy as np 9 | from vocab import Vocabulary 10 | import torch 11 | from model_charades import VSE, order_sim 12 | from collections import OrderedDict 13 | import pandas 14 | 15 | 16 | class AverageMeter(object): 17 | """Computes and stores the average and current value""" 18 | 19 | def __init__(self): 20 | self.reset() 21 | 22 | def reset(self): 23 | self.val = 0 24 | self.avg = 0 25 | self.sum = 0 26 | self.count = 0 27 | 28 | def update(self, val, n=1): 29 | self.val = val 30 | self.sum += val * n 31 | self.count += n 32 | self.avg = self.sum / (.0001 + self.count) 33 | 34 | def __str__(self): 35 | """String representation for logging 36 | """ 37 | # for values that should be recorded exactly e.g. iteration number 38 | if self.count == 0: 39 | return str(self.val) 40 | # for stats 41 | return '%.4f (%.4f)' % (self.val, self.avg) 42 | 43 | 44 | class LogCollector(object): 45 | """A collection of logging objects that can change from train to val""" 46 | 47 | def __init__(self): 48 | # to keep the order of logged variables deterministic 49 | self.meters = OrderedDict() 50 | 51 | def update(self, k, v, n=0): 52 | # create a new meter if previously not recorded 53 | if k not in self.meters: 54 | self.meters[k] = AverageMeter() 55 | self.meters[k].update(v, n) 56 | 57 | def __str__(self): 58 | """Concatenate the meters in one log line 59 | """ 60 | s = '' 61 | for i, (k, v) in enumerate(self.meters.items()): 62 | if i > 0: 63 | s += ' ' 64 | s += k + ' ' + str(v) 65 | return s 66 | 67 | def tb_log(self, tb_logger, prefix='', step=None): 68 | """Log using tensorboard 69 | """ 70 | for k, v in self.meters.iteritems(): 71 | tb_logger.log_value(prefix + k, v.val, step=step) 72 | 73 | 74 | def encode_data(model, data_loader, log_step=10, logging=print): 75 | """Encode all images and captions loadable by `data_loader` 76 | """ 77 | batch_time = AverageMeter() 78 | val_logger = LogCollector() 79 | 80 | # switch to evaluate mode 81 | model.val_start() 82 | 83 | end = time.time() 84 | 85 | # numpy array to keep all the embeddings 86 | img_embs = None 87 | cap_embs = None 88 | #attn_weights = 89 | for i, (images, captions, lengths, lengths_img, ids) in enumerate(data_loader): 90 | # make sure val logger is used 91 | model.logger = val_logger 92 | 93 | # compute the embeddings 94 | img_emb, cap_emb, attn_weight_s = model.forward_emb(images, captions, lengths, lengths_img, volatile=True) 95 | 96 | if(attn_weight_s.size(1)<10): 97 | attn_weight=torch.zeros(attn_weight_s.size(0),10,attn_weight_s.size(2)) 98 | attn_weight[:,0:attn_weight_s.size(1),:]=attn_weight_s 99 | else: 100 | attn_weight=attn_weight_s 101 | 102 | batch_length=attn_weight.size(0) 103 | attn_weight=torch.squeeze(attn_weight) 104 | 105 | # initialize the numpy arrays given the size of the embeddings 106 | if img_embs is None: 107 | img_embs = np.zeros((len(data_loader.dataset), img_emb.size(1))) 108 | cap_embs = np.zeros((len(data_loader.dataset), cap_emb.size(1))) 109 | attention_index = np.zeros((len(data_loader.dataset), 10)) 110 | rank1_ind = np.zeros((len(data_loader.dataset))) 111 | lengths_all = np.zeros((len(data_loader.dataset))) 112 | 113 | 114 | attn_index= np.zeros((batch_length, 10)) # Rank 1 to 10 115 | rank_att1= np.zeros(batch_length) 116 | temp=attn_weight.data.cpu().numpy().copy() 117 | for k in range(batch_length): 118 | att_weight=temp[k,:] 119 | sc_ind=numpy.argsort(-att_weight) 120 | rank_att1[k]=sc_ind[0] 121 | attn_index[k,:]=sc_ind[0:10] 122 | 123 | # preserve the embeddings by copying from gpu and converting to numpy 124 | img_embs[ids] = img_emb.data.cpu().numpy().copy() 125 | cap_embs[ids] = cap_emb.data.cpu().numpy().copy() 126 | attention_index[ids] = attn_index 127 | lengths_all[ids] = lengths_img 128 | rank1_ind[ids] = rank_att1 129 | 130 | # measure accuracy and record loss 131 | model.forward_loss(img_emb, cap_emb) 132 | 133 | # measure elapsed time 134 | batch_time.update(time.time() - end) 135 | end = time.time() 136 | 137 | if i % log_step == 0: 138 | logging('Test: [{0}/{1}]\t' 139 | '{e_log}\t' 140 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 141 | .format( 142 | i, len(data_loader), batch_time=batch_time, 143 | e_log=str(model.logger))) 144 | del images, captions 145 | 146 | return img_embs, cap_embs, attention_index, lengths_all 147 | 148 | 149 | #def cIoU_old(a,b,prec): 150 | # return np.around(1.0*(min(a[1], b[1])-max(a[0], b[0]))/(max(a[1], b[1])-min(a[0], b[0])),decimals=prec) 151 | 152 | 153 | def cIoU(pred, gt): 154 | intersection = max(0, min(pred[1], gt[1]) + 1 - max(pred[0], gt[0])) 155 | union = max(pred[1], gt[1]) + 1 - min(pred[0], gt[0]) 156 | return float(intersection)/union 157 | 158 | def evalrank(model_path, data_path=None, split='dev', fold5=False): 159 | """ 160 | Evaluate a trained model. 161 | """ 162 | # load model and options 163 | checkpoint = torch.load(model_path) 164 | opt = checkpoint['opt'] 165 | 166 | if data_path is not None: 167 | opt.data_path = data_path 168 | opt.vocab_path = "./vocab/" 169 | # load vocabulary 170 | vocab = pickle.load(open(os.path.join( 171 | opt.vocab_path, 'vocab.pkl'), 'rb')) 172 | 173 | opt.vocab_size = len(vocab) 174 | 175 | # construct model 176 | model = VSE(opt) 177 | 178 | # load model state 179 | model.load_state_dict(checkpoint['model']) 180 | print(opt) 181 | 182 | ####### input video files 183 | path= os.path.join(opt.data_path, opt.data_name)+"/Caption/charades_"+ str(split) + ".csv" 184 | df=pandas.read_csv(open(path,'rb')) 185 | #columns=df.columns 186 | inds=df['video'] 187 | desc=df['description'] 188 | 189 | print('Loading dataset') 190 | data_loader = get_test_loader(split, opt.data_name, vocab, opt.crop_size, 191 | opt.batch_size, opt.workers, opt) 192 | 193 | print('Computing results...') 194 | img_embs, cap_embs, attn_index, lengths_img = encode_data(model, data_loader) 195 | 196 | print(img_embs.shape) 197 | print(cap_embs.shape) 198 | print('Images: %d, Captions: %d' % 199 | (img_embs.shape[0], cap_embs.shape[0])) 200 | 201 | # retrieve moments 202 | r13, r15, r17 = t2i(img_embs, cap_embs, df, attn_index, lengths_img, measure=opt.measure, return_ranks=True) 203 | 204 | def t2i(images, captions, df, attn_index, lengths_img, npts=None, measure='cosine', return_ranks=False): 205 | """ 206 | Text->Images (Image Search) 207 | Images: (N, K) matrix of images 208 | Captions: (N, K) matrix of captions 209 | """ 210 | inds=df['video'] 211 | desc=df['description'] 212 | start_segment=df['start_segment'] 213 | end_segment=df['end_segment'] 214 | 215 | if npts is None: 216 | npts = images.shape[0] 217 | ims = numpy.array([images[i] for i in range(0, len(images), 1)]) 218 | 219 | ranks = numpy.zeros(int(npts)) 220 | top1 = numpy.zeros(int(npts)) 221 | average_ranks = [] 222 | average_iou = [] 223 | correct_num05=0 224 | correct_num07=0 225 | correct_num03=0 226 | 227 | R5IOU5=0 228 | R5IOU7=0 229 | R5IOU3=0 230 | R10IOU3=0 231 | R10IOU5=0 232 | R10IOU7=0 233 | 234 | for index in range(int(npts)): 235 | att_inds=attn_index[index,:] 236 | len_img=lengths_img[index] 237 | gt_start=start_segment[index] 238 | gt_end=end_segment[index] 239 | break_128=np.floor(len_img*2/3)-1 240 | rank1_start=att_inds[0] 241 | if (rank1_start=0.5: 250 | correct_num05+=1 251 | if iou>=0.7: 252 | correct_num07+=1 253 | if iou>=0.3: 254 | correct_num03+=1 255 | 256 | for j1 in range(5): 257 | if (att_inds[j1]=0.5: 266 | R5IOU5+=1 267 | break 268 | 269 | for j1 in range(5): 270 | if (att_inds[j1]=0.7: 279 | R5IOU7+=1 280 | break 281 | 282 | for j1 in range(5): 283 | if (att_inds[j1]=0.3: 292 | R5IOU3+=1 293 | break 294 | 295 | for j1 in range(10): 296 | if (att_inds[j1]=0.5: 305 | R10IOU5+=1 306 | break 307 | 308 | for j1 in range(10): 309 | if (att_inds[j1]=0.7: 318 | R10IOU7+=1 319 | break 320 | 321 | for j1 in range(10): 322 | if (att_inds[j1]=0.3: 331 | R10IOU3+=1 332 | break 333 | 334 | 335 | 336 | 337 | ############################ 338 | 339 | # Compute metrics 340 | R1IoU05=correct_num05 341 | R1IoU07=correct_num07 342 | R1IoU03=correct_num03 343 | total_length=images.shape[0] 344 | #print('total length',total_length) 345 | print("R@1 IoU0.3: %f" %(R1IoU03/float(total_length))) 346 | print("R@5 IoU0.3: %f" %(R5IOU3/float(total_length))) 347 | print("R@10 IoU0.3: %f" %(R10IOU3/float(total_length))) 348 | 349 | print("R@1 IoU0.5: %f" %(R1IoU05/float(total_length))) 350 | print("R@5 IoU0.5: %f" %(R5IOU5/float(total_length))) 351 | print("R@10 IoU0.5: %f" %(R10IOU5/float(total_length))) 352 | 353 | print("R@1 IoU0.7: %f" %(R1IoU07/float(total_length))) 354 | print("R@5 IoU0.7: %f" %(R5IOU7/float(total_length))) 355 | print("R@10 IoU0.7: %f" %(R10IOU7/float(total_length))) 356 | 357 | 358 | return R1IoU03, R1IoU05, R1IoU07 359 | 360 | -------------------------------------------------------------------------------- /model_charades.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.init 4 | import torchvision.models as models 5 | from torch.autograd import Variable 6 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 7 | import torch.backends.cudnn as cudnn 8 | from torch.nn.utils.clip_grad import clip_grad_norm 9 | import numpy as np 10 | from collections import OrderedDict 11 | 12 | 13 | def l2norm(X): 14 | """L2-normalize columns of X 15 | """ 16 | norm = torch.pow(X, 2).sum(dim=1).sqrt() 17 | X = X / norm[:,None] 18 | return X 19 | 20 | 21 | def EncoderImage(data_name, img_dim, embed_size, finetune=False, 22 | cnn_type='vgg19', use_abs=False, no_imgnorm=False): 23 | """A wrapper to image encoders. Chooses between an encoder that uses 24 | precomputed image features, `EncoderImagePrecomp`, or an encoder that 25 | computes image features on the fly `EncoderImageFull`. We used Precomp 26 | """ 27 | #print img_dim 28 | if data_name.endswith('_precomp'): 29 | img_enc = EncoderImagePrecomp( 30 | img_dim, embed_size, use_abs, no_imgnorm) 31 | else: 32 | img_enc = EncoderImageFull( 33 | embed_size, finetune, cnn_type, use_abs, no_imgnorm) 34 | 35 | return img_enc 36 | 37 | 38 | # tutorials/09 - Image Captioning 39 | class EncoderImageFull(nn.Module): 40 | 41 | def __init__(self, embed_size, finetune=False, cnn_type='vgg19', 42 | use_abs=False, no_imgnorm=False): 43 | """Load pretrained VGG19 and replace top fc layer.""" 44 | super(EncoderImageFull, self).__init__() 45 | self.embed_size = embed_size 46 | self.no_imgnorm = no_imgnorm 47 | self.use_abs = use_abs 48 | 49 | # Load a pre-trained model 50 | self.cnn = self.get_cnn(cnn_type, True) 51 | 52 | # For efficient memory usage. 53 | for param in self.cnn.parameters(): 54 | param.requires_grad = finetune 55 | 56 | # Replace the last fully connected layer of CNN with a new one 57 | if cnn_type.startswith('vgg'): 58 | self.fc = nn.Linear(self.cnn.classifier._modules['6'].in_features, 59 | embed_size) 60 | self.cnn.classifier = nn.Sequential( 61 | *list(self.cnn.classifier.children())[:-1]) 62 | elif cnn_type.startswith('resnet'): 63 | self.fc = nn.Linear(self.cnn.module.fc.in_features, embed_size) 64 | self.cnn.module.fc = nn.Sequential() 65 | 66 | self.init_weights() 67 | 68 | def get_cnn(self, arch, pretrained): 69 | """Load a pretrained CNN and parallelize over GPUs 70 | """ 71 | if pretrained: 72 | #print("=> using pre-trained model '{}'".format(arch)) 73 | model = models.__dict__[arch](pretrained=True) 74 | else: 75 | #print("=> creating model '{}'".format(arch)) 76 | model = models.__dict__[arch]() 77 | 78 | if arch.startswith('alexnet') or arch.startswith('vgg'): 79 | model.features = nn.DataParallel(model.features) 80 | model.cuda() 81 | else: 82 | model = nn.DataParallel(model).cuda() 83 | 84 | return model 85 | 86 | def init_weights(self): 87 | """Xavier initialization for the fully connected layer 88 | """ 89 | r = np.sqrt(6.) / np.sqrt(self.fc.in_features + 90 | self.fc.out_features) 91 | self.fc.weight.data.uniform_(-r, r) 92 | self.fc.bias.data.fill_(0) 93 | 94 | def forward(self, images): 95 | """Extract image feature vectors.""" 96 | features = self.cnn(images) 97 | 98 | # normalization in the image embedding space 99 | features = l2norm(features) 100 | 101 | # linear projection to the joint embedding space 102 | features = self.fc(features) 103 | 104 | # normalization in the joint embedding space 105 | if not self.no_imgnorm: 106 | features = l2norm(features) 107 | 108 | # take the absolute value of the embedding (used in order embeddings) 109 | if self.use_abs: 110 | features = torch.abs(features) 111 | 112 | return features 113 | 114 | 115 | 116 | 117 | def cross_attention(x1, x2, dim=2): 118 | """Returns cosine similarity based attentionbetween x1 and x2, computed along dim.""" 119 | w1=torch.bmm(x1, x2.unsqueeze(2)) 120 | return w1 121 | 122 | 123 | 124 | class EncoderImagePrecomp(nn.Module): 125 | 126 | def __init__(self, img_dim, embed_size, use_abs=False,no_imgnorm=False): 127 | super(EncoderImagePrecomp, self).__init__() 128 | self.embed_size = embed_size 129 | self.no_imgnorm = no_imgnorm 130 | self.use_abs = use_abs 131 | 132 | self.ws1 = nn.Linear(img_dim, embed_size) 133 | self.softmax = nn.Softmax(0) 134 | self.fc = nn.Linear(embed_size, embed_size) 135 | 136 | self.init_weights() 137 | 138 | def init_weights(self): 139 | """Xavier initialization for the fully connected layer 140 | """ 141 | r = np.sqrt(6.) / np.sqrt(self.fc.in_features + 142 | self.fc.out_features) 143 | self.fc.weight.data.uniform_(-r, r) 144 | self.fc.bias.data.fill_(0) 145 | r1 = np.sqrt(6.) / np.sqrt(self.ws1.in_features + 146 | self.ws1.out_features) 147 | self.ws1.weight.data.uniform_(-r, r) 148 | self.ws1.bias.data.fill_(0) 149 | 150 | 151 | def forward(self, images, cap_embs,lengths_img): 152 | """Extract image feature vectors.""" 153 | # assuming that the precomputed features are already l2-normalized 154 | size = images.size() 155 | 156 | image_feature=self.ws1(images) 157 | attn_weights=cross_attention(image_feature, cap_embs, dim=2) 158 | 159 | att_weights_s=torch.zeros(attn_weights.shape) 160 | 161 | for i in range(size[0]): 162 | temp=self.softmax(attn_weights[i,0:lengths_img[i],:]) 163 | att_weights_s[i,0:lengths_img[i],:] = temp.data 164 | 165 | attn_weights=Variable(att_weights_s.cuda()) 166 | out=torch.bmm(image_feature.transpose(1,2),attn_weights) 167 | 168 | out=torch.squeeze(out) 169 | 170 | features = self.fc(out) 171 | 172 | # normalize in the joint embedding space 173 | if not self.no_imgnorm: 174 | features = l2norm(features) 175 | 176 | # take the absolute value of embedding 177 | if self.use_abs: 178 | features = torch.abs(features) 179 | 180 | return features, attn_weights 181 | 182 | def load_state_dict(self, state_dict): 183 | """Copies parameters. overwritting the default one to 184 | accept state_dict from Full model 185 | """ 186 | own_state = self.state_dict() 187 | new_state = OrderedDict() 188 | for name, param in state_dict.items(): 189 | if name in own_state: 190 | new_state[name] = param 191 | 192 | super(EncoderImagePrecomp, self).load_state_dict(new_state) 193 | 194 | 195 | # tutorials/08 - Language Model 196 | # RNN Based Language Model 197 | class EncoderText(nn.Module): 198 | 199 | def __init__(self, vocab_size, word_dim, embed_size, num_layers, 200 | use_abs=False): 201 | super(EncoderText, self).__init__() 202 | self.use_abs = use_abs 203 | self.embed_size = embed_size 204 | 205 | # word embedding 206 | self.embed = nn.Embedding(vocab_size, word_dim) 207 | 208 | # caption embedding 209 | self.rnn = nn.GRU(word_dim, embed_size, num_layers, batch_first=True) 210 | 211 | self.init_weights() 212 | 213 | def init_weights(self): 214 | self.embed.weight.data.uniform_(-0.1, 0.1) 215 | 216 | def forward(self, x, lengths): 217 | """Handles variable size captions 218 | """ 219 | # Embed word ids to vectors 220 | 221 | x = self.embed(x) 222 | 223 | packed = pack_padded_sequence(x, lengths, batch_first=True) 224 | 225 | # Forward propagate RNN 226 | out, _ = self.rnn(packed) 227 | 228 | # Reshape *final* output to (batch_size, hidden_size) 229 | padded = pad_packed_sequence(out, batch_first=True) 230 | I = torch.LongTensor(lengths).view(-1, 1, 1) 231 | I = Variable(I.expand(x.size(0), 1, self.embed_size)-1).cuda() 232 | out = torch.gather(padded[0], 1, I).squeeze(1) 233 | 234 | # normalization in the joint embedding space 235 | out = l2norm(out) 236 | 237 | # take absolute value, used by order embeddings 238 | if self.use_abs: 239 | out = torch.abs(out) 240 | 241 | return out 242 | 243 | 244 | def cosine_sim(im, s): 245 | """Cosine similarity between all the image and sentence pairs 246 | """ 247 | return im.mm(s.t()) 248 | 249 | 250 | def order_sim(im, s): 251 | """Order embeddings similarity measure $max(0, s-im)$ 252 | """ 253 | YmX = (s.unsqueeze(1).expand(s.size(0), im.size(0), s.size(1)) 254 | - im.unsqueeze(0).expand(s.size(0), im.size(0), s.size(1))) 255 | score = -YmX.clamp(min=0).pow(2).sum(2).squeeze(2).sqrt().t() 256 | return score 257 | 258 | 259 | class ContrastiveLoss(nn.Module): 260 | """ 261 | Compute contrastive loss 262 | """ 263 | 264 | def __init__(self, margin=0, measure=False, max_violation=False): 265 | super(ContrastiveLoss, self).__init__() 266 | self.margin = margin 267 | if measure == 'order': 268 | self.sim = order_sim 269 | else: 270 | self.sim = cosine_sim 271 | 272 | self.max_violation = max_violation 273 | 274 | def forward(self, im, s): 275 | # compute image-sentence score matrix 276 | scores = self.sim(im, s) 277 | ##print('scores') 278 | diagonal = scores.diag().view(im.size(0), 1) 279 | d1 = diagonal.expand_as(scores) 280 | d2 = diagonal.t().expand_as(scores) 281 | 282 | # compare every diagonal score to scores in its column 283 | # caption retrieval 284 | cost_s = (self.margin + scores - d1).clamp(min=0) 285 | 286 | # compare every diagonal score to scores in its row 287 | # image retrieval 288 | cost_im = (self.margin + scores - d2).clamp(min=0) 289 | 290 | # clear diagonals 291 | mask = torch.eye(scores.size(0)) > .5 292 | I = Variable(mask) 293 | if torch.cuda.is_available(): 294 | I = I.cuda() 295 | cost_s = cost_s.masked_fill_(I, 0) 296 | cost_im = cost_im.masked_fill_(I, 0) 297 | # keep the maximum violating negative for each query 298 | if self.max_violation: 299 | cost_s = cost_s.max(1)[0] 300 | cost_im = cost_im.max(0)[0] 301 | 302 | return cost_s.sum() + cost_im.sum() 303 | 304 | class VSE(object): 305 | """ 306 | rkiros/uvs model 307 | """ 308 | 309 | def __init__(self, opt): 310 | # tutorials/09 - Image Captioning 311 | # Build Models 312 | self.grad_clip = opt.grad_clip 313 | self.img_enc = EncoderImage(opt.data_name, opt.img_dim, opt.embed_size, 314 | opt.finetune, opt.cnn_type, 315 | use_abs=opt.use_abs, 316 | no_imgnorm=opt.no_imgnorm) 317 | self.txt_enc = EncoderText(opt.vocab_size, opt.word_dim, 318 | opt.embed_size, opt.num_layers, 319 | use_abs=opt.use_abs) 320 | if torch.cuda.is_available(): 321 | self.img_enc.cuda() 322 | self.txt_enc.cuda() 323 | cudnn.benchmark = True 324 | 325 | # Loss and Optimizer 326 | self.criterion = ContrastiveLoss(margin=opt.margin, 327 | measure=opt.measure, 328 | max_violation=opt.max_violation) 329 | params = list(self.txt_enc.parameters()) 330 | params += list(self.img_enc.fc.parameters()) 331 | if opt.finetune: 332 | params += list(self.img_enc.cnn.parameters()) 333 | self.params = params 334 | 335 | self.optimizer = torch.optim.Adam(params, lr=opt.learning_rate) 336 | 337 | self.Eiters = 0 338 | 339 | def state_dict(self): 340 | state_dict = [self.img_enc.state_dict(), self.txt_enc.state_dict()] 341 | return state_dict 342 | 343 | def load_state_dict(self, state_dict): 344 | self.img_enc.load_state_dict(state_dict[0]) 345 | self.txt_enc.load_state_dict(state_dict[1]) 346 | 347 | def train_start(self): 348 | """switch to train mode 349 | """ 350 | self.img_enc.train() 351 | self.txt_enc.train() 352 | 353 | def val_start(self): 354 | """switch to evaluate mode 355 | """ 356 | self.img_enc.eval() 357 | self.txt_enc.eval() 358 | 359 | def forward_emb(self, images, captions, lengths, lengths_img, volatile=False): 360 | """Compute the image and caption embeddings 361 | """ 362 | # Set mini-batch dataset 363 | images = Variable(images, volatile=volatile) 364 | captions = Variable(captions, volatile=volatile) 365 | if torch.cuda.is_available(): 366 | images = images.cuda() 367 | captions = captions.cuda() 368 | 369 | # Forward 370 | cap_init_emb = self.txt_enc(captions, lengths) 371 | img_emb, attn_weights = self.img_enc(images,cap_init_emb,lengths_img) 372 | cap_emp=cap_init_emb 373 | return img_emb, cap_emb, attn_weights 374 | 375 | def forward_emb_image(self, images, volatile=False): 376 | """Compute the image and caption embeddings 377 | """ 378 | # Set mini-batch dataset 379 | images = Variable(images, volatile=volatile) 380 | 381 | if torch.cuda.is_available(): 382 | images = images.cuda() 383 | #captions = captions.cuda() 384 | 385 | # Forward 386 | img_emb = self.img_enc(images) 387 | return img_emb 388 | 389 | def forward_emb_caption(self, captions, lengths, volatile=False): 390 | #"""Compute the image and caption embeddings""" 391 | # Set mini-batch dataset 392 | captions = Variable(captions, volatile=volatile) 393 | if torch.cuda.is_available(): 394 | captions = captions.cuda() 395 | 396 | # Forward 397 | cap_emb = self.txt_enc(captions, lengths) 398 | return cap_emb 399 | 400 | def forward_loss(self, img_emb, cap_emb, **kwargs): 401 | """Compute the loss given pairs of image and caption embeddings 402 | """ 403 | loss = self.criterion(img_emb, cap_emb) 404 | self.logger.update('Le', loss.data, img_emb.size(0)) 405 | return loss 406 | 407 | def train_emb(self, images, captions, lengths, lengths_img, ids=None, *args): 408 | """One training step given images and captions. 409 | """ 410 | ##print(ids) 411 | self.Eiters += 1 412 | self.logger.update('Eit', self.Eiters) 413 | self.logger.update('lr', self.optimizer.param_groups[0]['lr']) 414 | 415 | # compute the embeddings 416 | img_emb, cap_emb, attn_weights = self.forward_emb(images, captions, lengths, lengths_img) 417 | 418 | # measure accuracy and record loss 419 | self.optimizer.zero_grad() 420 | loss = self.forward_loss(img_emb, cap_emb) 421 | 422 | # compute gradient and do SGD step 423 | loss.backward() 424 | if self.grad_clip > 0: 425 | clip_grad_norm(self.params, self.grad_clip) 426 | self.optimizer.step() 427 | 428 | 429 | 430 | -------------------------------------------------------------------------------- /test_charades.py: -------------------------------------------------------------------------------- 1 | from vocab import Vocabulary 2 | import evaluation_charades as evaluation 3 | DATA_PATH = '/data/usr/datasets/Text_Video_Moment/' 4 | RUN_PATH = '/home/usr/python/weak_supervised_video_moment/runs/' 5 | 6 | evaluation.evalrank(RUN_PATH+"test_charades/model_best.pth.tar", data_path=DATA_PATH, split="test") 7 | -------------------------------------------------------------------------------- /vocab.py: -------------------------------------------------------------------------------- 1 | 2 | class Vocabulary(object): 3 | """Simple vocabulary wrapper.""" 4 | 5 | def __init__(self): 6 | self.word2idx = {} 7 | self.idx2word = {} 8 | self.idx = 0 9 | 10 | def add_word(self, word): 11 | if word not in self.word2idx: 12 | self.word2idx[word] = self.idx 13 | self.idx2word[self.idx] = word 14 | self.idx += 1 15 | 16 | def __call__(self, word): 17 | if word not in self.word2idx: 18 | return self.word2idx[''] 19 | return self.word2idx[word] 20 | 21 | def __len__(self): 22 | return len(self.word2idx) 23 | -------------------------------------------------------------------------------- /vocab/vocab.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niluthpol/weak_supervised_video_moment/97fa2c052ef909a1dee4e2dec5b7e7a82cb3338e/vocab/vocab.pkl --------------------------------------------------------------------------------