├── imgs ├── ant.PNG ├── diag.JPG └── rec.PNG ├── data └── README.md ├── models_anticipation └── README.md ├── models_recognition └── README.md ├── LICENSE ├── non_local_embedded_gaussian.py ├── utils.py ├── network.py ├── README.md ├── dataset_recognition.py ├── dataset_anticipation.py ├── main_recognition.py └── main_anticipation.py /imgs/ant.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dibschat/tempAgg/HEAD/imgs/ant.PNG -------------------------------------------------------------------------------- /imgs/diag.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dibschat/tempAgg/HEAD/imgs/diag.JPG -------------------------------------------------------------------------------- /imgs/rec.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dibschat/tempAgg/HEAD/imgs/rec.PNG -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | Follow the [RU-LSTM](https://github.com/fpv-iplab/rulstm) repository to download the RGB, Flow, Obj features and the train/val/test splits and keep them in the `data/ek55` or `data/ek100` folder depending on the dataset. 2 | 3 | For ROI features we consider the union of the hand-object interaction bbox annotations provided by the authors of EPIC-KICTHENS-100 ([link](https://github.com/epic-kitchens/epic-kitchens-100-hand-object-bboxes)) as input and extract RGB features with TSN as explained [here](https://github.com/fpv-iplab/rulstm#feature-extraction). 4 | -------------------------------------------------------------------------------- /models_anticipation/README.md: -------------------------------------------------------------------------------- 1 | Create a folder `ek100` by: 2 | 3 | `mkdir -p ek100` 4 | 5 | Download the anticipation models pretrained on EPIC-KITCHENS-100 train split. 6 | * RGB: [link](https://drive.google.com/file/d/10CN66X8GCnqVaAT3U79wC2YmukpuBsQG/view?usp=sharing) 7 | * Flow: [link](https://drive.google.com/file/d/1oh3xQpKOF1NZNvEUPQ0fmFnhIf-FXkDg/view?usp=sharing) 8 | * Obj: [link](https://drive.google.com/file/d/16TIm67FElwfv772HFmPGoZU7yq3w6IYg/view?usp=sharing) 9 | * ROI: [link](https://drive.google.com/file/d/1lMQAAD-gv3ksOvuQyOMuHaj6dJFeVj2X/view?usp=sharing) -------------------------------------------------------------------------------- /models_recognition/README.md: -------------------------------------------------------------------------------- 1 | Create a folder `ek100` by: 2 | 3 | `mkdir -p ek100` 4 | 5 | Download the recognition models pretrained on EPIC-KITCHENS-100 train split. 6 | * RGB: [link](https://drive.google.com/file/d/1yic6f8-ZrSyEn3t5OdtCP6Zk8UaEaaVW/view?usp=sharing) 7 | * Flow: [link](https://drive.google.com/file/d/1zURKFv2Fd0CKlnimd-2zaZ3rWPSzgt41/view?usp=sharing) 8 | * Obj: [link](https://drive.google.com/file/d/17BfIYJklllcT9BoLy53kdrvePYJjYPeo/view?usp=sharing) 9 | * ROI: [link](https://drive.google.com/file/d/1xghs_RQ_4SeMsvnkDdPeKSjTFe6I6QcI/view?usp=sharing) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Dibyadip Chatterjee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /non_local_embedded_gaussian.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | 7 | class NONLocalBlock1D(nn.Module): 8 | def __init__(self, args, recent_dim, spanning_dim, latent_dim): 9 | super(NONLocalBlock1D, self).__init__() 10 | 11 | self.in_dim1 = recent_dim 12 | self.in_dim2 = spanning_dim 13 | 14 | self.scale = args.scale 15 | self.scale_factor = args.scale_factor 16 | 17 | self.dropout_rate = args.dropout_rate 18 | 19 | self.latent_dim = latent_dim 20 | self.video_feat_dim = args.video_feat_dim 21 | 22 | self.theta = nn.Conv1d(in_channels=self.in_dim1, out_channels=self.latent_dim, 23 | kernel_size=1, stride=1, padding=0) 24 | nn.init.normal_(self.theta.weight, mean=0, std=0.01) 25 | nn.init.constant_(self.theta.bias, 0) 26 | 27 | self.phi = nn.Conv1d(in_channels=self.in_dim2, out_channels=self.latent_dim, 28 | kernel_size=1, stride=1, padding=0) 29 | nn.init.normal_(self.phi.weight, mean=0, std=0.01) 30 | nn.init.constant_(self.phi.bias, 0) 31 | 32 | self.g = nn.Conv1d(in_channels=self.in_dim2, out_channels=self.latent_dim, 33 | kernel_size=1, stride=1, padding=0) 34 | nn.init.normal_(self.g.weight, mean=0, std=0.01) 35 | nn.init.constant_(self.g.bias, 0) 36 | 37 | if self.scale: 38 | self.scale_factor = torch.tensor([self.latent_dim ** self.scale_factor], requires_grad=True).to('cuda') 39 | 40 | # """Pre-activation style non-linearity.""" 41 | self.final_layers = nn.Sequential( 42 | nn.LayerNorm(torch.Size([self.latent_dim, self.video_feat_dim])), 43 | nn.ReLU(), 44 | nn.Conv1d(in_channels=self.latent_dim, out_channels=self.in_dim1, kernel_size=1, stride=1, padding=0), 45 | nn.Dropout(p=self.dropout_rate), 46 | ) 47 | 48 | def forward(self, x_past, x_curr): 49 | theta_x = self.theta(x_curr) 50 | theta_x = theta_x.permute(0, 2, 1) 51 | 52 | phi_x = self.phi(x_past) 53 | 54 | g_x = self.g(x_past) 55 | g_x = g_x.permute(0, 2, 1) 56 | 57 | # (N, C, num_feat1), (N, C, num_feat2) -> (N, num_feat1, num_feat2) 58 | theta_phi = torch.matmul(theta_x, phi_x) 59 | 60 | if self.scale: 61 | theta_phi = theta_phi * self.scale_factor 62 | 63 | p_x = F.softmax(theta_phi, dim=-1) 64 | 65 | # (N, C, num_feat2), (N, num_feat1, num_feat2) -> (B, C, num_feat1) 66 | t_x = torch.matmul(p_x, g_x) 67 | 68 | t_x = t_x.permute(0, 2, 1).contiguous() 69 | 70 | W_t = self.final_layers(t_x) 71 | 72 | z_x = W_t + x_curr 73 | return z_x 74 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ Set of utilities """ 3 | from torch import nn 4 | import torch 5 | from torch.autograd import Variable 6 | import numpy as np 7 | from torch.nn.modules.loss import _Loss 8 | from torch.nn import functional as F 9 | 10 | class MeanTopKRecallMeter(object): 11 | def __init__(self, num_classes, k=5): 12 | self.num_classes = num_classes 13 | self.k = k 14 | self.reset() 15 | 16 | def reset(self): 17 | self.tps = np.zeros(self.num_classes) 18 | self.nums = np.zeros(self.num_classes) 19 | 20 | def add(self, scores, labels): 21 | tp = (np.argsort(scores, axis=1)[:, -self.k:] == labels.reshape(-1, 1)).max(1) 22 | for l in np.unique(labels): 23 | self.tps[l]+=tp[labels==l].sum() 24 | self.nums[l]+=(labels==l).sum() 25 | 26 | def value(self): 27 | recalls = (self.tps/self.nums)[self.nums>0] 28 | if len(recalls)>0: 29 | return recalls.mean()*100 30 | else: 31 | return None 32 | 33 | 34 | class ValueMeter(object): 35 | def __init__(self): 36 | self.sum = 0 37 | self.total = 0 38 | 39 | def add(self, value, n): 40 | self.sum += value * n 41 | self.total += n 42 | 43 | def value(self): 44 | return self.sum / self.total 45 | 46 | 47 | def topk_accuracy(scores, labels, ks, selected_class=None): 48 | """Computes TOP-K accuracies for different values of k 49 | Args: 50 | scores: numpy nd array, shape = (instance_count, label_count) 51 | labels: numpy nd array, shape = (instance_count,) 52 | ks: tuple of integers 53 | Returns: 54 | list of float: TOP-K accuracy for each k in ks 55 | """ 56 | if selected_class is not None: 57 | idx = labels == selected_class 58 | scores = scores[idx] 59 | labels = labels[idx] 60 | rankings = scores.argsort()[:, ::-1] 61 | maxk = np.max(ks) # trim to max k to avoid extra computation 62 | 63 | # compute true positives in the top-maxk predictions 64 | tp = rankings[:, :maxk] == labels.reshape(-1, 1) 65 | 66 | # trim to selected ks and compute accuracies 67 | return [tp[:, :k].max(1).mean() for k in ks] 68 | 69 | 70 | def topk_accuracy_save_validation_pred(scores, labels, ks, modality, no_classes = 2513, selected_class=None): 71 | """Computes TOP-K accuracies for different values of k 72 | Args: 73 | scores: numpy nd array, shape = (instance_count, label_count) 74 | labels: numpy nd array, shape = (instance_count,) 75 | ks: tuple of integers 76 | 77 | Returns: 78 | list of float: TOP-K accuracy for each k in ks 79 | """ 80 | if selected_class is not None: 81 | idx = labels == selected_class 82 | scores = scores[idx] 83 | labels = labels[idx] 84 | ranking = scores.argsort()[:, ::-1] 85 | maxk = np.max(ks) # trim to max k to avoid extra computation 86 | 87 | # compute true positives in the top-maxk predictions 88 | tp = ranking[:, :maxk] == labels.reshape(-1, 1) 89 | 90 | allzs = np.zeros((no_classes,), dtype=int) 91 | allzs_correct = np.zeros((no_classes,), dtype=int) 92 | for aa in range(len(labels)): 93 | curr_label = labels[aa] 94 | curr_pred = ranking[:, :maxk][aa][0] 95 | allzs[curr_label] = allzs[curr_label] + 1 96 | if curr_label == curr_pred: 97 | allzs_correct[curr_label] = allzs_correct[curr_label] + 1 98 | 99 | for aa in range(no_classes): 100 | with open('validation_pred_'+str(modality)+'.txt', 'a') as f: 101 | f.write("%d\t%d\n" % (allzs_correct[aa], allzs[aa])) 102 | 103 | # trim to selected ks and compute accuracies 104 | return [tp[:, :k].max(1).mean() for k in ks] 105 | 106 | 107 | def topk_recall(scores, labels, k=5, classes=None): 108 | unique = np.unique(labels) 109 | if classes is None: 110 | classes = unique 111 | else: 112 | classes = np.intersect1d(classes, unique) 113 | recalls = 0 114 | 115 | for c in classes: 116 | recalls += topk_accuracy(scores, labels, ks=(k,), selected_class=c)[0] 117 | return recalls / len(classes) 118 | 119 | 120 | '''def topk_recall_multiple_timesteps(preds, labels, k=5, classes=None): 121 | accs = np.array([topk_recall(preds[:, t, :], labels, k, classes) 122 | for t in range(preds.shape[1])]) 123 | return accs.reshape(1, -1)''' 124 | 125 | 126 | def get_marginal_indexes(actions, mode): 127 | """For each verb/noun retrieve the list of actions containing that verb/name 128 | Input: 129 | mode: "verb" or "noun" 130 | Output: 131 | a list of numpy array of indexes. If verb/noun 3 is contained in actions 2,8,19, 132 | then output[3] will be np.array([2,8,19]) 133 | """ 134 | vi = [] 135 | for v in range(actions[mode].max() + 1): 136 | vals = actions[actions[mode] == v].index.values 137 | if len(vals) > 0: 138 | vi.append(vals) 139 | else: 140 | vi.append(np.array([0])) 141 | return vi 142 | 143 | 144 | def marginalize(probs, indexes): 145 | mprobs = [] 146 | for ilist in indexes: 147 | mprobs.append(probs[:, ilist].sum(1)) 148 | return np.array(mprobs).T 149 | 150 | 151 | def softmax(x): 152 | """Compute softmax values for each sets of scores in x.""" 153 | xx = x 154 | x = x.reshape((-1, x.shape[-1])) 155 | e_x = np.exp(x - np.max(x, 1).reshape(-1, 1)) 156 | res = e_x / e_x.sum(axis=1).reshape(-1, 1) 157 | return res.reshape(xx.shape) 158 | 159 | 160 | def predictions_to_json(task, verb_scores, noun_scores, action_scores, action_ids, a_to_vn, top_actions=100, version='0.1', sls=None): 161 | """Save verb, noun and action predictions to json for submitting them to the EPIC-Kitchens leaderboard""" 162 | 163 | predictions = {'version': version, 'challenge': task, 'results': {}} 164 | 165 | if sls is not None: 166 | if task == 'action_anticipation': 167 | predictions['sls_pt'] = 1 168 | predictions['sls_tl'] = 4 169 | predictions['sls_td'] = 4 170 | elif task == 'action_recognition': 171 | predictions['sls_pt'] = 1 172 | predictions['sls_tl'] = 4 173 | predictions['sls_td'] = 4 174 | 175 | row_idxs = np.argsort(action_scores)[:, ::-1] 176 | top_100_idxs = row_idxs[:, :top_actions] 177 | 178 | action_scores = action_scores[np.arange( 179 | len(action_scores)).reshape(-1, 1), top_100_idxs] 180 | 181 | for i, v, n, a, ai in zip(action_ids, verb_scores, noun_scores, action_scores, top_100_idxs): 182 | predictions['results'][str(i)] = {} 183 | predictions['results'][str(i)]['verb'] = {str(ii): float(vv) for ii, vv in enumerate(v)} 184 | predictions['results'][str(i)]['noun'] = {str(ii): float(nn) for ii, nn in enumerate(n)} 185 | predictions['results'][str(i)]['action'] = {"%d,%d" % a_to_vn[ii]: float(aa) for ii, aa in zip(ai, a)} 186 | return predictions 187 | -------------------------------------------------------------------------------- /network.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from torch import nn 3 | from non_local_embedded_gaussian import NONLocalBlock1D 4 | import torch 5 | import torch.nn.functional as F 6 | 7 | 8 | class CouplingBlocks(nn.Module): 9 | def __init__(self, args, recent_dim, in_dim_past): 10 | super(CouplingBlocks, self).__init__() 11 | 12 | self.dropout_rate = args.dropout_rate 13 | 14 | self.video_feat_dim = args.video_feat_dim 15 | self.latent_dim = args.latent_dim 16 | self.linear_dim = args.linear_dim 17 | 18 | self.recent_dim = recent_dim 19 | self.in_dim_past = in_dim_past 20 | self.past_attention = args.past_attention 21 | 22 | if self.past_attention: 23 | self.NLB_past = NONLocalBlock1D(args, self.in_dim_past, self.in_dim_past, self.latent_dim) 24 | self.NLB_recent = NONLocalBlock1D(args, self.recent_dim, self.in_dim_past, self.latent_dim) 25 | 26 | self.fc_recent = nn.Sequential( 27 | nn.Linear(in_features=2 * self.recent_dim * self.video_feat_dim, out_features=self.linear_dim), 28 | nn.ReLU(), 29 | nn.Dropout(self.dropout_rate), 30 | nn.Linear(in_features=self.linear_dim, out_features=self.linear_dim) 31 | ) 32 | self.fc_context = nn.Sequential( 33 | nn.Linear(in_features=self.in_dim_past * self.video_feat_dim + 2 * self.recent_dim * self.video_feat_dim, 34 | out_features=self.linear_dim), 35 | nn.ReLU(), 36 | nn.Dropout(self.dropout_rate), 37 | nn.Linear(in_features=self.linear_dim, 38 | out_features=self.linear_dim) 39 | ) 40 | 41 | def forward(self, spanning_snippets, recent_snippets): 42 | batch_size = spanning_snippets.size(0) 43 | 44 | if self.past_attention: 45 | nle_x_past = F.relu(self.NLB_past(spanning_snippets, spanning_snippets)) 46 | nle_x_future = F.relu(self.NLB_recent(nle_x_past, recent_snippets)) 47 | all_x_future = torch.cat((nle_x_future, recent_snippets), 1) 48 | all_x_task = torch.cat((nle_x_past, all_x_future), 1) 49 | else: 50 | nle_x_future = F.relu(self.NLB_recent(spanning_snippets, recent_snippets)) 51 | all_x_future = torch.cat((nle_x_future, recent_snippets), 1) 52 | all_x_task = torch.cat((spanning_snippets, all_x_future), 1) 53 | 54 | output_future_fc = self.fc_recent(all_x_future.view(batch_size, -1)) 55 | output_task_fc = self.fc_context(all_x_task.view(batch_size, -1)) 56 | 57 | return output_future_fc, output_task_fc 58 | 59 | 60 | class TemporalAggregateBlocks(nn.Module): 61 | def __init__(self, args): 62 | super(TemporalAggregateBlocks, self).__init__() 63 | 64 | self.linear_dim = args.linear_dim 65 | 66 | self.recent_dim = args.recent_dim 67 | self.span_dim1 = args.span_dim1 68 | self.span_dim2 = args.span_dim2 69 | self.span_dim3 = args.span_dim3 70 | 71 | self.CB1 = CouplingBlocks(args, self.recent_dim, self.span_dim1) 72 | self.CB2 = CouplingBlocks(args, self.recent_dim, self.span_dim2) 73 | self.CB3 = CouplingBlocks(args, self.recent_dim, self.span_dim3) 74 | 75 | self.fc_recent_tab = nn.Sequential( 76 | nn.Linear(in_features=3 * self.linear_dim, out_features=self.linear_dim) 77 | ) 78 | 79 | def forward(self, spanning_snippets, recent_snippets): 80 | cb_recent1, cb_past1 = self.CB1(spanning_snippets[0], recent_snippets) 81 | cb_recent2, cb_past2 = self.CB2(spanning_snippets[1], recent_snippets) 82 | cb_recent3, cb_past3 = self.CB3(spanning_snippets[2], recent_snippets) 83 | 84 | cat_cb_recent = torch.cat((cb_recent1, cb_recent2, cb_recent3), 1) 85 | out_tab_recent = self.fc_recent_tab(cat_cb_recent) 86 | 87 | stack_cb_past = torch.stack((cb_past1, cb_past2, cb_past3), 0) 88 | out_tab_past = torch.max(stack_cb_past, 0)[0].squeeze(0) 89 | 90 | return out_tab_recent, out_tab_past 91 | 92 | 93 | class Network(nn.Module): 94 | def __init__(self, args): 95 | super(Network, self).__init__() 96 | 97 | self.n_classes = args.num_class 98 | self.linear_dim = args.linear_dim 99 | 100 | self.TAB1 = TemporalAggregateBlocks(args) 101 | self.TAB2 = TemporalAggregateBlocks(args) 102 | self.TAB3 = TemporalAggregateBlocks(args) 103 | self.TAB4 = TemporalAggregateBlocks(args) 104 | 105 | self.cls_act1 = nn.Sequential(nn.Linear(in_features=2 * self.linear_dim, out_features=self.n_classes)) 106 | self.cls_act2 = nn.Sequential(nn.Linear(in_features=2 * self.linear_dim, out_features=self.n_classes)) 107 | self.cls_act3 = nn.Sequential(nn.Linear(in_features=2 * self.linear_dim, out_features=self.n_classes)) 108 | self.cls_act4 = nn.Sequential(nn.Linear(in_features=2 * self.linear_dim, out_features=self.n_classes)) 109 | 110 | self.add_verb_loss = args.add_verb_loss 111 | self.add_noun_loss = args.add_noun_loss 112 | if args.add_verb_loss: 113 | self.cls_verb1 = nn.Sequential(nn.Linear(in_features=2 * self.linear_dim, out_features=args.verb_class)) 114 | self.cls_verb2 = nn.Sequential(nn.Linear(in_features=2 * self.linear_dim, out_features=args.verb_class)) 115 | self.cls_verb3 = nn.Sequential(nn.Linear(in_features=2 * self.linear_dim, out_features=args.verb_class)) 116 | self.cls_verb4 = nn.Sequential(nn.Linear(in_features=2 * self.linear_dim, out_features=args.verb_class)) 117 | 118 | if args.add_noun_loss: 119 | self.cls_noun1 = nn.Sequential(nn.Linear(in_features=2 * self.linear_dim, out_features=args.noun_class)) 120 | self.cls_noun2 = nn.Sequential(nn.Linear(in_features=2 * self.linear_dim, out_features=args.noun_class)) 121 | self.cls_noun3 = nn.Sequential(nn.Linear(in_features=2 * self.linear_dim, out_features=args.noun_class)) 122 | self.cls_noun4 = nn.Sequential(nn.Linear(in_features=2 * self.linear_dim, out_features=args.noun_class)) 123 | 124 | def forward(self, spanning_snippets, recent_snippets): 125 | out_tab_recent1, out_tab_past1 = self.TAB1(spanning_snippets, recent_snippets[0]) 126 | out_tab_recent2, out_tab_past2 = self.TAB2(spanning_snippets, recent_snippets[1]) 127 | out_tab_recent3, out_tab_past3 = self.TAB3(spanning_snippets, recent_snippets[2]) 128 | out_tab_recent4, out_tab_past4 = self.TAB4(spanning_snippets, recent_snippets[3]) 129 | 130 | cat_tab1 = torch.cat((out_tab_recent1, out_tab_past1), 1) 131 | pred_act1 = self.cls_act1(cat_tab1) 132 | 133 | cat_tab2 = torch.cat((out_tab_recent2, out_tab_past2), 1) 134 | pred_act2 = self.cls_act2(cat_tab2) 135 | 136 | cat_tab3 = torch.cat((out_tab_recent3, out_tab_past3), 1) 137 | pred_act3 = self.cls_act3(cat_tab3) 138 | 139 | cat_tab4 = torch.cat((out_tab_recent4, out_tab_past4), 1) 140 | pred_act4 = self.cls_act4(cat_tab4) 141 | 142 | if self.add_verb_loss: 143 | pred_verb1 = self.cls_verb1(cat_tab1) 144 | pred_verb2 = self.cls_verb2(cat_tab2) 145 | pred_verb3 = self.cls_verb3(cat_tab3) 146 | pred_verb4 = self.cls_verb4(cat_tab4) 147 | else: 148 | pred_verb1 = None 149 | pred_verb2 = None 150 | pred_verb3 = None 151 | pred_verb4 = None 152 | 153 | if self.add_noun_loss: 154 | pred_noun1 = self.cls_noun1(cat_tab1) 155 | pred_noun2 = self.cls_noun2(cat_tab2) 156 | pred_noun3 = self.cls_noun3(cat_tab3) 157 | pred_noun4 = self.cls_noun4(cat_tab4) 158 | else: 159 | pred_noun1 = None 160 | pred_noun2 = None 161 | pred_noun3 = None 162 | pred_noun4 = None 163 | 164 | return pred_act1, pred_act2, pred_act3, pred_act4, \ 165 | pred_verb1, pred_verb2, pred_verb3, pred_verb4, \ 166 | pred_noun1, pred_noun2, pred_noun3, pred_noun4 167 | 168 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Temporal Aggregate Representations for Long-Range Video Understanding 2 | 3 | This repository provides official PyTorch implementation for our papers: 4 | 5 | F. Sener, D. Singhania and A. Yao, "**Temporal Aggregate Representations for Long-Range Video Understanding**", ECCV 2020 [[paper](https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123610154.pdf)] 6 | 7 | F. Sener, D. Chatterjee and A. Yao, "**Technical Report: Temporal Aggregate Representations**", arXiv:2106.03152, 2021 [[paper](https://arxiv.org/pdf/2106.03152.pdf)] 8 | 9 | ![model](imgs/diag.JPG) 10 | 11 | If you use the code/models hosted in this repository, please cite the following papers: 12 | 13 | ``` 14 | @inproceedings{sener2020temporal, 15 | title={Temporal aggregate representations for long-range video understanding}, 16 | author={Sener, Fadime and Singhania, Dipika and Yao, Angela}, 17 | booktitle={European Conference on Computer Vision}, 18 | pages={154--171}, 19 | year={2020}, 20 | organization={Springer} 21 | } 22 | ``` 23 | 24 | ``` 25 | @article{sener2021technical, 26 | title={Technical Report: Temporal Aggregate Representations}, 27 | author={Sener, Fadime and Chatterjee, Dibyadip and Yao, Angela}, 28 | journal={arXiv preprint arXiv:2106.03152}, 29 | year={2021} 30 | } 31 | ``` 32 | 33 | ## Dependencies 34 | * Python3 35 | * PyTorch 36 | * Numpy, Pandas, PIL 37 | * lmdb, tqdm 38 | 39 | ## Overview 40 | 41 | This repository provides code to train, validate and test our models on the [EPIC-KITCHENS-55](https://openaccess.thecvf.com/content_ECCV_2018/papers/Dima_Damen_Scaling_Egocentric_Vision_ECCV_2018_paper.pdf) an [EPIC-KITCHENS-100](https://arxiv.org/pdf/2006.13256.pdf) datasets for the tasks of action anticipation and action recognition. 42 | 43 | ### Features 44 | 45 | Follow the [RU-LSTM](https://github.com/fpv-iplab/rulstm) repository to download the RGB, Flow, Obj features and the train/val/test splits and keep them in the `data/ek55` or `data/ek100` folder depending on the dataset. 46 | 47 | For ROI features we consider the union of the hand-object interaction bbox annotations provided by the authors of EPIC-KICTHENS-100 ([link](https://github.com/epic-kitchens/epic-kitchens-100-hand-object-bboxes)) as input and extract RGB features with TSN as explained [here](https://github.com/fpv-iplab/rulstm#feature-extraction). 48 | 49 | ### Pretrained Models 50 | 51 | Pretrained models are available only for the EPIC-KITCHENS-100 dataset trained on it's train split. They are provided in the folders `models_anticipation` and `model_recognition`. 52 | 53 | ### Validation 54 | 55 | To validate our model, run the following: 56 | 57 | #### EPIC-KITCHENS-55 58 | ##### Action Anticipation 59 | * RGB: `python main_anticipation.py --mode validate --path_to_data data/ek55 --path_to_models models_anticipation/ek55 --modality rgb --video_feat_dim 1024` 60 | * Flow: `python main_anticipation.py --mode validate --path_to_data data/ek55 --path_to_models models_anticipation/ek55 --modality flow --video_feat_dim 1024` 61 | * Obj: `python main_anticipation.py --mode validate --path_to_data data/ek55 --path_to_models models_anticipation/ek55 --modality obj --video_feat_dim 352` 62 | * ROI: `python main_anticipation.py --mode validate --path_to_data data/ek55 --path_to_models models_anticipation/ek55 --modality roi --video_feat_dim 1024` 63 | * Late Fusion: `python main_anticipation.py --mode validate --path_to_data data/ek55 --path_to_models models_anticipation/ek55 --modality late_fusion` 64 | 65 | ##### Action Recognition 66 | * RGB: `python main_recognition.py --mode validate --path_to_data data/ek55 --path_to_models models_recognition/ek55 --modality rgb --video_feat_dim 1024` 67 | * Flow: `python main_recognition.py --mode validate --path_to_data data/ek55 --path_to_models models_recognition/ek55 --modality flow --video_feat_dim 1024` 68 | * Obj: `python main_recognition.py --mode validate --path_to_data data/ek55 --path_to_models models_recognition/ek55 --modality obj --video_feat_dim 352` 69 | * ROI: `python main_recognition.py --mode validate --path_to_data data/ek55 --path_to_models models_recognition/ek55 --modality roi --video_feat_dim 1024` 70 | * Late Fusion: `python main_recognition.py --mode validate --path_to_data data/ek55 --path_to_models models_recognition/ek55 --modality late_fusion` 71 | 72 | #### EPIC-KITCHENS-100 73 | ##### Action Anticipation 74 | * RGB: `python main_anticipation.py --mode validate --ek100 --path_to_data data/ek100 --path_to_models models_anticipation/ek100/ --modality rgb --video_feat_dim 1024 --num_class 3806 --verb_class 97 --noun_class 300` 75 | * Flow: `python main_anticipation.py --mode validate --ek100 --path_to_data data/ek100 --path_to_models models_anticipation/ek100/ --modality flow --video_feat_dim 1024 --num_class 3806 --verb_class 97 --noun_class 300` 76 | * Obj: `python main_anticipation.py --mode validate --ek100 --path_to_data data/ek100 --path_to_models models_anticipation/ek100/ --modality obj --video_feat_dim 352 --num_class 3806 --verb_class 97 --noun_class 300` 77 | * ROI: `python main_anticipation.py --mode validate --ek100 --path_to_data data/ek100 --path_to_models models_anticipation/ek100/ --modality roi --video_feat_dim 1024 --num_class 3806 --verb_class 97 --noun_class 300` 78 | * Late Fusion: `python main_anticipation.py --mode validate --ek100 --path_to_data data/ek100 --path_to_models models_anticipation/ek100/ --modality late_fusion --num_class 3806 --verb_class 97 --noun_class 300` 79 | 80 | ##### Action Recognition 81 | * RGB: `python main_recognition.py --mode validate --ek100 --path_to_data data/ek100 --path_to_models models_recognition/ek100/ --modality rgb --video_feat_dim 1024 --num_class 3806 --verb_class 97 --noun_class 300` 82 | * Flow: `python main_recognition.py --mode validate --ek100 --path_to_data data/ek100 --path_to_models models_recognition/ek100/ --modality flow --video_feat_dim 1024 --num_class 3806 --verb_class 97 --noun_class 300` 83 | * Obj: `python main_recognition.py --mode validate --ek100 --path_to_data data/ek100 --path_to_models models_recognition/ek100/ --modality obj --video_feat_dim 352 --num_class 3806 --verb_class 97 --noun_class 300` 84 | * ROI: `python main_recognition.py --mode validate --ek100 --path_to_data data/ek100 --path_to_models models_recognition/ek100/ --modality roi --video_feat_dim 1024 --num_class 3806 --verb_class 97 --noun_class 300` 85 | * Late Fusion: `python main_recognition.py --mode validate --ek100 --path_to_data data/ek100 --path_to_models models_recognition/ek100/ --modality late_fusion --num_class 3806 --verb_class 97 --noun_class 300` 86 | 87 | 88 | Here are the validation results on EPIC-KITCHENS-100 as provided in our paper. 89 | 90 | * Anticipation 91 | ![ant](imgs/ant.PNG) 92 | 93 | * Recognition 94 | ![rec](imgs/rec.PNG) 95 | 96 | ### Testing and submitting the results to the server 97 | 98 | To test your model on the EPIC-100 test split, run the following: 99 | ##### Action Anticipation 100 | * `mkdir -p jsons/anticipation` 101 | * `python main_anticipation.py --mode test --json_directory jsons/anticipation --ek100 --path_to_data data/ek100 --path_to_models models_anticipation/ek100/ --modality late_fusion --num_class 3806 --verb_class 97 --noun_class 300` 102 | 103 | ##### Action Recognition 104 | * `mkdir -p jsons/recognition` 105 | * `python main_recognition.py --mode test --json_directory jsons/recognition--ek100 --path_to_data data/ek100 --path_to_models models_recognition/ek100/ --modality late_fusion --num_class 3806 --verb_class 97 --noun_class 300` 106 | 107 | 108 | ### Custom Training 109 | 110 | To train the model, run the following: 111 | 112 | #### EPIC-KITCHENS-55 113 | ##### Action Anticipation 114 | * RGB: `python main_anticipation.py --mode train --path_to_data data/ek55 --path_to_models models_anticipation/ek55 --modality rgb --video_feat_dim 1024` 115 | * Flow: `python main_anticipation.py --mode train --path_to_data data/ek55 --path_to_models models_anticipation/ek55 --modality flow --video_feat_dim 1024` 116 | * Obj: `python main_anticipation.py --mode train --path_to_data data/ek55 --path_to_models models_anticipation/ek55 --modality obj --video_feat_dim 352` 117 | * ROI: `python main_anticipation.py --mode train --path_to_data data/ek55 --path_to_models models_anticipation/ek55 --modality roi --video_feat_dim 1024` 118 | 119 | ##### Action Recognition 120 | * RGB: `python main_recognition.py --mode train --path_to_data data/ek55 --path_to_models models_recognition/ek55 --modality rgb --video_feat_dim 1024` 121 | * Flow: `python main_recognition.py --mode train --path_to_data data/ek55 --path_to_models models_recognition/ek55 --modality flow --video_feat_dim 1024` 122 | * Obj: `python main_recognition.py --mode train --path_to_data data/ek55 --path_to_models models_recognition/ek55 --modality obj --video_feat_dim 352` 123 | * ROI: `python main_recognition.py --mode train --path_to_data data/ek55 --path_to_models models_recognition/ek55 --modality roi --video_feat_dim 1024` 124 | 125 | #### EPIC-KITCHENS-100 126 | ##### Action Anticipation 127 | * RGB: `python main_anticipation.py --mode train --ek100 --path_to_data data/ek100 --path_to_models models_anticipation/ek100/ --modality rgb --video_feat_dim 1024 --num_class 3806 --verb_class 97 --noun_class 300` 128 | * Flow: `python main_anticipation.py --mode train --ek100 --path_to_data data/ek100 --path_to_models models_anticipation/ek100/ --modality flow --video_feat_dim 1024 --num_class 3806 --verb_class 97 --noun_class 300` 129 | * Obj: `python main_anticipation.py --mode train --ek100 --path_to_data data/ek100 --path_to_models models_anticipation/ek100/ --modality obj --video_feat_dim 352 --num_class 3806 --verb_class 97 --noun_class 300` 130 | * ROI: `python main_anticipation.py --mode train --ek100 --path_to_data data/ek100 --path_to_models models_anticipation/ek100/ --modality roi --video_feat_dim 1024 --num_class 3806 --verb_class 97 --noun_class 300` 131 | 132 | ##### Action Recognition 133 | * RGB: `python main_recognition.py --mode train --ek100 --path_to_data data/ek100 --path_to_models models_recognition/ek100/ --modality rgb --video_feat_dim 1024 --num_class 3806 --verb_class 97 --noun_class 300` 134 | * Flow: `python main_recognition.py --mode train --ek100 --path_to_data data/ek100 --path_to_models models_recognition/ek100/ --modality flow --video_feat_dim 1024 --num_class 3806 --verb_class 97 --noun_class 300` 135 | * Obj: `python main_recognition.py --mode train --ek100 --path_to_data data/ek100 --path_to_models models_recognition/ek100/ --modality obj --video_feat_dim 352 --num_class 3806 --verb_class 97 --noun_class 300` 136 | * ROI: `python main_recognition.py --mode train --ek100 --path_to_data data/ek100 --path_to_models models_recognition/ek100/ --modality roi --video_feat_dim 1024 --num_class 3806 --verb_class 97 --noun_class 300` 137 | 138 | Please refer to the papers for more technical details. 139 | 140 | ## Acknowledgements 141 | This code is based on [RU-LSTM](https://github.com/fpv-iplab/rulstm), hence grateful to the collaborators/maintainers of that repository. 142 | -------------------------------------------------------------------------------- /dataset_recognition.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ Implements a dataset object which allows to read representations from LMDB datasets.""" 4 | import lmdb 5 | import numpy as np 6 | import pandas as pd 7 | from torch.utils import data 8 | from tqdm import tqdm 9 | 10 | 11 | class SequenceDataset(data.Dataset): 12 | def __init__(self, path_to_lmdb, 13 | path_to_csv, 14 | label_type='action', 15 | img_tmpl="frame_{:010d}.jpg", 16 | challenge=False, 17 | fps=30, 18 | args=None): 19 | """ 20 | Inputs: 21 | path_to_lmdb: path to the folder containing the LMDB dataset 22 | path_to_csv: path to training/validation csv 23 | label_type: which label to return (verb, noun, or action) 24 | img_tmpl: image template to load the features 25 | challenge: allows to load csvs containing only time-stamp for the challenge 26 | fps: framerate 27 | """ 28 | 29 | # read the csv file 30 | if challenge: 31 | self.annotations = pd.read_csv(path_to_csv, header=None, names=['video', 'start', 'end']) 32 | else: 33 | self.annotations = pd.read_csv(path_to_csv, header=None, 34 | names=['video', 'start', 'end', 'verb', 'noun', 'action']) 35 | 36 | self.challenge = challenge 37 | self.path_to_lmdb = path_to_lmdb 38 | self.fps = fps 39 | self.label_type = label_type 40 | self.img_tmpl = img_tmpl 41 | 42 | self.recent_sec1 = args.recent_sec1 43 | self.recent_sec2 = args.recent_sec2 44 | self.recent_sec3 = args.recent_sec3 45 | self.recent_sec4 = args.recent_sec4 46 | self.recent_dim = args.recent_dim 47 | 48 | self.spanning_sec = args.spanning_sec 49 | self.span_dim1 = args.span_dim1 50 | self.span_dim2 = args.span_dim2 51 | self.span_dim3 = args.span_dim3 52 | 53 | self.feat_dim = args.video_feat_dim 54 | 55 | self.debug_on = args.debug_on 56 | 57 | # initialize some lists 58 | self.ids = [] # action ids 59 | self.discarded_ids = [] # list of ids discarded (e.g., if there were 60 | # no enough frames before the beginning of the action 61 | self.discarded_labels = [] # list of labels discarded (e.g., if there 62 | # were no enough frames before the beginning of the action 63 | self.recent_frames = [] # recent past 64 | self.spanning_frames = [] # spanning past 65 | self.labels = [] # labels of each action 66 | 67 | # populate them 68 | self.__populate_lists() 69 | 70 | # if a list to datasets has been provided, load all of them 71 | if isinstance(self.path_to_lmdb, list): 72 | self.env = [lmdb.open(l_m, readonly=True, lock=False) for l_m in self.path_to_lmdb] 73 | else: 74 | # otherwise, just load the single LMDB dataset 75 | self.env = lmdb.open(self.path_to_lmdb, readonly=True, lock=False) 76 | 77 | def __populate_lists(self): 78 | count_debug = 0 79 | """ Samples a sequence for each action and populates the lists. """ 80 | for _, a in tqdm(self.annotations.iterrows(), 'Populating Dataset', total=len(self.annotations)): 81 | count_debug += 1 82 | if self.debug_on: 83 | if count_debug > 10: 84 | break 85 | 86 | # sample frames before the beginning of the action 87 | recent_f, spanning_f = self.__get_snippet_features(a.start, a.end, a.video) 88 | 89 | # check if there were enough frames before the beginning of the action 90 | # if the smaller frame is at least 1, the sequence is valid 91 | if spanning_f is not None and recent_f is not None: 92 | self.spanning_frames.append(spanning_f) 93 | self.recent_frames.append(recent_f) 94 | self.ids.append(a.name) 95 | 96 | # handle whether a list of labels is required (e.g., [verb, noun]), rather than a single action 97 | if isinstance(self.label_type, list): 98 | if self.challenge: # if sampling for the challenge, there are no labels, just add -1 99 | self.labels.append(-1) 100 | else: 101 | # otherwise get the required labels 102 | self.labels.append(a[self.label_type].values.astype(int)) 103 | else: # single label version 104 | if self.challenge: 105 | self.labels.append(-1) 106 | else: 107 | self.labels.append(a[self.label_type]) 108 | else: 109 | # if the sequence is invalid, do nothing, but add the id to the discarded_ids list 110 | self.discarded_ids.append(a.name) 111 | if isinstance(self.label_type, list): 112 | if self.challenge: # if sampling for the challenge, there are no labels, just add -1 113 | self.discarded_labels.append(-1) 114 | else: 115 | # otherwise get the required labels 116 | self.discarded_labels.append(a[self.label_type].values.astype(int)) 117 | else: #single label version 118 | if self.challenge: 119 | self.discarded_labels.append(-1) 120 | else: 121 | self.discarded_labels.append(a[self.label_type]) 122 | 123 | def __get_snippet_features(self, point_start, point_end, video): 124 | 125 | # Spanning snippets 126 | start_spanning = max(point_start - (self.spanning_sec * self.fps), 0) 127 | end_spanning = point_end + (self.spanning_sec * self.fps) 128 | 129 | select_spanning_frames1 = np.linspace(start_spanning, end_spanning, self.span_dim1 + 1, dtype=int) 130 | select_spanning_frames2 = np.linspace(start_spanning, end_spanning, self.span_dim2 + 1, dtype=int) 131 | select_spanning_frames3 = np.linspace(start_spanning, end_spanning, self.span_dim3 + 1, dtype=int) 132 | 133 | spanning_past = [self.__get_frames_from_indices(video, select_spanning_frames1), 134 | self.__get_frames_from_indices(video, select_spanning_frames2), 135 | self.__get_frames_from_indices(video, select_spanning_frames3)] 136 | 137 | # Recent snippets 138 | start_recent1 = int(max(point_start - (self.recent_sec1 * self.fps), 0)) 139 | end_recent1 = int(point_end + (self.recent_sec1 * self.fps)) 140 | start_recent2 = int(max(point_start - (self.recent_sec2 * self.fps), 0)) 141 | end_recent2 = int(point_end + (self.recent_sec2 * self.fps)) 142 | start_recent3 = int(max(point_start - (self.recent_sec3 * self.fps), 0)) 143 | end_recent3 = int(point_end + (self.recent_sec3 * self.fps)) 144 | start_recent4 = int(max(point_start - (self.recent_sec4 * self.fps), 0)) 145 | end_recent4 = int(point_end + (self.recent_sec4 * self.fps)) 146 | 147 | select_recent_frames1 = np.linspace(start_recent1, end_recent1, self.recent_dim + 1, dtype=int) 148 | select_recent_frames2 = np.linspace(start_recent2, end_recent2, self.recent_dim + 1, dtype=int) 149 | select_recent_frames3 = np.linspace(start_recent3, end_recent3, self.recent_dim + 1, dtype=int) 150 | select_recent_frames4 = np.linspace(start_recent4, end_recent4, self.recent_dim + 1, dtype=int) 151 | 152 | recent_past = [self.__get_frames_from_indices(video, select_recent_frames1), 153 | self.__get_frames_from_indices(video, select_recent_frames2), 154 | self.__get_frames_from_indices(video, select_recent_frames3), 155 | self.__get_frames_from_indices(video, select_recent_frames4)] 156 | 157 | return recent_past, spanning_past 158 | 159 | def __get_frames_from_indices(self, video, indices): 160 | list_data = [] 161 | for kkl in range(len(indices) - 1): 162 | cur_start = np.floor(indices[kkl]).astype('int') 163 | cur_end = np.floor(indices[kkl + 1]).astype('int') 164 | list_frames = list(range(cur_start, cur_end + 1)) 165 | list_data.append(self.__get_frames(list_frames, video)) 166 | return list_data 167 | 168 | def __get_frames(self, frames, video): 169 | """ format file names using the image template """ 170 | frames = np.array(list(map(lambda x: video + "_" + self.img_tmpl.format(x), frames))) 171 | return frames 172 | 173 | def __len__(self): 174 | return len(self.ids) 175 | 176 | def __getitem__(self, index): 177 | """ sample a given sequence """ 178 | 179 | # get spanning and recent frames 180 | spanning_frames = self.spanning_frames[index] 181 | recent_frames = self.recent_frames[index] 182 | 183 | # return a dictionary containing the id of the current sequence 184 | # this is useful to produce the jsons for the challenge 185 | out = {'id': self.ids[index]} 186 | 187 | # read representations for spanning and recent frames 188 | out['recent_features'], out['spanning_features'] = read_data(recent_frames, spanning_frames, self.env, 189 | self.feat_dim) 190 | 191 | # get the label of the current sequence 192 | label = self.labels[index] 193 | out['label'] = label 194 | 195 | return out 196 | 197 | 198 | def read_representations(recent_frames, spanning_frames, env, feat_dim): 199 | """ Reads a set of representations, given their frame names and an LMDB environment.""" 200 | 201 | recent_features1 = [] 202 | recent_features2 = [] 203 | recent_features3 = [] 204 | recent_features4 = [] 205 | spanning_features1 = [] 206 | spanning_features2 = [] 207 | spanning_features3 = [] 208 | for e in env: 209 | spanning_features1.append(get_max_pooled_features(e, spanning_frames[0], feat_dim)) 210 | spanning_features2.append(get_max_pooled_features(e, spanning_frames[1], feat_dim)) 211 | spanning_features3.append(get_max_pooled_features(e, spanning_frames[2], feat_dim)) 212 | 213 | recent_features1.append(get_max_pooled_features(e, recent_frames[0], feat_dim)) 214 | recent_features2.append(get_max_pooled_features(e, recent_frames[1], feat_dim)) 215 | recent_features3.append(get_max_pooled_features(e, recent_frames[2], feat_dim)) 216 | recent_features4.append(get_max_pooled_features(e, recent_frames[3], feat_dim)) 217 | 218 | spanning_features1 = np.concatenate(spanning_features1, axis=-1) 219 | spanning_features2 = np.concatenate(spanning_features2, axis=-1) 220 | spanning_features3 = np.concatenate(spanning_features3, axis=-1) 221 | 222 | recent_features1 = np.concatenate(recent_features1, axis=-1) 223 | recent_features2 = np.concatenate(recent_features2, axis=-1) 224 | recent_features3 = np.concatenate(recent_features3, axis=-1) 225 | recent_features4 = np.concatenate(recent_features4, axis=-1) 226 | 227 | spanning_snippet_features = [spanning_features1, spanning_features2, spanning_features3] 228 | recent_snippet_features = [recent_features1, recent_features2, recent_features3, recent_features4] 229 | 230 | return recent_snippet_features, spanning_snippet_features 231 | 232 | 233 | def get_max_pooled_features(env, frame_names, feat_dim): 234 | list_features = [] 235 | missing_features = [] 236 | 237 | for kkl in range(len(frame_names)): 238 | with env.begin() as e: 239 | pool_list = [] 240 | for name in frame_names[kkl]: 241 | dd = e.get(name.strip().encode('utf-8')) 242 | if dd is None: 243 | continue 244 | data_curr = np.frombuffer(dd, 'float32') # convert to numpy array 245 | feat_dim = data_curr.shape[0] 246 | pool_list.append(data_curr) 247 | 248 | if len(pool_list) == 0: # Missing frames indices 249 | missing_features.append(kkl) 250 | list_features.append(np.zeros(feat_dim, dtype='float32')) 251 | else: 252 | max_pool = np.max(np.array(pool_list), 0) 253 | list_features.append(max_pool.squeeze()) 254 | 255 | for index in missing_features[::-1]: 256 | list_features[index] = list_features[0] 257 | 258 | list_features = np.stack(list_features) 259 | return list_features 260 | 261 | 262 | def read_data(recent_frames, spanning_frames, env, feat_dim): 263 | """A wrapper form read_representations to handle loading from more environments. 264 | This is used for multimodal data loading (e.g., RGB + Flow)""" 265 | 266 | # if env is a list 267 | if isinstance(env, list): 268 | # read the representations from all environments 269 | return read_representations(recent_frames, spanning_frames, env, feat_dim) 270 | else: 271 | # otherwise, just read the representations 272 | env = [env] 273 | return read_representations(recent_frames, spanning_frames, env, feat_dim) 274 | -------------------------------------------------------------------------------- /dataset_anticipation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ Implements a dataset object which allows to read representations from LMDB datasets.""" 4 | import lmdb 5 | import numpy as np 6 | import pandas as pd 7 | from torch.utils import data 8 | from tqdm import tqdm 9 | 10 | 11 | class SequenceDataset(data.Dataset): 12 | def __init__(self, path_to_lmdb, 13 | path_to_csv, 14 | time_step=1, 15 | label_type='action', 16 | img_tmpl="frame_{:010d}.jpg", 17 | challenge=False, 18 | fps=30, 19 | args=None): 20 | """ 21 | Inputs: 22 | path_to_lmdb: path to the folder containing the LMDB dataset 23 | path_to_csv: path to training/validation csv 24 | time_step: in seconds 25 | label_type: which label to return (verb, noun, or action) 26 | img_tmpl: image template to load the features 27 | challenge: allows to load csvs containing only time-stamp for the challenge 28 | fps: framerate 29 | """ 30 | 31 | # read the csv file 32 | if challenge: 33 | self.annotations = pd.read_csv(path_to_csv, header=None, names=['video', 'start', 'end']) 34 | else: 35 | self.annotations = pd.read_csv(path_to_csv, header=None, 36 | names=['video', 'start', 'end', 'verb', 'noun', 'action']) 37 | 38 | self.challenge = challenge 39 | self.path_to_lmdb = path_to_lmdb 40 | self.time_step = time_step 41 | self.fps = fps 42 | self.label_type = label_type 43 | self.img_tmpl = img_tmpl 44 | 45 | self.recent_sec1 = args.recent_sec1 46 | self.recent_sec2 = args.recent_sec2 47 | self.recent_sec3 = args.recent_sec3 48 | self.recent_sec4 = args.recent_sec4 49 | self.recent_dim = args.recent_dim 50 | 51 | self.spanning_sec = args.spanning_sec 52 | self.span_dim1 = args.span_dim1 53 | self.span_dim2 = args.span_dim2 54 | self.span_dim3 = args.span_dim3 55 | 56 | self.feat_dim = args.video_feat_dim 57 | 58 | self.debug_on = args.debug_on 59 | 60 | # initialize some lists 61 | self.ids = [] # action ids 62 | self.discarded_ids = [] # list of ids discarded (e.g., if there were 63 | # no enough frames before the beginning of the action 64 | self.discarded_labels = [] # list of labels discarded (e.g., if there 65 | # were no enough frames before the beginning of the action 66 | self.recent_frames = [] # recent past 67 | self.spanning_frames = [] # spanning past 68 | self.labels = [] # labels of each action 69 | 70 | # populate them 71 | self.__populate_lists() 72 | 73 | # if a list to datasets has been provided, load all of them 74 | if isinstance(self.path_to_lmdb, list): 75 | self.env = [lmdb.open(l_m, readonly=True, lock=False) for l_m in self.path_to_lmdb] 76 | else: 77 | # otherwise, just load the single LMDB dataset 78 | self.env = lmdb.open(self.path_to_lmdb, readonly=True, lock=False) 79 | 80 | def __populate_lists(self): 81 | count_debug = 0 82 | """ Samples a sequence for each action and populates the lists. """ 83 | for _, a in tqdm(self.annotations.iterrows(), 'Populating Dataset', total=len(self.annotations)): 84 | count_debug += 1 85 | if self.debug_on: 86 | if count_debug > 10: 87 | break 88 | 89 | # sample frames before the beginning of the action 90 | recent_f, spanning_f = self.__get_snippet_features(a.start, a.video) 91 | 92 | # check if there were enough frames before the beginning of the action 93 | # if the smaller frame is at least 1, the sequence is valid 94 | if spanning_f is not None and recent_f is not None: 95 | self.spanning_frames.append(spanning_f) 96 | self.recent_frames.append(recent_f) 97 | self.ids.append(a.name) 98 | 99 | # handle whether a list of labels is required (e.g., [verb, noun]), rather than a single action 100 | if isinstance(self.label_type, list): 101 | if self.challenge: # if sampling for the challenge, there are no labels, just add -1 102 | self.labels.append(-1) 103 | else: 104 | # otherwise get the required labels 105 | self.labels.append(a[self.label_type].values.astype(int)) 106 | else: # single label version 107 | if self.challenge: 108 | self.labels.append(-1) 109 | else: 110 | self.labels.append(a[self.label_type]) 111 | else: 112 | # if the sequence is invalid, do nothing, but add the id to the discarded_ids list 113 | self.discarded_ids.append(a.name) 114 | if isinstance(self.label_type, list): 115 | if self.challenge: # if sampling for the challenge, there are no labels, just add -1 116 | self.discarded_labels.append(-1) 117 | else: 118 | # otherwise get the required labels 119 | self.discarded_labels.append(a[self.label_type].values.astype(int)) 120 | else: #single label version 121 | if self.challenge: 122 | self.discarded_labels.append(-1) 123 | else: 124 | self.discarded_labels.append(a[self.label_type]) 125 | 126 | def __get_snippet_features(self, point, video): 127 | time_stamps = self.time_step 128 | 129 | # compute the time stamp corresponding to the beginning of the action 130 | end_time_stamp = point / self.fps 131 | 132 | # subtract time stamps to the timestamp of the last frame 133 | end_time_stamp = end_time_stamp - time_stamps 134 | if end_time_stamp < 2: 135 | return None, None 136 | 137 | # Spanning snippets 138 | end_spanning = np.floor(end_time_stamp * self.fps).astype(int) 139 | start_spanning = max(end_spanning - (self.spanning_sec * self.fps), 0) 140 | 141 | # different spanning granularities (scale) for spanning feature 142 | select_spanning_frames1 = np.linspace(start_spanning, end_spanning, self.span_dim1 + 1, dtype=int) 143 | select_spanning_frames2 = np.linspace(start_spanning, end_spanning, self.span_dim2 + 1, dtype=int) 144 | select_spanning_frames3 = np.linspace(start_spanning, end_spanning, self.span_dim3 + 1, dtype=int) 145 | 146 | spanning_past = [self.__get_frames_from_indices(video, select_spanning_frames1), 147 | self.__get_frames_from_indices(video, select_spanning_frames2), 148 | self.__get_frames_from_indices(video, select_spanning_frames3)] 149 | 150 | # Recent snippets 151 | end_recent = end_spanning 152 | # different temporal granularities for recent feature 153 | start_recent1 = max(end_recent - self.recent_sec1 * self.fps, 0) 154 | start_recent2 = max(end_recent - self.recent_sec2 * self.fps, 0) 155 | start_recent3 = max(end_recent - self.recent_sec3 * self.fps, 0) 156 | start_recent4 = max(end_recent - self.recent_sec4 * self.fps, 0) 157 | 158 | select_recent_frames1 = np.linspace(start_recent1, end_recent, self.recent_dim + 1, dtype=int) 159 | select_recent_frames2 = np.linspace(start_recent2, end_recent, self.recent_dim + 1, dtype=int) 160 | select_recent_frames3 = np.linspace(start_recent3, end_recent, self.recent_dim + 1, dtype=int) 161 | select_recent_frames4 = np.linspace(start_recent4, end_recent, self.recent_dim + 1, dtype=int) 162 | 163 | recent_past = [self.__get_frames_from_indices(video, select_recent_frames1), 164 | self.__get_frames_from_indices(video, select_recent_frames2), 165 | self.__get_frames_from_indices(video, select_recent_frames3), 166 | self.__get_frames_from_indices(video, select_recent_frames4)] 167 | 168 | return recent_past, spanning_past 169 | 170 | def __get_frames_from_indices(self, video, indices): 171 | list_data = [] 172 | for kkl in range(len(indices) - 1): 173 | cur_start = np.floor(indices[kkl]).astype('int') 174 | cur_end = np.floor(indices[kkl + 1]).astype('int') 175 | list_frames = list(range(cur_start, cur_end + 1)) 176 | list_data.append(self.__get_frames(list_frames, video)) 177 | return list_data 178 | 179 | def __get_frames(self, frames, video): 180 | """ format file names using the image template """ 181 | frames = np.array(list(map(lambda x: video + "_" + self.img_tmpl.format(x), frames))) 182 | return frames 183 | 184 | def __len__(self): 185 | return len(self.ids) 186 | 187 | def __getitem__(self, index): 188 | """ sample a given sequence """ 189 | 190 | # get spanning and recent frames 191 | spanning_frames = self.spanning_frames[index] 192 | recent_frames = self.recent_frames[index] 193 | 194 | # return a dictionary containing the id of the current sequence 195 | # this is useful to produce the jsons for the challenge 196 | out = {'id': self.ids[index]} 197 | 198 | # read representations for spanning and recent frames 199 | out['recent_features'], out['spanning_features'] = read_data(recent_frames, spanning_frames, self.env, 200 | self.feat_dim) 201 | 202 | # get the label of the current sequence 203 | label = self.labels[index] 204 | out['label'] = label 205 | 206 | return out 207 | 208 | 209 | def read_representations(recent_frames, spanning_frames, env, feat_dim): 210 | """ Reads a set of representations, given their frame names and an LMDB environment.""" 211 | 212 | recent_features1 = [] 213 | recent_features2 = [] 214 | recent_features3 = [] 215 | recent_features4 = [] 216 | spanning_features1 = [] 217 | spanning_features2 = [] 218 | spanning_features3 = [] 219 | for e in env: 220 | spanning_features1.append(get_max_pooled_features(e, spanning_frames[0], feat_dim)) 221 | spanning_features2.append(get_max_pooled_features(e, spanning_frames[1], feat_dim)) 222 | spanning_features3.append(get_max_pooled_features(e, spanning_frames[2], feat_dim)) 223 | 224 | recent_features1.append(get_max_pooled_features(e, recent_frames[0], feat_dim)) 225 | recent_features2.append(get_max_pooled_features(e, recent_frames[1], feat_dim)) 226 | recent_features3.append(get_max_pooled_features(e, recent_frames[2], feat_dim)) 227 | recent_features4.append(get_max_pooled_features(e, recent_frames[3], feat_dim)) 228 | 229 | spanning_features1 = np.concatenate(spanning_features1, axis=-1) 230 | spanning_features2 = np.concatenate(spanning_features2, axis=-1) 231 | spanning_features3 = np.concatenate(spanning_features3, axis=-1) 232 | 233 | recent_features1 = np.concatenate(recent_features1, axis=-1) 234 | recent_features2 = np.concatenate(recent_features2, axis=-1) 235 | recent_features3 = np.concatenate(recent_features3, axis=-1) 236 | recent_features4 = np.concatenate(recent_features4, axis=-1) 237 | 238 | spanning_snippet_features = [spanning_features1, spanning_features2, spanning_features3] 239 | recent_snippet_features = [recent_features1, recent_features2, recent_features3, recent_features4] 240 | 241 | return recent_snippet_features, spanning_snippet_features 242 | 243 | 244 | def get_max_pooled_features(env, frame_names, feat_dim): 245 | list_features = [] 246 | missing_features = [] 247 | 248 | #print(f'frame_names len={len(frame_names)} and\nsnippets frames={frame_names}') 249 | for kkl in range(len(frame_names)): 250 | with env.begin() as e: 251 | pool_list = [] 252 | for name in frame_names[kkl]: 253 | dd = e.get(name.strip().encode('utf-8')) 254 | if dd is None: 255 | continue 256 | data_curr = np.frombuffer(dd, 'float32') # convert to numpy array 257 | feat_dim = data_curr.shape[0] 258 | pool_list.append(data_curr) 259 | 260 | if len(pool_list) == 0: # Missing frames indices 261 | missing_features.append(kkl) 262 | list_features.append(np.zeros(feat_dim, dtype='float32')) 263 | else: 264 | max_pool = np.max(np.array(pool_list), 0) 265 | list_features.append(max_pool.squeeze()) 266 | 267 | if(len(missing_features)>0): 268 | if(max(missing_features)>=len(frame_names)-1): 269 | for index in missing_features[::-1]: 270 | list_features[index] = list_features[max(index-1, 0)] 271 | else: 272 | # Reversing and adding next frames to previous frames to fill in indexes with many empty at start 273 | for index in missing_features[::-1]: 274 | list_features[index] = list_features[index + 1] 275 | 276 | list_features = np.stack(list_features) 277 | return list_features 278 | 279 | 280 | def read_data(recent_frames, spanning_frames, env, feat_dim): 281 | """A wrapper form read_representations to handle loading from more environments. 282 | This is used for multimodal data loading (e.g., RGB + Flow)""" 283 | 284 | # if env is a list 285 | if isinstance(env, list): 286 | # read the representations from all environments 287 | return read_representations(recent_frames, spanning_frames, env, feat_dim) 288 | else: 289 | # otherwise, just read the representations 290 | env = [env] 291 | return read_representations(recent_frames, spanning_frames, env, feat_dim) 292 | -------------------------------------------------------------------------------- /main_recognition.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from argparse import ArgumentParser 4 | from dataset_recognition import SequenceDataset 5 | from os.path import join 6 | import torch 7 | from torch.utils.data import DataLoader 8 | from utils import ValueMeter, topk_accuracy, topk_accuracy_save_validation_pred, topk_recall 9 | from utils import get_marginal_indexes, marginalize, softmax, predictions_to_json 10 | from tqdm import tqdm 11 | import numpy as np 12 | import pandas as pd 13 | import json 14 | from network import Network 15 | from torch.optim import lr_scheduler 16 | from torch import nn 17 | import copy 18 | import pickle as pkl 19 | 20 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 21 | 22 | COMP_PATH = 'tempAgg_ant_rec/' 23 | 24 | pd.options.display.float_format = '{:05.2f}'.format 25 | 26 | parser = ArgumentParser(description="Training for Action Recognition") 27 | parser.add_argument('--mode', type=str, default='train', choices=['train', 'validate', 'test', 'validate_json'], 28 | help="Whether to perform training, validation or test. If test/validate_json is selected, " 29 | "--json_directory must be used to provide a directory in which to save the generated jsons.") 30 | parser.add_argument('--path_to_data', type=str, default=COMP_PATH + 'DATA_EPIC_ALL/', 31 | help="Path to the data folder, containing all LMDB datasets") 32 | parser.add_argument('--path_to_models', type=str, default=COMP_PATH + '/models_recognition/', 33 | help="Path to the directory where to save all models") 34 | 35 | parser.add_argument('--json_directory', type=str, default=COMP_PATH + '/models_recognition/', 36 | help='Directory in which to save the generated jsons.') 37 | parser.add_argument('--task', type=str, default='action_recognition', 38 | choices=['action_anticipation', 'action_recognition'], 39 | help='Task to tackle: anticipation or recognition') 40 | 41 | parser.add_argument('--img_tmpl', type=str, default='frame_{:010d}.jpg', 42 | help='Template to use to load the representation of a given frame') 43 | parser.add_argument('--resume', action='store_true', help='Whether to resume suspended training') 44 | parser.add_argument('--best_model', type=str, default='best', choices=['best', 'last'], help='') 45 | 46 | parser.add_argument('--modality', type=str, default='obj', choices=['rgb', 'flow', 'obj', 'roi', 'late_fusion'], 47 | help="Modality. rgb/flow/obj/roi represent single branches or late fusion of all.") 48 | parser.add_argument('--weight_rgb', type=float, default=0.5, help='') 49 | parser.add_argument('--weight_flow', type=float, default=0.5, help='') 50 | parser.add_argument('--weight_obj', type=float, default=0.5, help='') 51 | parser.add_argument('--weight_roi', type=float, default=0.5, help='') 52 | 53 | parser.add_argument('--num_workers', type=int, default=0, help="Number of parallel thread to fetch the data") 54 | parser.add_argument('--display_every', type=int, default=10, help="Display every n iterations") 55 | 56 | parser.add_argument('--schedule_on', type=int, default=1, help='') 57 | parser.add_argument('--schedule_epoch', type=int, default=10, help='') 58 | 59 | parser.add_argument('--num_class', type=int, default=2513, help='Number of classes') 60 | parser.add_argument('--verb_class', type=int, default=125, help='') 61 | parser.add_argument('--noun_class', type=int, default=352, help='') 62 | parser.add_argument('--lr', type=float, default=1e-4, help="Learning rate") 63 | parser.add_argument('--latent_dim', type=int, default=512, help='') 64 | parser.add_argument('--linear_dim', type=int, default=512, help='') 65 | parser.add_argument('--dropout_rate', type=float, default=0.3, help='') 66 | parser.add_argument('--scale_factor', type=float, default=-.5, help='') 67 | parser.add_argument('--scale', type=bool, default=True, help='') 68 | parser.add_argument('--batch_size', type=int, default=10, help="Batch Size") 69 | parser.add_argument('--epochs', type=int, default=25, help="Training epochs") 70 | parser.add_argument('--video_feat_dim', type=int, default=352, choices=[352, 1024], help='') 71 | parser.add_argument('--past_attention', type=bool, default=True, help='') 72 | 73 | # Spanning snippets 74 | parser.add_argument('--spanning_sec', type=float, default=6.0, help='') 75 | parser.add_argument('--span_dim1', type=int, default=5, help='') 76 | parser.add_argument('--span_dim2', type=int, default=3, help='') 77 | parser.add_argument('--span_dim3', type=int, default=2, help='') 78 | 79 | # Recent snippets 80 | parser.add_argument('--recent_dim', type=int, default=5, help='') 81 | parser.add_argument('--recent_sec1', type=float, default=0.0, help='') 82 | parser.add_argument('--recent_sec2', type=float, default=1.0, help='') 83 | parser.add_argument('--recent_sec3', type=float, default=2.0, help='') 84 | parser.add_argument('--recent_sec4', type=float, default=3.0, help='') 85 | 86 | # Adding verb and noun loss 87 | parser.add_argument('--verb_noun_scores', type=bool, default=True, help='') 88 | parser.add_argument('--add_verb_loss', action='store_true', default=True, help='Whether to train with verb loss or not') 89 | parser.add_argument('--add_noun_loss', action='store_true', default=True, help='Whether to train with verb loss or not') 90 | parser.add_argument('--verb_loss_weight', type=float, default=1.0, help='') 91 | parser.add_argument('--noun_loss_weight', type=float, default=1.0, help='') 92 | parser.add_argument('--ek100', action='store_true', help="Whether to use EPIC-KITCHENS-100") 93 | parser.add_argument('--trainval', type=bool, default=False, help='Whether to train on train+val or only train') 94 | 95 | parser.add_argument('--topK', type=int, default=1, help='') 96 | 97 | # Debugging True 98 | parser.add_argument('--debug_on', type=bool, default=False, help='') 99 | 100 | args = parser.parse_args() 101 | 102 | 103 | def make_model_name(arg_save): 104 | save_name = "arec_mod_{}_span_{}_s1_{}_s2_{}_s3_{}_recent_{}_r1_{}_r2_{}_r3_{}_r4_{}_bs_{}_drop_{}_lr_{}_dimLa_{}_" \ 105 | "dimLi_{}_epoc_{}".format(arg_save.modality, arg_save.spanning_sec, arg_save.span_dim1, 106 | arg_save.span_dim2, arg_save.span_dim3, arg_save.recent_dim, 107 | arg_save.recent_sec1, arg_save.recent_sec2, arg_save.recent_sec3, 108 | arg_save.recent_sec4, arg_save.batch_size, arg_save.dropout_rate, arg_save.lr, 109 | arg_save.latent_dim, arg_save.linear_dim, arg_save.epochs) 110 | if arg_save.add_verb_loss: 111 | save_name = save_name + '_vb' 112 | if arg_save.add_noun_loss: 113 | save_name = save_name + '_nn' 114 | return save_name 115 | 116 | 117 | def save_model(model, epoch, perf, best_perf, is_best=False): 118 | torch.save({'state_dict': model.state_dict(), 'epoch': epoch, 119 | 'perf': perf, 'best_perf': best_perf}, join(args.path_to_models, exp_name + '.pth.tar')) 120 | if is_best: 121 | torch.save({'state_dict': model.state_dict(), 'epoch': epoch, 'perf': perf, 'best_perf': best_perf}, join( 122 | args.path_to_models, exp_name + '_best.pth.tar')) 123 | 124 | 125 | def get_validation_ids(): 126 | unseen_participants_ids = pd.read_csv(join(args.path_to_data, 'validation_unseen_participants_ids.csv'), names=['id'], squeeze=True) 127 | tail_verbs_ids = pd.read_csv(join(args.path_to_data, 'validation_tail_verbs_ids.csv'), names=['id'], squeeze=True) 128 | tail_nouns_ids = pd.read_csv(join(args.path_to_data, 'validation_tail_nouns_ids.csv'), names=['id'], squeeze=True) 129 | tail_actions_ids = pd.read_csv(join(args.path_to_data, 'validation_tail_actions_ids.csv'), names=['id'], squeeze=True) 130 | 131 | return unseen_participants_ids, tail_verbs_ids, tail_nouns_ids, tail_actions_ids 132 | 133 | 134 | def get_many_shot(): 135 | """Get many shot verbs, nouns and actions for class-aware metrics (Mean Top-5 Recall)""" 136 | # read the list of many shot verbs 137 | many_shot_verbs = pd.read_csv(join(args.path_to_data, 'EPIC_many_shot_verbs.csv'))['verb_class'].values 138 | # read the list of many shot nouns 139 | many_shot_nouns = pd.read_csv( 140 | join(args.path_to_data, 'EPIC_many_shot_nouns.csv'))['noun_class'].values 141 | 142 | # read the list of actions 143 | actions = pd.read_csv(join(args.path_to_data, 'actions.csv')) 144 | # map actions to (verb, noun) pairs 145 | a_to_vn = {a[1]['id']: tuple(a[1][['verb', 'noun']].values) 146 | for a in actions.iterrows()} 147 | 148 | # create the list of many shot actions 149 | # an action is "many shot" if at least one 150 | # between the related verb and noun are many shot 151 | many_shot_actions = [] 152 | for a, (v, n) in a_to_vn.items(): 153 | if v in many_shot_verbs or n in many_shot_nouns: 154 | many_shot_actions.append(a) 155 | 156 | return many_shot_verbs, many_shot_nouns, many_shot_actions 157 | 158 | 159 | def get_scores(model, loader, challenge=False): 160 | model.eval() 161 | predictions_act = [] 162 | predictions_noun = [] 163 | predictions_verb = [] 164 | labels = [] 165 | ids = [] 166 | with torch.set_grad_enabled(False): 167 | for batch in tqdm(loader, 'Evaluating...', len(loader)): 168 | x_spanning = batch['spanning_features'] 169 | x_recent = batch['recent_features'] 170 | if type(x_spanning) == list: 171 | x_spanning = [xx.to(device) for xx in x_spanning] 172 | x_recent = [xx.to(device) for xx in x_recent] 173 | else: 174 | x_spanning = x_spanning.to(device) 175 | x_recent = x_recent.to(device) 176 | 177 | y_label = batch['label'].numpy() 178 | ids.append(batch['id']) 179 | 180 | pred_act1, pred_act2, pred_act3, pred_act4, pred_verb1, pred_verb2, pred_verb3, pred_verb4, \ 181 | pred_noun1, pred_noun2, pred_noun3, pred_noun4 = model(x_spanning, x_recent) 182 | 183 | pred_ensemble_act = pred_act1.detach() + pred_act2.detach() + pred_act3.detach() + pred_act4.detach() 184 | pred_ensemble_act = pred_ensemble_act.cpu().numpy() 185 | pred_ensemble_verb = pred_verb1.detach() + pred_verb2.detach() + pred_verb3.detach() + pred_verb4.detach() 186 | pred_ensemble_verb = pred_ensemble_verb.cpu().numpy() 187 | pred_ensemble_noun = pred_noun1.detach() + pred_noun2.detach() + pred_noun3.detach() + pred_noun4.detach() 188 | pred_ensemble_noun = pred_ensemble_noun.cpu().numpy() 189 | 190 | predictions_act.append(pred_ensemble_act) 191 | predictions_verb.append(pred_ensemble_verb) 192 | predictions_noun.append(pred_ensemble_noun) 193 | labels.append(y_label) 194 | 195 | action_scores = np.concatenate(predictions_act) 196 | labels = np.concatenate(labels) 197 | ids = np.concatenate(ids) 198 | 199 | if args.verb_noun_scores: # use the verb and noun scores 200 | verb_scores = np.concatenate(predictions_verb) 201 | noun_scores = np.concatenate(predictions_noun) 202 | else: # marginalize the action scores to get the noun and verb scores 203 | actions = pd.read_csv(join(args.path_to_data, 'actions.csv'), index_col='id') 204 | vi = get_marginal_indexes(actions, 'verb') 205 | ni = get_marginal_indexes(actions, 'noun') 206 | action_prob = softmax(action_scores.reshape(-1, action_scores.shape[-1])) 207 | verb_scores = marginalize(action_prob, vi) # .reshape( action_scores.shape[0], action_scores.shape[1], -1) 208 | noun_scores = marginalize(action_prob, ni) # .reshape( action_scores.shape[0], action_scores.shape[1], -1) 209 | 210 | if labels.max() > 0 and not challenge: 211 | return verb_scores, noun_scores, action_scores, labels[:, 0], labels[:, 1], labels[:, 2], ids 212 | else: 213 | return verb_scores, noun_scores, action_scores, ids 214 | 215 | 216 | def get_scores_late_fusion(models, loaders, challenge=False): 217 | verb_scores = [] 218 | noun_scores = [] 219 | action_scores = [] 220 | outputs = [] 221 | for model, loader in zip(models, loaders): 222 | outputs = get_scores(model, loader, challenge) 223 | verb_scores.append(outputs[0]) 224 | noun_scores.append(outputs[1]) 225 | action_scores.append(outputs[2]) 226 | 227 | verb_scores[0] = verb_scores[0] * args.weight_rgb 228 | verb_scores[1] = verb_scores[1] * args.weight_flow 229 | verb_scores[2] = verb_scores[2] * args.weight_obj 230 | verb_scores[3] = verb_scores[3] * args.weight_roi 231 | 232 | noun_scores[0] = noun_scores[0] * args.weight_rgb 233 | noun_scores[1] = noun_scores[1] * args.weight_flow 234 | noun_scores[2] = noun_scores[2] * args.weight_obj 235 | noun_scores[3] = noun_scores[3] * args.weight_roi 236 | 237 | action_scores[0] = action_scores[0] * args.weight_rgb 238 | action_scores[1] = action_scores[1] * args.weight_flow 239 | action_scores[2] = action_scores[2] * args.weight_obj 240 | action_scores[3] = action_scores[3] * args.weight_roi 241 | 242 | verb_scores = sum(verb_scores) 243 | noun_scores = sum(noun_scores) 244 | action_scores = sum(action_scores) 245 | 246 | #return [verb_scores, noun_scores, action_scores] + list(outputs[3:]) 247 | return [verb_scores, noun_scores, action_scores] + list(outputs[3:]) 248 | 249 | 250 | def log(mode, epoch, total_loss_meter, ensemble_accuracy_meter, 251 | action_loss_meter, verb_loss_meter, noun_loss_meter, 252 | accuracy_action1_meter, accuracy_action2_meter, accuracy_action3_meter, accuracy_action4_meter, 253 | best_perf=None, green=False): 254 | if green: 255 | print('\033[92m', end="") 256 | print( 257 | "[{}] Epoch: {:.2f}. ".format(mode, epoch), 258 | "Total Loss: {:.2f}. ".format(total_loss_meter.value()), 259 | "Act. Loss: {:.2f}. ".format(action_loss_meter.value()), 260 | "Verb Loss: {:.2f}. ".format(verb_loss_meter.value()), 261 | "Noun Loss: {:.2f}. ".format(noun_loss_meter.value()), 262 | "Acc. Act1: {:.2f}% ".format(accuracy_action1_meter.value()), 263 | "Acc. Act2: {:.2f}% ".format(accuracy_action2_meter.value()), 264 | "Acc. Act3: {:.2f}% ".format(accuracy_action3_meter.value()), 265 | "Acc. Act4: {:.2f}% ".format(accuracy_action4_meter.value()), 266 | "Ensemble Acc.: {:.2f}% ".format(ensemble_accuracy_meter.value()), 267 | end="") 268 | 269 | if best_perf: 270 | print("[best: {:.2f}]%".format(best_perf), end="") 271 | 272 | print('\033[0m') 273 | 274 | 275 | def train_validation(model, loaders, optimizer, epochs, start_epoch, start_best_perf, schedule_on): 276 | """Training/Validation code""" 277 | 278 | best_perf = start_best_perf # to keep track of the best performing epoch 279 | 280 | loss_act_TAB1 = nn.CrossEntropyLoss() 281 | loss_act_TAB2 = nn.CrossEntropyLoss() 282 | loss_act_TAB3 = nn.CrossEntropyLoss() 283 | loss_act_TAB4 = nn.CrossEntropyLoss() 284 | if args.add_verb_loss: 285 | print('Add verb losses') 286 | loss_verb_TAB1 = nn.CrossEntropyLoss() 287 | loss_verb_TAB2 = nn.CrossEntropyLoss() 288 | loss_verb_TAB3 = nn.CrossEntropyLoss() 289 | loss_verb_TAB4 = nn.CrossEntropyLoss() 290 | if args.add_noun_loss: 291 | print('Add noun losses') 292 | loss_noun_TAB1 = nn.CrossEntropyLoss() 293 | loss_noun_TAB2 = nn.CrossEntropyLoss() 294 | loss_noun_TAB3 = nn.CrossEntropyLoss() 295 | loss_noun_TAB4 = nn.CrossEntropyLoss() 296 | 297 | for epoch in range(start_epoch, epochs): 298 | if schedule_on is not None: 299 | schedule_on.step() 300 | 301 | # define training and validation meters 302 | total_loss_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 303 | action_loss_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 304 | verb_loss_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 305 | noun_loss_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 306 | 307 | ensemble_accuracy_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 308 | accuracy_action1_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 309 | accuracy_action2_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 310 | accuracy_action3_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 311 | accuracy_action4_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 312 | 313 | for mode in ['training', 'validation']: 314 | 315 | # enable gradients only if training 316 | with torch.set_grad_enabled(mode == 'training'): 317 | if mode == 'training': 318 | model.train() 319 | else: 320 | model.eval() 321 | 322 | for i, batch in enumerate(loaders[mode]): 323 | x_spanning = batch['spanning_features'] 324 | x_recent = batch['recent_features'] 325 | if type(x_spanning) == list: 326 | x_spanning = [xx.to(device) for xx in x_spanning] 327 | x_recent = [xx.to(device) for xx in x_recent] 328 | else: 329 | x_spanning = x_spanning.to(device) 330 | x_recent = x_recent.to(device) 331 | 332 | y_label = batch['label'].to(device) 333 | bs = y_label.shape[0] # batch size 334 | 335 | pred_act1, pred_act2, pred_act3, pred_act4, pred_verb1, pred_verb2, pred_verb3, pred_verb4, \ 336 | pred_noun1, pred_noun2, pred_noun3, pred_noun4 = model(x_spanning, x_recent) 337 | 338 | loss = loss_act_TAB1(pred_act1, y_label[:, 2]) + \ 339 | loss_act_TAB2(pred_act2, y_label[:, 2]) + \ 340 | loss_act_TAB3(pred_act3, y_label[:, 2]) + \ 341 | loss_act_TAB4(pred_act4, y_label[:, 2]) 342 | action_loss_meter[mode].add(loss.item(), bs) 343 | 344 | if args.add_verb_loss: 345 | verb_loss = loss_verb_TAB1(pred_verb1, y_label[:, 0]) + \ 346 | loss_verb_TAB2(pred_verb2, y_label[:, 0]) + \ 347 | loss_verb_TAB3(pred_verb3, y_label[:, 0]) + \ 348 | loss_verb_TAB4(pred_verb4, y_label[:, 0]) 349 | verb_loss_meter[mode].add(verb_loss.item(), bs) 350 | loss = loss + args.verb_loss_weight * verb_loss 351 | else: 352 | verb_loss_meter[mode].add(-1, bs) 353 | 354 | if args.add_noun_loss: 355 | noun_loss = loss_noun_TAB1(pred_noun1, y_label[:, 1]) + \ 356 | loss_noun_TAB2(pred_noun2, y_label[:, 1]) + \ 357 | loss_noun_TAB3(pred_noun3, y_label[:, 1]) + \ 358 | loss_noun_TAB4(pred_noun4, y_label[:, 1]) 359 | noun_loss_meter[mode].add(noun_loss.item(), bs) 360 | loss = loss + args.noun_loss_weight * noun_loss 361 | else: 362 | noun_loss_meter[mode].add(-1, bs) 363 | 364 | label_curr = y_label[:, 2].detach().cpu().numpy() 365 | acc_future1 = topk_accuracy(pred_act1.detach().cpu().numpy(), label_curr, (args.topK,))[0] * 100 366 | acc_future2 = topk_accuracy(pred_act2.detach().cpu().numpy(), label_curr, (args.topK,))[0] * 100 367 | acc_future3 = topk_accuracy(pred_act3.detach().cpu().numpy(), label_curr, (args.topK,))[0] * 100 368 | acc_future4 = topk_accuracy(pred_act4.detach().cpu().numpy(), label_curr, (args.topK,))[0] * 100 369 | accuracy_action1_meter[mode].add(acc_future1, bs) 370 | accuracy_action2_meter[mode].add(acc_future2, bs) 371 | accuracy_action3_meter[mode].add(acc_future3, bs) 372 | accuracy_action4_meter[mode].add(acc_future4, bs) 373 | 374 | pred_ensemble = pred_act1.detach() + pred_act2.detach() + pred_act3.detach() + pred_act4.detach() 375 | pred_ensemble = pred_ensemble.cpu().numpy() 376 | acc_ensemble = topk_accuracy(pred_ensemble, label_curr, (args.topK,))[0] * 100 377 | 378 | # store the values in the meters to keep incremental averages 379 | total_loss_meter[mode].add(loss.item(), bs) 380 | ensemble_accuracy_meter[mode].add(acc_ensemble, bs) 381 | 382 | # if in training mode 383 | if mode == 'training': 384 | optimizer.zero_grad() 385 | loss.backward() 386 | optimizer.step() 387 | 388 | # log training during loop - avoid logging the very first batch. It can be biased. 389 | if mode == 'training' and i != 0 and i % args.display_every == 0: 390 | epoch_curr = epoch + i / len(loaders[mode]) # compute decimal epoch for logging 391 | log(mode, epoch_curr, total_loss_meter[mode], ensemble_accuracy_meter[mode], 392 | action_loss_meter[mode], verb_loss_meter[mode], noun_loss_meter[mode], 393 | accuracy_action1_meter[mode], accuracy_action2_meter[mode], 394 | accuracy_action3_meter[mode], accuracy_action4_meter[mode]) 395 | 396 | # log at the end of each epoch 397 | log(mode, epoch + 1, total_loss_meter[mode], ensemble_accuracy_meter[mode], 398 | action_loss_meter[mode], verb_loss_meter[mode], noun_loss_meter[mode], 399 | accuracy_action1_meter[mode], accuracy_action2_meter[mode], 400 | accuracy_action3_meter[mode], accuracy_action4_meter[mode], 401 | max(ensemble_accuracy_meter[mode].value(), best_perf) if mode == 'validation' else None, green=True) 402 | 403 | if best_perf < ensemble_accuracy_meter['validation'].value(): 404 | best_perf = ensemble_accuracy_meter['validation'].value() 405 | is_best = True 406 | else: 407 | is_best = False 408 | with open(args.path_to_models + '/' + exp_name + '.txt', 'a') as f: 409 | f.write("%d - %0.2f\n" % (epoch + 1, ensemble_accuracy_meter['validation'].value())) 410 | 411 | # save checkpoint at the end of each train/val epoch 412 | save_model(model, epoch + 1, ensemble_accuracy_meter['validation'].value(), best_perf, is_best=is_best) 413 | 414 | with open(args.path_to_models + '/' + exp_name + '.txt', 'a') as f: 415 | f.write("%d - %0.2f\n" % (epochs + 1, best_perf)) 416 | 417 | 418 | def load_checkpoint(model): 419 | model_add = '.pth.tar' 420 | if args.best_model == 'best': 421 | print('args.best_model == True') 422 | model_add = '_best.pth.tar' 423 | 424 | chk = torch.load(join(args.path_to_models, exp_name + model_add)) 425 | epoch = chk['epoch'] 426 | best_perf = chk['best_perf'] 427 | perf = chk['perf'] 428 | model.load_state_dict(chk['state_dict']) 429 | return epoch, perf, best_perf 430 | 431 | 432 | def get_loader(mode, override_modality=None): 433 | if override_modality: 434 | path_to_lmdb = join(args.path_to_data, override_modality) 435 | else: 436 | path_to_lmdb = join(args.path_to_data, args.modality) 437 | 438 | if args.trainval: 439 | csv_file = 'trainval' 440 | else: 441 | csv_file = mode 442 | 443 | kargs = { 444 | 'path_to_lmdb': path_to_lmdb, 445 | 'path_to_csv': join(args.path_to_data, "{}.csv".format(csv_file)), 446 | 'label_type': ['verb', 'noun', 'action'], 447 | 'img_tmpl': args.img_tmpl, 448 | 'challenge': 'test' in mode, 449 | 'args': args 450 | } 451 | _set = SequenceDataset(**kargs) 452 | return DataLoader(_set, batch_size=args.batch_size, num_workers=args.num_workers, 453 | pin_memory=True, shuffle=mode == 'training') 454 | 455 | 456 | def get_model(): 457 | if not args.modality == 'late_fusion': 458 | return Network(args) 459 | elif args.modality == 'late_fusion': 460 | obj_model = Network(args) 461 | rgb_model = Network(args_rgb) 462 | flow_model = Network(args_flow) 463 | roi_model = Network(args_roi) 464 | 465 | model_add = '.pth.tar' 466 | if args.best_model == 'best': 467 | print('args.best_model == True') 468 | model_add = '_best.pth.tar' 469 | 470 | 471 | checkpoint_rgb = torch.load(join(args.path_to_models, exp_rgb.replace(f'{args.modality}', 'rgb') + model_add)) 472 | checkpoint_flow = torch.load(join(args.path_to_models, exp_flow.replace(f'{args.modality}', 'flow') + model_add)) 473 | checkpoint_obj = torch.load(join(args.path_to_models, exp_name.replace(f'{args.modality}', 'obj') + model_add)) 474 | checkpoint_roi = torch.load(join(args.path_to_models, exp_roi.replace(f'{args.modality}', 'roi') + model_add)) 475 | 476 | print(f"Loaded checkpoint for model rgb. Epoch: {checkpoint_rgb['epoch']}. Perf: {checkpoint_rgb['perf']:.2f}.") 477 | print(f"Loaded checkpoint for model flow. Epoch: {checkpoint_flow['epoch']}. Perf: {checkpoint_flow['perf']:.2f}.") 478 | print(f"Loaded checkpoint for model obj. Epoch: {checkpoint_obj['epoch']}. Perf: {checkpoint_obj['perf']:.2f}.") 479 | print(f"Loaded checkpoint for model roi. Epoch: {checkpoint_roi['epoch']}. Perf: {checkpoint_roi['perf']:.2f}.") 480 | 481 | rgb_model.load_state_dict(checkpoint_rgb['state_dict']) 482 | flow_model.load_state_dict(checkpoint_flow['state_dict']) 483 | obj_model.load_state_dict(checkpoint_obj['state_dict']) 484 | roi_model.load_state_dict(checkpoint_roi['state_dict']) 485 | 486 | return [rgb_model, flow_model, obj_model, roi_model] 487 | 488 | 489 | def main(): 490 | model = get_model() 491 | if type(model) == list: 492 | model = [m.to(device) for m in model] 493 | else: 494 | model.to(device) 495 | 496 | if args.mode == 'train': 497 | loaders = {m: get_loader(m) for m in ['training', 'validation']} 498 | 499 | if args.resume: 500 | start_epoch, _, start_best_perf = load_checkpoint(model) 501 | else: 502 | start_epoch = 0 503 | start_best_perf = 0 504 | 505 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) 506 | 507 | schedule_on = None 508 | if args.schedule_on: 509 | schedule_on = lr_scheduler.StepLR(optimizer, args.schedule_epoch, gamma=0.1, last_epoch=-1) 510 | 511 | train_validation(model, loaders, optimizer, args.epochs, start_epoch, start_best_perf, schedule_on) 512 | 513 | elif args.mode == 'validate': 514 | if args.modality == 'late_fusion': 515 | loaders = [get_loader('validation', 'rgb'), 516 | get_loader('validation', 'flow'), 517 | get_loader('validation', 'obj'), 518 | get_loader('validation', 'roi')] 519 | verb_scores, noun_scores, action_scores, verb_labels, noun_labels, action_labels, ids = get_scores_late_fusion( 520 | model, loaders) 521 | else: 522 | epoch, perf, _ = load_checkpoint(model) 523 | print("Loaded checkpoint for model {}. Epoch: {}. Perf: {:0.2f}.".format(type(model), epoch, perf)) 524 | 525 | loader = get_loader('validation') 526 | verb_scores, noun_scores, action_scores, verb_labels, noun_labels, action_labels, ids = get_scores(model, loader) 527 | 528 | verb_accuracies = topk_accuracy(verb_scores, verb_labels, (args.topK,))[0] 529 | noun_accuracies = topk_accuracy(noun_scores, noun_labels, (args.topK,))[0] 530 | action_accuracies = topk_accuracy(action_scores, action_labels, (args.topK,))[0] 531 | 532 | verb_accuracies_5 = topk_accuracy(verb_scores, verb_labels, (5,))[0] 533 | noun_accuracies_5 = topk_accuracy(noun_scores, noun_labels, (5,))[0] 534 | action_accuracies_5 = topk_accuracy(action_scores, action_labels, (5,))[0] 535 | 536 | many_shot_verbs, many_shot_nouns, many_shot_actions = get_many_shot() 537 | verb_recalls = topk_recall(verb_scores, verb_labels, k=args.topK, classes=many_shot_verbs) 538 | noun_recalls = topk_recall(noun_scores, noun_labels, k=args.topK, classes=many_shot_nouns) 539 | action_recalls = topk_recall(action_scores, action_labels, k=args.topK, classes=many_shot_actions) 540 | 541 | unseen, tail_verbs, tail_nouns, tail_actions = get_validation_ids() 542 | 543 | unseen_bool_idx = pd.Series(ids).isin(unseen).values 544 | tail_verbs_bool_idx = pd.Series(ids).isin(tail_verbs).values 545 | tail_nouns_bool_idx = pd.Series(ids).isin(tail_nouns).values 546 | tail_actions_bool_idx = pd.Series(ids).isin(tail_actions).values 547 | 548 | tail_verb_accuracies = topk_accuracy(verb_scores[tail_verbs_bool_idx], verb_labels[tail_verbs_bool_idx], (args.topK,))[0] 549 | tail_noun_accuracies = topk_accuracy(noun_scores[tail_nouns_bool_idx], noun_labels[tail_nouns_bool_idx], (args.topK,))[0] 550 | tail_action_accuracies = topk_accuracy(action_scores[tail_actions_bool_idx], action_labels[tail_actions_bool_idx], (args.topK,))[0] 551 | 552 | unseen_verb_accuracies = topk_accuracy(verb_scores[unseen_bool_idx], verb_labels[unseen_bool_idx], (args.topK,))[0] 553 | unseen_noun_accuracies = topk_accuracy(noun_scores[unseen_bool_idx], noun_labels[unseen_bool_idx], (args.topK,))[0] 554 | unseen_action_accuracies = topk_accuracy(action_scores[unseen_bool_idx], action_labels[unseen_bool_idx], (args.topK,))[0] 555 | 556 | print(f'Overall Top-1 Acc. (Verb) = {verb_accuracies*100:.2f}') 557 | print(f'Overall Top-1 Acc. (Noun) = {noun_accuracies*100:.2f}') 558 | print(f'Overall Top-1 Acc. (Action) = {action_accuracies*100:.2f}') 559 | print(f'Overall Top-5 Acc. (Verb) = {verb_accuracies_5*100:.2f}') 560 | print(f'Overall Top-5 Acc. (Noun) = {noun_accuracies_5*100:.2f}') 561 | print(f'Overall Top-5 Acc. (Action) = {action_accuracies_5*100:.2f}') 562 | print(f'Unseen Top-1 Acc. (Verb) = {unseen_verb_accuracies*100:.2f}') 563 | print(f'Unseen Top-1 Acc. (Noun) = {unseen_noun_accuracies*100:.2f}') 564 | print(f'Unseen Top-1 Acc. (Action) = {unseen_action_accuracies*100:.2f}') 565 | print(f'Tail Top-1 Acc. (Verb) = {tail_verb_accuracies*100:.2f}') 566 | print(f'Tail Top-1 Acc. (Noun) = {tail_noun_accuracies*100:.2f}') 567 | print(f'Tail Top-1 Acc. (Action) = {tail_action_accuracies*100:.2f}') 568 | 569 | elif args.mode == 'test': 570 | if args.ek100: 571 | mm = ['timestamps'] 572 | else: 573 | mm = ['seen', 'unseen'] 574 | 575 | for m in mm: 576 | if args.modality == 'late_fusion': 577 | loaders = [get_loader("test_{}".format(m), 'rgb'), 578 | get_loader("test_{}".format(m), 'flow'), 579 | get_loader("test_{}".format(m), 'obj'), 580 | get_loader("test_{}".format(m), 'roi')] 581 | discarded_ids = loaders[0].dataset.discarded_ids 582 | verb_scores, noun_scores, action_scores, ids = get_scores_late_fusion(model, loaders) 583 | else: 584 | loader = get_loader("test_{}".format(m)) 585 | epoch, perf, _ = load_checkpoint(model) 586 | print("Loaded checkpoint for model {}. Epoch: {}. Perf: {:.2f}.".format(type(model), epoch, perf)) 587 | 588 | discarded_ids = loader.dataset.discarded_ids 589 | verb_scores, noun_scores, action_scores, ids = get_scores(model, loader) 590 | 591 | ids = list(ids) + list(discarded_ids) 592 | verb_scores = np.concatenate((verb_scores, np.zeros((len(discarded_ids), *verb_scores.shape[1:])))) 593 | noun_scores = np.concatenate((noun_scores, np.zeros((len(discarded_ids), *noun_scores.shape[1:])))) 594 | action_scores = np.concatenate((action_scores, np.zeros((len(discarded_ids), *action_scores.shape[1:])))) 595 | 596 | actions = pd.read_csv(join(args.path_to_data, 'actions.csv')) 597 | 598 | # map actions to (verb, noun) pairs 599 | a_to_vn = {a[1]['id']: tuple(a[1][['verb', 'noun']].values) 600 | for a in actions.iterrows()} 601 | 602 | predictions = predictions_to_json(args.task, verb_scores, noun_scores, action_scores, ids, a_to_vn, version = '0.2' if args.ek100 else '0.1', sls=True) 603 | if args.ek100: 604 | with open(join(args.json_directory,exp_name+f"_test.json"), 'w') as f: 605 | f.write(json.dumps(predictions, indent=4, separators=(',',': '))) 606 | else: 607 | with open(join(args.json_directory, exp_name + "_{}.json".format(m)), 'w') as f: 608 | f.write(json.dumps(predictions, indent=4, separators=(',', ': '))) 609 | print('Printing done') 610 | 611 | 612 | if __name__ == '__main__': 613 | 614 | if args.mode == 'test': 615 | assert args.json_directory is not None 616 | 617 | exp_name = make_model_name(args) 618 | print("Save file name ", exp_name) 619 | print("Printing Arguments ") 620 | print(args) 621 | 622 | # Considering args parameters from object model 623 | if args.modality == 'late_fusion': 624 | assert (args.mode != 'train') 625 | 626 | args_rgb = copy.deepcopy(args) 627 | args_rgb.video_feat_dim = 1024 628 | exp_rgb = make_model_name(args_rgb) 629 | 630 | args_flow = copy.deepcopy(args_rgb) 631 | exp_flow = make_model_name(args_flow) 632 | 633 | args_roi = copy.deepcopy(args_rgb) 634 | exp_roi = make_model_name(args_roi) 635 | 636 | # uncomment the next line when using TSM instead of TSN for rgb 637 | #args_rgb.video_feat_dim = 2048 638 | 639 | main() -------------------------------------------------------------------------------- /main_anticipation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from argparse import ArgumentParser 4 | from dataset_anticipation import SequenceDataset 5 | from os.path import join 6 | import torch 7 | from torch.utils.data import DataLoader 8 | import torch.nn.functional as F 9 | from utils import ValueMeter, topk_accuracy, topk_accuracy_save_validation_pred, topk_recall, MeanTopKRecallMeter 10 | from utils import get_marginal_indexes, marginalize, softmax, predictions_to_json 11 | from tqdm import tqdm 12 | import numpy as np 13 | import pandas as pd 14 | import json 15 | from network import Network 16 | from torch.optim import lr_scheduler 17 | from torch import nn 18 | import copy 19 | 20 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 21 | 22 | COMP_PATH = 'tempAgg_ant_rec/' 23 | 24 | pd.options.display.float_format = '{:05.2f}'.format 25 | 26 | parser = ArgumentParser(description="Training for Action Anticipation") 27 | parser.add_argument('--mode', type=str, default='train', choices=['train', 'validate', 'train_val', 'test', 'validate_json'], 28 | help="Whether to perform training, validation or test. If test/validate_json is selected, " 29 | "--json_directory must be used to provide a directory in which to save the generated jsons.") 30 | parser.add_argument('--path_to_data', type=str, default=COMP_PATH + 'DATA_EPIC_ALL/', 31 | help="Path to the data folder, containing all LMDB datasets") 32 | parser.add_argument('--path_to_models', type=str, default=COMP_PATH + '/models_anticipation/', 33 | help="Path to the directory where to save all models") 34 | 35 | parser.add_argument('--json_directory', type=str, default=COMP_PATH + '/models_anticipation/', 36 | help='Directory in which to save the generated jsons.') 37 | parser.add_argument('--task', type=str, default='action_anticipation', 38 | choices=['action_anticipation', 'action_recognition'], 39 | help='Task to tackle: anticipation or recognition') 40 | 41 | parser.add_argument('--img_tmpl', type=str, default='frame_{:010d}.jpg', 42 | help='Template to use to load the representation of a given frame') 43 | parser.add_argument('--resume', action='store_true', help='Whether to resume suspended training') 44 | parser.add_argument('--best_model', type=str, default='best', choices=['best', 'last'], help='') 45 | 46 | parser.add_argument('--modality', type=str, default='obj', choices=['rgb', 'flow', 'obj', 'roi', 'late_fusion'], 47 | help="Modality. rgb/flow/obj/roi represent single branches or late fusion of all.") 48 | 49 | parser.add_argument('--weight_rgb', type=float, default=0.4, help='') 50 | parser.add_argument('--weight_flow', type=float, default=0.1, help='') 51 | parser.add_argument('--weight_obj', type=float, default=0.25, help='') 52 | parser.add_argument('--weight_roi', type=float, default=0.25, help='') 53 | 54 | parser.add_argument('--num_workers', type=int, default=0, help="Number of parallel thread to fetch the data") 55 | parser.add_argument('--display_every', type=int, default=10, help="Display every n iterations") 56 | 57 | parser.add_argument('--schedule_on', type=int, default=1, help='') 58 | parser.add_argument('--schedule_epoch', type=int, default=10, help='') 59 | 60 | parser.add_argument('--num_class', type=int, default=2513, help='Number of classes') 61 | parser.add_argument('--verb_class', type=int, default=125, help='') 62 | parser.add_argument('--noun_class', type=int, default=352, help='') 63 | parser.add_argument('--lr', type=float, default=1e-4, help="Learning rate") 64 | parser.add_argument('--latent_dim', type=int, default=512, help='') 65 | parser.add_argument('--linear_dim', type=int, default=512, help='') 66 | parser.add_argument('--dropout_rate', type=float, default=0.3, help='') 67 | parser.add_argument('--scale_factor', type=float, default=-.5, help='') 68 | parser.add_argument('--scale', type=bool, default=True, help='') 69 | parser.add_argument('--batch_size', type=int, default=10, help="Batch Size") 70 | parser.add_argument('--epochs', type=int, default=15, help="Training epochs") 71 | parser.add_argument('--video_feat_dim', type=int, default=352, choices=[352, 1024], help='') 72 | parser.add_argument('--past_attention', type=bool, default=True, help='') 73 | 74 | # Spanning snippets 75 | parser.add_argument('--spanning_sec', type=float, default=6, help='') 76 | parser.add_argument('--span_dim1', type=int, default=5, help='') 77 | parser.add_argument('--span_dim2', type=int, default=3, help='') 78 | parser.add_argument('--span_dim3', type=int, default=2, help='') 79 | 80 | # Recent snippets 81 | parser.add_argument('--recent_dim', type=int, default=2, help='') 82 | parser.add_argument('--recent_sec1', type=float, default=1.6, help='') 83 | parser.add_argument('--recent_sec2', type=float, default=1.2, help='') 84 | parser.add_argument('--recent_sec3', type=float, default=0.8, help='') 85 | parser.add_argument('--recent_sec4', type=float, default=0.4, help='') 86 | 87 | # Adding verb and noun loss 88 | parser.add_argument('--verb_noun_scores', type=bool, default=True, help='') 89 | parser.add_argument('--add_verb_loss', type=bool, default=True, help='Whether to train with verb loss or not') 90 | parser.add_argument('--add_noun_loss', type=bool, default=True, help='Whether to train with verb loss or not') 91 | parser.add_argument('--verb_loss_weight', type=float, default=1.0, help='') 92 | parser.add_argument('--noun_loss_weight', type=float, default=1.0, help='') 93 | parser.add_argument('--ek100', action='store_true', help="Whether to use EPIC-KITCHENS-100") 94 | parser.add_argument('--trainval', type=bool, default=False, help='Whether to train on train+val or only train') 95 | 96 | parser.add_argument('--topK', type=int, default=1, help='') 97 | 98 | parser.add_argument('--alpha', type=float, default=1, help="Distance between time-steps in seconds") 99 | 100 | # Debugging True 101 | parser.add_argument('--debug_on', type=bool, default=False, help='') 102 | 103 | args = parser.parse_args() 104 | 105 | 106 | def make_model_name(arg_save): 107 | save_name = "anti_mod_{}_span_{}_s1_{}_s2_{}_s3_{}_recent_{}_r1_{}_r2_{}_r3_{}_r4_{}_bs_{}_drop_{}_lr_{}_dimLa_{}_" \ 108 | "dimLi_{}_epoc_{}".format(arg_save.modality, arg_save.spanning_sec, arg_save.span_dim1, 109 | arg_save.span_dim2, arg_save.span_dim3, arg_save.recent_dim, 110 | arg_save.recent_sec1, arg_save.recent_sec2, arg_save.recent_sec3, 111 | arg_save.recent_sec4, arg_save.batch_size, arg_save.dropout_rate, arg_save.lr, 112 | arg_save.latent_dim, arg_save.linear_dim, arg_save.epochs) 113 | if arg_save.add_verb_loss: 114 | save_name = save_name + '_vb' 115 | if arg_save.add_noun_loss: 116 | save_name = save_name + '_nn' 117 | return save_name 118 | 119 | 120 | def save_model(model, epoch, perf, best_perf, is_best=False): 121 | torch.save({'state_dict': model.state_dict(), 'epoch': epoch, 122 | 'perf': perf, 'best_perf': best_perf}, join(args.path_to_models, exp_name + '.pth.tar')) 123 | if is_best: 124 | torch.save({'state_dict': model.state_dict(), 'epoch': epoch, 'perf': perf, 'best_perf': best_perf}, join( 125 | args.path_to_models, exp_name + '_best.pth.tar')) 126 | 127 | 128 | def get_validation_ids(): 129 | unseen_participants_ids = pd.read_csv(join(args.path_to_data, 'validation_unseen_participants_ids.csv'), names=['id'], squeeze=True) 130 | tail_verbs_ids = pd.read_csv(join(args.path_to_data, 'validation_tail_verbs_ids.csv'), names=['id'], squeeze=True) 131 | tail_nouns_ids = pd.read_csv(join(args.path_to_data, 'validation_tail_nouns_ids.csv'), names=['id'], squeeze=True) 132 | tail_actions_ids = pd.read_csv(join(args.path_to_data, 'validation_tail_actions_ids.csv'), names=['id'], squeeze=True) 133 | 134 | return unseen_participants_ids, tail_verbs_ids, tail_nouns_ids, tail_actions_ids 135 | 136 | 137 | def get_many_shot(): 138 | """Get many shot verbs, nouns and actions for class-aware metrics (Mean Top-5 Recall)""" 139 | # read the list of many shot verbs 140 | many_shot_verbs = pd.read_csv(join(args.path_to_data, 'EPIC_many_shot_verbs.csv'))['verb_class'].values 141 | # read the list of many shot nouns 142 | many_shot_nouns = pd.read_csv( 143 | join(args.path_to_data, 'EPIC_many_shot_nouns.csv'))['noun_class'].values 144 | 145 | # read the list of actions 146 | actions = pd.read_csv(join(args.path_to_data, 'actions.csv')) 147 | # map actions to (verb, noun) pairs 148 | a_to_vn = {a[1]['id']: tuple(a[1][['verb', 'noun']].values) 149 | for a in actions.iterrows()} 150 | 151 | # create the list of many shot actions 152 | # an action is "many shot" if at least one 153 | # between the related verb and noun are many shot 154 | many_shot_actions = [] 155 | for a, (v, n) in a_to_vn.items(): 156 | if v in many_shot_verbs or n in many_shot_nouns: 157 | many_shot_actions.append(a) 158 | 159 | return many_shot_verbs, many_shot_nouns, many_shot_actions 160 | 161 | 162 | def get_scores(model, loader, challenge=False, include_discarded=False): 163 | model.eval() 164 | predictions_act = [] 165 | predictions_noun = [] 166 | predictions_verb = [] 167 | labels = [] 168 | ids = [] 169 | with torch.set_grad_enabled(False): 170 | for batch in tqdm(loader, 'Evaluating...', len(loader)): 171 | x_spanning = batch['spanning_features'] 172 | x_recent = batch['recent_features'] 173 | 174 | if type(x_spanning) == list: 175 | x_spanning = [xx.to(device) for xx in x_spanning] 176 | x_recent = [xx.to(device) for xx in x_recent] 177 | else: 178 | x_spanning = x_spanning.to(device) 179 | x_recent = x_recent.to(device) 180 | 181 | y_label = batch['label'].numpy() 182 | ids.append(batch['id']) 183 | 184 | pred_act1, pred_act2, pred_act3, pred_act4, pred_verb1, pred_verb2, pred_verb3, pred_verb4, \ 185 | pred_noun1, pred_noun2, pred_noun3, pred_noun4 = model(x_spanning, x_recent) 186 | 187 | pred_ensemble_act = pred_act1.detach() + pred_act2.detach() + pred_act3.detach() + pred_act4.detach() 188 | pred_ensemble_act = pred_ensemble_act.cpu().numpy() 189 | predictions_act.append(pred_ensemble_act) 190 | if args.add_verb_loss and args.add_noun_loss: 191 | pred_ensemble_verb = pred_verb1.detach() + pred_verb2.detach() + pred_verb3.detach() + pred_verb4.detach() 192 | pred_ensemble_verb = pred_ensemble_verb.cpu().numpy() 193 | pred_ensemble_noun = pred_noun1.detach() + pred_noun2.detach() + pred_noun3.detach() + pred_noun4.detach() 194 | pred_ensemble_noun = pred_ensemble_noun.cpu().numpy() 195 | 196 | predictions_verb.append(pred_ensemble_verb) 197 | predictions_noun.append(pred_ensemble_noun) 198 | 199 | labels.append(y_label) 200 | 201 | action_scores = np.concatenate(predictions_act) 202 | labels = np.concatenate(labels) 203 | ids = np.concatenate(ids) 204 | 205 | if args.verb_noun_scores: # use the verb and noun scores 206 | verb_scores = np.concatenate(predictions_verb) 207 | noun_scores = np.concatenate(predictions_noun) 208 | else: # marginalize the action scores to get the noun and verb scores 209 | actions = pd.read_csv(join(args.path_to_data, 'actions.csv'), index_col='id') 210 | vi = get_marginal_indexes(actions, 'verb') 211 | ni = get_marginal_indexes(actions, 'noun') 212 | action_prob = softmax(action_scores.reshape(-1, action_scores.shape[-1])) 213 | verb_scores = marginalize(action_prob, vi) # .reshape( action_scores.shape[0], action_scores.shape[1], -1) 214 | noun_scores = marginalize(action_prob, ni) # .reshape( action_scores.shape[0], action_scores.shape[1], -1) 215 | 216 | if include_discarded: 217 | dlab = np.array(loader.dataset.discarded_labels) 218 | dislab = np.array(loader.dataset.discarded_ids) 219 | ids = np.concatenate([ids, dislab]) 220 | num_disc = len(dlab) 221 | labels = np.concatenate([labels, dlab]) 222 | verb_scores = np.concatenate((verb_scores, np.zeros((num_disc, *verb_scores.shape[1:])))) 223 | noun_scores = np.concatenate((noun_scores, np.zeros((num_disc, *noun_scores.shape[1:])))) 224 | action_scores = np.concatenate((action_scores, np.zeros((num_disc, *action_scores.shape[1:])))) 225 | 226 | if labels.max() > 0 and not challenge: 227 | return verb_scores, noun_scores, action_scores, labels[:, 0], labels[:, 1], labels[:, 2], ids 228 | else: 229 | return verb_scores, noun_scores, action_scores, ids 230 | 231 | 232 | def get_scores_late_fusion(models, loaders, challenge=False, include_discarded=False): 233 | verb_scores = [] 234 | noun_scores = [] 235 | action_scores = [] 236 | outputs = [] 237 | for model, loader in zip(models, loaders): 238 | outputs = get_scores(model, loader, challenge, include_discarded) 239 | verb_scores.append(outputs[0]) 240 | noun_scores.append(outputs[1]) 241 | action_scores.append(outputs[2]) 242 | 243 | verb_scores[0] = verb_scores[0] * args.weight_rgb 244 | verb_scores[1] = verb_scores[1] * args.weight_flow 245 | verb_scores[2] = verb_scores[2] * args.weight_obj 246 | verb_scores[3] = verb_scores[3] * args.weight_roi 247 | 248 | noun_scores[0] = noun_scores[0] * args.weight_rgb 249 | noun_scores[1] = noun_scores[1] * args.weight_flow 250 | noun_scores[2] = noun_scores[2] * args.weight_obj 251 | noun_scores[3] = noun_scores[3] * args.weight_roi 252 | 253 | action_scores[0] = action_scores[0] * args.weight_rgb 254 | action_scores[1] = action_scores[1] * args.weight_flow 255 | action_scores[2] = action_scores[2] * args.weight_obj 256 | action_scores[3] = action_scores[3] * args.weight_roi 257 | 258 | verb_scores = sum(verb_scores) 259 | noun_scores = sum(noun_scores) 260 | action_scores = sum(action_scores) 261 | 262 | return [verb_scores, noun_scores, action_scores] + list(outputs[3:]) 263 | 264 | 265 | def log(mode, epoch, total_loss_meter, ensemble_accuracy_meter, 266 | action_loss_meter, verb_loss_meter, noun_loss_meter, 267 | accuracy_action1_meter, accuracy_action2_meter, accuracy_action3_meter, accuracy_action4_meter, 268 | best_perf=None, green=False): 269 | if green: 270 | print('\033[92m', end="") 271 | print( 272 | "[{}] Epoch: {:.2f}. ".format(mode, epoch), 273 | "Total Loss: {:.2f}. ".format(total_loss_meter.value()), 274 | "Act. Loss: {:.2f}. ".format(action_loss_meter.value()), 275 | "Verb Loss: {:.2f}. ".format(verb_loss_meter.value()), 276 | "Noun Loss: {:.2f}. ".format(noun_loss_meter.value()), 277 | "Acc. Act1: {:.2f}% ".format(accuracy_action1_meter.value()), 278 | "Acc. Act2: {:.2f}% ".format(accuracy_action2_meter.value()), 279 | "Acc. Act3: {:.2f}% ".format(accuracy_action3_meter.value()), 280 | "Acc. Act4: {:.2f}% ".format(accuracy_action4_meter.value()), 281 | "Ensemble Acc.: {:.2f}% ".format(ensemble_accuracy_meter.value()), 282 | end="") 283 | 284 | if best_perf: 285 | print("[best: {:.2f}]%".format(best_perf), end="") 286 | 287 | print('\033[0m') 288 | 289 | 290 | def train_validation(model, loaders, optimizer, epochs, start_epoch, start_best_perf, schedule_on): 291 | """Training/Validation code""" 292 | 293 | best_perf = start_best_perf # to keep track of the best performing epoch 294 | 295 | loss_act_TAB1 = nn.CrossEntropyLoss() 296 | loss_act_TAB2 = nn.CrossEntropyLoss() 297 | loss_act_TAB3 = nn.CrossEntropyLoss() 298 | loss_act_TAB4 = nn.CrossEntropyLoss() 299 | if args.add_verb_loss: 300 | print('Add verb losses') 301 | loss_verb_TAB1 = nn.CrossEntropyLoss() 302 | loss_verb_TAB2 = nn.CrossEntropyLoss() 303 | loss_verb_TAB3 = nn.CrossEntropyLoss() 304 | loss_verb_TAB4 = nn.CrossEntropyLoss() 305 | if args.add_noun_loss: 306 | print('Add noun losses') 307 | loss_noun_TAB1 = nn.CrossEntropyLoss() 308 | loss_noun_TAB2 = nn.CrossEntropyLoss() 309 | loss_noun_TAB3 = nn.CrossEntropyLoss() 310 | loss_noun_TAB4 = nn.CrossEntropyLoss() 311 | 312 | for epoch in range(start_epoch, epochs): 313 | if schedule_on is not None: 314 | schedule_on.step() 315 | 316 | # define training and validation meters 317 | total_loss_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 318 | action_loss_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 319 | verb_loss_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 320 | noun_loss_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 321 | 322 | ensemble_accuracy_meter = {'training': MeanTopKRecallMeter(args.num_class), 'validation': MeanTopKRecallMeter(args.num_class)} 323 | accuracy_action1_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 324 | accuracy_action2_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 325 | accuracy_action3_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 326 | accuracy_action4_meter = {'training': ValueMeter(), 'validation': ValueMeter()} 327 | 328 | for mode in ['training', 'validation']: 329 | 330 | # enable gradients only if training 331 | with torch.set_grad_enabled(mode == 'training'): 332 | if mode == 'training': 333 | model.train() 334 | else: 335 | model.eval() 336 | 337 | for i, batch in enumerate(loaders[mode]): 338 | x_spanning = batch['spanning_features'] 339 | x_recent = batch['recent_features'] 340 | if type(x_spanning) == list: 341 | x_spanning = [xx.to(device) for xx in x_spanning] 342 | x_recent = [xx.to(device) for xx in x_recent] 343 | else: 344 | x_spanning = x_spanning.to(device) 345 | x_recent = x_recent.to(device) 346 | 347 | y_label = batch['label'].to(device) 348 | bs = y_label.shape[0] # batch size 349 | 350 | pred_act1, pred_act2, pred_act3, pred_act4, pred_verb1, pred_verb2, pred_verb3, pred_verb4, \ 351 | pred_noun1, pred_noun2, pred_noun3, pred_noun4 = model(x_spanning, x_recent) 352 | 353 | loss = loss_act_TAB1(pred_act1, y_label[:, 2]) + \ 354 | loss_act_TAB2(pred_act2, y_label[:, 2]) + \ 355 | loss_act_TAB3(pred_act3, y_label[:, 2]) + \ 356 | loss_act_TAB4(pred_act4, y_label[:, 2]) 357 | action_loss_meter[mode].add(loss.item(), bs) 358 | 359 | if args.add_verb_loss: 360 | verb_loss = loss_verb_TAB1(pred_verb1, y_label[:, 0]) + \ 361 | loss_verb_TAB2(pred_verb2, y_label[:, 0]) + \ 362 | loss_verb_TAB3(pred_verb3, y_label[:, 0]) + \ 363 | loss_verb_TAB4(pred_verb4, y_label[:, 0]) 364 | verb_loss_meter[mode].add(verb_loss.item(), bs) 365 | loss = loss + args.verb_loss_weight * verb_loss 366 | else: 367 | verb_loss_meter[mode].add(-1, bs) 368 | 369 | if args.add_noun_loss: 370 | noun_loss = loss_noun_TAB1(pred_noun1, y_label[:, 1]) + \ 371 | loss_noun_TAB2(pred_noun2, y_label[:, 1]) + \ 372 | loss_noun_TAB3(pred_noun3, y_label[:, 1]) + \ 373 | loss_noun_TAB4(pred_noun4, y_label[:, 1]) 374 | noun_loss_meter[mode].add(noun_loss.item(), bs) 375 | loss = loss + args.noun_loss_weight * noun_loss 376 | else: 377 | noun_loss_meter[mode].add(-1, bs) 378 | 379 | label_curr = y_label[:, 2].detach().cpu().numpy() 380 | acc_future1 = topk_accuracy(pred_act1.detach().cpu().numpy(), label_curr, (args.topK,))[0] * 100 381 | acc_future2 = topk_accuracy(pred_act2.detach().cpu().numpy(), label_curr, (args.topK,))[0] * 100 382 | acc_future3 = topk_accuracy(pred_act3.detach().cpu().numpy(), label_curr, (args.topK,))[0] * 100 383 | acc_future4 = topk_accuracy(pred_act4.detach().cpu().numpy(), label_curr, (args.topK,))[0] * 100 384 | 385 | accuracy_action1_meter[mode].add(acc_future1, bs) 386 | accuracy_action2_meter[mode].add(acc_future2, bs) 387 | accuracy_action3_meter[mode].add(acc_future3, bs) 388 | accuracy_action4_meter[mode].add(acc_future4, bs) 389 | 390 | pred_ensemble = pred_act1.detach() + pred_act2.detach() + pred_act3.detach() + pred_act4.detach() 391 | pred_ensemble = pred_ensemble.cpu().numpy() 392 | 393 | # store the values in the meters to keep incremental averages 394 | total_loss_meter[mode].add(loss.item(), bs) 395 | ensemble_accuracy_meter[mode].add(pred_ensemble, label_curr) 396 | 397 | # if in training mode 398 | if mode == 'training': 399 | optimizer.zero_grad() 400 | loss.backward() 401 | optimizer.step() 402 | 403 | # log training during loop - avoid logging the very first batch. It can be biased. 404 | if mode == 'training' and i != 0 and i % args.display_every == 0: 405 | epoch_curr = epoch + i / len(loaders[mode]) # compute decimal epoch for logging 406 | log(mode, epoch_curr, total_loss_meter[mode], ensemble_accuracy_meter[mode], 407 | action_loss_meter[mode], verb_loss_meter[mode], noun_loss_meter[mode], 408 | accuracy_action1_meter[mode], accuracy_action2_meter[mode], 409 | accuracy_action3_meter[mode], accuracy_action4_meter[mode]) 410 | 411 | # log at the end of each epoch 412 | log(mode, epoch + 1, total_loss_meter[mode], ensemble_accuracy_meter[mode], 413 | action_loss_meter[mode], verb_loss_meter[mode], noun_loss_meter[mode], 414 | accuracy_action1_meter[mode], accuracy_action2_meter[mode], 415 | accuracy_action3_meter[mode], accuracy_action4_meter[mode], 416 | max(ensemble_accuracy_meter[mode].value(), best_perf) if mode == 'validation' else None, green=True) 417 | 418 | if best_perf < ensemble_accuracy_meter['validation'].value(): 419 | best_perf = ensemble_accuracy_meter['validation'].value() 420 | is_best = True 421 | else: 422 | is_best = False 423 | with open(args.path_to_models + '/' + exp_name + '.txt', 'a') as f: 424 | f.write("%d - %0.2f\n" % (epoch + 1, ensemble_accuracy_meter['validation'].value())) 425 | 426 | # save checkpoint at the end of each train/val epoch 427 | save_model(model, epoch + 1, ensemble_accuracy_meter['validation'].value(), best_perf, is_best=is_best) 428 | 429 | with open(args.path_to_models + '/' + exp_name + '.txt', 'a') as f: 430 | f.write("%d - %0.2f\n" % (epochs + 1, best_perf)) 431 | 432 | 433 | def load_checkpoint(model): 434 | model_add = '.pth.tar' 435 | if args.best_model == 'best': 436 | print('args.best_model == True') 437 | model_add = '_best.pth.tar' 438 | 439 | chk = torch.load(join(args.path_to_models, exp_name + model_add)) 440 | epoch = chk['epoch'] 441 | best_perf = chk['best_perf'] 442 | perf = chk['perf'] 443 | model.load_state_dict(chk['state_dict']) 444 | return epoch, perf, best_perf 445 | 446 | 447 | def get_loader(mode, override_modality=None): 448 | if override_modality: 449 | path_to_lmdb = join(args.path_to_data, override_modality) 450 | else: 451 | path_to_lmdb = join(args.path_to_data, args.modality) 452 | 453 | if mode=='training' and args.trainval: 454 | csv_file = 'trainval' 455 | else: 456 | csv_file = mode 457 | 458 | kargs = { 459 | 'path_to_lmdb': path_to_lmdb, 460 | 'path_to_csv': join(args.path_to_data, "{}.csv".format(csv_file)), 461 | 'time_step': args.alpha, 462 | 'label_type': ['verb', 'noun', 'action'], 463 | 'img_tmpl': args.img_tmpl, 464 | 'challenge': 'test' in mode, 465 | 'args': args 466 | } 467 | _set = SequenceDataset(**kargs) 468 | 469 | return DataLoader(_set, batch_size=args.batch_size, num_workers=args.num_workers, 470 | pin_memory=True, shuffle=mode == 'training') 471 | 472 | 473 | def get_model(): 474 | if not args.modality == 'late_fusion': 475 | return Network(args) 476 | elif args.modality=='late_fusion': 477 | obj_model = Network(args) 478 | rgb_model = Network(args_rgb) 479 | flow_model = Network(args_flow) 480 | roi_model = Network(args_roi) 481 | 482 | model_add = '.pth.tar' 483 | if args.best_model == 'best': 484 | print('args.best_model == True') 485 | model_add = '_best.pth.tar' 486 | 487 | checkpoint_rgb = torch.load(join(args.path_to_models, exp_rgb.replace(f'{args.modality}', 'rgb') + model_add)) 488 | checkpoint_flow = torch.load(join(args.path_to_models, exp_flow.replace(f'{args.modality}', 'flow') + model_add)) 489 | checkpoint_obj = torch.load(join(args.path_to_models, exp_name.replace(f'{args.modality}', 'obj') + model_add)) 490 | checkpoint_roi = torch.load(join(args.path_to_models, exp_roi.replace(f'{args.modality}', 'roi') + model_add)) 491 | 492 | print(f"Loaded checkpoint for model rgb. Epoch: {checkpoint_rgb['epoch']}. Perf: {checkpoint_rgb['perf']:.2f}.") 493 | print(f"Loaded checkpoint for model flow. Epoch: {checkpoint_flow['epoch']}. Perf: {checkpoint_flow['perf']:.2f}.") 494 | print(f"Loaded checkpoint for model obj. Epoch: {checkpoint_obj['epoch']}. Perf: {checkpoint_obj['perf']:.2f}.") 495 | print(f"Loaded checkpoint for model roi. Epoch: {checkpoint_roi['epoch']}. Perf: {checkpoint_roi['perf']:.2f}.") 496 | 497 | rgb_model.load_state_dict(checkpoint_rgb['state_dict']) 498 | flow_model.load_state_dict(checkpoint_flow['state_dict']) 499 | obj_model.load_state_dict(checkpoint_obj['state_dict']) 500 | roi_model.load_state_dict(checkpoint_roi['state_dict']) 501 | 502 | return [rgb_model, flow_model, obj_model, roi_model] 503 | 504 | 505 | 506 | 507 | def main(): 508 | model = get_model() 509 | if type(model) == list: 510 | model = [m.to(device) for m in model] 511 | else: 512 | model.to(device) 513 | 514 | if args.mode == 'train': 515 | loaders = {m: get_loader(m) for m in ['training', 'validation']} 516 | 517 | if args.resume: 518 | start_epoch, _, start_best_perf = load_checkpoint(model) 519 | else: 520 | start_epoch = 0 521 | start_best_perf = 0 522 | 523 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) 524 | 525 | schedule_on = None 526 | if args.schedule_on: 527 | schedule_on = lr_scheduler.StepLR(optimizer, args.schedule_epoch, gamma=0.1, last_epoch=-1) 528 | 529 | train_validation(model, loaders, optimizer, args.epochs, 530 | start_epoch, start_best_perf, schedule_on) 531 | 532 | elif args.mode == 'validate': 533 | if args.modality == 'late_fusion': 534 | loaders = [get_loader('validation', 'rgb'), 535 | get_loader('validation', 'flow'), 536 | get_loader('validation', 'obj'), 537 | get_loader('validation', 'roi')] 538 | verb_scores, noun_scores, action_scores, verb_labels, noun_labels, action_labels, ids = get_scores_late_fusion(model, loaders, include_discarded=args.ek100) 539 | else: 540 | epoch, perf, _ = load_checkpoint(model) 541 | print("Loaded checkpoint for model {}. Epoch: {}. Perf: {:0.2f}.".format(type(model), epoch, perf)) 542 | 543 | loader = get_loader('validation') 544 | verb_scores, noun_scores, action_scores, verb_labels, noun_labels, action_labels, ids = get_scores(model, loader, include_discarded=args.ek100) 545 | 546 | if not args.ek100: 547 | verb_accuracies = topk_accuracy(verb_scores, verb_labels, (1,))[0] 548 | noun_accuracies = topk_accuracy(noun_scores, noun_labels, (1,))[0] 549 | action_accuracies = topk_accuracy_save_validation_pred(action_scores, action_labels, (1,), no_classes = args.num_class)[0] 550 | 551 | many_shot_verbs, many_shot_nouns, many_shot_actions = get_many_shot() 552 | 553 | verb_recalls = topk_recall(verb_scores, verb_labels, k=1, classes=many_shot_verbs) 554 | noun_recalls = topk_recall(noun_scores, noun_labels, k=1, classes=many_shot_nouns) 555 | action_recalls = topk_recall(action_scores, action_labels, k=1, classes=many_shot_actions) 556 | 557 | print("Verb Accuracy: {:.2f}%".format(verb_accuracies * 100)) 558 | print("Noun Accuracy: {:.2f}%".format(noun_accuracies * 100)) 559 | print("Action Accuracy: {:.2f}%".format(action_accuracies * 100)) 560 | print("Verb Recall: {:.2f}%".format(verb_recalls * 100)) 561 | print("Noun Recall: {:.2f}%".format(noun_recalls * 100)) 562 | print("Action Recall: {:.2f}%".format(action_recalls * 100)) 563 | 564 | else: 565 | many_shot_verbs, many_shot_nouns, many_shot_actions = get_many_shot() 566 | verb_accuracies = topk_accuracy(verb_scores, verb_labels, (1,))[0] 567 | noun_accuracies = topk_accuracy(noun_scores, noun_labels, (1,))[0] 568 | action_accuracies = topk_accuracy_save_validation_pred(action_scores, action_labels, (1,), 569 | modality = args.modality, no_classes = args.num_class)[0] 570 | verb_recalls = topk_recall(verb_scores, verb_labels, k=1) 571 | noun_recalls = topk_recall(noun_scores, noun_labels, k=1) 572 | action_recalls = topk_recall(action_scores, action_labels, k=1) 573 | 574 | overall_verb_recalls = topk_recall(verb_scores, verb_labels, k=5) 575 | overall_noun_recalls = topk_recall(noun_scores, noun_labels, k=5) 576 | overall_action_recalls = topk_recall(action_scores, action_labels, k=5) 577 | 578 | unseen, tail_verbs, tail_nouns, tail_actions = get_validation_ids() 579 | 580 | unseen_bool_idx = pd.Series(ids).isin(unseen).values 581 | tail_verbs_bool_idx = pd.Series(ids).isin(tail_verbs).values 582 | tail_nouns_bool_idx = pd.Series(ids).isin(tail_nouns).values 583 | tail_actions_bool_idx = pd.Series(ids).isin(tail_actions).values 584 | 585 | tail_verb_recalls = topk_recall(verb_scores[tail_verbs_bool_idx], verb_labels[tail_verbs_bool_idx], k=5) 586 | tail_noun_recalls = topk_recall(noun_scores[tail_nouns_bool_idx], noun_labels[tail_nouns_bool_idx], k=5) 587 | tail_action_recalls = topk_recall(action_scores[tail_actions_bool_idx], action_labels[tail_actions_bool_idx], k=5) 588 | 589 | 590 | unseen_verb_recalls = topk_recall(verb_scores[unseen_bool_idx], verb_labels[unseen_bool_idx], k=5) 591 | unseen_noun_recalls = topk_recall(noun_scores[unseen_bool_idx], noun_labels[unseen_bool_idx], k=5) 592 | unseen_action_recalls = topk_recall(action_scores[unseen_bool_idx], action_labels[unseen_bool_idx], k=5) 593 | 594 | print(f'Overall Mean Top-5 Recall (Verb) = {overall_verb_recalls*100:.2f}') 595 | print(f'Overall Mean Top-5 Recall (Noun) = {overall_noun_recalls*100:.2f}') 596 | print(f'Overall Mean Top-5 Recall (Action) = {overall_action_recalls*100:.2f}') 597 | print(f'Unseen Mean Top-5 Recall (Verb) = {unseen_verb_recalls*100:.2f}') 598 | print(f'Unseen Mean Top-5 Recall (Noun) = {unseen_noun_recalls*100:.2f}') 599 | print(f'Unseen Mean Top-5 Recall (Action) = {unseen_action_recalls*100:.2f}') 600 | print(f'Tail Mean Top-5 Recall (Verb) = {tail_verb_recalls*100:.2f}') 601 | print(f'Tail Mean Top-5 Recall (Noun) = {tail_noun_recalls*100:.2f}') 602 | print(f'Tail Mean Top-5 Recall (Action) = {tail_action_recalls*100:.2f}') 603 | 604 | elif args.mode == 'test': 605 | if args.ek100: 606 | mm = ['timestamps'] 607 | else: 608 | mm = ['seen', 'unseen'] 609 | 610 | for m in mm: 611 | if args.modality == 'late_fusion': 612 | loaders = [get_loader("test_{}".format(m), 'rgb'), 613 | get_loader("test_{}".format(m), 'flow'), 614 | get_loader("test_{}".format(m), 'obj'), 615 | get_loader("test_{}".format(m), 'roi')] 616 | discarded_ids = loaders[0].dataset.discarded_ids 617 | verb_scores, noun_scores, action_scores, ids = get_scores_late_fusion(model, loaders) 618 | else: 619 | loader = get_loader("test_{}".format(m)) 620 | epoch, perf, _ = load_checkpoint(model) 621 | print("Loaded checkpoint for model {}. Epoch: {}. Perf: {:.2f}.".format(type(model), epoch, perf)) 622 | 623 | discarded_ids = loader.dataset.discarded_ids 624 | verb_scores, noun_scores, action_scores, ids = get_scores(model, loader) 625 | 626 | ids = list(ids) + list(discarded_ids) 627 | verb_scores = np.concatenate((verb_scores, np.zeros((len(discarded_ids), *verb_scores.shape[1:])))) 628 | noun_scores = np.concatenate((noun_scores, np.zeros((len(discarded_ids), *noun_scores.shape[1:])))) 629 | action_scores = np.concatenate((action_scores, np.zeros((len(discarded_ids), *action_scores.shape[1:])))) 630 | 631 | actions = pd.read_csv(join(args.path_to_data, 'actions.csv')) 632 | 633 | # map actions to (verb, noun) pairs 634 | a_to_vn = {a[1]['id']: tuple(a[1][['verb', 'noun']].values) 635 | for a in actions.iterrows()} 636 | 637 | predictions = predictions_to_json(args.task, verb_scores, noun_scores, action_scores, ids, a_to_vn, version = '0.2' if args.ek100 else '0.1', sls=True) 638 | 639 | if args.ek100: 640 | with open(join(args.json_directory, exp_name + f"_test.json"), 'w') as f: 641 | f.write(json.dumps(predictions, indent=4, separators=(',', ': '))) 642 | else: 643 | with open(join(args.json_directory, exp_name + "_{}.json".format(m)), 'w') as f: 644 | f.write(json.dumps(predictions, indent=4, separators=(',', ': '))) 645 | print('Printing done') 646 | 647 | 648 | 649 | if __name__ == '__main__': 650 | 651 | if args.mode == 'test': 652 | assert args.json_directory is not None 653 | 654 | exp_name = make_model_name(args) 655 | print("Save file name ", exp_name) 656 | print("Printing Arguments ") 657 | print(args) 658 | 659 | # Considering args parameters from object model 660 | if args.modality == 'late_fusion': 661 | assert (args.mode != 'train') 662 | 663 | args_rgb = copy.deepcopy(args) 664 | args_rgb.video_feat_dim = 1024 665 | exp_rgb = make_model_name(args_rgb) 666 | 667 | args_flow = copy.deepcopy(args_rgb) 668 | exp_flow = make_model_name(args_flow) 669 | 670 | args_roi = copy.deepcopy(args_rgb) 671 | exp_roi = make_model_name(args_roi) 672 | 673 | main() --------------------------------------------------------------------------------