├── meter.py ├── run_PDAN.sh ├── README.md ├── charades_i3d_per_video.py ├── PDAN.py ├── apmeter.py └── train_PDAN.py /meter.py: -------------------------------------------------------------------------------- 1 | 2 | class Meter(object): 3 | def reset(self): 4 | pass 5 | 6 | def add(self): 7 | pass 8 | 9 | def value(self): 10 | pass 11 | -------------------------------------------------------------------------------- /run_PDAN.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PATH=/Your/ENV/bin:$PATH 4 | 5 | python train_PDAN.py \ 6 | -dataset charades \ 7 | -mode rgb \ 8 | -model PDAN \ 9 | -train True \ 10 | -num_channel 512 \ 11 | -lr 0.0001 \ 12 | -comp_info charades_PDAN \ 13 | -APtype map \ 14 | -epoch 100 \ 15 | -batch_size 32 # -run_mode debug 16 | 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [WACV2021] PDAN 2 | Implementation for the paper ["PDAN: Pyramid Dilated Attention Network for Action Detection"](https://openaccess.thecvf.com/content/WACV2021/html/Dai_PDAN_Pyramid_Dilated_Attention_Network_for_Action_Detection_WACV_2021_paper.html). 3 | 4 | The code is tested in Python3.7 + PyTorch1.2 environment with one Tesla V100 GPU and the overall code framework is adapted from the [Superevent](https://github.com/piergiaj/super-events-cvpr18). 5 | 6 | ## Toyota Smarthome Untrimmed 7 | The evaluation code and pre-trained model for PDAN on [Toyota Smarthome Untrimmed (TSU) dataset](https://project.inria.fr/toyotasmarthome/) can be found in this [repository](https://github.com/dairui01/Toyota_Smarthome/blob/main/pipline/). With the pretrained model, the mAP should be more than 32.7 % for f-map CS protocol. 8 | 9 | ## Charades 10 | For training and testing this code on **Charades**, please download the dataset from this [link](https://prior.allenai.org/projects/charades) and follow this [repository](https://github.com/piergiaj/pytorch-i3d) to extract the snippet-level I3D feature. The RGB-Pretrained PDAN can be downloaded via this [Link](https://mybox.inria.fr/f/9fa53012b2684cb588b5/?dl=1). If the I3D feature is well extracted, the pretrained RGB model should achieve ~ 23.8% per frame-mAP on Charades. Note that this mAP is the one reported in the paper which is computed by all the timesteps and not the weighted mAP. While using the original setting (25 sampled version, w/o weighted) for Charades localization, the mAP should be more than 24 %. 11 | 12 | If you find this work useful for your research, please cite our [paper](https://openaccess.thecvf.com/content/WACV2021/html/Dai_PDAN_Pyramid_Dilated_Attention_Network_for_Action_Detection_WACV_2021_paper.html): 13 | ```bibtex 14 | @inproceedings{dai2021pdan, 15 | title={Pdan: Pyramid dilated attention network for action detection}, 16 | author={Dai, Rui and Das, Srijan and Minciullo, Luca and Garattoni, Lorenzo and Francesca, Gianpiero and Bremond, Francois}, 17 | booktitle={Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision}, 18 | pages={2970--2979}, 19 | year={2021} 20 | } 21 | ``` 22 | Contact: rui.dai@inria.fr 23 | 24 | -------------------------------------------------------------------------------- /charades_i3d_per_video.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data_utl 3 | from torch.utils.data.dataloader import default_collate 4 | 5 | import numpy as np 6 | import json 7 | import csv 8 | # import h5py 9 | 10 | import os 11 | import os.path 12 | from tqdm import tqdm 13 | 14 | 15 | def video_to_tensor(pic): 16 | """Convert a ``numpy.ndarray`` to tensor. 17 | Converts a numpy.ndarray (T x H x W x C) 18 | to a torch.FloatTensor of shape (C x T x H x W) 19 | 20 | Args: 21 | pic (numpy.ndarray): Video to be converted to tensor. 22 | Returns: 23 | Tensor: Converted video. 24 | """ 25 | return torch.from_numpy(pic.transpose([3,0,1,2])) 26 | 27 | 28 | def make_dataset(split_file, split, root, num_classes=157): 29 | dataset = [] 30 | with open(split_file, 'r') as f: 31 | data = json.load(f) 32 | print('split!!!!',split) 33 | i = 0 34 | for vid in tqdm(data.keys()): 35 | if data[vid]['subset'] != split: 36 | continue 37 | 38 | if not os.path.exists(os.path.join(root, vid+'.npy')): 39 | continue 40 | fts = np.load(os.path.join(root, vid+'.npy')) 41 | num_feat = fts.shape[0] 42 | label = np.zeros((num_feat,num_classes), np.float32) 43 | 44 | fps = num_feat/data[vid]['duration'] 45 | for ann in data[vid]['actions']: 46 | for fr in range(0,num_feat,1): 47 | if fr/fps > ann[1] and fr/fps < ann[2]: 48 | label[fr, ann[0]] = 1 # binary classification 49 | dataset.append((vid, label, data[vid]['duration'])) 50 | i += 1 51 | 52 | return dataset 53 | 54 | # make_dataset('multithumos.json', 'training', '/ssd2/thumos/val_i3d_rgb') 55 | 56 | 57 | class MultiThumos(data_utl.Dataset): 58 | 59 | def __init__(self, split_file, split, root, batch_size, classes): 60 | 61 | self.data = make_dataset(split_file, split, root, classes) 62 | self.split_file = split_file 63 | self.batch_size = batch_size 64 | self.root = root 65 | self.in_mem = {} 66 | 67 | def __getitem__(self, index): 68 | """ 69 | Args: 70 | index (int): Index 71 | 72 | Returns: 73 | tuple: (image, target) where target is class_index of the target class. 74 | """ 75 | entry = self.data[index] 76 | if entry[0] in self.in_mem: 77 | feat = self.in_mem[entry[0]] 78 | else: 79 | # print('here') 80 | feat = np.load(os.path.join(self.root, entry[0]+'.npy')) 81 | # print(feat.shape[-1]) 82 | feat = feat.reshape((feat.shape[0],1,1,feat.shape[-1])) 83 | feat = feat.astype(np.float32) 84 | 85 | 86 | label = entry[1] 87 | return feat, label, [entry[0], entry[2]] 88 | 89 | def __len__(self): 90 | return len(self.data) 91 | 92 | 93 | def mt_collate_fn(batch): 94 | "Pads data and puts it into a tensor of same dimensions" 95 | max_len = 0 96 | for b in batch: 97 | if b[0].shape[0] > max_len: 98 | max_len = b[0].shape[0] 99 | 100 | new_batch = [] 101 | for b in batch: 102 | f = np.zeros((max_len, b[0].shape[1], b[0].shape[2], b[0].shape[3]), np.float32) 103 | m = np.zeros((max_len), np.float32) 104 | l = np.zeros((max_len, b[1].shape[1]), np.float32) 105 | f[:b[0].shape[0]] = b[0] 106 | m[:b[0].shape[0]] = 1 107 | l[:b[0].shape[0], :] = b[1] 108 | new_batch.append([video_to_tensor(f), torch.from_numpy(m), torch.from_numpy(l), b[2]]) 109 | 110 | return default_collate(new_batch) 111 | 112 | -------------------------------------------------------------------------------- /PDAN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.init as init 5 | import os 6 | import copy 7 | 8 | 9 | class PDAN(nn.Module): 10 | def __init__(self, num_stages=1, num_layers=5, num_f_maps=512, dim=1024, num_classes=157): 11 | super(PDAN, self).__init__() 12 | self.stage1 = SSPDAN(num_layers, num_f_maps, dim, num_classes) 13 | self.stages = nn.ModuleList([copy.deepcopy(SSPDAN(num_layers, num_f_maps, num_classes, num_classes)) for s in range(num_stages-1)]) 14 | 15 | def forward(self, x, mask): 16 | out = self.stage1(x, mask) 17 | outputs = out.unsqueeze(0) 18 | for s in self.stages: 19 | out = s(out * mask[:, 0:1, :], mask) 20 | outputs = torch.cat((outputs, out.unsqueeze(0)), dim=0) 21 | return outputs 22 | 23 | class SSPDAN(nn.Module): 24 | def __init__(self, num_layers, num_f_maps, dim, num_classes): 25 | super(SSPDAN, self).__init__() 26 | self.conv_1x1 = nn.Conv1d(dim, num_f_maps, 1) 27 | self.layers = nn.ModuleList([copy.deepcopy(PDAN_Block(2 ** i, num_f_maps, num_f_maps)) for i in range(num_layers)]) 28 | self.conv_out = nn.Conv1d(num_f_maps, num_classes, 1) 29 | 30 | def forward(self, x, mask): 31 | out = self.conv_1x1(x) 32 | for layer in self.layers: 33 | out = layer(out, mask) 34 | out = self.conv_out(out) * mask[:, 0:1, :] 35 | return out 36 | 37 | 38 | class PDAN_Block(nn.Module): 39 | def __init__(self, dilation, in_channels, out_channels): 40 | super(PDAN_Block, self).__init__() 41 | self.conv_attention=DAL(in_channels, out_channels, kernel_size=3, padding=dilation, dilated=dilation) 42 | self.conv_1x1 = nn.Conv1d(out_channels, out_channels, 1) 43 | self.dropout = nn.Dropout() 44 | 45 | def forward(self, x, mask): 46 | out = F.relu(self.conv_attention(x)) 47 | out = self.conv_1x1(out) 48 | out = self.dropout(out) 49 | return (x + out) * mask[:, 0:1, :] 50 | 51 | class DAL(nn.Module): 52 | def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, dilated=1, groups=1, bias=False): 53 | super(DAL, self).__init__() 54 | self.out_channels = out_channels 55 | self.kernel_size = kernel_size 56 | self.stride = stride 57 | self.padding = padding 58 | self.groups = groups 59 | self.dilated = dilated 60 | assert self.out_channels % self.groups == 0, "out_channels should be divided by groups. (example: out_channels: 40, groups: 4)" 61 | self.rel_t = nn.Parameter(torch.randn(out_channels, 1, kernel_size), requires_grad=True) 62 | self.key_conv = nn.Conv1d(in_channels, out_channels, kernel_size=1, bias=bias) 63 | self.query_conv = nn.Conv1d(in_channels, out_channels, kernel_size=1, bias=bias) 64 | self.value_conv = nn.Conv1d(in_channels, out_channels, kernel_size=1, bias=bias) 65 | self.reset_parameters() 66 | 67 | 68 | def forward(self, x): 69 | batch, channels, time = x.size() 70 | padded_x = F.pad(x, (self.padding, self.padding)) 71 | q_out = self.query_conv(x) 72 | k_out = self.key_conv(padded_x) 73 | v_out = self.value_conv(padded_x) 74 | kernal_size = 2*self.dilated + 1 75 | k_out = k_out.unfold(2, kernal_size, self.stride) # unfold(dim, size, step) 76 | k_out=torch.cat((k_out[:,:,:,0].unsqueeze(3),k_out[:,:,:,0+self.dilated].unsqueeze(3),k_out[:,:,:,0+2*self.dilated].unsqueeze(3)),dim=3) #dilated 77 | v_out = v_out.unfold(2, kernal_size, self.stride) 78 | v_out=torch.cat((v_out[:,:,:,0].unsqueeze(3),v_out[:,:,:,0+self.dilated].unsqueeze(3),v_out[:,:,:,0+2*self.dilated].unsqueeze(3)),dim=3) #dilated 79 | v_out = v_out + self.rel_t 80 | k_out = k_out.contiguous().view(batch, self.groups, self.out_channels // self.groups, time, -1) 81 | v_out = v_out.contiguous().view(batch, self.groups, self.out_channels // self.groups, time, -1) 82 | q_out = q_out.view(batch, self.groups, self.out_channels // self.groups, time, 1) 83 | out = q_out * k_out 84 | out = F.softmax(out, dim=-1) 85 | out = torch.einsum('bnctk,bnctk -> bnct', out, v_out).view(batch, -1, time) 86 | return out 87 | 88 | def reset_parameters(self): 89 | init.kaiming_normal(self.key_conv.weight, mode='fan_out') 90 | init.kaiming_normal(self.value_conv.weight, mode='fan_out') 91 | init.kaiming_normal(self.query_conv.weight, mode='fan_out') 92 | init.normal(self.rel_t, 0, 1) 93 | 94 | 95 | -------------------------------------------------------------------------------- /apmeter.py: -------------------------------------------------------------------------------- 1 | import math 2 | import meter 3 | import numpy as np 4 | import torch 5 | 6 | 7 | class APMeter(meter.Meter): 8 | """ 9 | The APMeter measures the average precision per class. 10 | 11 | The APMeter is designed to operate on `NxK` Tensors `output` and 12 | `target`, and optionally a `Nx1` Tensor weight where (1) the `output` 13 | contains model output scores for `N` examples and `K` classes that ought to 14 | be higher when the model is more convinced that the example should be 15 | positively labeled, and smaller when the model believes the example should 16 | be negatively labeled (for instance, the output of a sigmoid function); (2) 17 | the `target` contains only values 0 (for negative examples) and 1 18 | (for positive examples); and (3) the `weight` ( > 0) represents weight for 19 | each sample. 20 | """ 21 | def __init__(self): 22 | super(APMeter, self).__init__() 23 | self.reset() 24 | 25 | def reset(self): 26 | """Resets the meter with empty member variables""" 27 | self.scores = torch.FloatTensor(torch.FloatStorage()) 28 | self.targets = torch.LongTensor(torch.LongStorage()) 29 | self.weights = torch.FloatTensor(torch.FloatStorage()) 30 | 31 | def add(self, output, target, weight=None): 32 | """ 33 | Args: 34 | output (Tensor): NxK tensor that for each of the N examples 35 | indicates the probability of the example belonging to each of 36 | the K classes, according to the model. The probabilities should 37 | sum to one over all classes 38 | target (Tensor): binary NxK tensort that encodes which of the K 39 | classes are associated with the N-th input 40 | (eg: a row [0, 1, 0, 1] indicates that the example is 41 | associated with classes 2 and 4) 42 | weight (optional, Tensor): Nx1 tensor representing the weight for 43 | each example (each weight > 0) 44 | """ 45 | if not torch.is_tensor(output): 46 | output = torch.from_numpy(output) 47 | if not torch.is_tensor(target): 48 | target = torch.from_numpy(target) 49 | 50 | if weight is not None: 51 | if not torch.is_tensor(weight): 52 | weight = torch.from_numpy(weight) 53 | weight = weight.squeeze() 54 | if output.dim() == 1: 55 | output = output.view(-1, 1) 56 | else: 57 | assert output.dim() == 2, \ 58 | 'wrong output size (should be 1D or 2D with one column \ 59 | per class)' 60 | if target.dim() == 1: 61 | target = target.view(-1, 1) 62 | else: 63 | assert target.dim() == 2, \ 64 | 'wrong target size (should be 1D or 2D with one column \ 65 | per class)' 66 | if weight is not None: 67 | assert weight.dim() == 1, 'Weight dimension should be 1' 68 | assert weight.numel() == target.size(0), \ 69 | 'Weight dimension 1 should be the same as that of target' 70 | assert torch.min(weight) >= 0, 'Weight should be non-negative only' 71 | assert torch.equal(target**2, target), \ 72 | 'targets should be binary (0 or 1)' 73 | if self.scores.numel() > 0: 74 | assert target.size(1) == self.targets.size(1), \ 75 | 'dimensions for output should match previously added examples.' 76 | 77 | # make sure storage is of sufficient size 78 | if self.scores.storage().size() < self.scores.numel() + output.numel(): 79 | new_size = math.ceil(self.scores.storage().size() * 1.5) 80 | new_weight_size = math.ceil(self.weights.storage().size() * 1.5) 81 | self.scores.storage().resize_(int(new_size + output.numel())) 82 | self.targets.storage().resize_(int(new_size + output.numel())) 83 | if weight is not None: 84 | self.weights.storage().resize_(int(new_weight_size 85 | + output.size(0))) 86 | 87 | # store scores and targets 88 | offset = self.scores.size(0) if self.scores.dim() > 0 else 0 89 | self.scores.resize_(offset + output.size(0), output.size(1)) 90 | self.targets.resize_(offset + target.size(0), target.size(1)) 91 | self.scores.narrow(0, offset, output.size(0)).copy_(output) 92 | self.targets.narrow(0, offset, target.size(0)).copy_(target) 93 | 94 | if weight is not None: 95 | self.weights.resize_(offset + weight.size(0)) 96 | self.weights.narrow(0, offset, weight.size(0)).copy_(weight) 97 | 98 | def value(self): 99 | """Returns the model's average precision for each class 100 | 101 | Return: 102 | ap (FloatTensor): 1xK tensor, with avg precision for each class k 103 | """ 104 | 105 | if self.scores.numel() == 0: 106 | return 0 107 | ap = torch.zeros(self.scores.size(1)) 108 | rg = torch.range(1, self.scores.size(0)).float() 109 | if self.weights.numel() > 0: 110 | weight = self.weights.new(self.weights.size()) 111 | weighted_truth = self.weights.new(self.weights.size()) 112 | 113 | # print(self.scores) 114 | # compute average precision for each class 115 | for k in range(self.scores.size(1)): 116 | # sort scores 117 | scores = self.scores[:, k] 118 | targets = self.targets[:, k] 119 | _, sortind = torch.sort(scores, 0, True) 120 | truth = targets[sortind] 121 | if self.weights.numel() > 0: 122 | weight = self.weights[sortind] 123 | weighted_truth = truth.float() * weight 124 | rg = weight.cumsum(0) 125 | 126 | # compute true positive sums 127 | if self.weights.numel() > 0: 128 | tp = weighted_truth.cumsum(0) 129 | else: 130 | tp = truth.float().cumsum(0) 131 | 132 | # compute precision curve 133 | precision = tp.div(rg) 134 | 135 | # compute average precision 136 | ap[k] = precision[truth.byte()].sum() / max(truth.sum(), 1) 137 | return ap 138 | -------------------------------------------------------------------------------- /train_PDAN.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import time 3 | import os 4 | import argparse 5 | import sys 6 | 7 | import torchvision.models as models 8 | import torch 9 | 10 | 11 | 12 | def str2bool(v): 13 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 14 | return True 15 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 16 | return False 17 | else: 18 | raise argparse.ArgumentTypeError('Boolean value expected.') 19 | 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('-mode', type=str, help='rgb or flow (or joint for eval)') 23 | parser.add_argument('-train', type=str2bool, default='True', help='train or eval') 24 | parser.add_argument('-comp_info', type=str) 25 | parser.add_argument('-rgb_model_file', type=str) 26 | parser.add_argument('-flow_model_file', type=str) 27 | parser.add_argument('-gpu', type=str, default='4') 28 | parser.add_argument('-dataset', type=str, default='charades') 29 | parser.add_argument('-rgb_root', type=str, default='no_root') 30 | parser.add_argument('-flow_root', type=str, default='no_root') 31 | parser.add_argument('-type', type=str, default='original') 32 | parser.add_argument('-lr', type=str, default='0.1') 33 | parser.add_argument('-epoch', type=str, default='50') 34 | parser.add_argument('-model', type=str, default='') 35 | parser.add_argument('-APtype', type=str, default='wap') 36 | parser.add_argument('-randomseed', type=str, default='False') 37 | parser.add_argument('-load_model', type=str, default='False') 38 | parser.add_argument('-batch_size', type=str, default='False') 39 | parser.add_argument('-num_channel', type=str, default='False') 40 | parser.add_argument('-run_mode', type=str, default='False') 41 | parser.add_argument('-feat', type=str, default='False') 42 | 43 | 44 | args = parser.parse_args() 45 | 46 | import torch 47 | import torch.nn as nn 48 | import torch.nn.functional as F 49 | import torch.optim as optim 50 | import numpy as np 51 | import random 52 | 53 | # set random seed 54 | if args.randomseed=="False": 55 | SEED = 0 56 | elif args.randomseed=="True": 57 | SEED = random.randint(1, 100000) 58 | else: 59 | SEED = int(args.randomseed) 60 | 61 | torch.manual_seed(SEED) 62 | torch.cuda.manual_seed(SEED) 63 | torch.manual_seed(SEED) 64 | np.random.seed(SEED) 65 | torch.cuda.manual_seed_all(SEED) 66 | random.seed(SEED) 67 | torch.backends.cudnn.deterministic = True 68 | torch.backends.cudnn.benchmark = False 69 | print('Random_SEED!!!:', SEED) 70 | 71 | from torch.optim import lr_scheduler 72 | from torch.autograd import Variable 73 | 74 | import json 75 | 76 | import pickle 77 | import math 78 | 79 | from apmeter import APMeter 80 | 81 | 82 | batch_size = int(args.batch_size) 83 | 84 | 85 | if args.dataset == 'charades': 86 | from charades_i3d_per_video import MultiThumos as Dataset 87 | from charades_i3d_per_video import mt_collate_fn as collate_fn 88 | if args.run_mode == 'debug': 89 | print('debug!!!') 90 | train_split = './data/charades_test.json' 91 | test_split = './data/charades_test.json' 92 | else: 93 | train_split = './data/charades.json' 94 | test_split = './data/charades.json' 95 | # print('load feature from:', args.rgb_root) 96 | rgb_root = '/Path/to/charades_feat_rgb' 97 | skeleton_root = '/Path/to/charades_feat_pose' 98 | flow_root = '/Path/to/charades_feat_flow' 99 | rgb_of=[rgb_root,flow_root] 100 | classes = 157 101 | 102 | 103 | def load_data(train_split, val_split, root): 104 | # Load Data 105 | print('load data', root) 106 | if len(train_split) > 0: 107 | if str(args.feat) == '2d': 108 | dataset = Dataset(train_split, 'training', root, batch_size, classes, int(args.pool_step)) 109 | else: 110 | dataset = Dataset(train_split, 'training', root, batch_size, classes) 111 | dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, 112 | pin_memory=True, collate_fn=collate_fn) 113 | dataloader.root = root 114 | else: 115 | 116 | dataset = None 117 | dataloader = None 118 | 119 | if str(args.feat) == '2d': 120 | val_dataset = Dataset(val_split, 'testing', root, batch_size, classes, int(args.pool_step)) 121 | else: 122 | val_dataset = Dataset(val_split, 'testing', root, batch_size, classes) 123 | val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=True, num_workers=2, 124 | pin_memory=True, collate_fn=collate_fn) 125 | val_dataloader.root = root 126 | 127 | dataloaders = {'train': dataloader, 'val': val_dataloader} 128 | datasets = {'train': dataset, 'val': val_dataset} 129 | return dataloaders, datasets 130 | 131 | 132 | # train the model 133 | def run(models, criterion, num_epochs=50): 134 | since = time.time() 135 | 136 | for epoch in range(num_epochs): 137 | print('Epoch {}/{}'.format(epoch, num_epochs - 1)) 138 | print('-' * 10) 139 | 140 | probs = [] 141 | for model, gpu, dataloader, optimizer, sched, model_file in models: 142 | train_map, train_loss = train_step(model, gpu, optimizer, dataloader['train'], epoch) 143 | prob_val, val_loss, val_map = val_step(model, gpu, dataloader['val'], epoch) 144 | probs.append(prob_val) 145 | sched.step(val_loss) 146 | 147 | 148 | def eval_model(model, dataloader, baseline=False): 149 | results = {} 150 | for data in dataloader: 151 | other = data[3] 152 | outputs, loss, probs, _ = run_network(model, data, 0, baseline) 153 | fps = outputs.size()[1] / other[1][0] 154 | 155 | results[other[0][0]] = (outputs.data.cpu().numpy()[0], probs.data.cpu().numpy()[0], data[2].numpy()[0], fps) 156 | return results 157 | 158 | 159 | def run_network(model, data, gpu, epoch=0, baseline=False): 160 | inputs, mask, labels, other = data 161 | # wrap them in Variable 162 | inputs = Variable(inputs.cuda(gpu)) 163 | mask = Variable(mask.cuda(gpu)) 164 | labels = Variable(labels.cuda(gpu)) 165 | 166 | mask_list = torch.sum(mask, 1) 167 | mask_new = np.zeros((mask.size()[0], classes, mask.size()[1])) 168 | for i in range(mask.size()[0]): 169 | mask_new[i, :, :int(mask_list[i])] = np.ones((classes, int(mask_list[i]))) 170 | mask_new = torch.from_numpy(mask_new).float() 171 | mask_new = Variable(mask_new.cuda(gpu)) 172 | 173 | inputs = inputs.squeeze(3).squeeze(3) 174 | #print("inputs",inputs.size()) 175 | activation = model(inputs, mask_new) 176 | 177 | 178 | outputs_final = activation 179 | 180 | #print("outputs_final",outputs_final.size()) 181 | outputs_final = outputs_final[-1] 182 | #print("outputs_final",outputs_final.size()) 183 | outputs_final = outputs_final.permute(0, 2, 1) 184 | probs_f = F.sigmoid(outputs_final) * mask.unsqueeze(2) 185 | loss_f = F.binary_cross_entropy_with_logits(outputs_final, labels, size_average=False) 186 | loss_f = torch.sum(loss_f) / torch.sum(mask) 187 | 188 | loss = loss_f 189 | 190 | corr = torch.sum(mask) 191 | tot = torch.sum(mask) 192 | 193 | return outputs_final, loss, probs_f, corr / tot 194 | 195 | 196 | def train_step(model, gpu, optimizer, dataloader, epoch): 197 | model.train(True) 198 | tot_loss = 0.0 199 | error = 0.0 200 | num_iter = 0. 201 | apm = APMeter() 202 | for data in dataloader: 203 | optimizer.zero_grad() 204 | num_iter += 1 205 | 206 | outputs, loss, probs, err = run_network(model, data, gpu, epoch) 207 | apm.add(probs.data.cpu().numpy()[0], data[2].numpy()[0]) 208 | error += err.data 209 | tot_loss += loss.data 210 | 211 | loss.backward() 212 | optimizer.step() 213 | if args.APtype == 'wap': 214 | train_map = 100 * apm.value() 215 | else: 216 | train_map = 100 * apm.value().mean() 217 | print('train-map:', train_map) 218 | apm.reset() 219 | 220 | epoch_loss = tot_loss / num_iter 221 | 222 | return train_map, epoch_loss 223 | 224 | 225 | def val_step(model, gpu, dataloader, epoch): 226 | model.train(False) 227 | apm = APMeter() 228 | tot_loss = 0.0 229 | error = 0.0 230 | num_iter = 0. 231 | num_preds = 0 232 | 233 | full_probs = {} 234 | 235 | # Iterate over data. 236 | for data in dataloader: 237 | num_iter += 1 238 | other = data[3] 239 | 240 | outputs, loss, probs, err = run_network(model, data, gpu, epoch) 241 | 242 | apm.add(probs.data.cpu().numpy()[0], data[2].numpy()[0]) 243 | 244 | error += err.data 245 | tot_loss += loss.data 246 | 247 | probs = probs.squeeze() 248 | 249 | full_probs[other[0][0]] = probs.data.cpu().numpy().T 250 | 251 | epoch_loss = tot_loss / num_iter 252 | 253 | 254 | val_map = torch.sum(100 * apm.value()) / torch.nonzero(100 * apm.value()).size()[0] 255 | print('val-map:', val_map) 256 | print(100 * apm.value()) 257 | apm.reset() 258 | 259 | return full_probs, epoch_loss, val_map 260 | 261 | 262 | if __name__ == '__main__': 263 | print(str(args.model)) 264 | print('batch_size:', batch_size) 265 | print('cuda_avail', torch.cuda.is_available()) 266 | 267 | if args.mode == 'flow': 268 | print('flow mode', flow_root) 269 | dataloaders, datasets = load_data(train_split, test_split, flow_root) 270 | elif args.mode == 'skeleton': 271 | print('Pose mode', skeleton_root) 272 | dataloaders, datasets = load_data(train_split, test_split, skeleton_root) 273 | elif args.mode == 'rgb': 274 | print('RGB mode', rgb_root) 275 | dataloaders, datasets = load_data(train_split, test_split, rgb_root) 276 | 277 | if args.train: 278 | num_channel = args.num_channel 279 | if args.mode == 'skeleton': 280 | input_channnel = 256 281 | else: 282 | input_channnel = 1024 283 | 284 | num_classes = classes 285 | mid_channel=int(args.num_channel) 286 | 287 | if args.model=="PDAN": 288 | print("you are processing PDAN_original") 289 | from PDAN import PDAN 290 | # rgb_model = Net(mid_channel, input_channnel, classes) 291 | stage=1 292 | block=5 293 | num_channel=512 294 | input_channnel=1024 295 | num_classes=classes 296 | rgb_model = PDAN(stage, block, num_channel, input_channnel, num_classes) 297 | pytorch_total_params = sum(p.numel() for p in rgb_model.parameters() if p.requires_grad) 298 | print('pytorch_total_params', pytorch_total_params) 299 | #exit() 300 | print ('stage:', stage, 'block:', block, 'num_channel:', num_channel, 'input_channnel:', input_channnel, 301 | 'num_classes:', num_classes) 302 | 303 | 304 | rgb_model=torch.nn.DataParallel(rgb_model) 305 | 306 | if args.load_model!= "False": 307 | rgb_model.load_state_dict(torch.load(str(args.load_model))) 308 | print("loaded",args.load_model) 309 | 310 | pytorch_total_params = sum(p.numel() for p in rgb_model.parameters() if p.requires_grad) 311 | print('pytorch_total_params', pytorch_total_params) 312 | print('num_channel:', num_channel, 'input_channnel:', input_channnel,'num_classes:', num_classes) 313 | rgb_model.cuda() 314 | 315 | criterion = nn.NLLLoss(reduce=False) 316 | lr = float(args.lr) 317 | print(lr) 318 | optimizer = optim.Adam(rgb_model.parameters(), lr=lr) 319 | lr_sched = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=8, verbose=True) 320 | run([(rgb_model, 0, dataloaders, optimizer, lr_sched, args.comp_info)], criterion, num_epochs=int(args.epoch)) 321 | 322 | 323 | --------------------------------------------------------------------------------