├── .gitignore ├── LICENSE ├── README.md ├── datasets ├── activitynet.py ├── dataset.py ├── diving48.py ├── hmdb51.py ├── kinetics.py └── ucf101.py ├── libs ├── mean.py ├── opts.py ├── spatial_transforms.py ├── target_transforms.py ├── temporal_transforms.py ├── test.py ├── train_epoch.py ├── utils.py └── validation_epoch.py ├── loss ├── hloss.py └── soft_cross_entropy.py ├── models ├── densenet.py ├── grad_reversal.py ├── model.py ├── pre_act_resnet.py ├── resnet.py ├── resnext.py ├── vgg.py └── wide_resnet.py ├── sdn_packages.txt ├── train.py └── utils ├── eval_diving48.py ├── eval_hmdb51.py ├── eval_kinetics.py ├── eval_ucf101.py ├── fps.py ├── hmdb51_json.py ├── kinetics_json.py ├── n_frames_kinetics.py ├── n_frames_ucf101_hmdb51.py ├── ucf101_json.py ├── video_jpg.py ├── video_jpg_diving48.py ├── video_jpg_kinetics.py └── video_jpg_ucf101_hmdb51.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.out 2 | log/ 3 | result/ 4 | pretrain/ 5 | checkpoints/ 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | env/ 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # dotenv 89 | .env 90 | 91 | # virtualenv 92 | .venv 93 | venv/ 94 | ENV/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | .DS_Store 110 | 111 | .vscode 112 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jinwoo Choi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SDN: Scene Debiasing Network for Action Recognition in PyTorch 2 | We release the code of the "Why Can't I Dance in the Mall? Learning to Mitigate Scene Bias in Action Recognition". The code is built upon the [3D-ResNets-PyTorch codebase](https://github.com/kenshohara/3D-ResNets-PyTorch). 3 | 4 | For the details, visit our [project website](http://chengao.vision/SDN/) or see our [full paper](https://papers.nips.cc/paper/8372-why-cant-i-dance-in-the-mall-learning-to-mitigate-scene-bias-in-action-recognition.pdf). 5 | 6 | ## Reference 7 | [Jinwoo Choi](https://sites.google.com/site/jchoivision/), [Chen Gao](https://gaochen315.github.io/), [Joseph C. E. Messou](https://josephcmessou.weebly.com/about.html), [Jia-Bin Huang](https://filebox.ece.vt.edu/~jbhuang/index.html). Why Can't I Dance in the Mall? Learning to Mitigate Scene Bias in Action Recognition. Neural Information Processing Systems (NeurIPS) 2019. 8 | 9 | ``` 10 | @inproceedings{choi2019sdn, 11 | title = {Why Can't I Dance in the Mall? Learning to Mitigate Scene Bias in Action Recognition}, 12 | author = {Choi, Jinwoo and Gao, Chen and Messou, C. E. Joseph and Huang, Jia-Bin}, 13 | booktitle={NeurIPS}, 14 | year={2019} 15 | } 16 | ``` 17 | 18 | ## Requirements 19 | This codebase was developed and tested with: 20 | - Python 3.6 21 | - PyTorch 0.4.1 22 | - torchvision 0.2.1 23 | - CUDA 9.0 24 | - CUDNN 7.1 25 | - GPU: 2xP100 26 | 27 | You can find dependencies from `sdn_packages.txt` 28 | 29 | You can install dependencies by 30 | ``` 31 | pip install -r sdn_packages.txt 32 | ``` 33 | 34 | ## Datasets 35 | ### Prepare your dataset 36 | **1. Download and pre-process data** 37 | - Follow the [3D-ResNets-PyTorch instruction](https://github.com/kenshohara/3D-ResNets-PyTorch#preparation). 38 | 39 | **2. Download scene and human detection data numpy files** 40 | - [Download the Mini-Kinetics scene pseudo labels](https://filebox.ece.vt.edu/~jinchoi/files/sdn/places_data.zip) 41 | - [Download the Mini-Kinetics human detections](https://filebox.ece.vt.edu/~jinchoi/files/sdn/detections.zip) 42 | 43 | ## Train 44 | ### Training on a source dataset (mini-Kinetics) 45 | **- Baseline model without any debiasing** 46 | ``` 47 | python train.py 48 | --video_path \ 49 | --annotation_path /kinetics.json \ 50 | --result_path \ 51 | --root_path \ 52 | --dataset kinetics \ 53 | --n_classes 200 \ 54 | --n_finetune_classes 200 \ 55 | --model resnet \ 56 | --model_depth 18 \ 57 | --resnet_shortcut A \ 58 | --batch_size 32 \ 59 | --val_batch_size 16 \ 60 | --n_threads 16 \ 61 | --checkpoint 1 \ 62 | --ft_begin_index 0 \ 63 | --is_mask_adv \ 64 | --learning_rate 0.0001 \ 65 | --weight_decay 1e-5 \ 66 | --n_epochs 100 \ 67 | --pretrain_path 68 | ``` 69 | 70 | **- SDN model with scene adversarial loss only** 71 | ``` 72 | python train.py \ 73 | --video_path \ 74 | --annotation_path /kinetics.json \ 75 | --result_path \ 76 | --root_path \ 77 | --dataset kinetics_adv \ 78 | --n_classes 200 \ 79 | --n_finetune_classes 200 \ 80 | --model resnet \ 81 | --model_depth 18 \ 82 | --resnet_shortcut A \ 83 | --batch_size 32 \ 84 | --val_batch_size 16 \ 85 | --n_threads 16 \ 86 | --checkpoint 1 \ 87 | --ft_begin_index 0 \ 88 | --num_place_hidden_layers 3 \ 89 | --new_layer_lr 1e-2 \ 90 | --learning_rate 1e-4 \ 91 | --warm_up_epochs 5 \ 92 | --weight_decay 1e-5 \ 93 | --n_epochs 100 \ 94 | --place_pred_path \ 95 | --is_place_adv \ 96 | --is_place_soft \ 97 | --alpha 1.0 \ 98 | --is_mask_adv \ 99 | --num_places_classes 365 \ 100 | --pretrain_path 101 | ``` 102 | 103 | **- Full SDN model with 1) scene adversarial loss and 2) human mask confussion loss** 104 | ``` 105 | python train.py \ 106 | --video_path \ 107 | --annotation_path /kinetics.json \ 108 | --result_path \ 109 | --root_path \ 110 | --dataset kinetics_adv_msk \ 111 | --n_classes 200 \ 112 | --n_finetune_classes 200 \ 113 | --model resnet \ 114 | --model_depth 18 \ 115 | --resnet_shortcut A \ 116 | --batch_size 32 \ 117 | --val_batch_size 16 \ 118 | --n_threads 16 \ 119 | --checkpoint 1 \ 120 | --ft_begin_index 0 \ 121 | --num_place_hidden_layers 3 \ 122 | --num_human_mask_adv_hidden_layers 1 \ 123 | --new_layer_lr 1e-4 \ 124 | --learning_rate 1e-4 \ 125 | --warm_up_epochs 0 \ 126 | --weight_decay 1e-5 \ 127 | --n_epochs 100 \ 128 | --place_pred_path \ 129 | --is_place_adv \ 130 | --is_place_soft \ 131 | --is_mask_entropy \ 132 | --alpha 0.5 \ 133 | --mask_ratio 1.0 \ 134 | --slower_place_mlp \ 135 | --not_replace_last_fc \ 136 | --num_places_classes 365 \ 137 | --human_dets_path \ 138 | --pretrain_path 139 | ``` 140 | 141 | ### Finetuning on target datasets 142 | #### [Diving48](http://www.svcl.ucsd.edu/projects/resound/dataset.html) as an example 143 | ``` 144 | python train.py \ 145 | --dataset diving48 \ 146 | --root_path \ 147 | --video_path \ 148 | --n_classes 200 \ 149 | --n_finetune_classes 48 \ 150 | --model resnet \ 151 | --model_depth 18 \ 152 | --resnet_shortcut A \ 153 | --ft_begin_index 0 \ 154 | --batch_size 32 \ 155 | --val_batch_size 16 \ 156 | --n_threads 4 \ 157 | --checkpoint 1 \ 158 | --learning_rate 0.005 \ 159 | --weight_decay 1e-5 \ 160 | --n_epochs $epoch_ft \ 161 | --is_mask_adv \ 162 | --annotation_path $anno_path \ 163 | --result_path \ 164 | --pretrain_path 165 | ``` 166 | 167 | ## Test 168 | ``` 169 | python train.py \ 170 | --dataset diving48 \ 171 | --root_path \ 172 | --video_path \ 173 | --n_finetune_classes 48 \ 174 | --n_classes 48 \ 175 | --model resnet \ 176 | --model_depth 18 \ 177 | --resnet_shortcut A \ 178 | --batch_size 32 \ 179 | --val_batch_size 16 \ 180 | --n_threads 4 \ 181 | --test \ 182 | --test_subset val \ 183 | --no_train \ 184 | --no_val \ 185 | --is_mask_adv \ 186 | --annotation_path $anno_path \ 187 | --result_path \ 188 | --resume_path 189 | ``` 190 | This step will generate `val.json` file under `$result_path`. 191 | 192 | ## Evaluation 193 | ``` 194 | python utils/eval_diving48.py \ 195 | --annotation_path $anno_path \ 196 | --prediction_path 197 | ``` 198 | 199 | ## Pre-trained model weights provided 200 | [Download the pre-trained weights](https://www.dropbox.com/scl/fi/j2pgucu8gvpz3jp5ygl91/pre-trained_weights.tar?rlkey=gicecxrpj2o7ipjmhmx0hlcrl&dl=0) 201 | 202 | ## Acknowledgments 203 | This code is built upon [3D-ResNets-PyTorch codebase](https://github.com/kenshohara/3D-ResNets-PyTorch). We thank to Kensho Hara. 204 | -------------------------------------------------------------------------------- /datasets/activitynet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from PIL import Image 4 | import os 5 | import functools 6 | import json 7 | import copy 8 | import math 9 | 10 | from libs.utils import load_value_file 11 | 12 | 13 | def pil_loader(path): 14 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 15 | with open(path, 'rb') as f: 16 | with Image.open(f) as img: 17 | return img.convert('RGB') 18 | 19 | 20 | def accimage_loader(path): 21 | try: 22 | import accimage 23 | return accimage.Image(path) 24 | except IOError: 25 | # Potentially a decoding problem, fall back to PIL.Image 26 | return pil_loader(path) 27 | 28 | 29 | def get_default_image_loader(): 30 | from torchvision import get_image_backend 31 | if get_image_backend() == 'accimage': 32 | return accimage_loader 33 | else: 34 | return pil_loader 35 | 36 | 37 | def video_loader(video_dir_path, frame_indices, image_loader): 38 | video = [] 39 | for i in frame_indices: 40 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 41 | if os.path.exists(image_path): 42 | video.append(image_loader(image_path)) 43 | else: 44 | return video 45 | 46 | return video 47 | 48 | 49 | def get_default_video_loader(): 50 | image_loader = get_default_image_loader() 51 | return functools.partial(video_loader, image_loader=image_loader) 52 | 53 | 54 | def load_annotation_data(data_file_path): 55 | with open(data_file_path, 'r') as data_file: 56 | return json.load(data_file) 57 | 58 | 59 | def get_class_labels(data): 60 | class_names = [] 61 | index = 0 62 | for node1 in data['taxonomy']: 63 | is_leaf = True 64 | for node2 in data['taxonomy']: 65 | if node2['parentId'] == node1['nodeId']: 66 | is_leaf = False 67 | break 68 | if is_leaf: 69 | class_names.append(node1['nodeName']) 70 | 71 | class_labels_map = {} 72 | 73 | for i, class_name in enumerate(class_names): 74 | class_labels_map[class_name] = i 75 | 76 | return class_labels_map 77 | 78 | 79 | def get_video_names_and_annotations(data, subset): 80 | video_names = [] 81 | annotations = [] 82 | 83 | for key, value in data['database'].items(): 84 | this_subset = value['subset'] 85 | if this_subset == subset: 86 | if subset == 'testing': 87 | video_names.append('v_{}'.format(key)) 88 | else: 89 | video_names.append('v_{}'.format(key)) 90 | annotations.append(value['annotations']) 91 | 92 | return video_names, annotations 93 | 94 | 95 | def modify_frame_indices(video_dir_path, frame_indices): 96 | modified_indices = [] 97 | for i in frame_indices: 98 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 99 | if not os.path.exists(image_path): 100 | return modified_indices 101 | modified_indices.append(i) 102 | return modified_indices 103 | 104 | 105 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video, 106 | sample_duration): 107 | data = load_annotation_data(annotation_path) 108 | video_names, annotations = get_video_names_and_annotations(data, subset) 109 | class_to_idx = get_class_labels(data) 110 | idx_to_class = {} 111 | for name, label in class_to_idx.items(): 112 | idx_to_class[label] = name 113 | 114 | dataset = [] 115 | for i in range(len(video_names)): 116 | if i % 1000 == 0: 117 | print('dataset loading [{}/{}]'.format(i, len(video_names))) 118 | 119 | video_path = os.path.join(root_path, video_names[i]) 120 | if not os.path.exists(video_path): 121 | continue 122 | 123 | fps_file_path = os.path.join(video_path, 'fps') 124 | fps = load_value_file(fps_file_path) 125 | 126 | for annotation in annotations[i]: 127 | begin_t = math.ceil(annotation['segment'][0] * fps) 128 | end_t = math.ceil(annotation['segment'][1] * fps) 129 | if begin_t == 0: 130 | begin_t = 1 131 | n_frames = end_t - begin_t 132 | 133 | sample = { 134 | 'video': video_path, 135 | 'segment': [begin_t, end_t], 136 | 'fps': fps, 137 | 'video_id': video_names[i][2:] 138 | } 139 | if len(annotations) != 0: 140 | sample['label'] = class_to_idx[annotation['label']] 141 | else: 142 | sample['label'] = -1 143 | 144 | if n_samples_for_each_video == 1: 145 | frame_indices = list(range(begin_t, end_t)) 146 | frame_indices = modify_frame_indices(sample['video'], 147 | frame_indices) 148 | if len(frame_indices) < 16: 149 | continue 150 | sample['frame_indices'] = frame_indices 151 | dataset.append(sample) 152 | else: 153 | if n_samples_for_each_video > 1: 154 | step = max(1, 155 | math.ceil((n_frames - 1 - sample_duration) / 156 | (n_samples_for_each_video - 1))) 157 | else: 158 | step = sample_duration 159 | for j in range(begin_t, end_t, step): 160 | sample_j = copy.deepcopy(sample) 161 | frame_indices = list(range(j, j + sample_duration)) 162 | frame_indices = modify_frame_indices( 163 | sample_j['video'], frame_indices) 164 | if len(frame_indices) < 16: 165 | continue 166 | sample_j['frame_indices'] = frame_indices 167 | dataset.append(sample_j) 168 | 169 | return dataset, idx_to_class 170 | 171 | 172 | def get_end_t(video_path): 173 | file_names = os.listdir(video_path) 174 | image_file_names = [x for x in file_names if 'image' in x] 175 | image_file_names.sort(reverse=True) 176 | return int(image_file_names[0][6:11]) 177 | 178 | 179 | def make_untrimmed_dataset(root_path, annotation_path, subset, 180 | n_samples_for_each_video, sample_duration): 181 | data = load_annotation_data(annotation_path) 182 | video_names, _ = get_video_names_and_annotations(data, subset) 183 | class_to_idx = get_class_labels(data) 184 | idx_to_class = {} 185 | for name, label in class_to_idx.items(): 186 | idx_to_class[label] = name 187 | 188 | dataset = [] 189 | for i in range(len(video_names)): 190 | if i % 1000 == 0: 191 | print('dataset loading [{}/{}]'.format(i, len(video_names))) 192 | 193 | video_path = os.path.join(root_path, video_names[i]) 194 | if not os.path.exists(video_path): 195 | continue 196 | 197 | fps_file_path = os.path.join(video_path, 'fps') 198 | fps = load_value_file(fps_file_path) 199 | 200 | begin_t = 1 201 | end_t = get_end_t(video_path) 202 | n_frames = end_t - begin_t 203 | 204 | sample = { 205 | 'video': video_path, 206 | 'segment': [begin_t, end_t], 207 | 'fps': fps, 208 | 'video_id': video_names[i][2:] 209 | } 210 | 211 | if n_samples_for_each_video >= 1: 212 | step = max(1, 213 | math.ceil((n_frames - 1 - sample_duration) / 214 | (n_samples_for_each_video - 1))) 215 | else: 216 | step = sample_duration 217 | for j in range(begin_t, end_t, step): 218 | sample_j = copy.deepcopy(sample) 219 | frame_indices = list(range(j, j + sample_duration)) 220 | frame_indices = modify_frame_indices(sample_j['video'], 221 | frame_indices) 222 | if len(frame_indices) < 16: 223 | continue 224 | sample_j['frame_indices'] = frame_indices 225 | dataset.append(sample_j) 226 | 227 | return dataset, idx_to_class 228 | 229 | 230 | class ActivityNet(data.Dataset): 231 | """ 232 | Args: 233 | root (string): Root directory path. 234 | spatial_transform (callable, optional): A function/transform that takes in an PIL image 235 | and returns a transformed version. E.g, ``transforms.RandomCrop`` 236 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices 237 | and returns a transformed version 238 | target_transform (callable, optional): A function/transform that takes in the 239 | target and transforms it. 240 | loader (callable, optional): A function to load an video given its path and frame indices. 241 | Attributes: 242 | classes (list): List of the class names. 243 | class_to_idx (dict): Dict with items (class_name, class_index). 244 | imgs (list): List of (image path, class_index) tuples 245 | """ 246 | 247 | def __init__(self, 248 | root_path, 249 | annotation_path, 250 | subset, 251 | is_untrimmed_setting=False, 252 | n_samples_for_each_video=1, 253 | spatial_transform=None, 254 | temporal_transform=None, 255 | target_transform=None, 256 | sample_duration=16, 257 | get_loader=get_default_video_loader): 258 | if is_untrimmed_setting: 259 | self.data, self.class_names = make_untrimmed_dataset( 260 | root_path, annotation_path, subset, n_samples_for_each_video, 261 | sample_duration) 262 | else: 263 | self.data, self.class_names = make_dataset( 264 | root_path, annotation_path, subset, n_samples_for_each_video, 265 | sample_duration) 266 | 267 | self.spatial_transform = spatial_transform 268 | self.temporal_transform = temporal_transform 269 | self.target_transform = target_transform 270 | self.loader = get_loader() 271 | 272 | def __getitem__(self, index): 273 | """ 274 | Args: 275 | index (int): Index 276 | Returns: 277 | tuple: (image, target) where target is class_index of the target class. 278 | """ 279 | path = self.data[index]['video'] 280 | 281 | frame_indices = self.data[index]['frame_indices'] 282 | if self.temporal_transform is not None: 283 | frame_indices = self.temporal_transform(frame_indices) 284 | clip = self.loader(path, frame_indices) 285 | if self.spatial_transform is not None: 286 | self.spatial_transform.randomize_parameters() 287 | clip = [self.spatial_transform(img) for img in clip] 288 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 289 | 290 | target = self.data[index] 291 | if self.target_transform is not None: 292 | target = self.target_transform(target) 293 | 294 | return clip, target 295 | 296 | def __len__(self): 297 | return len(self.data) 298 | -------------------------------------------------------------------------------- /datasets/dataset.py: -------------------------------------------------------------------------------- 1 | from datasets.kinetics import Kinetics, Kinetics_adv, Kinetics_bkgmsk, Kinetics_human_msk, Kinetics_adv_msk 2 | from datasets.activitynet import ActivityNet 3 | from datasets.ucf101 import UCF101 4 | from datasets.hmdb51 import HMDB51 5 | from datasets.diving48 import Diving48 6 | 7 | 8 | def get_training_set(opt, spatial_transform, temporal_transform, 9 | target_transform): 10 | assert opt.dataset in ['kinetics', 'kinetics_adv', 'kinetics_bkgmsk', 'kinetics_adv_msk', 'activitynet', 'ucf101', 'hmdb51', 'diving48'] 11 | 12 | if opt.dataset == 'kinetics': 13 | training_data = Kinetics( 14 | opt.video_path+'/train', 15 | opt.annotation_path, 16 | 'training', 17 | spatial_transform=spatial_transform, 18 | temporal_transform=temporal_transform, 19 | target_transform=target_transform) 20 | elif opt.dataset == 'kinetics_adv': 21 | training_data = Kinetics_adv( 22 | opt.video_path+'/train', 23 | opt.annotation_path, 24 | 'training', 25 | spatial_transform=spatial_transform, 26 | temporal_transform=temporal_transform, 27 | target_transform=target_transform, 28 | place_pred_path=opt.place_pred_path, 29 | is_place_soft_label=opt.is_place_soft) 30 | elif opt.dataset == 'kinetics_bkgmsk': 31 | training_data = Kinetics_bkgmsk( 32 | opt.video_path+'/train', 33 | opt.annotation_path, 34 | 'training', 35 | spatial_transform=spatial_transform, 36 | temporal_transform=temporal_transform, 37 | target_transform=target_transform, 38 | detection_path=opt.human_dets_path, 39 | mask_ratio=opt.mask_ratio) 40 | elif opt.dataset == 'kinetics_adv_msk': 41 | training_data_1 = Kinetics_adv( 42 | opt.video_path+'/train', 43 | opt.annotation_path, 44 | 'training', 45 | spatial_transform=spatial_transform, 46 | temporal_transform=temporal_transform, 47 | target_transform=target_transform, 48 | place_pred_path=opt.place_pred_path, 49 | is_place_soft_label=opt.is_place_soft) 50 | training_data_2 = Kinetics_human_msk( 51 | opt.video_path+'/train', 52 | opt.annotation_path, 53 | 'training', 54 | spatial_transform=spatial_transform, 55 | temporal_transform=temporal_transform, 56 | target_transform=target_transform, 57 | detection_path=opt.human_dets_path, 58 | mask_ratio=opt.mask_ratio) 59 | training_data = [training_data_1, training_data_2] 60 | elif opt.dataset == 'activitynet': 61 | training_data = ActivityNet( 62 | opt.video_path, 63 | opt.annotation_path, 64 | 'training', 65 | False, 66 | spatial_transform=spatial_transform, 67 | temporal_transform=temporal_transform, 68 | target_transform=target_transform) 69 | elif opt.dataset == 'ucf101': 70 | training_data = UCF101( 71 | opt.video_path, 72 | opt.annotation_path, 73 | 'training', 74 | spatial_transform=spatial_transform, 75 | temporal_transform=temporal_transform, 76 | target_transform=target_transform) 77 | elif opt.dataset == 'hmdb51': 78 | training_data = HMDB51( 79 | opt.video_path, 80 | opt.annotation_path, 81 | 'training', 82 | spatial_transform=spatial_transform, 83 | temporal_transform=temporal_transform, 84 | target_transform=target_transform) 85 | elif opt.dataset == 'diving48': 86 | training_data = Diving48( 87 | opt.video_path, 88 | opt.annotation_path, 89 | 'training', 90 | spatial_transform=spatial_transform, 91 | temporal_transform=temporal_transform, 92 | target_transform=target_transform) 93 | 94 | return training_data 95 | 96 | 97 | def get_validation_set(opt, spatial_transform, temporal_transform, 98 | target_transform): 99 | assert opt.dataset in ['kinetics', 'kinetics_adv', 'kinetics_bkgmsk', 'kinetics_human_msk', 'kinetics_adv_msk', 'activitynet', 'ucf101', 'hmdb51', 'diving48'] 100 | 101 | if opt.dataset == 'kinetics': 102 | validation_data = Kinetics( 103 | opt.video_path+'/val', 104 | opt.annotation_path, 105 | 'validation', 106 | opt.n_val_samples, 107 | spatial_transform, 108 | temporal_transform, 109 | target_transform, 110 | sample_duration=opt.sample_duration) 111 | elif opt.dataset == 'kinetics_adv': 112 | validation_data = Kinetics_adv( 113 | opt.video_path+'/val', 114 | opt.annotation_path, 115 | 'validation', 116 | opt.n_val_samples, 117 | spatial_transform, 118 | temporal_transform, 119 | target_transform, 120 | sample_duration=opt.sample_duration, 121 | place_pred_path=opt.place_pred_path, 122 | is_place_soft_label=opt.is_place_soft) 123 | elif opt.dataset == 'kinetics_bkgmsk': 124 | validation_data = Kinetics_bkgmsk( 125 | opt.video_path+'/val', 126 | opt.annotation_path, 127 | 'validation', 128 | opt.n_val_samples, 129 | spatial_transform, 130 | temporal_transform, 131 | target_transform, 132 | sample_duration=opt.sample_duration, 133 | detection_path=opt.human_dets_path, 134 | mask_ratio=opt.mask_ratio) 135 | elif opt.dataset == 'kinetics_adv_msk': 136 | validation_data_1 = Kinetics_adv( 137 | opt.video_path+'/val', 138 | opt.annotation_path, 139 | 'validation', 140 | opt.n_val_samples, 141 | spatial_transform, 142 | temporal_transform, 143 | target_transform, 144 | sample_duration=opt.sample_duration, 145 | place_pred_path=opt.place_pred_path, 146 | is_place_soft_label=opt.is_place_soft) 147 | validation_data_2 = Kinetics_human_msk( 148 | opt.video_path+'/val', 149 | opt.annotation_path, 150 | 'validation', 151 | opt.n_val_samples, 152 | spatial_transform, 153 | temporal_transform, 154 | target_transform, 155 | sample_duration=opt.sample_duration, 156 | detection_path=opt.human_dets_path, 157 | mask_ratio=opt.mask_ratio) 158 | validation_data = [validation_data_1, validation_data_2] 159 | elif opt.dataset == 'activitynet': 160 | validation_data = ActivityNet( 161 | opt.video_path, 162 | opt.annotation_path, 163 | 'validation', 164 | False, 165 | opt.n_val_samples, 166 | spatial_transform, 167 | temporal_transform, 168 | target_transform, 169 | sample_duration=opt.sample_duration) 170 | elif opt.dataset == 'ucf101': 171 | validation_data = UCF101( 172 | opt.video_path, 173 | opt.annotation_path, 174 | 'validation', 175 | opt.n_val_samples, 176 | spatial_transform, 177 | temporal_transform, 178 | target_transform, 179 | sample_duration=opt.sample_duration, 180 | vis=opt.vis) 181 | elif opt.dataset == 'hmdb51': 182 | validation_data = HMDB51( 183 | opt.video_path, 184 | opt.annotation_path, 185 | 'validation', 186 | opt.n_val_samples, 187 | spatial_transform, 188 | temporal_transform, 189 | target_transform, 190 | sample_duration=opt.sample_duration, 191 | vis=opt.vis) 192 | elif opt.dataset == 'diving48': 193 | validation_data = Diving48( 194 | opt.video_path, 195 | opt.annotation_path, 196 | 'validation', 197 | opt.n_val_samples, 198 | spatial_transform, 199 | temporal_transform, 200 | target_transform, 201 | sample_duration=opt.sample_duration, 202 | vis=opt.vis) 203 | return validation_data 204 | 205 | 206 | def get_test_set(opt, spatial_transform, temporal_transform, target_transform): 207 | assert opt.dataset in ['kinetics', 'activitynet', 'ucf101', 'hmdb51', 'diving48'] 208 | assert opt.test_subset in ['val', 'test'] 209 | 210 | if opt.test_subset == 'val': 211 | subset = 'validation' 212 | elif opt.test_subset == 'test': 213 | subset = 'testing' 214 | if opt.dataset == 'kinetics': 215 | test_data = Kinetics( 216 | opt.video_path, 217 | opt.annotation_path, 218 | subset, 219 | 0, 220 | spatial_transform, 221 | temporal_transform, 222 | target_transform, 223 | sample_duration=opt.sample_duration) 224 | elif opt.dataset == 'activitynet': 225 | test_data = ActivityNet( 226 | opt.video_path, 227 | opt.annotation_path, 228 | subset, 229 | True, 230 | 0, 231 | spatial_transform, 232 | temporal_transform, 233 | target_transform, 234 | sample_duration=opt.sample_duration) 235 | elif opt.dataset == 'ucf101': 236 | test_data = UCF101( 237 | opt.video_path, 238 | opt.annotation_path, 239 | subset, 240 | 0, 241 | spatial_transform, 242 | temporal_transform, 243 | target_transform, 244 | sample_duration=opt.sample_duration) 245 | elif opt.dataset == 'hmdb51': 246 | test_data = HMDB51( 247 | opt.video_path, 248 | opt.annotation_path, 249 | subset, 250 | 0, 251 | spatial_transform, 252 | temporal_transform, 253 | target_transform, 254 | sample_duration=opt.sample_duration) 255 | elif opt.dataset == 'diving48': 256 | test_data = Diving48( 257 | opt.video_path, 258 | opt.annotation_path, 259 | subset, 260 | 0, 261 | spatial_transform, 262 | temporal_transform, 263 | target_transform, 264 | sample_duration=opt.sample_duration) 265 | 266 | return test_data 267 | -------------------------------------------------------------------------------- /datasets/diving48.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from PIL import Image 4 | import os 5 | import math 6 | import functools 7 | import json 8 | import copy 9 | 10 | from libs.utils import load_value_file 11 | import pdb 12 | 13 | def pil_loader(path): 14 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 15 | with open(path, 'rb') as f: 16 | with Image.open(f) as img: 17 | return img.convert('RGB') 18 | 19 | 20 | def accimage_loader(path): 21 | try: 22 | import accimage 23 | return accimage.Image(path) 24 | except IOError: 25 | # Potentially a decoding problem, fall back to PIL.Image 26 | return pil_loader(path) 27 | 28 | 29 | def get_default_image_loader(): 30 | from torchvision import get_image_backend 31 | if get_image_backend() == 'accimage': 32 | return accimage_loader 33 | else: 34 | return pil_loader 35 | 36 | 37 | def video_loader(video_dir_path, frame_indices, image_loader): 38 | video = [] 39 | for i in frame_indices: 40 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 41 | if os.path.exists(image_path): 42 | video.append(image_loader(image_path)) 43 | else: 44 | return video 45 | 46 | return video 47 | 48 | 49 | def get_default_video_loader(): 50 | image_loader = get_default_image_loader() 51 | return functools.partial(video_loader, image_loader=image_loader) 52 | 53 | 54 | def load_annotation_data(data_file_path): 55 | with open(data_file_path, 'r') as data_file: 56 | return json.load(data_file) 57 | 58 | 59 | def get_class_labels(data_file_path): 60 | with open(data_file_path, 'r') as data_file: 61 | data = json.load(data_file) 62 | data = ['_'.join(row) for row in data] 63 | class_labels_map = {} 64 | index = 0 65 | for class_label in data: 66 | class_labels_map[class_label] = index 67 | index += 1 68 | return class_labels_map 69 | 70 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video, 71 | sample_duration): 72 | if subset == 'training': 73 | postfix = 'train' 74 | else: 75 | postfix = 'test' 76 | annotation_file_path = os.path.join(annotation_path, 'Diving48_{}.json'.format(postfix)) 77 | data = load_annotation_data(annotation_file_path) 78 | 79 | class_file_path = os.path.join(annotation_path, 'Diving48_vocab.json') 80 | class_to_idx = get_class_labels(class_file_path) 81 | idx_to_class = {} 82 | for name, label in class_to_idx.items(): 83 | idx_to_class[label] = name 84 | 85 | dataset = [] 86 | for i in range(len(data)): 87 | if i % 1000 == 0: 88 | print('dataset loading [{}/{}]'.format(i, len(data))) 89 | 90 | video_path = os.path.join(root_path, data[i]['vid_name']) 91 | if not os.path.exists(video_path): 92 | continue 93 | 94 | n_frames = data[i]['end_frame'] - data[i]['start_frame'] + 1 95 | if n_frames <= 0: 96 | continue 97 | 98 | begin_t = 1 99 | end_t = n_frames 100 | sample = { 101 | 'video': video_path, 102 | 'segment': [begin_t, end_t], 103 | 'n_frames': n_frames, 104 | 'video_id': data[i]['vid_name'] 105 | } 106 | if len(data) != 0: 107 | sample['label'] = data[i]['label'] 108 | else: 109 | sample['label'] = -1 110 | 111 | if n_samples_for_each_video == 1: 112 | sample['frame_indices'] = list(range(1, n_frames + 1)) 113 | dataset.append(sample) 114 | else: 115 | if n_samples_for_each_video > 1: 116 | step = max(1, 117 | math.ceil((n_frames - 1 - sample_duration) / 118 | (n_samples_for_each_video - 1))) 119 | else: 120 | step = sample_duration 121 | for j in range(1, n_frames, step): 122 | sample_j = copy.deepcopy(sample) 123 | sample_j['frame_indices'] = list( 124 | range(j, min(n_frames + 1, j + sample_duration))) 125 | dataset.append(sample_j) 126 | 127 | return dataset, idx_to_class 128 | 129 | 130 | class Diving48(data.Dataset): 131 | """ 132 | Args: 133 | root (string): Root directory path. 134 | spatial_transform (callable, optional): A function/transform that takes in an PIL image 135 | and returns a transformed version. E.g, ``transforms.RandomCrop`` 136 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices 137 | and returns a transformed version 138 | target_transform (callable, optional): A function/transform that takes in the 139 | target and transforms it. 140 | loader (callable, optional): A function to load an video given its path and frame indices. 141 | Attributes: 142 | classes (list): List of the class names. 143 | class_to_idx (dict): Dict with items (class_name, class_index). 144 | imgs (list): List of (image path, class_index) tuples 145 | """ 146 | 147 | def __init__(self, 148 | root_path, 149 | annotation_path, 150 | subset, 151 | n_samples_for_each_video=1, 152 | spatial_transform=None, 153 | temporal_transform=None, 154 | target_transform=None, 155 | sample_duration=16, 156 | get_loader=get_default_video_loader, 157 | vis=False): 158 | self.data, self.class_names = make_dataset( 159 | root_path, annotation_path, subset, n_samples_for_each_video, 160 | sample_duration) 161 | 162 | self.spatial_transform = spatial_transform 163 | self.temporal_transform = temporal_transform 164 | self.target_transform = target_transform 165 | self.vis = vis 166 | self.loader = get_loader() 167 | 168 | def __getitem__(self, index): 169 | """ 170 | Args: 171 | index (int): Index 172 | Returns: 173 | tuple: (image, target) where target is class_index of the target class. 174 | """ 175 | path = self.data[index]['video'] 176 | 177 | frame_indices = self.data[index]['frame_indices'] 178 | if self.temporal_transform is not None: 179 | frame_indices = self.temporal_transform(frame_indices) 180 | clip = self.loader(path, frame_indices) 181 | if self.spatial_transform is not None: 182 | self.spatial_transform.randomize_parameters() 183 | clip = [self.spatial_transform(img) for img in clip] 184 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 185 | 186 | target = self.data[index] 187 | if self.target_transform is not None: 188 | target = self.target_transform(target) 189 | 190 | if self.vis: 191 | return clip, target, path, frame_indices 192 | else: 193 | return clip, target 194 | 195 | def __len__(self): 196 | return len(self.data) 197 | -------------------------------------------------------------------------------- /datasets/hmdb51.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from PIL import Image 4 | import os 5 | import math 6 | import functools 7 | import json 8 | import copy 9 | 10 | from libs.utils import load_value_file 11 | 12 | 13 | def pil_loader(path): 14 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 15 | with open(path, 'rb') as f: 16 | with Image.open(f) as img: 17 | return img.convert('RGB') 18 | 19 | 20 | def accimage_loader(path): 21 | try: 22 | import accimage 23 | return accimage.Image(path) 24 | except IOError: 25 | # Potentially a decoding problem, fall back to PIL.Image 26 | return pil_loader(path) 27 | 28 | 29 | def get_default_image_loader(): 30 | from torchvision import get_image_backend 31 | if get_image_backend() == 'accimage': 32 | return accimage_loader 33 | else: 34 | return pil_loader 35 | 36 | 37 | def video_loader(video_dir_path, frame_indices, image_loader): 38 | video = [] 39 | for i in frame_indices: 40 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 41 | if os.path.exists(image_path): 42 | video.append(image_loader(image_path)) 43 | else: 44 | return video 45 | 46 | return video 47 | 48 | 49 | def get_default_video_loader(): 50 | image_loader = get_default_image_loader() 51 | return functools.partial(video_loader, image_loader=image_loader) 52 | 53 | 54 | def load_annotation_data(data_file_path): 55 | with open(data_file_path, 'r') as data_file: 56 | return json.load(data_file) 57 | 58 | 59 | def get_class_labels(data): 60 | class_labels_map = {} 61 | index = 0 62 | for class_label in data['labels']: 63 | class_labels_map[class_label] = index 64 | index += 1 65 | return class_labels_map 66 | 67 | 68 | def get_video_names_and_annotations(data, subset): 69 | video_names = [] 70 | annotations = [] 71 | 72 | for key, value in data['database'].items(): 73 | this_subset = value['subset'] 74 | if this_subset == subset: 75 | label = value['annotations']['label'] 76 | video_names.append('{}/{}'.format(label, key)) 77 | annotations.append(value['annotations']) 78 | 79 | return video_names, annotations 80 | 81 | 82 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video, 83 | sample_duration): 84 | data = load_annotation_data(annotation_path) 85 | video_names, annotations = get_video_names_and_annotations(data, subset) 86 | class_to_idx = get_class_labels(data) 87 | idx_to_class = {} 88 | for name, label in class_to_idx.items(): 89 | idx_to_class[label] = name 90 | 91 | dataset = [] 92 | for i in range(len(video_names)): 93 | if i % 1000 == 0: 94 | print('dataset loading [{}/{}]'.format(i, len(video_names))) 95 | 96 | video_path = os.path.join(root_path, video_names[i]) 97 | if not os.path.exists(video_path): 98 | continue 99 | 100 | n_frames_file_path = os.path.join(video_path, 'n_frames') 101 | n_frames = int(load_value_file(n_frames_file_path)) 102 | if n_frames <= 0: 103 | continue 104 | 105 | begin_t = 1 106 | end_t = n_frames 107 | sample = { 108 | 'video': video_path, 109 | 'segment': [begin_t, end_t], 110 | 'n_frames': n_frames, 111 | 'video_id': video_names[i].split('/')[1] 112 | } 113 | if len(annotations) != 0: 114 | sample['label'] = class_to_idx[annotations[i]['label']] 115 | else: 116 | sample['label'] = -1 117 | 118 | if n_samples_for_each_video == 1: 119 | sample['frame_indices'] = list(range(1, n_frames + 1)) 120 | dataset.append(sample) 121 | else: 122 | if n_samples_for_each_video > 1: 123 | step = max(1, 124 | math.ceil((n_frames - 1 - sample_duration) / 125 | (n_samples_for_each_video - 1))) 126 | else: 127 | step = sample_duration 128 | for j in range(1, n_frames, step): 129 | sample_j = copy.deepcopy(sample) 130 | sample_j['frame_indices'] = list( 131 | range(j, min(n_frames + 1, j + sample_duration))) 132 | dataset.append(sample_j) 133 | 134 | return dataset, idx_to_class 135 | 136 | 137 | class HMDB51(data.Dataset): 138 | """ 139 | Args: 140 | root (string): Root directory path. 141 | spatial_transform (callable, optional): A function/transform that takes in an PIL image 142 | and returns a transformed version. E.g, ``transforms.RandomCrop`` 143 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices 144 | and returns a transformed version 145 | target_transform (callable, optional): A function/transform that takes in the 146 | target and transforms it. 147 | loader (callable, optional): A function to load an video given its path and frame indices. 148 | Attributes: 149 | classes (list): List of the class names. 150 | class_to_idx (dict): Dict with items (class_name, class_index). 151 | imgs (list): List of (image path, class_index) tuples 152 | """ 153 | 154 | def __init__(self, 155 | root_path, 156 | annotation_path, 157 | subset, 158 | n_samples_for_each_video=1, 159 | spatial_transform=None, 160 | temporal_transform=None, 161 | target_transform=None, 162 | sample_duration=16, 163 | get_loader=get_default_video_loader, 164 | vis=False): 165 | self.data, self.class_names = make_dataset( 166 | root_path, annotation_path, subset, n_samples_for_each_video, 167 | sample_duration) 168 | 169 | self.spatial_transform = spatial_transform 170 | self.temporal_transform = temporal_transform 171 | self.target_transform = target_transform 172 | self.vis = vis 173 | self.loader = get_loader() 174 | 175 | def __getitem__(self, index): 176 | """ 177 | Args: 178 | index (int): Index 179 | Returns: 180 | tuple: (image, target) where target is class_index of the target class. 181 | """ 182 | path = self.data[index]['video'] 183 | 184 | frame_indices = self.data[index]['frame_indices'] 185 | if self.temporal_transform is not None: 186 | frame_indices = self.temporal_transform(frame_indices) 187 | clip = self.loader(path, frame_indices) 188 | if self.spatial_transform is not None: 189 | self.spatial_transform.randomize_parameters() 190 | clip = [self.spatial_transform(img) for img in clip] 191 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 192 | 193 | target = self.data[index] 194 | if self.target_transform is not None: 195 | target = self.target_transform(target) 196 | 197 | if self.vis: 198 | return clip, target, path, frame_indices 199 | else: 200 | return clip, target 201 | 202 | def __len__(self): 203 | return len(self.data) 204 | -------------------------------------------------------------------------------- /datasets/ucf101.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from PIL import Image 4 | import os 5 | import math 6 | import functools 7 | import json 8 | import copy 9 | 10 | from libs.utils import load_value_file 11 | 12 | 13 | def pil_loader(path): 14 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 15 | with open(path, 'rb') as f: 16 | with Image.open(f) as img: 17 | return img.convert('RGB') 18 | 19 | 20 | def accimage_loader(path): 21 | try: 22 | import accimage 23 | return accimage.Image(path) 24 | except IOError: 25 | # Potentially a decoding problem, fall back to PIL.Image 26 | return pil_loader(path) 27 | 28 | 29 | def get_default_image_loader(): 30 | from torchvision import get_image_backend 31 | if get_image_backend() == 'accimage': 32 | return accimage_loader 33 | else: 34 | return pil_loader 35 | 36 | 37 | def video_loader(video_dir_path, frame_indices, image_loader): 38 | video = [] 39 | for i in frame_indices: 40 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 41 | if os.path.exists(image_path): 42 | video.append(image_loader(image_path)) 43 | else: 44 | return video 45 | 46 | return video 47 | 48 | 49 | def get_default_video_loader(): 50 | image_loader = get_default_image_loader() 51 | return functools.partial(video_loader, image_loader=image_loader) 52 | 53 | 54 | def load_annotation_data(data_file_path): 55 | with open(data_file_path, 'r') as data_file: 56 | return json.load(data_file) 57 | 58 | 59 | def get_class_labels(data): 60 | class_labels_map = {} 61 | index = 0 62 | for class_label in data['labels']: 63 | class_labels_map[class_label] = index 64 | index += 1 65 | return class_labels_map 66 | 67 | 68 | def get_video_names_and_annotations(data, subset): 69 | video_names = [] 70 | annotations = [] 71 | 72 | for key, value in data['database'].items(): 73 | this_subset = value['subset'] 74 | if this_subset == subset: 75 | label = value['annotations']['label'] 76 | video_names.append('{}/{}'.format(label, key)) 77 | annotations.append(value['annotations']) 78 | 79 | return video_names, annotations 80 | 81 | 82 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video, 83 | sample_duration): 84 | data = load_annotation_data(annotation_path) 85 | video_names, annotations = get_video_names_and_annotations(data, subset) 86 | class_to_idx = get_class_labels(data) 87 | idx_to_class = {} 88 | for name, label in class_to_idx.items(): 89 | idx_to_class[label] = name 90 | 91 | dataset = [] 92 | for i in range(len(video_names)): 93 | if i % 1000 == 0: 94 | print('dataset loading [{}/{}]'.format(i, len(video_names))) 95 | 96 | video_path = os.path.join(root_path, video_names[i]) 97 | if not os.path.exists(video_path): 98 | continue 99 | 100 | n_frames_file_path = os.path.join(video_path, 'n_frames') 101 | n_frames = int(load_value_file(n_frames_file_path)) 102 | if n_frames <= 0: 103 | continue 104 | 105 | begin_t = 1 106 | end_t = n_frames 107 | sample = { 108 | 'video': video_path, 109 | 'segment': [begin_t, end_t], 110 | 'n_frames': n_frames, 111 | 'video_id': video_names[i].split('/')[1] 112 | } 113 | if len(annotations) != 0: 114 | sample['label'] = class_to_idx[annotations[i]['label']] 115 | else: 116 | sample['label'] = -1 117 | 118 | if n_samples_for_each_video == 1: 119 | sample['frame_indices'] = list(range(1, n_frames + 1)) 120 | dataset.append(sample) 121 | else: 122 | if n_samples_for_each_video > 1: 123 | step = max(1, 124 | math.ceil((n_frames - 1 - sample_duration) / 125 | (n_samples_for_each_video - 1))) 126 | else: 127 | step = sample_duration 128 | for j in range(1, n_frames, step): 129 | sample_j = copy.deepcopy(sample) 130 | sample_j['frame_indices'] = list( 131 | range(j, min(n_frames + 1, j + sample_duration))) 132 | dataset.append(sample_j) 133 | 134 | return dataset, idx_to_class 135 | 136 | 137 | class UCF101(data.Dataset): 138 | """ 139 | Args: 140 | root (string): Root directory path. 141 | spatial_transform (callable, optional): A function/transform that takes in an PIL image 142 | and returns a transformed version. E.g, ``transforms.RandomCrop`` 143 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices 144 | and returns a transformed version 145 | target_transform (callable, optional): A function/transform that takes in the 146 | target and transforms it. 147 | loader (callable, optional): A function to load an video given its path and frame indices. 148 | Attributes: 149 | classes (list): List of the class names. 150 | class_to_idx (dict): Dict with items (class_name, class_index). 151 | imgs (list): List of (image path, class_index) tuples 152 | """ 153 | 154 | def __init__(self, 155 | root_path, 156 | annotation_path, 157 | subset, 158 | n_samples_for_each_video=1, 159 | spatial_transform=None, 160 | temporal_transform=None, 161 | target_transform=None, 162 | sample_duration=16, 163 | get_loader=get_default_video_loader, 164 | vis=False): 165 | self.data, self.class_names = make_dataset( 166 | root_path, annotation_path, subset, n_samples_for_each_video, 167 | sample_duration) 168 | 169 | self.spatial_transform = spatial_transform 170 | self.temporal_transform = temporal_transform 171 | self.target_transform = target_transform 172 | self.vis = vis 173 | self.loader = get_loader() 174 | 175 | def __getitem__(self, index): 176 | """ 177 | Args: 178 | index (int): Index 179 | Returns: 180 | tuple: (image, target) where target is class_index of the target class. 181 | """ 182 | path = self.data[index]['video'] 183 | 184 | frame_indices = self.data[index]['frame_indices'] 185 | if self.temporal_transform is not None: 186 | frame_indices = self.temporal_transform(frame_indices) 187 | clip = self.loader(path, frame_indices) 188 | if self.spatial_transform is not None: 189 | self.spatial_transform.randomize_parameters() 190 | clip = [self.spatial_transform(img) for img in clip] 191 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 192 | 193 | target = self.data[index] 194 | if self.target_transform is not None: 195 | target = self.target_transform(target) 196 | 197 | if self.vis: 198 | return clip, target, path, frame_indices 199 | else: 200 | return clip, target 201 | 202 | def __len__(self): 203 | return len(self.data) 204 | -------------------------------------------------------------------------------- /libs/mean.py: -------------------------------------------------------------------------------- 1 | def get_mean(norm_value=255, dataset='activitynet'): 2 | assert dataset in ['activitynet', 'kinetics'] 3 | 4 | if dataset == 'activitynet': 5 | return [ 6 | 114.7748 / norm_value, 107.7354 / norm_value, 99.4750 / norm_value 7 | ] 8 | elif dataset == 'kinetics': 9 | # Kinetics (10 videos for each class) 10 | return [ 11 | 110.63666788 / norm_value, 103.16065604 / norm_value, 12 | 96.29023126 / norm_value 13 | ] 14 | 15 | 16 | def get_std(norm_value=255): 17 | # Kinetics (10 videos for each class) 18 | return [ 19 | 38.7568578 / norm_value, 37.88248729 / norm_value, 20 | 40.02898126 / norm_value 21 | ] 22 | -------------------------------------------------------------------------------- /libs/opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def parse_opts(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument( 7 | '--root_path', 8 | default='/root/data/ActivityNet', 9 | type=str, 10 | help='Root directory path of data') 11 | parser.add_argument( 12 | '--video_path', 13 | default='video_kinetics_jpg', 14 | type=str, 15 | help='Directory path of Videos') 16 | parser.add_argument( 17 | '--annotation_path', 18 | default='kinetics.json', 19 | type=str, 20 | help='Annotation file path') 21 | parser.add_argument( 22 | '--prediction_path', 23 | default='kinetics.json', 24 | type=str, 25 | help='Prediction file path') 26 | parser.add_argument( 27 | '--result_path', 28 | default='results', 29 | type=str, 30 | help='Result directory path') 31 | parser.add_argument( 32 | '--place_pred_path', 33 | default='place', 34 | type=str, 35 | help='place prediction directory full path') 36 | parser.add_argument( 37 | '--human_dets_path', 38 | default='dets', 39 | type=str, 40 | help='human detection directory full path') 41 | parser.add_argument( 42 | '--mask_ratio', 43 | default=0.5, 44 | type=float, 45 | help='mask out background ratio, higher measn mask out more') 46 | 47 | parser.add_argument( 48 | '--dataset', 49 | default='kinetics', 50 | type=str, 51 | help='Used dataset (activitynet | kinetics | ucf101 | hmdb51)') 52 | parser.add_argument( 53 | '--n_classes', 54 | default=400, 55 | type=int, 56 | help= 57 | 'Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)' 58 | ) 59 | parser.add_argument( 60 | '--n_finetune_classes', 61 | default=400, 62 | type=int, 63 | help= 64 | 'Number of classes for fine-tuning. n_classes is set to the number when pretraining.' 65 | ) 66 | parser.add_argument( 67 | '--sample_size', 68 | default=112, 69 | type=int, 70 | help='Height and width of inputs') 71 | parser.add_argument( 72 | '--sample_duration', 73 | default=16, 74 | type=int, 75 | help='Temporal duration of inputs') 76 | parser.add_argument( 77 | '--initial_scale', 78 | default=1.0, 79 | type=float, 80 | help='Initial scale for multiscale cropping') 81 | parser.add_argument( 82 | '--n_scales', 83 | default=5, 84 | type=int, 85 | help='Number of scales for multiscale cropping') 86 | parser.add_argument( 87 | '--scale_step', 88 | default=0.84089641525, 89 | type=float, 90 | help='Scale step for multiscale cropping') 91 | parser.add_argument( 92 | '--train_crop', 93 | default='corner', 94 | type=str, 95 | help= 96 | 'Spatial cropping method in training. random is uniform. corner is selection from 4 corners and 1 center. (random | corner | center)' 97 | ) 98 | parser.add_argument( 99 | '--learning_rate', 100 | default=0.1, 101 | type=float, 102 | help= 103 | 'Initial learning rate (divided by 10 while training by lr scheduler)') 104 | parser.add_argument( 105 | '--new_layer_lr', 106 | default=0.1, 107 | type=float, 108 | help= 109 | 'Initial learning rate for new layers (divided by 10 while training by lr scheduler)') 110 | parser.add_argument( 111 | '--warm_up_epochs', 112 | default=10, 113 | type=int, 114 | help='number of epochs need to warm up the new layers') 115 | parser.add_argument('--momentum', default=0.9, type=float, help='Momentum') 116 | parser.add_argument( 117 | '--dampening', default=0.9, type=float, help='dampening of SGD') 118 | parser.add_argument( 119 | '--weight_decay', default=1e-3, type=float, help='Weight Decay') 120 | parser.add_argument( 121 | '--mean_dataset', 122 | default='activitynet', 123 | type=str, 124 | help= 125 | 'dataset for mean values of mean subtraction (activitynet | kinetics)') 126 | parser.add_argument( 127 | '--no_mean_norm', 128 | action='store_true', 129 | help='If true, inputs are not normalized by mean.') 130 | parser.set_defaults(no_mean_norm=False) 131 | parser.add_argument( 132 | '--std_norm', 133 | action='store_true', 134 | help='If true, inputs are normalized by standard deviation.') 135 | parser.set_defaults(std_norm=False) 136 | parser.add_argument( 137 | '--nesterov', action='store_true', help='Nesterov momentum') 138 | parser.set_defaults(nesterov=False) 139 | parser.add_argument( 140 | '--optimizer', 141 | default='sgd', 142 | type=str, 143 | help='Currently only support SGD') 144 | parser.add_argument( 145 | '--lr_patience', 146 | default=10, 147 | type=int, 148 | help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.' 149 | ) 150 | parser.add_argument( 151 | '--batch_size', default=128, type=int, help='Batch Size') 152 | parser.add_argument( 153 | '--val_batch_size', default=16, type=int, help='Batch Size for Validation') 154 | parser.add_argument( 155 | '--n_epochs', 156 | default=200, 157 | type=int, 158 | help='Number of total epochs to run') 159 | parser.add_argument( 160 | '--begin_epoch', 161 | default=1, 162 | type=int, 163 | help= 164 | 'Training begins at this epoch. Previous trained model indicated by resume_path is loaded.' 165 | ) 166 | parser.add_argument( 167 | '--n_val_samples', 168 | default=3, 169 | type=int, 170 | help='Number of validation samples for each activity') 171 | parser.add_argument( 172 | '--resume_path', 173 | default='', 174 | type=str, 175 | help='Save data (.pth) of previous training') 176 | parser.add_argument( 177 | '--pretrain_path', default='', type=str, help='Pretrained model (.pth)') 178 | parser.add_argument( 179 | '--vis', 180 | action='store_true', 181 | help='If true, vis') 182 | parser.set_defaults(vis=False) 183 | parser.add_argument( 184 | '--is_place_adv', 185 | action='store_true', 186 | help='If true, using place adversarial traiing.') 187 | parser.set_defaults(is_place_adv=False) 188 | parser.add_argument( 189 | '--is_place_soft', 190 | action='store_true', 191 | help='If true, using placenet soft label.') 192 | parser.set_defaults(is_place_soft=False) 193 | parser.add_argument( 194 | '--is_place_entropy', 195 | action='store_true', 196 | help='If true, using place entropy loss for training.') 197 | parser.set_defaults(is_place_entropy=False) 198 | parser.add_argument( 199 | '--is_entropy_max', 200 | action='store_true', 201 | help='If true, using place entropy maximization training.') 202 | parser.set_defaults(is_entropy_max=False) 203 | parser.add_argument( 204 | '--is_mask_adv', 205 | action='store_false', 206 | help='If true, using human mask branch for training.') 207 | parser.set_defaults(is_mask_adv=True) 208 | parser.add_argument( 209 | '--is_mask_cross_entropy', 210 | action='store_true', 211 | help='If true, using human mask cross entropy loss.') 212 | parser.set_defaults(is_mask_cross_entropy=False) 213 | parser.add_argument( 214 | '--is_mask_entropy', 215 | action='store_true', 216 | help='If true, using human mask entropy loss.') 217 | parser.set_defaults(is_mask_entropy=False) 218 | parser.add_argument( 219 | '--is_mask_conf_dual_loader', 220 | action='store_true', 221 | help='If true, using two data loaders for human mask action confusion loss.') 222 | parser.set_defaults(is_mask_conf_dual_loader=False) 223 | parser.add_argument( 224 | '--slower_place_mlp', 225 | action='store_true', 226 | help='If true, using slower learning rate for place mlp') 227 | parser.set_defaults(slower_place_mlp=False) 228 | parser.add_argument( 229 | '--slower_hm_mlp', 230 | action='store_true', 231 | help='If true, using slower learning rate for human mask mlp') 232 | parser.set_defaults(slower_hm_mlp=False) 233 | parser.add_argument( 234 | '--weight_entropy_loss', 235 | default=1.0, 236 | type=float, 237 | help='weight of the entropy loss') 238 | parser.add_argument( 239 | '--num_place_hidden_layers', 240 | default=1, 241 | type=int, 242 | help='Number of hidden layers in the place prediction MLP') 243 | parser.add_argument( 244 | '--num_human_mask_adv_hidden_layers', 245 | default=1, 246 | type=int, 247 | help='Number of hidden layers in the human masked prediction MLP') 248 | parser.add_argument( 249 | '--alpha', 250 | default=1.0, 251 | type=float, 252 | help='lambda of the grad reversarl layer, higher means higher impacts of the adversarial training' 253 | ) 254 | parser.add_argument( 255 | '--alpha_hm', 256 | default=1.0, 257 | type=float, 258 | help='lambda of the grad reversarl layer for human mask confusion loss branch, higher means higher impacts of the adversarial training' 259 | ) 260 | parser.add_argument( 261 | '--num_places_classes', 262 | default=0, 263 | type=int, 264 | help='Number of place classes') 265 | parser.add_argument( 266 | '--ft_begin_index', 267 | default=0, 268 | type=int, 269 | help='Begin block index of fine-tuning') 270 | parser.add_argument( 271 | '--not_replace_last_fc', 272 | action='store_true', 273 | help='If true, DO NOT replace the last fc layer (classifier) of a network with a new one, if false, replace the last fc layer') 274 | parser.set_defaults(not_replace_last_fc=False) 275 | parser.add_argument( 276 | '--no_train', 277 | action='store_true', 278 | help='If true, training is not performed.') 279 | parser.set_defaults(no_train=False) 280 | parser.add_argument( 281 | '--no_val', 282 | action='store_true', 283 | help='If true, validation is not performed.') 284 | parser.set_defaults(no_val=False) 285 | parser.add_argument( 286 | '--test', action='store_true', help='If true, test is performed.') 287 | parser.set_defaults(test=False) 288 | parser.add_argument( 289 | '--test_subset', 290 | default='val', 291 | type=str, 292 | help='Used subset in test (val | test)') 293 | parser.add_argument( 294 | '--scale_in_test', 295 | default=1.0, 296 | type=float, 297 | help='Spatial scale in test') 298 | parser.add_argument( 299 | '--crop_position_in_test', 300 | default='c', 301 | type=str, 302 | help='Cropping method (c | tl | tr | bl | br) in test') 303 | parser.add_argument( 304 | '--no_softmax_in_test', 305 | action='store_true', 306 | help='If true, output for each clip is not normalized using softmax.') 307 | parser.set_defaults(no_softmax_in_test=False) 308 | parser.add_argument( 309 | '--no_cuda', action='store_true', help='If true, cuda is not used.') 310 | parser.set_defaults(no_cuda=False) 311 | parser.add_argument( 312 | '--n_threads', 313 | default=4, 314 | type=int, 315 | help='Number of threads for multi-thread loading') 316 | parser.add_argument( 317 | '--checkpoint', 318 | default=10, 319 | type=int, 320 | help='Trained model is saved at every this epochs.') 321 | parser.add_argument( 322 | '--no_hflip', 323 | action='store_true', 324 | help='If true holizontal flipping is not performed.') 325 | parser.set_defaults(no_hflip=False) 326 | parser.add_argument( 327 | '--norm_value', 328 | default=1, 329 | type=int, 330 | help= 331 | 'If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].') 332 | parser.add_argument( 333 | '--model', 334 | default='resnet', 335 | type=str, 336 | help='(resnet | preresnet | wideresnet | resnext | densenet | ') 337 | parser.add_argument( 338 | '--model_depth', 339 | default=18, 340 | type=int, 341 | help='Depth of resnet (10 | 18 | 34 | 50 | 101)') 342 | parser.add_argument( 343 | '--resnet_shortcut', 344 | default='B', 345 | type=str, 346 | help='Shortcut type of resnet (A | B)') 347 | parser.add_argument( 348 | '--wide_resnet_k', default=2, type=int, help='Wide resnet k') 349 | parser.add_argument( 350 | '--resnext_cardinality', 351 | default=32, 352 | type=int, 353 | help='ResNeXt cardinality') 354 | parser.add_argument( 355 | '--manual_seed', default=1, type=int, help='Manually set random seed') 356 | 357 | args = parser.parse_args() 358 | 359 | return args 360 | -------------------------------------------------------------------------------- /libs/spatial_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | import numbers 4 | import collections 5 | import numpy as np 6 | import torch 7 | from PIL import Image, ImageOps 8 | try: 9 | import accimage 10 | except ImportError: 11 | accimage = None 12 | 13 | 14 | class Compose(object): 15 | """Composes several transforms together. 16 | Args: 17 | transforms (list of ``Transform`` objects): list of transforms to compose. 18 | Example: 19 | >>> transforms.Compose([ 20 | >>> transforms.CenterCrop(10), 21 | >>> transforms.ToTensor(), 22 | >>> ]) 23 | """ 24 | 25 | def __init__(self, transforms): 26 | self.transforms = transforms 27 | 28 | def __call__(self, img): 29 | for t in self.transforms: 30 | img = t(img) 31 | return img 32 | 33 | def randomize_parameters(self): 34 | for t in self.transforms: 35 | t.randomize_parameters() 36 | 37 | 38 | class ToTensor(object): 39 | """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor. 40 | Converts a PIL.Image or numpy.ndarray (H x W x C) in the range 41 | [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. 42 | """ 43 | 44 | def __init__(self, norm_value=255): 45 | self.norm_value = norm_value 46 | 47 | def __call__(self, pic): 48 | """ 49 | Args: 50 | pic (PIL.Image or numpy.ndarray): Image to be converted to tensor. 51 | Returns: 52 | Tensor: Converted image. 53 | """ 54 | if isinstance(pic, np.ndarray): 55 | # handle numpy array 56 | img = torch.from_numpy(pic.transpose((2, 0, 1))) 57 | # backward compatibility 58 | return img.float().div(self.norm_value) 59 | 60 | if accimage is not None and isinstance(pic, accimage.Image): 61 | nppic = np.zeros( 62 | [pic.channels, pic.height, pic.width], dtype=np.float32) 63 | pic.copyto(nppic) 64 | return torch.from_numpy(nppic) 65 | 66 | # handle PIL Image 67 | if pic.mode == 'I': 68 | img = torch.from_numpy(np.array(pic, np.int32, copy=False)) 69 | elif pic.mode == 'I;16': 70 | img = torch.from_numpy(np.array(pic, np.int16, copy=False)) 71 | else: 72 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) 73 | # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK 74 | if pic.mode == 'YCbCr': 75 | nchannel = 3 76 | elif pic.mode == 'I;16': 77 | nchannel = 1 78 | else: 79 | nchannel = len(pic.mode) 80 | img = img.view(pic.size[1], pic.size[0], nchannel) 81 | # put it from HWC to CHW format 82 | # yikes, this transpose takes 80% of the loading time/CPU 83 | img = img.transpose(0, 1).transpose(0, 2).contiguous() 84 | if isinstance(img, torch.ByteTensor): 85 | return img.float().div(self.norm_value) 86 | else: 87 | return img 88 | 89 | def randomize_parameters(self): 90 | pass 91 | 92 | 93 | class Normalize(object): 94 | """Normalize an tensor image with mean and standard deviation. 95 | Given mean: (R, G, B) and std: (R, G, B), 96 | will normalize each channel of the torch.*Tensor, i.e. 97 | channel = (channel - mean) / std 98 | Args: 99 | mean (sequence): Sequence of means for R, G, B channels respecitvely. 100 | std (sequence): Sequence of standard deviations for R, G, B channels 101 | respecitvely. 102 | """ 103 | 104 | def __init__(self, mean, std): 105 | self.mean = mean 106 | self.std = std 107 | 108 | def __call__(self, tensor): 109 | """ 110 | Args: 111 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized. 112 | Returns: 113 | Tensor: Normalized image. 114 | """ 115 | # TODO: make efficient 116 | for t, m, s in zip(tensor, self.mean, self.std): 117 | t.sub_(m).div_(s) 118 | return tensor 119 | 120 | def randomize_parameters(self): 121 | pass 122 | 123 | 124 | class Scale(object): 125 | """Rescale the input PIL.Image to the given size. 126 | Args: 127 | size (sequence or int): Desired output size. If size is a sequence like 128 | (w, h), output size will be matched to this. If size is an int, 129 | smaller edge of the image will be matched to this number. 130 | i.e, if height > width, then image will be rescaled to 131 | (size * height / width, size) 132 | interpolation (int, optional): Desired interpolation. Default is 133 | ``PIL.Image.BILINEAR`` 134 | """ 135 | 136 | def __init__(self, size, interpolation=Image.BILINEAR): 137 | assert isinstance(size, 138 | int) or (isinstance(size, collections.Iterable) and 139 | len(size) == 2) 140 | self.size = size 141 | self.interpolation = interpolation 142 | 143 | def __call__(self, img): 144 | """ 145 | Args: 146 | img (PIL.Image): Image to be scaled. 147 | Returns: 148 | PIL.Image: Rescaled image. 149 | """ 150 | if isinstance(self.size, int): 151 | w, h = img.size 152 | if (w <= h and w == self.size) or (h <= w and h == self.size): 153 | return img 154 | if w < h: 155 | ow = self.size 156 | oh = int(self.size * h / w) 157 | return img.resize((ow, oh), self.interpolation) 158 | else: 159 | oh = self.size 160 | ow = int(self.size * w / h) 161 | return img.resize((ow, oh), self.interpolation) 162 | else: 163 | return img.resize(self.size, self.interpolation) 164 | 165 | def randomize_parameters(self): 166 | pass 167 | 168 | 169 | class CenterCrop(object): 170 | """Crops the given PIL.Image at the center. 171 | Args: 172 | size (sequence or int): Desired output size of the crop. If size is an 173 | int instead of sequence like (h, w), a square crop (size, size) is 174 | made. 175 | """ 176 | 177 | def __init__(self, size): 178 | if isinstance(size, numbers.Number): 179 | self.size = (int(size), int(size)) 180 | else: 181 | self.size = size 182 | 183 | def __call__(self, img): 184 | """ 185 | Args: 186 | img (PIL.Image): Image to be cropped. 187 | Returns: 188 | PIL.Image: Cropped image. 189 | """ 190 | w, h = img.size 191 | th, tw = self.size 192 | x1 = int(round((w - tw) / 2.)) 193 | y1 = int(round((h - th) / 2.)) 194 | return img.crop((x1, y1, x1 + tw, y1 + th)) 195 | 196 | def randomize_parameters(self): 197 | pass 198 | 199 | 200 | class CornerCrop(object): 201 | 202 | def __init__(self, size, crop_position=None): 203 | self.size = size 204 | if crop_position is None: 205 | self.randomize = True 206 | else: 207 | self.randomize = False 208 | self.crop_position = crop_position 209 | self.crop_positions = ['c', 'tl', 'tr', 'bl', 'br'] 210 | 211 | def __call__(self, img): 212 | image_width = img.size[0] 213 | image_height = img.size[1] 214 | 215 | if self.crop_position == 'c': 216 | th, tw = (self.size, self.size) 217 | x1 = int(round((image_width - tw) / 2.)) 218 | y1 = int(round((image_height - th) / 2.)) 219 | x2 = x1 + tw 220 | y2 = y1 + th 221 | elif self.crop_position == 'tl': 222 | x1 = 0 223 | y1 = 0 224 | x2 = self.size 225 | y2 = self.size 226 | elif self.crop_position == 'tr': 227 | x1 = image_width - self.size 228 | y1 = 0 229 | x2 = image_width 230 | y2 = self.size 231 | elif self.crop_position == 'bl': 232 | x1 = 0 233 | y1 = image_height - self.size 234 | x2 = self.size 235 | y2 = image_height 236 | elif self.crop_position == 'br': 237 | x1 = image_width - self.size 238 | y1 = image_height - self.size 239 | x2 = image_width 240 | y2 = image_height 241 | 242 | img = img.crop((x1, y1, x2, y2)) 243 | 244 | return img 245 | 246 | def randomize_parameters(self): 247 | if self.randomize: 248 | self.crop_position = self.crop_positions[random.randint( 249 | 0, 250 | len(self.crop_positions) - 1)] 251 | 252 | 253 | class RandomHorizontalFlip(object): 254 | """Horizontally flip the given PIL.Image randomly with a probability of 0.5.""" 255 | 256 | def __call__(self, img): 257 | """ 258 | Args: 259 | img (PIL.Image): Image to be flipped. 260 | Returns: 261 | PIL.Image: Randomly flipped image. 262 | """ 263 | if self.p < 0.5: 264 | return img.transpose(Image.FLIP_LEFT_RIGHT) 265 | return img 266 | 267 | def randomize_parameters(self): 268 | self.p = random.random() 269 | 270 | 271 | class MultiScaleCornerCrop(object): 272 | """Crop the given PIL.Image to randomly selected size. 273 | A crop of size is selected from scales of the original size. 274 | A position of cropping is randomly selected from 4 corners and 1 center. 275 | This crop is finally resized to given size. 276 | Args: 277 | scales: cropping scales of the original size 278 | size: size of the smaller edge 279 | interpolation: Default: PIL.Image.BILINEAR 280 | """ 281 | 282 | def __init__(self, 283 | scales, 284 | size, 285 | interpolation=Image.BILINEAR, 286 | crop_positions=['c', 'tl', 'tr', 'bl', 'br']): 287 | self.scales = scales 288 | self.size = size 289 | self.interpolation = interpolation 290 | 291 | self.crop_positions = crop_positions 292 | 293 | def __call__(self, img): 294 | min_length = min(img.size[0], img.size[1]) 295 | crop_size = int(min_length * self.scale) 296 | 297 | image_width = img.size[0] 298 | image_height = img.size[1] 299 | 300 | if self.crop_position == 'c': 301 | center_x = image_width // 2 302 | center_y = image_height // 2 303 | box_half = crop_size // 2 304 | x1 = center_x - box_half 305 | y1 = center_y - box_half 306 | x2 = center_x + box_half 307 | y2 = center_y + box_half 308 | elif self.crop_position == 'tl': 309 | x1 = 0 310 | y1 = 0 311 | x2 = crop_size 312 | y2 = crop_size 313 | elif self.crop_position == 'tr': 314 | x1 = image_width - crop_size 315 | y1 = 0 316 | x2 = image_width 317 | y2 = crop_size 318 | elif self.crop_position == 'bl': 319 | x1 = 0 320 | y1 = image_height - crop_size 321 | x2 = crop_size 322 | y2 = image_height 323 | elif self.crop_position == 'br': 324 | x1 = image_width - crop_size 325 | y1 = image_height - crop_size 326 | x2 = image_width 327 | y2 = image_height 328 | 329 | img = img.crop((x1, y1, x2, y2)) 330 | 331 | return img.resize((self.size, self.size), self.interpolation) 332 | 333 | def randomize_parameters(self): 334 | self.scale = self.scales[random.randint(0, len(self.scales) - 1)] 335 | self.crop_position = self.crop_positions[random.randint( 336 | 0, 337 | len(self.crop_positions) - 1)] 338 | 339 | 340 | class MultiScaleRandomCrop(object): 341 | 342 | def __init__(self, scales, size, interpolation=Image.BILINEAR): 343 | self.scales = scales 344 | self.size = size 345 | self.interpolation = interpolation 346 | 347 | def __call__(self, img): 348 | min_length = min(img.size[0], img.size[1]) 349 | crop_size = int(min_length * self.scale) 350 | 351 | image_width = img.size[0] 352 | image_height = img.size[1] 353 | 354 | x1 = self.tl_x * (image_width - crop_size) 355 | y1 = self.tl_y * (image_height - crop_size) 356 | x2 = x1 + crop_size 357 | y2 = y1 + crop_size 358 | 359 | img = img.crop((x1, y1, x2, y2)) 360 | 361 | return img.resize((self.size, self.size), self.interpolation) 362 | 363 | def randomize_parameters(self): 364 | self.scale = self.scales[random.randint(0, len(self.scales) - 1)] 365 | self.tl_x = random.random() 366 | self.tl_y = random.random() 367 | -------------------------------------------------------------------------------- /libs/target_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | 4 | 5 | class Compose(object): 6 | 7 | def __init__(self, transforms): 8 | self.transforms = transforms 9 | 10 | def __call__(self, target): 11 | dst = [] 12 | for t in self.transforms: 13 | dst.append(t(target)) 14 | return dst 15 | 16 | 17 | class ClassLabel(object): 18 | 19 | def __call__(self, target): 20 | return target['label'] 21 | 22 | 23 | class VideoID(object): 24 | 25 | def __call__(self, target): 26 | return target['video_id'] 27 | -------------------------------------------------------------------------------- /libs/temporal_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | 4 | 5 | class LoopPadding(object): 6 | 7 | def __init__(self, size): 8 | self.size = size 9 | 10 | def __call__(self, frame_indices): 11 | out = frame_indices 12 | 13 | for index in out: 14 | if len(out) >= self.size: 15 | break 16 | out.append(index) 17 | 18 | return out 19 | 20 | 21 | class TemporalBeginCrop(object): 22 | """Temporally crop the given frame indices at a beginning. 23 | 24 | If the number of frames is less than the size, 25 | loop the indices as many times as necessary to satisfy the size. 26 | 27 | Args: 28 | size (int): Desired output size of the crop. 29 | """ 30 | 31 | def __init__(self, size): 32 | self.size = size 33 | 34 | def __call__(self, frame_indices): 35 | out = frame_indices[:self.size] 36 | 37 | for index in out: 38 | if len(out) >= self.size: 39 | break 40 | out.append(index) 41 | 42 | return out 43 | 44 | 45 | class TemporalCenterCrop(object): 46 | """Temporally crop the given frame indices at a center. 47 | 48 | If the number of frames is less than the size, 49 | loop the indices as many times as necessary to satisfy the size. 50 | 51 | Args: 52 | size (int): Desired output size of the crop. 53 | """ 54 | 55 | def __init__(self, size): 56 | self.size = size 57 | 58 | def __call__(self, frame_indices): 59 | """ 60 | Args: 61 | frame_indices (list): frame indices to be cropped. 62 | Returns: 63 | list: Cropped frame indices. 64 | """ 65 | 66 | center_index = len(frame_indices) // 2 67 | begin_index = max(0, center_index - (self.size // 2)) 68 | end_index = min(begin_index + self.size, len(frame_indices)) 69 | 70 | out = frame_indices[begin_index:end_index] 71 | 72 | for index in out: 73 | if len(out) >= self.size: 74 | break 75 | out.append(index) 76 | 77 | return out 78 | 79 | 80 | class TemporalRandomCrop(object): 81 | """Temporally crop the given frame indices at a random location. 82 | 83 | If the number of frames is less than the size, 84 | loop the indices as many times as necessary to satisfy the size. 85 | 86 | Args: 87 | size (int): Desired output size of the crop. 88 | """ 89 | 90 | def __init__(self, size): 91 | self.size = size 92 | 93 | def __call__(self, frame_indices): 94 | """ 95 | Args: 96 | frame_indices (list): frame indices to be cropped. 97 | Returns: 98 | list: Cropped frame indices. 99 | """ 100 | 101 | rand_end = max(0, len(frame_indices) - self.size - 1) 102 | begin_index = random.randint(0, rand_end) 103 | end_index = min(begin_index + self.size, len(frame_indices)) 104 | 105 | out = frame_indices[begin_index:end_index] 106 | 107 | for index in out: 108 | if len(out) >= self.size: 109 | break 110 | out.append(index) 111 | 112 | return out 113 | -------------------------------------------------------------------------------- /libs/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import torch.nn.functional as F 4 | import time 5 | import os 6 | import sys 7 | import json 8 | import pdb 9 | 10 | from libs.utils import AverageMeter 11 | 12 | # PyTorch 0.3 13 | def calculate_video_results(output_buffer, video_id, test_results, class_names): 14 | video_outputs = torch.stack(output_buffer) 15 | average_scores = torch.mean(video_outputs, dim=0) 16 | sorted_scores, locs = torch.topk(average_scores, k=10) 17 | 18 | video_results = [] 19 | for i in range(sorted_scores.size(0)): 20 | video_results.append({ 21 | 'label': class_names[locs[i]], 22 | 'score': sorted_scores[i] 23 | }) 24 | 25 | test_results['results'][video_id] = video_results 26 | 27 | # PyTorch 0.4 28 | def calculate_video_results_pt_0_4(output_buffer, video_id, test_results, class_names): 29 | video_outputs = torch.stack(output_buffer) 30 | average_scores = torch.mean(video_outputs, dim=0) 31 | sorted_scores, locs = torch.topk(average_scores, k=10) 32 | 33 | video_results = [] 34 | for i in range(sorted_scores.size(0)): 35 | video_results.append({ 36 | 'label': class_names[locs[i].item()], 37 | 'score': sorted_scores[i].cpu().numpy().item() 38 | }) 39 | 40 | test_results['results'][video_id] = video_results 41 | 42 | 43 | def test(data_loader, model, opt, class_names): 44 | print('test') 45 | 46 | model.eval() 47 | 48 | # pytroch version check 49 | torch_version = float(torch.__version__[:3]) 50 | 51 | batch_time = AverageMeter() 52 | data_time = AverageMeter() 53 | 54 | end_time = time.time() 55 | output_buffer = [] 56 | previous_video_id = '' 57 | test_results = {'results': {}} 58 | for i, (inputs, targets) in enumerate(data_loader): 59 | data_time.update(time.time() - end_time) 60 | 61 | inputs = Variable(inputs, volatile=True) 62 | outputs = model(inputs) 63 | if not opt.no_softmax_in_test: 64 | outputs = F.softmax(outputs) 65 | 66 | for j in range(outputs.size(0)): 67 | if not (i == 0 and j == 0) and targets[j] != previous_video_id: 68 | if torch_version < 0.4: 69 | calculate_video_results(output_buffer, previous_video_id, 70 | test_results, class_names) 71 | else: 72 | calculate_video_results_pt_0_4(output_buffer, previous_video_id, 73 | test_results, class_names) 74 | output_buffer = [] 75 | output_buffer.append(outputs[j].data.cpu()) 76 | previous_video_id = targets[j] 77 | 78 | if (i % 100) == 0: 79 | with open( 80 | os.path.join(opt.result_path, '{}.json'.format( 81 | opt.test_subset)), 'w') as f: 82 | json.dump(test_results, f) 83 | 84 | batch_time.update(time.time() - end_time) 85 | end_time = time.time() 86 | 87 | print('[{}/{}]\t' 88 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 89 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format( 90 | i + 1, 91 | len(data_loader), 92 | batch_time=batch_time, 93 | data_time=data_time)) 94 | with open( 95 | os.path.join(opt.result_path, '{}.json'.format(opt.test_subset)), 96 | 'w') as f: 97 | json.dump(test_results, f) 98 | -------------------------------------------------------------------------------- /libs/utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | class AverageMeter(object): 4 | """Computes and stores the average and current value""" 5 | 6 | def __init__(self): 7 | self.reset() 8 | 9 | def reset(self): 10 | self.val = 0 11 | self.avg = 0 12 | self.sum = 0 13 | self.count = 0 14 | 15 | def update(self, val, n=1): 16 | self.val = val 17 | self.sum += val * n 18 | self.count += n 19 | self.avg = self.sum / self.count 20 | 21 | 22 | class Logger(object): 23 | 24 | def __init__(self, path, header): 25 | self.log_file = open(path, 'w') 26 | self.logger = csv.writer(self.log_file, delimiter='\t') 27 | 28 | self.logger.writerow(header) 29 | self.header = header 30 | 31 | def __del(self): 32 | self.log_file.close() 33 | 34 | def log(self, values): 35 | write_values = [] 36 | for col in self.header: 37 | assert col in values 38 | write_values.append(values[col]) 39 | 40 | self.logger.writerow(write_values) 41 | self.log_file.flush() 42 | 43 | 44 | def load_value_file(file_path): 45 | with open(file_path, 'r') as input_file: 46 | value = float(input_file.read().rstrip('\n\r')) 47 | 48 | return value 49 | 50 | 51 | def calculate_accuracy(outputs, targets): 52 | batch_size = targets.size(0) 53 | 54 | _, pred = outputs.topk(1, 1, True) 55 | pred = pred.t() 56 | correct = pred.eq(targets.view(1, -1)) 57 | n_correct_elems = correct.float().sum().data[0] 58 | 59 | return n_correct_elems / batch_size 60 | 61 | def calculate_accuracy_pt_0_4(outputs, targets): 62 | batch_size = targets.size(0) 63 | 64 | _, pred = outputs.topk(1, 1, True) 65 | pred = pred.t() 66 | correct = pred.eq(targets.view(1, -1)) 67 | n_correct_elems = correct.float().sum().item() 68 | 69 | return n_correct_elems / batch_size -------------------------------------------------------------------------------- /loss/hloss.py: -------------------------------------------------------------------------------- 1 | ## code from https://discuss.pytorch.org/t/calculating-the-entropy-loss/14510 2 | 3 | from torch import nn 4 | import torch.nn.functional as F 5 | 6 | class HLoss(nn.Module): 7 | """ 8 | returning the negative entropy of an input tensor 9 | """ 10 | def __init__(self, is_maximization=False): 11 | super(HLoss, self).__init__() 12 | self.is_neg = is_maximization 13 | 14 | def forward(self, x): 15 | b = F.softmax(x, dim=1) * F.log_softmax(x, dim=1) 16 | if self.is_neg: 17 | # b = 1.0 * b.sum() # summation over batches 18 | b = 1.0 * b.sum(dim=1).mean() # summation over batches, mean over batches 19 | else: 20 | # b = -1.0 * b.sum() 21 | b = -1.0 * b.sum(dim=1).mean() # summation over batches, mean over batches 22 | return b -------------------------------------------------------------------------------- /loss/soft_cross_entropy.py: -------------------------------------------------------------------------------- 1 | ## code from https://discuss.pytorch.org/t/cross-entropy-for-soft-label/16093 and https://discuss.pytorch.org/t/how-should-i-implement-cross-entropy-loss-with-continuous-target-outputs/10720/21 2 | 3 | import torch 4 | from torch import nn 5 | 6 | class SoftCrossEntropy(nn.Module): 7 | def __init__(self): 8 | super(SoftCrossEntropy, self).__init__() 9 | return 10 | 11 | def forward(self, inputs, target): 12 | """ 13 | :param inputs: predictions 14 | :param target: target labels 15 | :return: loss 16 | """ 17 | logsoftmax = nn.LogSoftmax(dim=1) 18 | 19 | return torch.mean(torch.sum(- target * logsoftmax(inputs), 1)) -------------------------------------------------------------------------------- /models/densenet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from collections import OrderedDict 5 | import math 6 | 7 | __all__ = [ 8 | 'DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet264' 9 | ] 10 | 11 | 12 | def densenet121(**kwargs): 13 | model = DenseNet( 14 | num_init_features=64, 15 | growth_rate=32, 16 | block_config=(6, 12, 24, 16), 17 | **kwargs) 18 | return model 19 | 20 | 21 | def densenet169(**kwargs): 22 | model = DenseNet( 23 | num_init_features=64, 24 | growth_rate=32, 25 | block_config=(6, 12, 32, 32), 26 | **kwargs) 27 | return model 28 | 29 | 30 | def densenet201(**kwargs): 31 | model = DenseNet( 32 | num_init_features=64, 33 | growth_rate=32, 34 | block_config=(6, 12, 48, 32), 35 | **kwargs) 36 | return model 37 | 38 | 39 | def densenet264(**kwargs): 40 | model = DenseNet( 41 | num_init_features=64, 42 | growth_rate=32, 43 | block_config=(6, 12, 64, 48), 44 | **kwargs) 45 | return model 46 | 47 | 48 | def get_fine_tuning_parameters(model, ft_begin_index): 49 | if ft_begin_index == 0: 50 | return model.parameters() 51 | 52 | ft_module_names = [] 53 | for i in range(ft_begin_index, 5): 54 | ft_module_names.append('denseblock{}'.format(i)) 55 | ft_module_names.append('transition{}'.format(i)) 56 | ft_module_names.append('norm5') 57 | ft_module_names.append('classifier') 58 | 59 | parameters = [] 60 | for k, v in model.named_parameters(): 61 | for ft_module in ft_module_names: 62 | if ft_module in k: 63 | parameters.append({'params': v}) 64 | break 65 | else: 66 | parameters.append({'params': v, 'lr': 0.0}) 67 | 68 | return parameters 69 | 70 | 71 | class _DenseLayer(nn.Sequential): 72 | 73 | def __init__(self, num_input_features, growth_rate, bn_size, drop_rate): 74 | super(_DenseLayer, self).__init__() 75 | self.add_module('norm.1', nn.BatchNorm3d(num_input_features)) 76 | self.add_module('relu.1', nn.ReLU(inplace=True)) 77 | self.add_module('conv.1', 78 | nn.Conv3d( 79 | num_input_features, 80 | bn_size * growth_rate, 81 | kernel_size=1, 82 | stride=1, 83 | bias=False)) 84 | self.add_module('norm.2', nn.BatchNorm3d(bn_size * growth_rate)) 85 | self.add_module('relu.2', nn.ReLU(inplace=True)) 86 | self.add_module('conv.2', 87 | nn.Conv3d( 88 | bn_size * growth_rate, 89 | growth_rate, 90 | kernel_size=3, 91 | stride=1, 92 | padding=1, 93 | bias=False)) 94 | self.drop_rate = drop_rate 95 | 96 | def forward(self, x): 97 | new_features = super(_DenseLayer, self).forward(x) 98 | if self.drop_rate > 0: 99 | new_features = F.dropout( 100 | new_features, p=self.drop_rate, training=self.training) 101 | return torch.cat([x, new_features], 1) 102 | 103 | 104 | class _DenseBlock(nn.Sequential): 105 | 106 | def __init__(self, num_layers, num_input_features, bn_size, growth_rate, 107 | drop_rate): 108 | super(_DenseBlock, self).__init__() 109 | for i in range(num_layers): 110 | layer = _DenseLayer(num_input_features + i * growth_rate, 111 | growth_rate, bn_size, drop_rate) 112 | self.add_module('denselayer%d' % (i + 1), layer) 113 | 114 | 115 | class _Transition(nn.Sequential): 116 | 117 | def __init__(self, num_input_features, num_output_features): 118 | super(_Transition, self).__init__() 119 | self.add_module('norm', nn.BatchNorm3d(num_input_features)) 120 | self.add_module('relu', nn.ReLU(inplace=True)) 121 | self.add_module('conv', 122 | nn.Conv3d( 123 | num_input_features, 124 | num_output_features, 125 | kernel_size=1, 126 | stride=1, 127 | bias=False)) 128 | self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2)) 129 | 130 | 131 | class DenseNet(nn.Module): 132 | """Densenet-BC model class 133 | Args: 134 | growth_rate (int) - how many filters to add each layer (k in paper) 135 | block_config (list of 4 ints) - how many layers in each pooling block 136 | num_init_features (int) - the number of filters to learn in the first convolution layer 137 | bn_size (int) - multiplicative factor for number of bottle neck layers 138 | (i.e. bn_size * k features in the bottleneck layer) 139 | drop_rate (float) - dropout rate after each dense layer 140 | num_classes (int) - number of classification classes 141 | """ 142 | 143 | def __init__(self, 144 | sample_size, 145 | sample_duration, 146 | growth_rate=32, 147 | block_config=(6, 12, 24, 16), 148 | num_init_features=64, 149 | bn_size=4, 150 | drop_rate=0, 151 | num_classes=1000): 152 | 153 | super(DenseNet, self).__init__() 154 | 155 | self.sample_size = sample_size 156 | self.sample_duration = sample_duration 157 | 158 | # First convolution 159 | self.features = nn.Sequential( 160 | OrderedDict([ 161 | ('conv0', 162 | nn.Conv3d( 163 | 3, 164 | num_init_features, 165 | kernel_size=7, 166 | stride=(1, 2, 2), 167 | padding=(3, 3, 3), 168 | bias=False)), 169 | ('norm0', nn.BatchNorm3d(num_init_features)), 170 | ('relu0', nn.ReLU(inplace=True)), 171 | ('pool0', nn.MaxPool3d(kernel_size=3, stride=2, padding=1)), 172 | ])) 173 | 174 | # Each denseblock 175 | num_features = num_init_features 176 | for i, num_layers in enumerate(block_config): 177 | block = _DenseBlock( 178 | num_layers=num_layers, 179 | num_input_features=num_features, 180 | bn_size=bn_size, 181 | growth_rate=growth_rate, 182 | drop_rate=drop_rate) 183 | self.features.add_module('denseblock%d' % (i + 1), block) 184 | num_features = num_features + num_layers * growth_rate 185 | if i != len(block_config) - 1: 186 | trans = _Transition( 187 | num_input_features=num_features, 188 | num_output_features=num_features // 2) 189 | self.features.add_module('transition%d' % (i + 1), trans) 190 | num_features = num_features // 2 191 | 192 | # Final batch norm 193 | self.features.add_module('norm5', nn.BatchNorm2d(num_features)) 194 | 195 | for m in self.modules(): 196 | if isinstance(m, nn.Conv3d): 197 | m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out') 198 | elif isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm2d): 199 | m.weight.data.fill_(1) 200 | m.bias.data.zero_() 201 | 202 | # Linear layer 203 | self.classifier = nn.Linear(num_features, num_classes) 204 | 205 | def forward(self, x): 206 | features = self.features(x) 207 | out = F.relu(features, inplace=True) 208 | last_duration = int(math.ceil(self.sample_duration / 16)) 209 | last_size = int(math.floor(self.sample_size / 32)) 210 | out = F.avg_pool3d( 211 | out, kernel_size=(last_duration, last_size, last_size)).view( 212 | features.size(0), -1) 213 | out = self.classifier(out) 214 | return out 215 | -------------------------------------------------------------------------------- /models/grad_reversal.py: -------------------------------------------------------------------------------- 1 | ## code from https://github.com/jindongwang/transferlearning/tree/master/code/deep/DANN(RevGrad) 2 | ## original paper: Ganin Y, Lempitsky V. Unsupervised domain adaptation by backpropagation. ICML 2015. 3 | 4 | from torch.autograd import Function 5 | 6 | class ReverseLayerF(Function): 7 | @staticmethod 8 | def forward(ctx, x, alpha): 9 | ctx.alpha = alpha 10 | return x.view_as(x) 11 | 12 | @staticmethod 13 | def backward(ctx, grad_output): 14 | output = grad_output.neg() * ctx.alpha 15 | return output, None 16 | -------------------------------------------------------------------------------- /models/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from models import resnet, pre_act_resnet, wide_resnet, resnext, densenet, vgg 5 | import pdb 6 | 7 | def generate_model(opt): 8 | assert opt.model in [ 9 | 'resnet', 'preresnet', 'wideresnet', 'resnext', 'densenet', 'vgg' 10 | ] 11 | 12 | if opt.model == 'resnet': 13 | assert opt.model_depth in [10, 18, 34, 50, 101, 152, 200] 14 | 15 | from models.resnet import get_fine_tuning_parameters, get_adv_fine_tuning_parameters 16 | 17 | if opt.model_depth == 10: 18 | model = resnet.resnet10( 19 | num_classes=opt.n_classes, 20 | shortcut_type=opt.resnet_shortcut, 21 | sample_size=opt.sample_size, 22 | sample_duration=opt.sample_duration, 23 | is_adv=opt.is_place_adv, 24 | is_human_mask_adv=opt.is_mask_adv, 25 | alpha=opt.alpha, 26 | alpha_hm=opt.alpha_hm, 27 | num_places_classes=opt.num_places_classes, 28 | num_place_hidden_layers=opt.num_place_hidden_layers, 29 | num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers) 30 | elif opt.model_depth == 18: 31 | model = resnet.resnet18( 32 | num_classes=opt.n_classes, 33 | shortcut_type=opt.resnet_shortcut, 34 | sample_size=opt.sample_size, 35 | sample_duration=opt.sample_duration, 36 | is_adv=opt.is_place_adv, 37 | is_human_mask_adv=opt.is_mask_adv, 38 | alpha=opt.alpha, 39 | alpha_hm=opt.alpha_hm, 40 | num_places_classes=opt.num_places_classes, 41 | num_place_hidden_layers=opt.num_place_hidden_layers, 42 | num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers) 43 | elif opt.model_depth == 34: 44 | model = resnet.resnet34( 45 | num_classes=opt.n_classes, 46 | shortcut_type=opt.resnet_shortcut, 47 | sample_size=opt.sample_size, 48 | sample_duration=opt.sample_duration, 49 | is_adv=opt.is_place_adv, 50 | is_human_mask_adv=opt.is_mask_adv, 51 | alpha=opt.alpha, 52 | alpha_hm=opt.alpha_hm, 53 | num_places_classes=opt.num_places_classes, 54 | num_place_hidden_layers=opt.num_place_hidden_layers, 55 | num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers) 56 | elif opt.model_depth == 50: 57 | model = resnet.resnet50( 58 | num_classes=opt.n_classes, 59 | shortcut_type=opt.resnet_shortcut, 60 | sample_size=opt.sample_size, 61 | sample_duration=opt.sample_duration, 62 | is_adv=opt.is_place_adv, 63 | is_human_mask_adv=opt.is_mask_adv, 64 | alpha=opt.alpha, 65 | alpha_hm=opt.alpha_hm, 66 | num_places_classes=opt.num_places_classes, 67 | num_place_hidden_layers=opt.num_place_hidden_layers, 68 | num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers) 69 | elif opt.model_depth == 101: 70 | model = resnet.resnet101( 71 | num_classes=opt.n_classes, 72 | shortcut_type=opt.resnet_shortcut, 73 | sample_size=opt.sample_size, 74 | sample_duration=opt.sample_duration, 75 | is_adv=opt.is_place_adv, 76 | is_human_mask_adv=opt.is_mask_adv, 77 | alpha=opt.alpha, 78 | alpha_hm=opt.alpha_hm, 79 | num_places_classes=opt.num_places_classes, 80 | num_place_hidden_layers=opt.num_place_hidden_layers, 81 | num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers) 82 | elif opt.model_depth == 152: 83 | model = resnet.resnet152( 84 | num_classes=opt.n_classes, 85 | shortcut_type=opt.resnet_shortcut, 86 | sample_size=opt.sample_size, 87 | sample_duration=opt.sample_duration, 88 | is_adv=opt.is_place_adv, 89 | is_human_mask_adv=opt.is_mask_adv, 90 | alpha=opt.alpha, 91 | alpha_hm=opt.alpha_hm, 92 | num_places_classes=opt.num_places_classes, 93 | num_place_hidden_layers=opt.num_place_hidden_layers, 94 | num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers) 95 | elif opt.model_depth == 200: 96 | model = resnet.resnet200( 97 | num_classes=opt.n_classes, 98 | shortcut_type=opt.resnet_shortcut, 99 | sample_size=opt.sample_size, 100 | sample_duration=opt.sample_duration, 101 | is_adv=opt.is_place_adv, 102 | is_human_mask_adv=opt.is_mask_adv, 103 | alpha=opt.alpha, 104 | alpha_hm=opt.alpha_hm, 105 | num_places_classes=opt.num_places_classes, 106 | num_place_hidden_layers=opt.num_place_hidden_layers, 107 | num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers) 108 | elif opt.model == 'wideresnet': 109 | assert opt.model_depth in [50] 110 | 111 | from models.wide_resnet import get_fine_tuning_parameters 112 | 113 | if opt.model_depth == 50: 114 | model = wide_resnet.resnet50( 115 | num_classes=opt.n_classes, 116 | shortcut_type=opt.resnet_shortcut, 117 | k=opt.wide_resnet_k, 118 | sample_size=opt.sample_size, 119 | sample_duration=opt.sample_duration) 120 | elif opt.model == 'resnext': 121 | assert opt.model_depth in [50, 101, 152] 122 | 123 | from models.resnext import get_fine_tuning_parameters 124 | 125 | if opt.model_depth == 50: 126 | model = resnext.resnet50( 127 | num_classes=opt.n_classes, 128 | shortcut_type=opt.resnet_shortcut, 129 | cardinality=opt.resnext_cardinality, 130 | sample_size=opt.sample_size, 131 | sample_duration=opt.sample_duration) 132 | elif opt.model_depth == 101: 133 | model = resnext.resnet101( 134 | num_classes=opt.n_classes, 135 | shortcut_type=opt.resnet_shortcut, 136 | cardinality=opt.resnext_cardinality, 137 | sample_size=opt.sample_size, 138 | sample_duration=opt.sample_duration) 139 | elif opt.model_depth == 152: 140 | model = resnext.resnet152( 141 | num_classes=opt.n_classes, 142 | shortcut_type=opt.resnet_shortcut, 143 | cardinality=opt.resnext_cardinality, 144 | sample_size=opt.sample_size, 145 | sample_duration=opt.sample_duration) 146 | elif opt.model == 'preresnet': 147 | assert opt.model_depth in [18, 34, 50, 101, 152, 200] 148 | 149 | from models.pre_act_resnet import get_fine_tuning_parameters 150 | 151 | if opt.model_depth == 18: 152 | model = pre_act_resnet.resnet18( 153 | num_classes=opt.n_classes, 154 | shortcut_type=opt.resnet_shortcut, 155 | sample_size=opt.sample_size, 156 | sample_duration=opt.sample_duration) 157 | elif opt.model_depth == 34: 158 | model = pre_act_resnet.resnet34( 159 | num_classes=opt.n_classes, 160 | shortcut_type=opt.resnet_shortcut, 161 | sample_size=opt.sample_size, 162 | sample_duration=opt.sample_duration) 163 | elif opt.model_depth == 50: 164 | model = pre_act_resnet.resnet50( 165 | num_classes=opt.n_classes, 166 | shortcut_type=opt.resnet_shortcut, 167 | sample_size=opt.sample_size, 168 | sample_duration=opt.sample_duration) 169 | elif opt.model_depth == 101: 170 | model = pre_act_resnet.resnet101( 171 | num_classes=opt.n_classes, 172 | shortcut_type=opt.resnet_shortcut, 173 | sample_size=opt.sample_size, 174 | sample_duration=opt.sample_duration) 175 | elif opt.model_depth == 152: 176 | model = pre_act_resnet.resnet152( 177 | num_classes=opt.n_classes, 178 | shortcut_type=opt.resnet_shortcut, 179 | sample_size=opt.sample_size, 180 | sample_duration=opt.sample_duration) 181 | elif opt.model_depth == 200: 182 | model = pre_act_resnet.resnet200( 183 | num_classes=opt.n_classes, 184 | shortcut_type=opt.resnet_shortcut, 185 | sample_size=opt.sample_size, 186 | sample_duration=opt.sample_duration) 187 | elif opt.model == 'densenet': 188 | assert opt.model_depth in [121, 169, 201, 264] 189 | 190 | from models.densenet import get_fine_tuning_parameters 191 | 192 | if opt.model_depth == 121: 193 | model = densenet.densenet121( 194 | num_classes=opt.n_classes, 195 | sample_size=opt.sample_size, 196 | sample_duration=opt.sample_duration) 197 | elif opt.model_depth == 169: 198 | model = densenet.densenet169( 199 | num_classes=opt.n_classes, 200 | sample_size=opt.sample_size, 201 | sample_duration=opt.sample_duration) 202 | elif opt.model_depth == 201: 203 | model = densenet.densenet201( 204 | num_classes=opt.n_classes, 205 | sample_size=opt.sample_size, 206 | sample_duration=opt.sample_duration) 207 | elif opt.model_depth == 264: 208 | model = densenet.densenet264( 209 | num_classes=opt.n_classes, 210 | sample_size=opt.sample_size, 211 | sample_duration=opt.sample_duration) 212 | elif opt.model == 'vgg': 213 | 214 | from models.vgg import get_fine_tuning_parameters, get_adv_fine_tuning_parameters 215 | 216 | model = vgg.build_vgg( 217 | num_classes=opt.n_classes, 218 | is_adv=opt.is_place_adv, 219 | is_human_mask_adv=opt.is_mask_adv, 220 | alpha=opt.alpha, 221 | alpha_hm=opt.alpha_hm, 222 | num_places_classes=opt.num_places_classes, 223 | num_place_hidden_layers=opt.num_place_hidden_layers, 224 | num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers) 225 | 226 | if not opt.no_cuda: 227 | model = model.cuda() 228 | model = nn.DataParallel(model, device_ids=None) 229 | 230 | if opt.pretrain_path: 231 | print('loading pretrained model {}'.format(opt.pretrain_path)) 232 | pretrain = torch.load(opt.pretrain_path) 233 | 234 | if opt.model != 'vgg': 235 | assert opt.arch == pretrain['arch'] 236 | # else: 237 | # pdb.set_trace() 238 | # pdb.set_trace() 239 | 240 | # model.load_state_dict(pretrain['state_dict']) 241 | model_dict = model.state_dict() 242 | 243 | # 1. filter out unnecessary keys and the last fc layers' weights 244 | pretrained_dict = dict() 245 | if 'state_dict' in pretrain: 246 | for k,v in pretrain['state_dict'].items(): 247 | if ((k in model_dict) and (v.shape == model_dict[k].shape)): 248 | pretrained_dict[k] = v 249 | else: 250 | for k,v in pretrain.items(): 251 | new_k = 'module.vgg.'+ k 252 | if ((new_k in model_dict) and (v.shape == model_dict[new_k].shape)): 253 | pretrained_dict[new_k] = v 254 | # 2. overwrite entries in the existing state dict 255 | model_dict.update(pretrained_dict) 256 | # 3. load the new state dict 257 | model.load_state_dict(model_dict) 258 | 259 | if not opt.not_replace_last_fc: 260 | if opt.model == 'densenet': 261 | model.module.classifier = nn.Linear( 262 | model.module.classifier.in_features, opt.n_finetune_classes) 263 | model.module.classifier = model.module.classifier.cuda() 264 | else: 265 | model.module.fc = nn.Linear(model.module.fc.in_features, 266 | opt.n_finetune_classes) 267 | model.module.fc = model.module.fc.cuda() 268 | 269 | if opt.is_place_adv or opt.is_mask_cross_entropy or opt.is_mask_entropy: 270 | # pdb.set_trace() 271 | parameters = get_adv_fine_tuning_parameters(model, opt.ft_begin_index, opt.new_layer_lr, not_replace_last_fc=opt.not_replace_last_fc, is_human_mask_adv=opt.is_mask_adv, slower_place_mlp=opt.slower_place_mlp, slower_hm_mlp=opt.slower_hm_mlp) 272 | else: 273 | parameters = get_fine_tuning_parameters(model, opt.ft_begin_index) 274 | 275 | return model, parameters 276 | else: 277 | if opt.pretrain_path: 278 | print('loading pretrained model {}'.format(opt.pretrain_path)) 279 | pretrain = torch.load(opt.pretrain_path) 280 | 281 | if opt.model != 'vgg': 282 | assert opt.arch == pretrain['arch'] 283 | # else: 284 | # pdb.set_trace() 285 | 286 | # model.load_state_dict(pretrain['state_dict']) 287 | model_dict = model.state_dict() 288 | 289 | # 1. filter out unnecessary keys and the last fc layers' weights 290 | pretrained_dict = dict() 291 | if 'state_dict' in pretrain: 292 | for k,v in pretrain['state_dict'].items(): 293 | if ((k in model_dict) and (v.shape == model_dict[k].shape)): 294 | pretrained_dict[k] = v 295 | else: 296 | for k,v in pretrain.items(): 297 | new_k = 'module.vgg.'+ k 298 | if ((new_k in model_dict) and (v.shape == model_dict[new_k].shape)): 299 | pretrained_dict[new_k] = v 300 | # 2. overwrite entries in the existing state dict 301 | model_dict.update(pretrained_dict) 302 | # 3. load the new state dict 303 | model.load_state_dict(model_dict) 304 | 305 | if not opt.not_replace_last_fc: 306 | if opt.model == 'densenet': 307 | model.classifier = nn.Linear( 308 | model.classifier.in_features, opt.n_finetune_classes) 309 | else: 310 | model.fc = nn.Linear(model.fc.in_features, 311 | opt.n_finetune_classes) 312 | 313 | if opt.is_place_adv: 314 | parameters = get_adv_fine_tuning_parameters(model, opt.ft_begin_index, opt.new_layer_lr, not_replace_last_fc=opt.not_replace_last_fc, is_human_mask_adv=opt.is_mask_adv, slower_place_mlp=opt.slower_place_mlp, slower_hm_mlp=opt.slower_hm_mlp) 315 | else: 316 | parameters = get_fine_tuning_parameters(model, opt.ft_begin_index) 317 | return model, parameters 318 | 319 | return model, model.parameters() 320 | -------------------------------------------------------------------------------- /models/pre_act_resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = [ 9 | 'PreActivationResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 10 | 'resnet152', 'resnet200' 11 | ] 12 | 13 | 14 | def conv3x3x3(in_planes, out_planes, stride=1): 15 | # 3x3x3 convolution with padding 16 | return nn.Conv3d( 17 | in_planes, 18 | out_planes, 19 | kernel_size=3, 20 | stride=stride, 21 | padding=1, 22 | bias=False) 23 | 24 | 25 | def downsample_basic_block(x, planes, stride): 26 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 27 | zero_pads = torch.Tensor( 28 | out.size(0), planes - out.size(1), out.size(2), out.size(3), 29 | out.size(4)).zero_() 30 | if isinstance(out.data, torch.cuda.FloatTensor): 31 | zero_pads = zero_pads.cuda() 32 | 33 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 34 | 35 | return out 36 | 37 | 38 | class PreActivationBasicBlock(nn.Module): 39 | expansion = 1 40 | 41 | def __init__(self, inplanes, planes, stride=1, downsample=None): 42 | super(PreActivationBasicBlock, self).__init__() 43 | self.bn1 = nn.BatchNorm3d(inplanes) 44 | self.conv1 = conv3x3x3(inplanes, planes, stride) 45 | self.bn2 = nn.BatchNorm3d(planes) 46 | self.conv2 = conv3x3x3(planes, planes) 47 | self.relu = nn.ReLU(inplace=True) 48 | self.downsample = downsample 49 | self.stride = stride 50 | 51 | def forward(self, x): 52 | residual = x 53 | 54 | out = self.bn1(x) 55 | out = self.relu(out) 56 | out = self.conv1(out) 57 | 58 | out = self.bn2(out) 59 | out = self.relu(out) 60 | out = self.conv2(out) 61 | 62 | if self.downsample is not None: 63 | residual = self.downsample(x) 64 | 65 | out += residual 66 | 67 | return out 68 | 69 | 70 | class PreActivationBottleneck(nn.Module): 71 | expansion = 4 72 | 73 | def __init__(self, inplanes, planes, stride=1, downsample=None): 74 | super(PreActivationBottleneck, self).__init__() 75 | self.bn1 = nn.BatchNorm3d(inplanes) 76 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 77 | self.bn2 = nn.BatchNorm3d(planes) 78 | self.conv2 = nn.Conv3d( 79 | planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 80 | self.bn3 = nn.BatchNorm3d(planes) 81 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False) 82 | self.relu = nn.ReLU(inplace=True) 83 | self.downsample = downsample 84 | self.stride = stride 85 | 86 | def forward(self, x): 87 | residual = x 88 | 89 | out = self.bn1(x) 90 | out = self.relu(out) 91 | out = self.conv1(out) 92 | 93 | out = self.bn2(out) 94 | out = self.relu(out) 95 | out = self.conv2(out) 96 | 97 | out = self.bn3(out) 98 | out = self.relu(out) 99 | out = self.conv3(out) 100 | 101 | if self.downsample is not None: 102 | residual = self.downsample(x) 103 | 104 | out += residual 105 | 106 | return out 107 | 108 | 109 | class PreActivationResNet(nn.Module): 110 | 111 | def __init__(self, 112 | block, 113 | layers, 114 | sample_size, 115 | sample_duration, 116 | shortcut_type='B', 117 | num_classes=400): 118 | self.inplanes = 64 119 | super(PreActivationResNet, self).__init__() 120 | self.conv1 = nn.Conv3d( 121 | 3, 122 | 64, 123 | kernel_size=7, 124 | stride=(1, 2, 2), 125 | padding=(3, 3, 3), 126 | bias=False) 127 | self.bn1 = nn.BatchNorm3d(64) 128 | self.relu = nn.ReLU(inplace=True) 129 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 130 | self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type) 131 | self.layer2 = self._make_layer( 132 | block, 128, layers[1], shortcut_type, stride=2) 133 | self.layer3 = self._make_layer( 134 | block, 256, layers[2], shortcut_type, stride=2) 135 | self.layer4 = self._make_layer( 136 | block, 512, layers[3], shortcut_type, stride=2) 137 | last_duration = int(math.ceil(sample_duration / 16)) 138 | last_size = int(math.ceil(sample_size / 32)) 139 | self.avgpool = nn.AvgPool3d( 140 | (last_duration, last_size, last_size), stride=1) 141 | self.fc = nn.Linear(512 * block.expansion, num_classes) 142 | 143 | for m in self.modules(): 144 | if isinstance(m, nn.Conv3d): 145 | m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out') 146 | elif isinstance(m, nn.BatchNorm3d): 147 | m.weight.data.fill_(1) 148 | m.bias.data.zero_() 149 | 150 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 151 | downsample = None 152 | if stride != 1 or self.inplanes != planes * block.expansion: 153 | if shortcut_type == 'A': 154 | downsample = partial( 155 | downsample_basic_block, 156 | planes=planes * block.expansion, 157 | stride=stride) 158 | else: 159 | downsample = nn.Sequential( 160 | nn.Conv3d( 161 | self.inplanes, 162 | planes * block.expansion, 163 | kernel_size=1, 164 | stride=stride, 165 | bias=False), nn.BatchNorm3d(planes * block.expansion)) 166 | 167 | layers = [] 168 | layers.append(block(self.inplanes, planes, stride, downsample)) 169 | self.inplanes = planes * block.expansion 170 | for i in range(1, blocks): 171 | layers.append(block(self.inplanes, planes)) 172 | 173 | return nn.Sequential(*layers) 174 | 175 | def forward(self, x): 176 | x = self.conv1(x) 177 | x = self.bn1(x) 178 | x = self.relu(x) 179 | x = self.maxpool(x) 180 | 181 | x = self.layer1(x) 182 | x = self.layer2(x) 183 | x = self.layer3(x) 184 | x = self.layer4(x) 185 | 186 | x = self.avgpool(x) 187 | 188 | x = x.view(x.size(0), -1) 189 | x = self.fc(x) 190 | 191 | return x 192 | 193 | 194 | def get_fine_tuning_parameters(model, ft_begin_index): 195 | if ft_begin_index == 0: 196 | return model.parameters() 197 | 198 | ft_module_names = [] 199 | for i in range(ft_begin_index, 5): 200 | ft_module_names.append('layer{}'.format(i)) 201 | ft_module_names.append('fc') 202 | 203 | parameters = [] 204 | for k, v in model.named_parameters(): 205 | for ft_module in ft_module_names: 206 | if ft_module in k: 207 | parameters.append({'params': v}) 208 | break 209 | else: 210 | parameters.append({'params': v, 'lr': 0.0}) 211 | 212 | return parameters 213 | 214 | 215 | def resnet18(**kwargs): 216 | """Constructs a ResNet-18 model. 217 | """ 218 | model = PreActivationResNet(PreActivationBasicBlock, [2, 2, 2, 2], **kwargs) 219 | return model 220 | 221 | 222 | def resnet34(**kwargs): 223 | """Constructs a ResNet-34 model. 224 | """ 225 | model = PreActivationResNet(PreActivationBasicBlock, [3, 4, 6, 3], **kwargs) 226 | return model 227 | 228 | 229 | def resnet50(**kwargs): 230 | """Constructs a ResNet-50 model. 231 | """ 232 | model = PreActivationResNet(PreActivationBottleneck, [3, 4, 6, 3], **kwargs) 233 | return model 234 | 235 | 236 | def resnet101(**kwargs): 237 | """Constructs a ResNet-101 model. 238 | """ 239 | model = PreActivationResNet(PreActivationBottleneck, [3, 4, 23, 3], 240 | **kwargs) 241 | return model 242 | 243 | 244 | def resnet152(**kwargs): 245 | """Constructs a ResNet-101 model. 246 | """ 247 | model = PreActivationResNet(PreActivationBottleneck, [3, 8, 36, 3], 248 | **kwargs) 249 | return model 250 | 251 | 252 | def resnet200(**kwargs): 253 | """Constructs a ResNet-101 model. 254 | """ 255 | model = PreActivationResNet(PreActivationBottleneck, [3, 24, 36, 3], 256 | **kwargs) 257 | return model 258 | -------------------------------------------------------------------------------- /models/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | # adv 8 | from models.grad_reversal import ReverseLayerF 9 | import pdb 10 | __all__ = [ 11 | 'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 12 | 'resnet152', 'resnet200' 13 | ] 14 | 15 | def conv3x3x3(in_planes, out_planes, stride=1): 16 | # 3x3x3 convolution with padding 17 | return nn.Conv3d( 18 | in_planes, 19 | out_planes, 20 | kernel_size=3, 21 | stride=stride, 22 | padding=1, 23 | bias=False) 24 | 25 | 26 | def downsample_basic_block(x, planes, stride): 27 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 28 | zero_pads = torch.Tensor( 29 | out.size(0), planes - out.size(1), out.size(2), out.size(3), 30 | out.size(4)).zero_() 31 | if isinstance(out.data, torch.cuda.FloatTensor): 32 | zero_pads = zero_pads.cuda() 33 | 34 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 35 | 36 | return out 37 | 38 | 39 | class BasicBlock(nn.Module): 40 | expansion = 1 41 | 42 | def __init__(self, inplanes, planes, stride=1, downsample=None): 43 | super(BasicBlock, self).__init__() 44 | self.conv1 = conv3x3x3(inplanes, planes, stride) 45 | self.bn1 = nn.BatchNorm3d(planes) 46 | self.relu = nn.ReLU(inplace=True) 47 | self.conv2 = conv3x3x3(planes, planes) 48 | self.bn2 = nn.BatchNorm3d(planes) 49 | self.downsample = downsample 50 | self.stride = stride 51 | 52 | def forward(self, x): 53 | residual = x 54 | 55 | out = self.conv1(x) 56 | out = self.bn1(out) 57 | out = self.relu(out) 58 | 59 | out = self.conv2(out) 60 | out = self.bn2(out) 61 | 62 | if self.downsample is not None: 63 | residual = self.downsample(x) 64 | 65 | out += residual 66 | out = self.relu(out) 67 | 68 | return out 69 | 70 | 71 | class Bottleneck(nn.Module): 72 | expansion = 4 73 | 74 | def __init__(self, inplanes, planes, stride=1, downsample=None): 75 | super(Bottleneck, self).__init__() 76 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 77 | self.bn1 = nn.BatchNorm3d(planes) 78 | self.conv2 = nn.Conv3d( 79 | planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 80 | self.bn2 = nn.BatchNorm3d(planes) 81 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False) 82 | self.bn3 = nn.BatchNorm3d(planes * 4) 83 | self.relu = nn.ReLU(inplace=True) 84 | self.downsample = downsample 85 | self.stride = stride 86 | 87 | def forward(self, x): 88 | residual = x 89 | 90 | out = self.conv1(x) 91 | out = self.bn1(out) 92 | out = self.relu(out) 93 | 94 | out = self.conv2(out) 95 | out = self.bn2(out) 96 | out = self.relu(out) 97 | 98 | out = self.conv3(out) 99 | out = self.bn3(out) 100 | 101 | if self.downsample is not None: 102 | residual = self.downsample(x) 103 | 104 | out += residual 105 | out = self.relu(out) 106 | 107 | return out 108 | 109 | 110 | class MLP_Block(nn.Module): 111 | expansion = 1 112 | 113 | def __init__(self, inplanes, planes): 114 | super(MLP_Block, self).__init__() 115 | self.fc = nn.Linear(inplanes, planes) 116 | self.bn = nn.BatchNorm1d(planes) 117 | self.relu = nn.ReLU(inplace=True) 118 | 119 | def forward(self, x): 120 | out = self.fc(x) 121 | out = self.bn(out) 122 | out = self.relu(out) 123 | 124 | return out 125 | 126 | 127 | class ResNet(nn.Module): 128 | 129 | def __init__(self, 130 | block, 131 | layers, 132 | sample_size, 133 | sample_duration, 134 | shortcut_type='B', 135 | num_classes=400, 136 | is_adv=False, 137 | is_human_mask_adv=False, 138 | alpha=0.0, 139 | alpha_hm=0.0, 140 | num_places_classes=365, 141 | num_place_hidden_layers=1, 142 | num_human_mask_adv_hidden_layers=1): 143 | self.inplanes = 64 144 | 145 | # adv 146 | self.is_adv = is_adv 147 | self.is_human_mask_adv = is_human_mask_adv 148 | self.alpha = alpha 149 | self.alpha_hm = alpha_hm 150 | self.num_places_classes = num_places_classes 151 | 152 | super(ResNet, self).__init__() 153 | self.conv1 = nn.Conv3d( 154 | 3, 155 | 64, 156 | kernel_size=7, 157 | stride=(1, 2, 2), 158 | padding=(3, 3, 3), 159 | bias=False) 160 | self.bn1 = nn.BatchNorm3d(64) 161 | self.relu = nn.ReLU(inplace=True) 162 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 163 | self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type) 164 | self.layer2 = self._make_layer( 165 | block, 128, layers[1], shortcut_type, stride=2) 166 | self.layer3 = self._make_layer( 167 | block, 256, layers[2], shortcut_type, stride=2) 168 | self.layer4 = self._make_layer( 169 | block, 512, layers[3], shortcut_type, stride=2) 170 | last_duration = int(math.ceil(sample_duration / 16)) 171 | last_size = int(math.ceil(sample_size / 32)) 172 | self.avgpool = nn.AvgPool3d( 173 | (last_duration, last_size, last_size), stride=1) 174 | self.fc = nn.Linear(512 * block.expansion, num_classes) 175 | 176 | # human mask adv 177 | if self.is_human_mask_adv: 178 | self.hm_mlp = nn.Sequential() 179 | self.hm_mlp = self._make_mlp_layer(MLP_Block, 512 * block.expansion, 512 * block.expansion, num_human_mask_adv_hidden_layers) 180 | self.hm_mlp.add_module('hm_last_fc', nn.Linear(512 * block.expansion, num_classes)) 181 | 182 | # adv 183 | if self.is_adv: 184 | self.place_mlp = nn.Sequential() 185 | self.place_mlp = self._make_mlp_layer(MLP_Block, 512 * block.expansion, 512 * block.expansion, num_place_hidden_layers) 186 | self.place_mlp.add_module('p_last_fc', nn.Linear(512 * block.expansion, self.num_places_classes)) 187 | 188 | for m in self.modules(): 189 | if isinstance(m, nn.Conv3d): 190 | m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out') 191 | elif isinstance(m, nn.BatchNorm3d): 192 | m.weight.data.fill_(1) 193 | m.bias.data.zero_() 194 | 195 | def _make_mlp_layer(self, block, inplanes, planes, blocks): 196 | layers = [] 197 | layers.append(block(inplanes, planes)) 198 | for i in range(1, blocks): 199 | layers.append(block(inplanes, planes)) 200 | 201 | return nn.Sequential(*layers) 202 | 203 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 204 | downsample = None 205 | if stride != 1 or self.inplanes != planes * block.expansion: 206 | if shortcut_type == 'A': 207 | downsample = partial( 208 | downsample_basic_block, 209 | planes=planes * block.expansion, 210 | stride=stride) 211 | else: 212 | downsample = nn.Sequential( 213 | nn.Conv3d( 214 | self.inplanes, 215 | planes * block.expansion, 216 | kernel_size=1, 217 | stride=stride, 218 | bias=False), nn.BatchNorm3d(planes * block.expansion)) 219 | 220 | layers = [] 221 | layers.append(block(self.inplanes, planes, stride, downsample)) 222 | self.inplanes = planes * block.expansion 223 | for i in range(1, blocks): 224 | layers.append(block(self.inplanes, planes)) 225 | 226 | return nn.Sequential(*layers) 227 | 228 | def forward(self, x): 229 | x = self.conv1(x) 230 | x = self.bn1(x) 231 | x = self.relu(x) 232 | x = self.maxpool(x) 233 | 234 | x = self.layer1(x) 235 | x = self.layer2(x) 236 | x = self.layer3(x) 237 | x = self.layer4(x) 238 | 239 | x = self.avgpool(x) 240 | 241 | x = x.view(x.size(0), -1) 242 | 243 | # adv 244 | if self.is_human_mask_adv: 245 | rev_x_hm = ReverseLayerF.apply(x, self.alpha_hm) 246 | if self.is_adv: 247 | rev_x = ReverseLayerF.apply(x, self.alpha) 248 | dom_x = self.place_mlp(rev_x) 249 | 250 | x = self.fc(x) 251 | 252 | if self.is_human_mask_adv and self.is_adv: 253 | hm_x = self.hm_mlp(rev_x_hm) 254 | return x, dom_x, hm_x 255 | elif self.is_adv: 256 | return x, dom_x 257 | elif self.is_human_mask_adv: 258 | hm_x = self.hm_mlp(rev_x_hm) 259 | return x, hm_x 260 | else: 261 | return x 262 | 263 | def get_fine_tuning_parameters(model, ft_begin_index): 264 | if ft_begin_index == 0: 265 | return model.parameters() 266 | 267 | ft_module_names = [] 268 | for i in range(ft_begin_index, 5): 269 | ft_module_names.append('layer{}'.format(i)) 270 | ft_module_names.append('fc') 271 | 272 | parameters = [] 273 | for k, v in model.named_parameters(): 274 | for ft_module in ft_module_names: 275 | if ft_module in k: 276 | parameters.append({'params': v}) 277 | break 278 | else: 279 | parameters.append({'params': v, 'lr': 0.0}) 280 | 281 | return parameters 282 | 283 | def get_adv_fine_tuning_parameters(model, ft_begin_index, new_layer_lr, not_replace_last_fc=False, is_human_mask_adv=False, slower_place_mlp=False, slower_hm_mlp=False): 284 | ft_module_names, frozen_module_names = [], [] 285 | 286 | for i in range(0, ft_begin_index): 287 | frozen_module_names.append('layer{}'.format(i)) 288 | for i in range(ft_begin_index, 5): 289 | ft_module_names.append('layer{}'.format(i)) 290 | 291 | new_module_names = [] 292 | 293 | if not slower_place_mlp: 294 | new_module_names.append('place_mlp') 295 | else: 296 | ft_module_names.append('place_mlp') 297 | if is_human_mask_adv: 298 | if not slower_hm_mlp: 299 | new_module_names.append('hm_mlp') 300 | else: 301 | ft_module_names.append('hm_mlp') 302 | if not not_replace_last_fc: 303 | new_module_names.append('fc') 304 | 305 | pretrained_parameters, new_parameters = [], [] 306 | for k, v in model.named_parameters(): 307 | for ft_module in ft_module_names: 308 | if ft_module in k: 309 | print('finetune params:{}'.format(k)) 310 | pretrained_parameters.append(v) 311 | break 312 | else: 313 | for new_module in new_module_names: 314 | if new_module in k: 315 | print('new params:{}'.format(k)) 316 | new_parameters.append(v) 317 | break 318 | else: 319 | for frozen_module in frozen_module_names: 320 | if frozen_module in k: 321 | print('frozen:{}'.format(k)) 322 | pretrained_parameters.append(v) 323 | break 324 | else: 325 | print('finetune params:{}'.format(k)) 326 | pretrained_parameters.append(v) 327 | 328 | return [pretrained_parameters, new_parameters] 329 | 330 | def resnet10(**kwargs): 331 | """Constructs a ResNet-18 model. 332 | """ 333 | model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs) 334 | return model 335 | 336 | 337 | def resnet18(**kwargs): 338 | """Constructs a ResNet-18 model. 339 | """ 340 | model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) 341 | return model 342 | 343 | 344 | def resnet34(**kwargs): 345 | """Constructs a ResNet-34 model. 346 | """ 347 | model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) 348 | return model 349 | 350 | 351 | def resnet50(**kwargs): 352 | """Constructs a ResNet-50 model. 353 | """ 354 | model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) 355 | return model 356 | 357 | 358 | def resnet101(**kwargs): 359 | """Constructs a ResNet-101 model. 360 | """ 361 | model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) 362 | return model 363 | 364 | 365 | def resnet152(**kwargs): 366 | """Constructs a ResNet-101 model. 367 | """ 368 | model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) 369 | return model 370 | 371 | 372 | def resnet200(**kwargs): 373 | """Constructs a ResNet-101 model. 374 | """ 375 | model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs) 376 | return model -------------------------------------------------------------------------------- /models/resnext.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = ['ResNeXt', 'resnet50', 'resnet101'] 9 | 10 | 11 | def conv3x3x3(in_planes, out_planes, stride=1): 12 | # 3x3x3 convolution with padding 13 | return nn.Conv3d( 14 | in_planes, 15 | out_planes, 16 | kernel_size=3, 17 | stride=stride, 18 | padding=1, 19 | bias=False) 20 | 21 | 22 | def downsample_basic_block(x, planes, stride): 23 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 24 | zero_pads = torch.Tensor( 25 | out.size(0), planes - out.size(1), out.size(2), out.size(3), 26 | out.size(4)).zero_() 27 | if isinstance(out.data, torch.cuda.FloatTensor): 28 | zero_pads = zero_pads.cuda() 29 | 30 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 31 | 32 | return out 33 | 34 | 35 | class ResNeXtBottleneck(nn.Module): 36 | expansion = 2 37 | 38 | def __init__(self, inplanes, planes, cardinality, stride=1, 39 | downsample=None): 40 | super(ResNeXtBottleneck, self).__init__() 41 | mid_planes = cardinality * int(planes / 32) 42 | self.conv1 = nn.Conv3d(inplanes, mid_planes, kernel_size=1, bias=False) 43 | self.bn1 = nn.BatchNorm3d(mid_planes) 44 | self.conv2 = nn.Conv3d( 45 | mid_planes, 46 | mid_planes, 47 | kernel_size=3, 48 | stride=stride, 49 | padding=1, 50 | groups=cardinality, 51 | bias=False) 52 | self.bn2 = nn.BatchNorm3d(mid_planes) 53 | self.conv3 = nn.Conv3d( 54 | mid_planes, planes * self.expansion, kernel_size=1, bias=False) 55 | self.bn3 = nn.BatchNorm3d(planes * self.expansion) 56 | self.relu = nn.ReLU(inplace=True) 57 | self.downsample = downsample 58 | self.stride = stride 59 | 60 | def forward(self, x): 61 | residual = x 62 | 63 | out = self.conv1(x) 64 | out = self.bn1(out) 65 | out = self.relu(out) 66 | 67 | out = self.conv2(out) 68 | out = self.bn2(out) 69 | out = self.relu(out) 70 | 71 | out = self.conv3(out) 72 | out = self.bn3(out) 73 | 74 | if self.downsample is not None: 75 | residual = self.downsample(x) 76 | 77 | out += residual 78 | out = self.relu(out) 79 | 80 | return out 81 | 82 | 83 | class ResNeXt(nn.Module): 84 | 85 | def __init__(self, 86 | block, 87 | layers, 88 | sample_size, 89 | sample_duration, 90 | shortcut_type='B', 91 | cardinality=32, 92 | num_classes=400): 93 | self.inplanes = 64 94 | super(ResNeXt, self).__init__() 95 | self.conv1 = nn.Conv3d( 96 | 3, 97 | 64, 98 | kernel_size=7, 99 | stride=(1, 2, 2), 100 | padding=(3, 3, 3), 101 | bias=False) 102 | self.bn1 = nn.BatchNorm3d(64) 103 | self.relu = nn.ReLU(inplace=True) 104 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 105 | self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type, 106 | cardinality) 107 | self.layer2 = self._make_layer( 108 | block, 256, layers[1], shortcut_type, cardinality, stride=2) 109 | self.layer3 = self._make_layer( 110 | block, 512, layers[2], shortcut_type, cardinality, stride=2) 111 | self.layer4 = self._make_layer( 112 | block, 1024, layers[3], shortcut_type, cardinality, stride=2) 113 | last_duration = int(math.ceil(sample_duration / 16)) 114 | last_size = int(math.ceil(sample_size / 32)) 115 | self.avgpool = nn.AvgPool3d( 116 | (last_duration, last_size, last_size), stride=1) 117 | self.fc = nn.Linear(cardinality * 32 * block.expansion, num_classes) 118 | 119 | for m in self.modules(): 120 | if isinstance(m, nn.Conv3d): 121 | m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out') 122 | elif isinstance(m, nn.BatchNorm3d): 123 | m.weight.data.fill_(1) 124 | m.bias.data.zero_() 125 | 126 | def _make_layer(self, 127 | block, 128 | planes, 129 | blocks, 130 | shortcut_type, 131 | cardinality, 132 | stride=1): 133 | downsample = None 134 | if stride != 1 or self.inplanes != planes * block.expansion: 135 | if shortcut_type == 'A': 136 | downsample = partial( 137 | downsample_basic_block, 138 | planes=planes * block.expansion, 139 | stride=stride) 140 | else: 141 | downsample = nn.Sequential( 142 | nn.Conv3d( 143 | self.inplanes, 144 | planes * block.expansion, 145 | kernel_size=1, 146 | stride=stride, 147 | bias=False), nn.BatchNorm3d(planes * block.expansion)) 148 | 149 | layers = [] 150 | layers.append( 151 | block(self.inplanes, planes, cardinality, stride, downsample)) 152 | self.inplanes = planes * block.expansion 153 | for i in range(1, blocks): 154 | layers.append(block(self.inplanes, planes, cardinality)) 155 | 156 | return nn.Sequential(*layers) 157 | 158 | def forward(self, x): 159 | x = self.conv1(x) 160 | x = self.bn1(x) 161 | x = self.relu(x) 162 | x = self.maxpool(x) 163 | 164 | x = self.layer1(x) 165 | x = self.layer2(x) 166 | x = self.layer3(x) 167 | x = self.layer4(x) 168 | 169 | x = self.avgpool(x) 170 | 171 | x = x.view(x.size(0), -1) 172 | x = self.fc(x) 173 | 174 | return x 175 | 176 | 177 | def get_fine_tuning_parameters(model, ft_begin_index): 178 | if ft_begin_index == 0: 179 | return model.parameters() 180 | 181 | ft_module_names = [] 182 | for i in range(ft_begin_index, 5): 183 | ft_module_names.append('layer{}'.format(i)) 184 | ft_module_names.append('fc') 185 | 186 | parameters = [] 187 | for k, v in model.named_parameters(): 188 | for ft_module in ft_module_names: 189 | if ft_module in k: 190 | parameters.append({'params': v}) 191 | break 192 | else: 193 | parameters.append({'params': v, 'lr': 0.0}) 194 | 195 | return parameters 196 | 197 | 198 | def resnet50(**kwargs): 199 | """Constructs a ResNet-50 model. 200 | """ 201 | model = ResNeXt(ResNeXtBottleneck, [3, 4, 6, 3], **kwargs) 202 | return model 203 | 204 | 205 | def resnet101(**kwargs): 206 | """Constructs a ResNet-101 model. 207 | """ 208 | model = ResNeXt(ResNeXtBottleneck, [3, 4, 23, 3], **kwargs) 209 | return model 210 | 211 | 212 | def resnet152(**kwargs): 213 | """Constructs a ResNet-101 model. 214 | """ 215 | model = ResNeXt(ResNeXtBottleneck, [3, 8, 36, 3], **kwargs) 216 | return model 217 | -------------------------------------------------------------------------------- /models/vgg.py: -------------------------------------------------------------------------------- 1 | """ VGG16 network Class 2 | Adapted from Gurkirt Singh's code: https://github.com/gurkirt/realtime-action-detection/blob/master/ssd.py 3 | """ 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.autograd import Variable 9 | import os 10 | # adv 11 | from models.grad_reversal import ReverseLayerF 12 | import pdb 13 | 14 | class MLP_Block(nn.Module): 15 | def __init__(self, inplanes, planes): 16 | super(MLP_Block, self).__init__() 17 | self.fc = nn.Linear(inplanes, planes) 18 | # self.bn = nn.BatchNorm1d(planes) 19 | self.relu = nn.ReLU(inplace=True) 20 | 21 | def forward(self, x): 22 | out = self.fc(x) 23 | # out = self.bn(out) 24 | out = self.relu(out) 25 | 26 | return out 27 | 28 | class VGG16(nn.Module): 29 | def __init__(self, 30 | base, 31 | num_classes, 32 | is_adv=False, 33 | is_human_mask_adv=False, 34 | alpha=0.0, 35 | alpha_hm=0.0, 36 | num_places_classes=365, 37 | num_place_hidden_layers=1, 38 | num_human_mask_adv_hidden_layers=1): 39 | super(VGG16, self).__init__() 40 | self.num_classes = num_classes 41 | self.size = 300 42 | self.is_adv = is_adv 43 | self.is_human_mask_adv = is_human_mask_adv 44 | self.alpha = alpha 45 | self.alpha_hm = alpha_hm 46 | self.num_places_classes = num_places_classes 47 | 48 | self.vgg = nn.ModuleList(base) 49 | self.avgpool = nn.AdaptiveAvgPool2d((7, 7)) 50 | self.mlp = nn.Sequential( 51 | nn.Linear(1024 * 7 * 7, 4096), 52 | nn.ReLU(True), 53 | nn.Dropout(), 54 | nn.Linear(4096, 4096), 55 | nn.ReLU(True), 56 | nn.Dropout(), 57 | ) 58 | self.fc = nn.Linear(4096, self.num_classes) 59 | 60 | # human mask adv 61 | if self.is_human_mask_adv: 62 | self.hm_mlp = nn.Sequential() 63 | self.hm_mlp = self._make_mlp_layer(MLP_Block, 4096, 4096, num_human_mask_adv_hidden_layers) 64 | self.hm_mlp.add_module('hm_last_fc', nn.Linear(4096, num_classes)) 65 | 66 | # adv 67 | if self.is_adv: 68 | self.place_mlp = nn.Sequential() 69 | self.place_mlp = self._make_mlp_layer(MLP_Block, 4096, 4096, num_place_hidden_layers) 70 | self.place_mlp.add_module('p_last_fc', nn.Linear(4096, self.num_places_classes)) 71 | 72 | def forward(self, x): 73 | for k in range(len(self.vgg)): 74 | x = self.vgg[k](x) 75 | 76 | x = self.avgpool(x) 77 | x = x.view(x.size(0), -1) 78 | x = self.mlp(x) 79 | 80 | # adv 81 | if self.is_human_mask_adv: 82 | rev_x_hm = ReverseLayerF.apply(x, self.alpha_hm) 83 | if self.is_adv: 84 | rev_x = ReverseLayerF.apply(x, self.alpha) 85 | dom_x = self.place_mlp(rev_x) 86 | 87 | x = self.fc(x) 88 | 89 | if self.is_human_mask_adv and self.is_adv: 90 | hm_x = self.hm_mlp(rev_x_hm) 91 | return x, dom_x, hm_x 92 | elif self.is_adv: 93 | return x, dom_x 94 | elif self.is_human_mask_adv: 95 | hm_x = self.hm_mlp(rev_x_hm) 96 | return x, hm_x 97 | else: 98 | return x 99 | 100 | def _make_mlp_layer(self, block, inplanes, planes, blocks): 101 | layers = [] 102 | layers.append(block(inplanes, planes)) 103 | for i in range(1, blocks): 104 | layers.append(block(inplanes, planes)) 105 | 106 | return nn.Sequential(*layers) 107 | 108 | def load_weights(self, base_file): 109 | other, ext = os.path.splitext(base_file) 110 | if ext == '.pkl' or '.pth': 111 | print('Loading weights into state dict...') 112 | self.load_state_dict(torch.load(base_file, map_location=lambda storage, loc: storage)) 113 | print('Finished!') 114 | else: 115 | print('Sorry only .pth and .pkl files supported.') 116 | 117 | 118 | # This function is derived from torchvision VGG make_layers() 119 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 120 | def vgg(cfg, i, batch_norm=False): 121 | layers = [] 122 | in_channels = i 123 | for v in cfg: 124 | if v == 'M': 125 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 126 | elif v == 'C': 127 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 128 | else: 129 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 130 | if batch_norm: 131 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 132 | else: 133 | layers += [conv2d, nn.ReLU(inplace=True)] 134 | in_channels = v 135 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 136 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 137 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 138 | layers += [pool5, conv6, 139 | nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] 140 | return layers 141 | 142 | base = { 143 | '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 144 | 512, 512, 512], 145 | '512': [], 146 | } 147 | 148 | def build_vgg(**kwargs): 149 | # def build_vgg(size=300, num_classes=24): 150 | # if size != 300: 151 | # print("Error: Sorry only SSD300 is supported currently!") 152 | # return 153 | model = VGG16(vgg(base['300'], 3), **kwargs) 154 | return model 155 | 156 | def get_fine_tuning_parameters(model, ft_begin_index): 157 | if ft_begin_index > 0: 158 | print('Finetuing only partial layers is not supported') 159 | return 160 | 161 | ft_module_names, new_module_names = [], [] 162 | ft_module_names.append('vgg') 163 | 164 | new_module_names.append('mlp') 165 | new_module_names.append('fc') 166 | 167 | pretrained_parameters, new_parameters = [], [] 168 | for k, v in model.named_parameters(): 169 | for ft_module in ft_module_names: 170 | if ft_module in k: 171 | print('finetune params:{}'.format(k)) 172 | pretrained_parameters.append(v) 173 | break 174 | else: 175 | for new_module in new_module_names: 176 | if new_module in k: 177 | print('new params:{}'.format(k)) 178 | new_parameters.append(v) 179 | break 180 | 181 | return [pretrained_parameters, new_parameters] 182 | 183 | def get_adv_fine_tuning_parameters(model, ft_begin_index, new_layer_lr, not_replace_last_fc=False, is_human_mask_adv=False, slower_place_mlp=False, slower_hm_mlp=False): 184 | if ft_begin_index > 0: 185 | print('Finetuing only partial layers is not supported') 186 | return 187 | 188 | ft_module_names, new_module_names = [], [] 189 | ft_module_names.append('vgg') 190 | 191 | new_module_names.append('mlp') 192 | if not slower_place_mlp: 193 | new_module_names.append('place_mlp') 194 | else: 195 | ft_module_names.append('place_mlp') 196 | if is_human_mask_adv: 197 | if not slower_hm_mlp: 198 | new_module_names.append('hm_mlp') 199 | else: 200 | ft_module_names.append('hm_mlp') 201 | if not not_replace_last_fc: 202 | new_module_names.append('fc') 203 | 204 | 205 | pretrained_parameters, new_parameters = [], [] 206 | for k, v in model.named_parameters(): 207 | for ft_module in ft_module_names: 208 | if ft_module in k: 209 | print('finetune params:{}'.format(k)) 210 | pretrained_parameters.append(v) 211 | break 212 | else: 213 | for new_module in new_module_names: 214 | if new_module in k: 215 | print('new params:{}'.format(k)) 216 | new_parameters.append(v) 217 | break 218 | 219 | return [pretrained_parameters, new_parameters] 220 | -------------------------------------------------------------------------------- /models/wide_resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = ['WideResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101'] 9 | 10 | 11 | def conv3x3x3(in_planes, out_planes, stride=1): 12 | # 3x3x3 convolution with padding 13 | return nn.Conv3d( 14 | in_planes, 15 | out_planes, 16 | kernel_size=3, 17 | stride=stride, 18 | padding=1, 19 | bias=False) 20 | 21 | 22 | def downsample_basic_block(x, planes, stride): 23 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 24 | zero_pads = torch.Tensor( 25 | out.size(0), planes - out.size(1), out.size(2), out.size(3), 26 | out.size(4)).zero_() 27 | if isinstance(out.data, torch.cuda.FloatTensor): 28 | zero_pads = zero_pads.cuda() 29 | 30 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 31 | 32 | return out 33 | 34 | 35 | class WideBottleneck(nn.Module): 36 | expansion = 2 37 | 38 | def __init__(self, inplanes, planes, stride=1, downsample=None): 39 | super(WideBottleneck, self).__init__() 40 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 41 | self.bn1 = nn.BatchNorm3d(planes) 42 | self.conv2 = nn.Conv3d( 43 | planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 44 | self.bn2 = nn.BatchNorm3d(planes) 45 | self.conv3 = nn.Conv3d( 46 | planes, planes * self.expansion, kernel_size=1, bias=False) 47 | self.bn3 = nn.BatchNorm3d(planes * self.expansion) 48 | self.relu = nn.ReLU(inplace=True) 49 | self.downsample = downsample 50 | self.stride = stride 51 | 52 | def forward(self, x): 53 | residual = x 54 | 55 | out = self.conv1(x) 56 | out = self.bn1(out) 57 | out = self.relu(out) 58 | 59 | out = self.conv2(out) 60 | out = self.bn2(out) 61 | out = self.relu(out) 62 | 63 | out = self.conv3(out) 64 | out = self.bn3(out) 65 | 66 | if self.downsample is not None: 67 | residual = self.downsample(x) 68 | 69 | out += residual 70 | out = self.relu(out) 71 | 72 | return out 73 | 74 | 75 | class WideResNet(nn.Module): 76 | 77 | def __init__(self, 78 | block, 79 | layers, 80 | sample_size, 81 | sample_duration, 82 | k=1, 83 | shortcut_type='B', 84 | num_classes=400): 85 | self.inplanes = 64 86 | super(WideResNet, self).__init__() 87 | self.conv1 = nn.Conv3d( 88 | 3, 89 | 64, 90 | kernel_size=7, 91 | stride=(1, 2, 2), 92 | padding=(3, 3, 3), 93 | bias=False) 94 | self.bn1 = nn.BatchNorm3d(64) 95 | self.relu = nn.ReLU(inplace=True) 96 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 97 | self.layer1 = self._make_layer(block, 64 * k, layers[0], shortcut_type) 98 | self.layer2 = self._make_layer( 99 | block, 128 * k, layers[1], shortcut_type, stride=2) 100 | self.layer3 = self._make_layer( 101 | block, 256 * k, layers[2], shortcut_type, stride=2) 102 | self.layer4 = self._make_layer( 103 | block, 512 * k, layers[3], shortcut_type, stride=2) 104 | last_duration = int(math.ceil(sample_duration / 16)) 105 | last_size = int(math.ceil(sample_size / 32)) 106 | self.avgpool = nn.AvgPool3d( 107 | (last_duration, last_size, last_size), stride=1) 108 | self.fc = nn.Linear(512 * k * block.expansion, num_classes) 109 | 110 | for m in self.modules(): 111 | if isinstance(m, nn.Conv3d): 112 | m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out') 113 | elif isinstance(m, nn.BatchNorm3d): 114 | m.weight.data.fill_(1) 115 | m.bias.data.zero_() 116 | 117 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 118 | downsample = None 119 | if stride != 1 or self.inplanes != planes * block.expansion: 120 | if shortcut_type == 'A': 121 | downsample = partial( 122 | downsample_basic_block, 123 | planes=planes * block.expansion, 124 | stride=stride) 125 | else: 126 | downsample = nn.Sequential( 127 | nn.Conv3d( 128 | self.inplanes, 129 | planes * block.expansion, 130 | kernel_size=1, 131 | stride=stride, 132 | bias=False), nn.BatchNorm3d(planes * block.expansion)) 133 | 134 | layers = [] 135 | layers.append(block(self.inplanes, planes, stride, downsample)) 136 | self.inplanes = planes * block.expansion 137 | for i in range(1, blocks): 138 | layers.append(block(self.inplanes, planes)) 139 | 140 | return nn.Sequential(*layers) 141 | 142 | def forward(self, x): 143 | x = self.conv1(x) 144 | x = self.bn1(x) 145 | x = self.relu(x) 146 | x = self.maxpool(x) 147 | 148 | x = self.layer1(x) 149 | x = self.layer2(x) 150 | x = self.layer3(x) 151 | x = self.layer4(x) 152 | 153 | x = self.avgpool(x) 154 | 155 | x = x.view(x.size(0), -1) 156 | x = self.fc(x) 157 | 158 | return x 159 | 160 | 161 | def get_fine_tuning_parameters(model, ft_begin_index): 162 | if ft_begin_index == 0: 163 | return model.parameters() 164 | 165 | ft_module_names = [] 166 | for i in range(ft_begin_index, 5): 167 | ft_module_names.append('layer{}'.format(i)) 168 | ft_module_names.append('fc') 169 | 170 | parameters = [] 171 | for k, v in model.named_parameters(): 172 | for ft_module in ft_module_names: 173 | if ft_module in k: 174 | parameters.append({'params': v}) 175 | break 176 | else: 177 | parameters.append({'params': v, 'lr': 0.0}) 178 | 179 | return parameters 180 | 181 | 182 | def resnet50(**kwargs): 183 | """Constructs a ResNet-50 model. 184 | """ 185 | model = WideResNet(WideBottleneck, [3, 4, 6, 3], **kwargs) 186 | return model 187 | -------------------------------------------------------------------------------- /sdn_packages.txt: -------------------------------------------------------------------------------- 1 | # Name Version Build Channel 2 | _libgcc_mutex 0.1 main 3 | absl-py 0.7.1 4 | asn1crypto 0.24.0 py36_0 5 | astor 0.7.1 6 | attrs 19.1.0 py36_1 7 | backcall 0.1.0 py36_0 8 | blas 1.0 mkl 9 | bleach 1.5.0 10 | bleach 3.1.0 py36_0 11 | bzip2 1.0.6 h14c3975_5 12 | ca-certificates 2019.11.28 hecc5488_0 conda-forge 13 | cairo 1.14.12 h8948797_3 14 | certifi 2019.11.28 py36_0 conda-forge 15 | cffi 1.12.3 py36h2e261b9_0 16 | chardet 3.0.4 py36_1 17 | cryptography 2.6.1 py36h1ba5d50_0 18 | cuda90 1.0 h6433d27_0 pytorch 19 | cudatoolkit 9.0 h13b8566_0 20 | cudnn 7.6.0 cuda9.0_0 21 | cycler 0.10.0 py36_0 22 | cython 0.29.6 py36he6710b0_0 23 | dbus 1.13.6 h746ee38_0 24 | decorator 4.4.0 py36_1 25 | defusedxml 0.5.0 py36_1 26 | entrypoints 0.3 py36_0 27 | enum34 1.1.6 28 | expat 2.2.6 he6710b0_0 29 | ffmpeg 4.0 hcdf2ecd_0 30 | fontconfig 2.13.0 h9420a91_0 31 | freeglut 3.0.0 hf484d3e_5 32 | freetype 2.9.1 h8a8886c_1 33 | gast 0.2.2 34 | glib 2.56.2 hd408876_0 35 | gmp 6.1.2 h6c8ec71_1 36 | graphite2 1.3.13 h23475e2_0 37 | grpcio 1.20.0 38 | gst-plugins-base 1.14.0 hbbd80ab_1 39 | gstreamer 1.14.0 hb453b48_1 40 | h5py 2.9.0 41 | harfbuzz 1.8.8 hffaf4a1_0 42 | hdf5 1.10.2 hba1933b_1 43 | html5lib 0.9999999 44 | icu 58.2 h9c2bf20_1 45 | idna 2.8 py36_0 46 | intel-openmp 2019.3 199 47 | ipykernel 5.1.0 py36h39e3cac_0 48 | ipython 7.4.0 py36h39e3cac_0 49 | ipython_genutils 0.2.0 py36_0 50 | jasper 2.0.14 h07fcdf6_1 51 | jedi 0.13.3 py36_0 52 | jinja2 2.10.1 py36_0 53 | joblib 0.13.2 py36_0 54 | jpeg 9b h024ee3a_2 55 | jsonschema 3.0.1 py36_0 56 | jupyter_client 5.2.4 py36_0 57 | jupyter_core 4.4.0 py36_0 58 | jupyterlab 0.35.4 py36hf63ae98_0 59 | jupyterlab_server 0.2.0 py36_0 60 | Keras-Applications 1.0.7 61 | Keras-Preprocessing 1.0.9 62 | kiwisolver 1.0.1 py36hf484d3e_0 63 | libedit 3.1.20181209 hc058e9b_0 64 | libffi 3.2.1 hd88cf55_4 65 | libgcc-ng 9.1.0 hdf63c60_0 66 | libgfortran-ng 7.3.0 hdf63c60_0 67 | libglu 9.0.0 hf484d3e_1 68 | libopencv 3.4.2 hb342d67_1 69 | libopus 1.3 h7b6447c_0 70 | libpng 1.6.36 hbc83047_0 71 | libsodium 1.0.16 h1bed415_0 72 | libstdcxx-ng 8.2.0 hdf63c60_1 73 | libtiff 4.0.10 h2733197_2 74 | libuuid 1.0.3 h1bed415_2 75 | libvpx 1.7.0 h439df22_0 76 | libxcb 1.13 h1bed415_1 77 | libxml2 2.9.9 he19cac6_0 78 | Markdown 3.1 79 | markupsafe 1.1.1 py36h7b6447c_0 80 | matplotlib 3.0.3 py36h5429711_0 81 | mistune 0.8.4 py36h7b6447c_0 82 | mkl 2018.0.3 1 83 | mkl-service 1.1.2 py36h90e4bf4_5 84 | mkl_fft 1.0.6 py36h7dd41cf_0 85 | mkl_random 1.0.1 py36h4414c95_1 86 | mock 2.0.0 87 | nbconvert 5.4.1 py36_3 88 | nbformat 4.4.0 py36_0 89 | nccl 1.3.5 cuda9.0_0 90 | ncurses 6.1 he6710b0_1 91 | ninja 1.9.0 py36hfd86e86_0 92 | notebook 5.7.8 py36_0 93 | numpy 1.15.4 py36h1d66e8a_0 94 | numpy-base 1.15.4 py36h81de0dd_0 95 | olefile 0.46 py36_0 96 | opencv 3.4.2 py36h6fd60c2_1 97 | openssl 1.1.1d h7b6447c_3 98 | packaging 19.0 py36_0 99 | pandas 0.24.2 py36he6710b0_0 100 | pandoc 2.2.3.2 0 101 | pandocfilters 1.4.2 py36_1 102 | parso 0.4.0 py_0 103 | pbr 5.1.3 104 | pcre 8.43 he6710b0_0 105 | pexpect 4.7.0 py36_0 106 | pickleshare 0.7.5 py36_0 107 | pillow 6.1.0 py36h34e0f95_0 108 | pip 19.1.1 py36_0 109 | pixman 0.38.0 h7b6447c_0 110 | prometheus_client 0.6.0 py36_0 111 | prompt_toolkit 2.0.9 py36_0 112 | protobuf 3.7.1 113 | ptyprocess 0.6.0 py36_0 114 | py-opencv 3.4.2 py36hb342d67_1 115 | pycocotools 2.0.0 116 | pycparser 2.19 py36_0 117 | pygments 2.3.1 py36_0 118 | pyopenssl 19.0.0 py36_0 119 | pyparsing 2.4.0 py_0 120 | pyqt 5.9.2 py36h05f1152_2 121 | pyrsistent 0.14.11 py36h7b6447c_0 122 | pysocks 1.7.0 py36_0 123 | python 3.6.8 h0371630_0 124 | python-dateutil 2.8.0 py36_0 125 | pytorch 0.4.1 py36ha74772b_0 126 | pytz 2019.1 py_0 127 | pyyaml 3.13 py36h14c3975_0 128 | pyzmq 18.0.0 py36he6710b0_0 129 | qt 5.9.7 h5867ecd_1 130 | readline 7.0 h7b6447c_5 131 | requests 2.21.0 py36_0 132 | scikit-learn 0.20.1 py36h4989274_0 133 | scikit-video 1.1.11 134 | scipy 1.1.0 py36hfa4b5c9_1 135 | send2trash 1.5.0 py36_0 136 | setuptools 41.0.1 py36_0 137 | sip 4.19.8 py36hf484d3e_0 138 | six 1.12.0 py36_0 139 | sqlite 3.29.0 h7b6447c_0 140 | termcolor 1.1.0 141 | terminado 0.8.2 py36_0 142 | testpath 0.4.2 py36_0 143 | tk 8.6.8 hbc83047_0 144 | torchvision 0.2.1 py36_0 145 | tornado 6.0.2 py36h7b6447c_0 146 | traitlets 4.3.2 py36_0 147 | urllib3 1.24.2 py36_0 148 | wcwidth 0.1.7 py36_0 149 | webencodings 0.5.1 py36_1 150 | Werkzeug 0.15.2 151 | wheel 0.33.4 py36_0 152 | x264 1!152.20180806 h14c3975_0 conda-forge 153 | xz 5.2.4 h14c3975_4 154 | yaml 0.1.7 had09818_2 155 | zeromq 4.3.1 he6710b0_3 156 | zlib 1.2.11 h7b6447c_3 157 | zstd 1.3.7 h0b5b093_0 158 | -------------------------------------------------------------------------------- /utils/eval_diving48.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import sys, os 6 | sys.path.append('/home/jinchoi/src/3D-ResNets-PyTorch') 7 | from opts import parse_opts 8 | import pdb 9 | 10 | class Diving48classification(object): 11 | def __init__(self, ground_truth_filename=None, class_def_filename=None, prediction_filename=None, 12 | subset='validation', verbose=False, top_k=1): 13 | if not ground_truth_filename: 14 | raise IOError('Please input a valid ground truth file.') 15 | if not prediction_filename: 16 | raise IOError('Please input a valid prediction file.') 17 | self.subset = subset 18 | self.verbose = verbose 19 | self.top_k = top_k 20 | self.ap = None 21 | self.hit_at_k = None 22 | # Import ground truth and predictions. 23 | self.ground_truth = self._import_ground_truth( 24 | ground_truth_filename) 25 | self.activity_index = self._get_class_labels(class_def_filename) 26 | self.prediction = self._import_prediction(prediction_filename) 27 | 28 | if self.verbose: 29 | print ('[INIT] Loaded annotations from {} subset.'.format(subset)) 30 | nr_gt = len(self.ground_truth) 31 | print ('\tNumber of ground truth instances: {}'.format(nr_gt)) 32 | nr_pred = len(self.prediction) 33 | print ('\tNumber of predictions: {}'.format(nr_pred)) 34 | 35 | def _get_class_labels(self, data_file_path): 36 | with open(data_file_path, 'r') as data_file: 37 | data = json.load(data_file) 38 | data = ['_'.join(row) for row in data] 39 | class_labels_map = {} 40 | index = 0 41 | for class_label in data: 42 | class_labels_map[class_label] = index 43 | index += 1 44 | return class_labels_map 45 | 46 | def _import_ground_truth(self, ground_truth_filename): 47 | """Reads ground truth file, checks if it is well formatted, and returns 48 | the ground truth instances and the activity classes. 49 | 50 | Parameters 51 | ---------- 52 | ground_truth_filename : str 53 | Full path to the ground truth json file. 54 | 55 | Outputs 56 | ------- 57 | ground_truth : df 58 | Data frame containing the ground truth instances. 59 | activity_index : dict 60 | Dictionary containing class index. 61 | """ 62 | with open(ground_truth_filename, 'r') as fobj: 63 | data = json.load(fobj) 64 | 65 | # pdb.set_trace() 66 | # Checking format 67 | # if not all([field in data.keys() for field in self.gt_fields]): 68 | # raise IOError('Please input a valid ground truth file.') 69 | 70 | # Initialize data frame 71 | video_lst, label_lst = [], [] 72 | for cur_data in data: 73 | video_lst.append(cur_data['vid_name']) 74 | label_lst.append(cur_data['label']) 75 | ground_truth = pd.DataFrame({'video-id': video_lst, 76 | 'label': label_lst}) 77 | ground_truth = ground_truth.drop_duplicates().reset_index(drop=True) 78 | 79 | return ground_truth 80 | 81 | def _import_prediction(self, prediction_filename): 82 | """Reads prediction file, checks if it is well formatted, and returns 83 | the prediction instances. 84 | 85 | Parameters 86 | ---------- 87 | prediction_filename : str 88 | Full path to the prediction json file. 89 | 90 | Outputs 91 | ------- 92 | prediction : df 93 | Data frame containing the prediction instances. 94 | """ 95 | with open(prediction_filename, 'r') as fobj: 96 | data = json.load(fobj) 97 | # Checking format... 98 | # if not all([field in data.keys() for field in self.pred_fields]): 99 | # raise IOError('Please input a valid prediction file.') 100 | 101 | # Initialize data frame 102 | video_lst, label_lst, score_lst = [], [], [] 103 | for videoid, v in data['results'].items(): 104 | for result in v: 105 | label = self.activity_index[result['label']] 106 | video_lst.append(videoid) 107 | label_lst.append(label) 108 | score_lst.append(result['score']) 109 | prediction = pd.DataFrame({'video-id': video_lst, 110 | 'label': label_lst, 111 | 'score': score_lst}) 112 | return prediction 113 | 114 | def evaluate(self): 115 | """Evaluates a prediction file. For the detection task we measure the 116 | interpolated mean average precision to measure the performance of a 117 | method. 118 | """ 119 | hit_at_k = compute_video_hit_at_k(self.ground_truth, 120 | self.prediction, top_k=self.top_k) 121 | if self.verbose: 122 | print ('[RESULTS] Performance on Diving48 video ' 123 | 'classification task.') 124 | print ('\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k)) 125 | #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k) 126 | self.hit_at_k = hit_at_k 127 | 128 | ################################################################################ 129 | # Metrics 130 | ################################################################################ 131 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3): 132 | """Compute accuracy at k prediction between ground truth and 133 | predictions data frames. This code is greatly inspired by evaluation 134 | performed in Karpathy et al. CVPR14. 135 | 136 | Parameters 137 | ---------- 138 | ground_truth : df 139 | Data frame containing the ground truth instances. 140 | Required fields: ['video-id', 'label'] 141 | prediction : df 142 | Data frame containing the prediction instances. 143 | Required fields: ['video-id, 'label', 'score'] 144 | 145 | Outputs 146 | ------- 147 | acc : float 148 | Top k accuracy score. 149 | """ 150 | video_ids = np.unique(ground_truth['video-id'].values) 151 | avg_hits_per_vid = np.zeros(video_ids.size) 152 | for i, vid in enumerate(video_ids): 153 | pred_idx = prediction['video-id'] == vid 154 | if not pred_idx.any(): 155 | continue 156 | this_pred = prediction.loc[pred_idx].reset_index(drop=True) 157 | # Get top K predictions sorted by decreasing score. 158 | sort_idx = this_pred['score'].values.argsort()[::-1][:top_k] 159 | this_pred = this_pred.loc[sort_idx].reset_index(drop=True) 160 | # Get labels and compare against ground truth. 161 | pred_label = this_pred['label'].tolist() 162 | gt_idx = ground_truth['video-id'] == vid 163 | gt_label = ground_truth.loc[gt_idx]['label'].tolist() 164 | avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0 165 | for this_label in gt_label]) 166 | return float(avg_hits_per_vid.mean()) 167 | 168 | 169 | if __name__ == '__main__': 170 | opt = parse_opts() 171 | opt.class_def_path = os.path.join(opt.root_path, opt.annotation_path, 'Diving48_vocab.json') 172 | 173 | opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path, 'Diving48_test.json') 174 | 175 | # pdb.set_trace() 176 | diving48_classification = Diving48classification(opt.annotation_path, opt.class_def_path, opt.prediction_path, subset='val', verbose=True, top_k=1) 177 | diving48_classification.evaluate() 178 | print(diving48_classification.hit_at_k) 179 | -------------------------------------------------------------------------------- /utils/eval_hmdb51.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pdb 6 | import sys, os 7 | sys.path.append('/home/jinchoi/src/3D-ResNets-PyTorch') 8 | from opts import parse_opts 9 | 10 | 11 | class HMDBclassification(object): 12 | 13 | def __init__(self, ground_truth_filename=None, prediction_filename=None, 14 | subset='validation', verbose=False, top_k=1): 15 | if not ground_truth_filename: 16 | raise IOError('Please input a valid ground truth file.') 17 | if not prediction_filename: 18 | raise IOError('Please input a valid prediction file.') 19 | self.subset = subset 20 | self.verbose = verbose 21 | self.top_k = top_k 22 | self.ap = None 23 | self.hit_at_k = None 24 | # Import ground truth and predictions. 25 | self.ground_truth, self.activity_index = self._import_ground_truth( 26 | ground_truth_filename) 27 | self.prediction = self._import_prediction(prediction_filename) 28 | 29 | if self.verbose: 30 | print('[INIT] Loaded annotations from {} subset.'.format(subset)) 31 | nr_gt = len(self.ground_truth) 32 | print ('\tNumber of ground truth instances: {}'.format(nr_gt)) 33 | nr_pred = len(self.prediction) 34 | print ('\tNumber of predictions: {}'.format(nr_pred)) 35 | 36 | def _import_ground_truth(self, ground_truth_filename): 37 | """Reads ground truth file, checks if it is well formatted, and returns 38 | the ground truth instances and the activity classes. 39 | 40 | Parameters 41 | ---------- 42 | ground_truth_filename : str 43 | Full path to the ground truth json file. 44 | 45 | Outputs 46 | ------- 47 | ground_truth : df 48 | Data frame containing the ground truth instances. 49 | activity_index : dict 50 | Dictionary containing class index. 51 | """ 52 | with open(ground_truth_filename, 'r') as fobj: 53 | data = json.load(fobj) 54 | # Checking format 55 | # if not all([field in data.keys() for field in self.gt_fields]): 56 | # raise IOError('Please input a valid ground truth file.') 57 | 58 | # Initialize data frame 59 | activity_index, cidx = {}, 0 60 | video_lst, label_lst = [], [] 61 | for videoid, v in data['database'].items(): 62 | if self.subset != v['subset']: 63 | continue 64 | this_label = v['annotations']['label'] 65 | if this_label not in activity_index: 66 | activity_index[this_label] = cidx 67 | cidx += 1 68 | video_lst.append(videoid) 69 | label_lst.append(activity_index[this_label]) 70 | ground_truth = pd.DataFrame({'video-id': video_lst, 71 | 'label': label_lst}) 72 | ground_truth = ground_truth.drop_duplicates().reset_index(drop=True) 73 | return ground_truth, activity_index 74 | 75 | def _import_prediction(self, prediction_filename): 76 | """Reads prediction file, checks if it is well formatted, and returns 77 | the prediction instances. 78 | 79 | Parameters 80 | ---------- 81 | prediction_filename : str 82 | Full path to the prediction json file. 83 | 84 | Outputs 85 | ------- 86 | prediction : df 87 | Data frame containing the prediction instances. 88 | """ 89 | with open(prediction_filename, 'r') as fobj: 90 | data = json.load(fobj) 91 | # Checking format... 92 | # if not all([field in data.keys() for field in self.pred_fields]): 93 | # raise IOError('Please input a valid prediction file.') 94 | 95 | # Initialize data frame 96 | video_lst, label_lst, score_lst = [], [], [] 97 | for videoid, v in data['results'].items(): 98 | for result in v: 99 | label = self.activity_index[result['label']] 100 | video_lst.append(videoid) 101 | label_lst.append(label) 102 | score_lst.append(result['score']) 103 | prediction = pd.DataFrame({'video-id': video_lst, 104 | 'label': label_lst, 105 | 'score': score_lst}) 106 | return prediction 107 | 108 | def evaluate(self): 109 | """Evaluates a prediction file. For the detection task we measure the 110 | interpolated mean average precision to measure the performance of a 111 | method. 112 | """ 113 | hit_at_k = compute_video_hit_at_k(self.ground_truth, 114 | self.prediction, top_k=self.top_k) 115 | if self.verbose: 116 | print ('[RESULTS] Performance on HMDB-51 video ' 117 | 'classification task.') 118 | print ('\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k)) 119 | #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k) 120 | self.hit_at_k = hit_at_k 121 | 122 | ################################################################################ 123 | # Metrics 124 | ################################################################################ 125 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3): 126 | """Compute accuracy at k prediction between ground truth and 127 | predictions data frames. This code is greatly inspired by evaluation 128 | performed in Karpathy et al. CVPR14. 129 | 130 | Parameters 131 | ---------- 132 | ground_truth : df 133 | Data frame containing the ground truth instances. 134 | Required fields: ['video-id', 'label'] 135 | prediction : df 136 | Data frame containing the prediction instances. 137 | Required fields: ['video-id, 'label', 'score'] 138 | 139 | Outputs 140 | ------- 141 | acc : float 142 | Top k accuracy score. 143 | """ 144 | video_ids = np.unique(ground_truth['video-id'].values) 145 | avg_hits_per_vid = np.zeros(video_ids.size) 146 | for i, vid in enumerate(video_ids): 147 | pred_idx = prediction['video-id'] == vid 148 | if not pred_idx.any(): 149 | continue 150 | this_pred = prediction.loc[pred_idx].reset_index(drop=True) 151 | # Get top K predictions sorted by decreasing score. 152 | sort_idx = this_pred['score'].values.argsort()[::-1][:top_k] 153 | this_pred = this_pred.loc[sort_idx].reset_index(drop=True) 154 | # Get labels and compare against ground truth. 155 | pred_label = this_pred['label'].tolist() 156 | gt_idx = ground_truth['video-id'] == vid 157 | gt_label = ground_truth.loc[gt_idx]['label'].tolist() 158 | avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0 159 | for this_label in gt_label]) 160 | return float(avg_hits_per_vid.mean()) 161 | 162 | if __name__ == '__main__': 163 | opt = parse_opts() 164 | opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path) 165 | 166 | hmdb_classification = HMDBclassification(opt.annotation_path, opt.prediction_path, subset='validation', verbose=True, top_k=1) 167 | hmdb_classification.evaluate() 168 | print(hmdb_classification.hit_at_k) 169 | -------------------------------------------------------------------------------- /utils/eval_kinetics.py: -------------------------------------------------------------------------------- 1 | import json 2 | import urllib2 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | API = 'http://ec2-52-11-11-89.us-west-2.compute.amazonaws.com/challenge17/api.py' 8 | 9 | def get_blocked_videos(api=API): 10 | api_url = '{}?action=get_blocked'.format(api) 11 | req = urllib2.Request(api_url) 12 | response = urllib2.urlopen(req) 13 | return json.loads(response.read()) 14 | 15 | class KINETICSclassification(object): 16 | GROUND_TRUTH_FIELDS = ['database', 'labels'] 17 | PREDICTION_FIELDS = ['results', 'version', 'external_data'] 18 | 19 | def __init__(self, ground_truth_filename=None, prediction_filename=None, 20 | ground_truth_fields=GROUND_TRUTH_FIELDS, 21 | prediction_fields=PREDICTION_FIELDS, 22 | subset='validation', verbose=False, top_k=1, 23 | check_status=True): 24 | if not ground_truth_filename: 25 | raise IOError('Please input a valid ground truth file.') 26 | if not prediction_filename: 27 | raise IOError('Please input a valid prediction file.') 28 | self.subset = subset 29 | self.verbose = verbose 30 | self.gt_fields = ground_truth_fields 31 | self.pred_fields = prediction_fields 32 | self.top_k = top_k 33 | self.ap = None 34 | self.hit_at_k = None 35 | self.check_status = check_status 36 | # Retrieve blocked videos from server. 37 | if self.check_status: 38 | self.blocked_videos = get_blocked_videos() 39 | else: 40 | self.blocked_videos = list() 41 | # Import ground truth and predictions. 42 | self.ground_truth, self.activity_index = self._import_ground_truth( 43 | ground_truth_filename) 44 | self.prediction = self._import_prediction(prediction_filename) 45 | 46 | if self.verbose: 47 | print '[INIT] Loaded annotations from {} subset.'.format(subset) 48 | nr_gt = len(self.ground_truth) 49 | print '\tNumber of ground truth instances: {}'.format(nr_gt) 50 | nr_pred = len(self.prediction) 51 | print '\tNumber of predictions: {}'.format(nr_pred) 52 | 53 | def _import_ground_truth(self, ground_truth_filename): 54 | """Reads ground truth file, checks if it is well formatted, and returns 55 | the ground truth instances and the activity classes. 56 | 57 | Parameters 58 | ---------- 59 | ground_truth_filename : str 60 | Full path to the ground truth json file. 61 | 62 | Outputs 63 | ------- 64 | ground_truth : df 65 | Data frame containing the ground truth instances. 66 | activity_index : dict 67 | Dictionary containing class index. 68 | """ 69 | with open(ground_truth_filename, 'r') as fobj: 70 | data = json.load(fobj) 71 | # Checking format 72 | # if not all([field in data.keys() for field in self.gt_fields]): 73 | # raise IOError('Please input a valid ground truth file.') 74 | 75 | # Initialize data frame 76 | activity_index, cidx = {}, 0 77 | video_lst, label_lst = [], [] 78 | for videoid, v in data['database'].iteritems(): 79 | if self.subset != v['subset']: 80 | continue 81 | if videoid in self.blocked_videos: 82 | continue 83 | this_label = v['annotations']['label'] 84 | if this_label not in activity_index: 85 | activity_index[this_label] = cidx 86 | cidx += 1 87 | video_lst.append(videoid[:-14]) 88 | label_lst.append(activity_index[this_label]) 89 | ground_truth = pd.DataFrame({'video-id': video_lst, 90 | 'label': label_lst}) 91 | ground_truth = ground_truth.drop_duplicates().reset_index(drop=True) 92 | return ground_truth, activity_index 93 | 94 | def _import_prediction(self, prediction_filename): 95 | """Reads prediction file, checks if it is well formatted, and returns 96 | the prediction instances. 97 | 98 | Parameters 99 | ---------- 100 | prediction_filename : str 101 | Full path to the prediction json file. 102 | 103 | Outputs 104 | ------- 105 | prediction : df 106 | Data frame containing the prediction instances. 107 | """ 108 | with open(prediction_filename, 'r') as fobj: 109 | data = json.load(fobj) 110 | # Checking format... 111 | # if not all([field in data.keys() for field in self.pred_fields]): 112 | # raise IOError('Please input a valid prediction file.') 113 | 114 | # Initialize data frame 115 | video_lst, label_lst, score_lst = [], [], [] 116 | for videoid, v in data['results'].iteritems(): 117 | if videoid in self.blocked_videos: 118 | continue 119 | for result in v: 120 | label = self.activity_index[result['label']] 121 | video_lst.append(videoid) 122 | label_lst.append(label) 123 | score_lst.append(result['score']) 124 | prediction = pd.DataFrame({'video-id': video_lst, 125 | 'label': label_lst, 126 | 'score': score_lst}) 127 | return prediction 128 | 129 | def evaluate(self): 130 | """Evaluates a prediction file. For the detection task we measure the 131 | interpolated mean average precision to measure the performance of a 132 | method. 133 | """ 134 | hit_at_k = compute_video_hit_at_k(self.ground_truth, 135 | self.prediction, top_k=self.top_k) 136 | # avg_hit_at_k = compute_video_hit_at_k( 137 | # self.ground_truth, self.prediction, top_k=self.top_k, avg=True) 138 | if self.verbose: 139 | print ('[RESULTS] Performance on Kinetics video ' 140 | 'classification task.') 141 | # print '\tMean Average Precision: {}'.format(ap.mean()) 142 | print '\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k) 143 | #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k) 144 | # self.ap = ap 145 | self.hit_at_k = hit_at_k 146 | # self.avg_hit_at_k = avg_hit_at_k 147 | 148 | ################################################################################ 149 | # Metrics 150 | ################################################################################ 151 | 152 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3, avg=False): 153 | """Compute accuracy at k prediction between ground truth and 154 | predictions data frames. This code is greatly inspired by evaluation 155 | performed in Karpathy et al. CVPR14. 156 | 157 | Parameters 158 | ---------- 159 | ground_truth : df 160 | Data frame containing the ground truth instances. 161 | Required fields: ['video-id', 'label'] 162 | prediction : df 163 | Data frame containing the prediction instances. 164 | Required fields: ['video-id, 'label', 'score'] 165 | 166 | Outputs 167 | ------- 168 | acc : float 169 | Top k accuracy score. 170 | """ 171 | video_ids = np.unique(ground_truth['video-id'].values) 172 | avg_hits_per_vid = np.zeros(video_ids.size) 173 | for i, vid in enumerate(video_ids): 174 | pred_idx = prediction['video-id'] == vid 175 | if not pred_idx.any(): 176 | continue 177 | this_pred = prediction.loc[pred_idx].reset_index(drop=True) 178 | # Get top K predictions sorted by decreasing score. 179 | sort_idx = this_pred['score'].values.argsort()[::-1][:top_k] 180 | this_pred = this_pred.loc[sort_idx].reset_index(drop=True) 181 | # Get labels and compare against ground truth. 182 | pred_label = this_pred['label'].tolist() 183 | gt_idx = ground_truth['video-id'] == vid 184 | gt_label = ground_truth.loc[gt_idx]['label'].tolist() 185 | avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0 186 | for this_label in gt_label]) 187 | if not avg: 188 | avg_hits_per_vid[i] = np.ceil(avg_hits_per_vid[i]) 189 | return float(avg_hits_per_vid.mean()) 190 | -------------------------------------------------------------------------------- /utils/eval_ucf101.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import sys, os 6 | sys.path.append('/home/jinchoi/src/3D-ResNets-PyTorch') 7 | from opts import parse_opts 8 | 9 | class UCFclassification(object): 10 | 11 | def __init__(self, ground_truth_filename=None, prediction_filename=None, 12 | subset='validation', verbose=False, top_k=1): 13 | if not ground_truth_filename: 14 | raise IOError('Please input a valid ground truth file.') 15 | if not prediction_filename: 16 | raise IOError('Please input a valid prediction file.') 17 | self.subset = subset 18 | self.verbose = verbose 19 | self.top_k = top_k 20 | self.ap = None 21 | self.hit_at_k = None 22 | # Import ground truth and predictions. 23 | self.ground_truth, self.activity_index = self._import_ground_truth( 24 | ground_truth_filename) 25 | self.prediction = self._import_prediction(prediction_filename) 26 | 27 | if self.verbose: 28 | print ('[INIT] Loaded annotations from {} subset.'.format(subset)) 29 | nr_gt = len(self.ground_truth) 30 | print ('\tNumber of ground truth instances: {}'.format(nr_gt)) 31 | nr_pred = len(self.prediction) 32 | print ('\tNumber of predictions: {}'.format(nr_pred)) 33 | 34 | def _import_ground_truth(self, ground_truth_filename): 35 | """Reads ground truth file, checks if it is well formatted, and returns 36 | the ground truth instances and the activity classes. 37 | 38 | Parameters 39 | ---------- 40 | ground_truth_filename : str 41 | Full path to the ground truth json file. 42 | 43 | Outputs 44 | ------- 45 | ground_truth : df 46 | Data frame containing the ground truth instances. 47 | activity_index : dict 48 | Dictionary containing class index. 49 | """ 50 | with open(ground_truth_filename, 'r') as fobj: 51 | data = json.load(fobj) 52 | # Checking format 53 | # if not all([field in data.keys() for field in self.gt_fields]): 54 | # raise IOError('Please input a valid ground truth file.') 55 | 56 | # Initialize data frame 57 | activity_index, cidx = {}, 0 58 | video_lst, label_lst = [], [] 59 | for videoid, v in data['database'].items(): 60 | if self.subset != v['subset']: 61 | continue 62 | this_label = v['annotations']['label'] 63 | if this_label not in activity_index: 64 | activity_index[this_label] = cidx 65 | cidx += 1 66 | video_lst.append(videoid) 67 | label_lst.append(activity_index[this_label]) 68 | ground_truth = pd.DataFrame({'video-id': video_lst, 69 | 'label': label_lst}) 70 | ground_truth = ground_truth.drop_duplicates().reset_index(drop=True) 71 | return ground_truth, activity_index 72 | 73 | def _import_prediction(self, prediction_filename): 74 | """Reads prediction file, checks if it is well formatted, and returns 75 | the prediction instances. 76 | 77 | Parameters 78 | ---------- 79 | prediction_filename : str 80 | Full path to the prediction json file. 81 | 82 | Outputs 83 | ------- 84 | prediction : df 85 | Data frame containing the prediction instances. 86 | """ 87 | with open(prediction_filename, 'r') as fobj: 88 | data = json.load(fobj) 89 | # Checking format... 90 | # if not all([field in data.keys() for field in self.pred_fields]): 91 | # raise IOError('Please input a valid prediction file.') 92 | 93 | # Initialize data frame 94 | video_lst, label_lst, score_lst = [], [], [] 95 | for videoid, v in data['results'].items(): 96 | for result in v: 97 | label = self.activity_index[result['label']] 98 | video_lst.append(videoid) 99 | label_lst.append(label) 100 | score_lst.append(result['score']) 101 | prediction = pd.DataFrame({'video-id': video_lst, 102 | 'label': label_lst, 103 | 'score': score_lst}) 104 | return prediction 105 | 106 | def evaluate(self): 107 | """Evaluates a prediction file. For the detection task we measure the 108 | interpolated mean average precision to measure the performance of a 109 | method. 110 | """ 111 | hit_at_k = compute_video_hit_at_k(self.ground_truth, 112 | self.prediction, top_k=self.top_k) 113 | if self.verbose: 114 | print ('[RESULTS] Performance on UCF101 video ' 115 | 'classification task.') 116 | print ('\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k)) 117 | #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k) 118 | self.hit_at_k = hit_at_k 119 | 120 | ################################################################################ 121 | # Metrics 122 | ################################################################################ 123 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3): 124 | """Compute accuracy at k prediction between ground truth and 125 | predictions data frames. This code is greatly inspired by evaluation 126 | performed in Karpathy et al. CVPR14. 127 | 128 | Parameters 129 | ---------- 130 | ground_truth : df 131 | Data frame containing the ground truth instances. 132 | Required fields: ['video-id', 'label'] 133 | prediction : df 134 | Data frame containing the prediction instances. 135 | Required fields: ['video-id, 'label', 'score'] 136 | 137 | Outputs 138 | ------- 139 | acc : float 140 | Top k accuracy score. 141 | """ 142 | video_ids = np.unique(ground_truth['video-id'].values) 143 | avg_hits_per_vid = np.zeros(video_ids.size) 144 | for i, vid in enumerate(video_ids): 145 | pred_idx = prediction['video-id'] == vid 146 | if not pred_idx.any(): 147 | continue 148 | this_pred = prediction.loc[pred_idx].reset_index(drop=True) 149 | # Get top K predictions sorted by decreasing score. 150 | sort_idx = this_pred['score'].values.argsort()[::-1][:top_k] 151 | this_pred = this_pred.loc[sort_idx].reset_index(drop=True) 152 | # Get labels and compare against ground truth. 153 | pred_label = this_pred['label'].tolist() 154 | gt_idx = ground_truth['video-id'] == vid 155 | gt_label = ground_truth.loc[gt_idx]['label'].tolist() 156 | avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0 157 | for this_label in gt_label]) 158 | return float(avg_hits_per_vid.mean()) 159 | 160 | 161 | if __name__ == '__main__': 162 | opt = parse_opts() 163 | opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path) 164 | 165 | ucf_classification = UCFclassification(opt.annotation_path, opt.prediction_path,subset='validation', verbose=True, top_k=1) 166 | ucf_classification.evaluate() 167 | print(ucf_classification.hit_at_k) 168 | -------------------------------------------------------------------------------- /utils/fps.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | 7 | if __name__=="__main__": 8 | dir_path = sys.argv[1] 9 | dst_dir_path = sys.argv[2] 10 | 11 | for file_name in os.listdir(dir_path): 12 | if '.mp4' not in file_name: 13 | continue 14 | name, ext = os.path.splitext(file_name) 15 | dst_directory_path = os.path.join(dst_dir_path, name) 16 | 17 | video_file_path = os.path.join(dir_path, file_name) 18 | p = subprocess.Popen('ffprobe {}'.format(video_file_path), 19 | shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 20 | _, res = p.communicate() 21 | res = res.decode('utf-8') 22 | 23 | duration_index = res.find('Duration:') 24 | duration_str = res[(duration_index + 10):(duration_index + 21)] 25 | hour = float(duration_str[0:2]) 26 | minute = float(duration_str[3:5]) 27 | sec = float(duration_str[6:10]) 28 | total_sec = hour * 3600 + minute * 60 + sec 29 | 30 | n_frames = len(os.listdir(dst_directory_path)) 31 | if os.path.exists(os.path.join(dst_directory_path, 'fps')): 32 | n_frames -= 1 33 | 34 | fps = round(n_frames / total_sec, 2) 35 | 36 | print(video_file_path, os.path.exists(video_file_path), fps) 37 | with open(os.path.join(dst_directory_path, 'fps'), 'w') as fps_file: 38 | fps_file.write('{}\n'.format(fps)) 39 | -------------------------------------------------------------------------------- /utils/hmdb51_json.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import json 5 | import pandas as pd 6 | 7 | def convert_csv_to_dict(csv_dir_path, split_index): 8 | database = {} 9 | for filename in os.listdir(csv_dir_path): 10 | if 'split{}'.format(split_index) not in filename: 11 | continue 12 | 13 | data = pd.read_csv(os.path.join(csv_dir_path, filename), 14 | delimiter=' ', header=None) 15 | keys = [] 16 | subsets = [] 17 | for i in range(data.shape[0]): 18 | row = data.ix[i, :] 19 | if row[1] == 0: 20 | continue 21 | elif row[1] == 1: 22 | subset = 'training' 23 | elif row[1] == 2: 24 | subset = 'validation' 25 | 26 | keys.append(row[0].split('.')[0]) 27 | subsets.append(subset) 28 | 29 | for i in range(len(keys)): 30 | key = keys[i] 31 | database[key] = {} 32 | database[key]['subset'] = subsets[i] 33 | label = '_'.join(filename.split('_')[:-2]) 34 | database[key]['annotations'] = {'label': label} 35 | 36 | return database 37 | 38 | def get_labels(csv_dir_path): 39 | labels = [] 40 | for name in os.listdir(csv_dir_path): 41 | labels.append('_'.join(name.split('_')[:-2])) 42 | return sorted(list(set(labels))) 43 | 44 | def convert_hmdb51_csv_to_activitynet_json(csv_dir_path, split_index, dst_json_path): 45 | labels = get_labels(csv_dir_path) 46 | database = convert_csv_to_dict(csv_dir_path, split_index) 47 | 48 | dst_data = {} 49 | dst_data['labels'] = labels 50 | dst_data['database'] = {} 51 | dst_data['database'].update(database) 52 | 53 | with open(dst_json_path, 'w') as dst_file: 54 | json.dump(dst_data, dst_file) 55 | 56 | if __name__ == '__main__': 57 | csv_dir_path = sys.argv[1] 58 | 59 | for split_index in range(1, 4): 60 | dst_json_path = os.path.join(csv_dir_path, 'hmdb51_{}.json'.format(split_index)) 61 | convert_hmdb51_csv_to_activitynet_json(csv_dir_path, split_index, dst_json_path) -------------------------------------------------------------------------------- /utils/kinetics_json.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import json 5 | import pandas as pd 6 | 7 | def convert_csv_to_dict(csv_path, subset): 8 | data = pd.read_csv(csv_path) 9 | 10 | keys = [] 11 | key_labels = [] 12 | for i in range(data.shape[0]): 13 | row = data.ix[i, :] 14 | basename = '%s_%s_%s' % (row['youtube_id'], 15 | '%06d' % row['time_start'], 16 | '%06d' % row['time_end']) 17 | keys.append(basename) 18 | if subset != 'testing': 19 | key_labels.append(row['label']) 20 | 21 | database = {} 22 | for i in range(len(keys)): 23 | key = keys[i] 24 | database[key] = {} 25 | database[key]['subset'] = subset 26 | if subset != 'testing': 27 | label = key_labels[i] 28 | database[key]['annotations'] = {'label': label} 29 | else: 30 | database[key]['annotations'] = {} 31 | 32 | return database 33 | 34 | def load_labels(train_csv_path): 35 | data = pd.read_csv(train_csv_path) 36 | return data['label'].unique().tolist() 37 | 38 | def convert_kinetics_csv_to_activitynet_json(train_csv_path, val_csv_path, test_csv_path, dst_json_path): 39 | labels = load_labels(train_csv_path) 40 | train_database = convert_csv_to_dict(train_csv_path, 'training') 41 | val_database = convert_csv_to_dict(val_csv_path, 'validation') 42 | test_database = convert_csv_to_dict(test_csv_path, 'testing') 43 | dst_data = {} 44 | dst_data['labels'] = labels 45 | dst_data['database'] = {} 46 | dst_data['database'].update(train_database) 47 | dst_data['database'].update(val_database) 48 | dst_data['database'].update(test_database) 49 | 50 | with open(dst_json_path, 'w') as dst_file: 51 | json.dump(dst_data, dst_file) 52 | 53 | if __name__=="__main__": 54 | train_csv_path = sys.argv[1] 55 | val_csv_path = sys.argv[2] 56 | test_csv_path = sys.argv[3] 57 | dst_json_path = sys.argv[4] 58 | 59 | convert_kinetics_csv_to_activitynet_json( 60 | train_csv_path, val_csv_path, test_csv_path, dst_json_path) 61 | -------------------------------------------------------------------------------- /utils/n_frames_kinetics.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | def class_process(dir_path, class_name): 7 | class_path = os.path.join(dir_path, class_name) 8 | if not os.path.isdir(class_path): 9 | return 10 | 11 | for file_name in os.listdir(class_path): 12 | video_dir_path = os.path.join(class_path, file_name) 13 | image_indices = [] 14 | for image_file_name in os.listdir(video_dir_path): 15 | if 'image' not in image_file_name: 16 | continue 17 | image_indices.append(int(image_file_name[6:11])) 18 | 19 | if len(image_indices) == 0: 20 | print('no image files', video_dir_path) 21 | n_frames = 0 22 | else: 23 | image_indices.sort(reverse=True) 24 | n_frames = image_indices[0] 25 | print(video_dir_path, n_frames) 26 | with open(os.path.join(video_dir_path, 'n_frames'), 'w') as dst_file: 27 | dst_file.write(str(n_frames)) 28 | 29 | 30 | if __name__=="__main__": 31 | dir_path = sys.argv[1] 32 | for class_name in os.listdir(dir_path): 33 | class_process(dir_path, class_name) 34 | 35 | class_name = 'test' 36 | class_process(dir_path, class_name) 37 | -------------------------------------------------------------------------------- /utils/n_frames_ucf101_hmdb51.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | import pdb 6 | 7 | def class_process(dir_path, class_name): 8 | class_path = os.path.join(dir_path, class_name) 9 | if not os.path.isdir(class_path): 10 | return 11 | 12 | for file_name in os.listdir(class_path): 13 | video_dir_path = os.path.join(class_path, file_name) 14 | image_indices = [] 15 | # pdb.set_trace() 16 | for image_file_name in os.listdir(video_dir_path): 17 | if 'image' not in image_file_name: 18 | continue 19 | image_indices.append(int(image_file_name[6:11])) 20 | 21 | if len(image_indices) == 0: 22 | print('no image files', video_dir_path) 23 | n_frames = 0 24 | else: 25 | image_indices.sort(reverse=True) 26 | n_frames = image_indices[0] 27 | print(video_dir_path, n_frames) 28 | with open(os.path.join(video_dir_path, 'n_frames'), 'w') as dst_file: 29 | dst_file.write(str(n_frames)) 30 | 31 | 32 | if __name__=="__main__": 33 | dir_path = sys.argv[1] 34 | for class_name in os.listdir(dir_path): 35 | class_process(dir_path, class_name) 36 | -------------------------------------------------------------------------------- /utils/ucf101_json.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import json 5 | import pandas as pd 6 | 7 | def convert_csv_to_dict(csv_path, subset): 8 | data = pd.read_csv(csv_path, delimiter=' ', header=None) 9 | keys = [] 10 | key_labels = [] 11 | for i in range(data.shape[0]): 12 | row = data.ix[i, :] 13 | slash_rows = data.ix[i, 0].split('/') 14 | class_name = slash_rows[0] 15 | basename = slash_rows[1].split('.')[0] 16 | 17 | keys.append(basename) 18 | key_labels.append(class_name) 19 | 20 | database = {} 21 | for i in range(len(keys)): 22 | key = keys[i] 23 | database[key] = {} 24 | database[key]['subset'] = subset 25 | label = key_labels[i] 26 | database[key]['annotations'] = {'label': label} 27 | 28 | return database 29 | 30 | def load_labels(label_csv_path): 31 | data = pd.read_csv(label_csv_path, delimiter=' ', header=None) 32 | labels = [] 33 | for i in range(data.shape[0]): 34 | labels.append(data.ix[i, 1]) 35 | return labels 36 | 37 | def convert_ucf101_csv_to_activitynet_json(label_csv_path, train_csv_path, 38 | val_csv_path, dst_json_path): 39 | labels = load_labels(label_csv_path) 40 | train_database = convert_csv_to_dict(train_csv_path, 'training') 41 | val_database = convert_csv_to_dict(val_csv_path, 'validation') 42 | 43 | dst_data = {} 44 | dst_data['labels'] = labels 45 | dst_data['database'] = {} 46 | dst_data['database'].update(train_database) 47 | dst_data['database'].update(val_database) 48 | 49 | with open(dst_json_path, 'w') as dst_file: 50 | json.dump(dst_data, dst_file) 51 | 52 | if __name__ == '__main__': 53 | csv_dir_path = sys.argv[1] 54 | 55 | for split_index in range(1, 4): 56 | label_csv_path = os.path.join(csv_dir_path, 'classInd.txt') 57 | train_csv_path = os.path.join(csv_dir_path, 'trainlist0{}.txt'.format(split_index)) 58 | val_csv_path = os.path.join(csv_dir_path, 'testlist0{}.txt'.format(split_index)) 59 | dst_json_path = os.path.join(csv_dir_path, 'ucf101_0{}.json'.format(split_index)) 60 | 61 | convert_ucf101_csv_to_activitynet_json(label_csv_path, train_csv_path, 62 | val_csv_path, dst_json_path) 63 | -------------------------------------------------------------------------------- /utils/video_jpg.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | 7 | if __name__=="__main__": 8 | dir_path = sys.argv[1] 9 | dst_dir_path = sys.argv[2] 10 | 11 | for file_name in os.listdir(dir_path): 12 | if '.mp4' not in file_name: 13 | continue 14 | name, ext = os.path.splitext(file_name) 15 | dst_directory_path = os.path.join(dst_dir_path, name) 16 | 17 | video_file_path = os.path.join(dir_path, file_name) 18 | try: 19 | if os.path.exists(dst_directory_path): 20 | if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')): 21 | subprocess.call('rm -r {}'.format(dst_directory_path), shell=True) 22 | print('remove {}'.format(dst_directory_path)) 23 | os.mkdir(dst_directory_path) 24 | else: 25 | continue 26 | else: 27 | os.mkdir(dst_directory_path) 28 | except: 29 | print(dst_directory_path) 30 | continue 31 | cmd = 'ffmpeg -i {} -vf scale=-1:360 {}/image_%05d.jpg'.format(video_file_path, dst_directory_path) 32 | print(cmd) 33 | subprocess.call(cmd, shell=True) 34 | print('\n') 35 | -------------------------------------------------------------------------------- /utils/video_jpg_diving48.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | import pdb 6 | 7 | def class_process(dir_path, dst_dir_path, class_name, resume_vid_idx=0): 8 | class_path = os.path.join(dir_path, class_name) 9 | if not os.path.isdir(class_path): 10 | return 11 | 12 | dst_class_path = os.path.join(dst_dir_path, class_name) 13 | if not os.path.exists(dst_class_path): 14 | os.mkdir(dst_class_path) 15 | 16 | for file_name in os.listdir(class_path)[resume_vid_idx:]: 17 | if '.mp4' not in file_name: 18 | continue 19 | name, ext = os.path.splitext(file_name) 20 | dst_directory_path = os.path.join(dst_class_path, name) 21 | 22 | video_file_path = os.path.join(class_path, file_name) 23 | try: 24 | if os.path.exists(dst_directory_path): 25 | if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')): 26 | subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True) 27 | print('remove {}'.format(dst_directory_path)) 28 | os.mkdir(dst_directory_path) 29 | else: 30 | continue 31 | else: 32 | os.mkdir(dst_directory_path) 33 | except: 34 | print(dst_directory_path) 35 | continue 36 | cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path) 37 | print(cmd) 38 | subprocess.call(cmd, shell=True) 39 | print('\n') 40 | 41 | if __name__=="__main__": 42 | dir_path = sys.argv[1] 43 | dst_dir_path = sys.argv[2] 44 | resume_vid_idx = int(sys.argv[3]) 45 | 46 | for class_name in os.listdir(dir_path): 47 | class_process(dir_path, dst_dir_path, class_name, resume_vid_idx=resume_vid_idx) 48 | -------------------------------------------------------------------------------- /utils/video_jpg_kinetics.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | def class_process(dir_path, dst_dir_path, class_name): 7 | class_path = os.path.join(dir_path, class_name) 8 | if not os.path.isdir(class_path): 9 | return 10 | 11 | dst_class_path = os.path.join(dst_dir_path, class_name) 12 | if not os.path.exists(dst_class_path): 13 | os.mkdir(dst_class_path) 14 | 15 | for file_name in os.listdir(class_path): 16 | if '.mp4' not in file_name: 17 | continue 18 | name, ext = os.path.splitext(file_name) 19 | dst_directory_path = os.path.join(dst_class_path, name) 20 | 21 | video_file_path = os.path.join(class_path, file_name) 22 | try: 23 | if os.path.exists(dst_directory_path): 24 | if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')): 25 | subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True) 26 | print('remove {}'.format(dst_directory_path)) 27 | os.mkdir(dst_directory_path) 28 | else: 29 | continue 30 | else: 31 | os.mkdir(dst_directory_path) 32 | except: 33 | print(dst_directory_path) 34 | continue 35 | cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path) 36 | print(cmd) 37 | subprocess.call(cmd, shell=True) 38 | print('\n') 39 | 40 | if __name__=="__main__": 41 | dir_path = sys.argv[1] 42 | dst_dir_path = sys.argv[2] 43 | start_ind = int(sys.argv[3]) #inclusive 44 | end_ind = int(sys.argv[4]) #exclusive 45 | 46 | for class_name in os.listdir(dir_path)[start_ind:end_ind]: 47 | class_process(dir_path, dst_dir_path, class_name) 48 | 49 | class_name = 'test' 50 | class_process(dir_path, dst_dir_path, class_name) 51 | -------------------------------------------------------------------------------- /utils/video_jpg_ucf101_hmdb51.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | def class_process(dir_path, dst_dir_path, class_name): 7 | class_path = os.path.join(dir_path, class_name) 8 | if not os.path.isdir(class_path): 9 | return 10 | 11 | dst_class_path = os.path.join(dst_dir_path, class_name) 12 | if not os.path.exists(dst_class_path): 13 | os.mkdir(dst_class_path) 14 | 15 | for file_name in os.listdir(class_path): 16 | if '.avi' not in file_name: 17 | continue 18 | name, ext = os.path.splitext(file_name) 19 | dst_directory_path = os.path.join(dst_class_path, name) 20 | 21 | video_file_path = os.path.join(class_path, file_name) 22 | try: 23 | if os.path.exists(dst_directory_path): 24 | if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')): 25 | subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True) 26 | print('remove {}'.format(dst_directory_path)) 27 | os.mkdir(dst_directory_path) 28 | else: 29 | continue 30 | else: 31 | os.mkdir(dst_directory_path) 32 | except: 33 | print(dst_directory_path) 34 | continue 35 | cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path) 36 | print(cmd) 37 | subprocess.call(cmd, shell=True) 38 | print('\n') 39 | 40 | if __name__=="__main__": 41 | dir_path = sys.argv[1] 42 | dst_dir_path = sys.argv[2] 43 | 44 | for class_name in os.listdir(dir_path): 45 | class_process(dir_path, dst_dir_path, class_name) 46 | --------------------------------------------------------------------------------