├── .gitignore
├── LICENSE
├── README.md
├── datasets
    ├── activitynet.py
    ├── dataset.py
    ├── diving48.py
    ├── hmdb51.py
    ├── kinetics.py
    └── ucf101.py
├── libs
    ├── mean.py
    ├── opts.py
    ├── spatial_transforms.py
    ├── target_transforms.py
    ├── temporal_transforms.py
    ├── test.py
    ├── train_epoch.py
    ├── utils.py
    └── validation_epoch.py
├── loss
    ├── hloss.py
    └── soft_cross_entropy.py
├── models
    ├── densenet.py
    ├── grad_reversal.py
    ├── model.py
    ├── pre_act_resnet.py
    ├── resnet.py
    ├── resnext.py
    ├── vgg.py
    └── wide_resnet.py
├── sdn_packages.txt
├── train.py
└── utils
    ├── eval_diving48.py
    ├── eval_hmdb51.py
    ├── eval_kinetics.py
    ├── eval_ucf101.py
    ├── fps.py
    ├── hmdb51_json.py
    ├── kinetics_json.py
    ├── n_frames_kinetics.py
    ├── n_frames_ucf101_hmdb51.py
    ├── ucf101_json.py
    ├── video_jpg.py
    ├── video_jpg_diving48.py
    ├── video_jpg_kinetics.py
    └── video_jpg_ucf101_hmdb51.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | *.out
  2 | log/
  3 | result/
  4 | pretrain/
  5 | checkpoints/
  6 | 
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | env/
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | .hypothesis/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # pyenv
 80 | .python-version
 81 | 
 82 | # celery beat schedule file
 83 | celerybeat-schedule
 84 | 
 85 | # SageMath parsed files
 86 | *.sage.py
 87 | 
 88 | # dotenv
 89 | .env
 90 | 
 91 | # virtualenv
 92 | .venv
 93 | venv/
 94 | ENV/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 
109 | .DS_Store
110 | 
111 | .vscode
112 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Jinwoo Choi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SDN: Scene Debiasing Network for Action Recognition in PyTorch
  2 | We release the code of the "Why Can't I Dance in the Mall? Learning to Mitigate Scene Bias in Action Recognition". The code is built upon the [3D-ResNets-PyTorch codebase](https://github.com/kenshohara/3D-ResNets-PyTorch).
  3 | 
  4 | For the details, visit our [project website](http://chengao.vision/SDN/) or see our [full paper](https://papers.nips.cc/paper/8372-why-cant-i-dance-in-the-mall-learning-to-mitigate-scene-bias-in-action-recognition.pdf).
  5 | 
  6 | ## Reference
  7 | [Jinwoo Choi](https://sites.google.com/site/jchoivision/), [Chen Gao](https://gaochen315.github.io/), [Joseph C. E. Messou](https://josephcmessou.weebly.com/about.html), [Jia-Bin Huang](https://filebox.ece.vt.edu/~jbhuang/index.html). Why Can't I Dance in the Mall? Learning to Mitigate Scene Bias in Action Recognition. Neural Information Processing Systems (NeurIPS) 2019.
  8 | 
  9 | ```
 10 | @inproceedings{choi2019sdn,
 11 |     title = {Why Can't I Dance in the Mall? Learning to Mitigate Scene Bias in Action Recognition},
 12 |     author = {Choi, Jinwoo and Gao, Chen and Messou, C. E. Joseph and Huang, Jia-Bin},
 13 |     booktitle={NeurIPS},
 14 |     year={2019}
 15 | }
 16 | ```
 17 | 
 18 | ## Requirements
 19 | This codebase was developed and tested with:
 20 | - Python 3.6
 21 | - PyTorch 0.4.1
 22 | - torchvision 0.2.1
 23 | - CUDA 9.0
 24 | - CUDNN 7.1
 25 | - GPU: 2xP100 
 26 | 
 27 | You can find dependencies from `sdn_packages.txt`
 28 | 
 29 | You can install dependencies by
 30 | ```
 31 | pip install -r sdn_packages.txt 
 32 | ```
 33 | 
 34 | ## Datasets
 35 | ### Prepare your dataset
 36 | **1. Download and pre-process data**
 37 | - Follow the [3D-ResNets-PyTorch instruction](https://github.com/kenshohara/3D-ResNets-PyTorch#preparation).
 38 | 
 39 | **2. Download scene and human detection data numpy files**
 40 | - [Download the Mini-Kinetics scene pseudo labels](https://filebox.ece.vt.edu/~jinchoi/files/sdn/places_data.zip)
 41 | - [Download the Mini-Kinetics human detections](https://filebox.ece.vt.edu/~jinchoi/files/sdn/detections.zip)
 42 | 
 43 | ## Train
 44 | ### Training on a source dataset (mini-Kinetics)
 45 | **- Baseline model without any debiasing**
 46 | ```
 47 |  python train.py 
 48 |  --video_path <your dataset dir path> \
 49 |  --annotation_path <your dataset dir path>/kinetics.json \
 50 |  --result_path <path to save your model> \
 51 |  --root_path <your dataset dir path> \
 52 |  --dataset kinetics \
 53 |  --n_classes 200 \
 54 |  --n_finetune_classes 200 \
 55 |  --model resnet \
 56 |  --model_depth 18 \
 57 |  --resnet_shortcut A \
 58 |  --batch_size 32 \
 59 |  --val_batch_size 16 \
 60 |  --n_threads 16 \
 61 |  --checkpoint 1 \
 62 |  --ft_begin_index 0 \
 63 |  --is_mask_adv \
 64 |  --learning_rate 0.0001 \
 65 |  --weight_decay 1e-5 \
 66 |  --n_epochs 100 \
 67 |  --pretrain_path <pre-trained model file path>
 68 |  ```
 69 |  
 70 | **- SDN model with scene adversarial loss only**
 71 | ```
 72 | python train.py \
 73 | --video_path <your dataset dir path> \
 74 | --annotation_path <your dataset dir path>/kinetics.json \
 75 | --result_path <path to save your model> \
 76 | --root_path <your dataset dir path> \
 77 | --dataset kinetics_adv \
 78 | --n_classes 200 \
 79 | --n_finetune_classes 200 \
 80 | --model resnet \
 81 | --model_depth 18 \
 82 | --resnet_shortcut A \
 83 | --batch_size 32 \
 84 | --val_batch_size 16 \
 85 | --n_threads 16 \
 86 | --checkpoint 1 \
 87 | --ft_begin_index 0 \
 88 | --num_place_hidden_layers 3 \
 89 | --new_layer_lr 1e-2 \
 90 | --learning_rate 1e-4 \
 91 | --warm_up_epochs 5 \
 92 | --weight_decay 1e-5 \
 93 | --n_epochs 100 \
 94 | --place_pred_path <full path of your kinetics pseudo scene labels> \
 95 | --is_place_adv \
 96 | --is_place_soft \
 97 | --alpha 1.0 \
 98 | --is_mask_adv \
 99 | --num_places_classes 365 \
100 | --pretrain_path <pre-trained model file path>
101 | ```
102 | 
103 | **- Full SDN model with 1) scene adversarial loss and 2) human mask confussion loss**
104 | ```
105 | python train.py \
106 | --video_path <your dataset dir path> \
107 | --annotation_path <your dataset dir path>/kinetics.json \
108 | --result_path <path to save your model> \
109 | --root_path <your dataset dir path> \
110 | --dataset kinetics_adv_msk \
111 | --n_classes 200 \
112 | --n_finetune_classes 200 \
113 | --model resnet \
114 | --model_depth 18 \
115 | --resnet_shortcut A \
116 | --batch_size 32 \
117 | --val_batch_size 16 \
118 | --n_threads 16 \
119 | --checkpoint 1 \
120 | --ft_begin_index 0 \
121 | --num_place_hidden_layers 3 \
122 | --num_human_mask_adv_hidden_layers 1 \
123 | --new_layer_lr 1e-4 \
124 | --learning_rate 1e-4 \
125 | --warm_up_epochs 0 \
126 | --weight_decay 1e-5 \
127 | --n_epochs 100 \
128 | --place_pred_path <full path of your kinetics pseudo scene labels> \
129 | --is_place_adv \
130 | --is_place_soft \
131 | --is_mask_entropy \
132 | --alpha 0.5 \
133 | --mask_ratio 1.0 \
134 | --slower_place_mlp \
135 | --not_replace_last_fc \
136 | --num_places_classes 365 \
137 | --human_dets_path <full path of your kinetics human detections> \
138 | --pretrain_path <pre-trained model file path: e.g., your SDN model with scene adversarial loss only>
139 | ```
140 | 
141 | ### Finetuning on target datasets
142 | #### [Diving48](http://www.svcl.ucsd.edu/projects/resound/dataset.html) as an example
143 | ```
144 | python train.py \
145 | --dataset diving48 \
146 | --root_path <your dataset path> \
147 | --video_path <your dataset path> \
148 | --n_classes 200 \
149 | --n_finetune_classes 48 \
150 | --model resnet \
151 | --model_depth 18 \
152 | --resnet_shortcut A \
153 | --ft_begin_index 0 \
154 | --batch_size 32 \
155 | --val_batch_size 16 \
156 | --n_threads 4 \
157 | --checkpoint 1 \
158 | --learning_rate 0.005 \
159 | --weight_decay 1e-5 \
160 | --n_epochs $epoch_ft \
161 | --is_mask_adv \
162 | --annotation_path $anno_path \
163 | --result_path <path to save your fine-tuned model> \
164 | --pretrain_path <pre-trained model file path: e.g., your full SDN model path>
165 | ```
166 | 
167 | ## Test
168 | ```
169 | python train.py \
170 | --dataset diving48 \
171 | --root_path <your dataset path> \
172 | --video_path <your dataset path> \
173 | --n_finetune_classes 48 \
174 | --n_classes 48 \
175 | --model resnet \
176 | --model_depth 18 \
177 | --resnet_shortcut A \
178 | --batch_size 32 \
179 | --val_batch_size 16 \
180 | --n_threads 4 \
181 | --test \
182 | --test_subset val \
183 | --no_train \
184 | --no_val \
185 | --is_mask_adv \
186 | --annotation_path $anno_path \
187 | --result_path <path (dir) to save your fine-tuned model> \
188 | --resume_path <path (the model checkpoint file) to save your fine-tuned model>
189 | ```
190 | This step will generate `val.json` file under `$result_path`.
191 | 
192 | ## Evaluation
193 | ```
194 | python utils/eval_diving48.py \
195 | --annotation_path $anno_path \
196 | --prediction_path <path to your test result file (val.json) generated from the test step>
197 | ```
198 | 
199 | ## Pre-trained model weights provided
200 | [Download the pre-trained weights](https://www.dropbox.com/scl/fi/j2pgucu8gvpz3jp5ygl91/pre-trained_weights.tar?rlkey=gicecxrpj2o7ipjmhmx0hlcrl&dl=0)
201 | 
202 | ## Acknowledgments
203 | This code is built upon [3D-ResNets-PyTorch codebase](https://github.com/kenshohara/3D-ResNets-PyTorch). We thank to Kensho Hara. 
204 | 


--------------------------------------------------------------------------------
/datasets/activitynet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | from PIL import Image
  4 | import os
  5 | import functools
  6 | import json
  7 | import copy
  8 | import math
  9 | 
 10 | from libs.utils import load_value_file
 11 | 
 12 | 
 13 | def pil_loader(path):
 14 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 15 |     with open(path, 'rb') as f:
 16 |         with Image.open(f) as img:
 17 |             return img.convert('RGB')
 18 | 
 19 | 
 20 | def accimage_loader(path):
 21 |     try:
 22 |         import accimage
 23 |         return accimage.Image(path)
 24 |     except IOError:
 25 |         # Potentially a decoding problem, fall back to PIL.Image
 26 |         return pil_loader(path)
 27 | 
 28 | 
 29 | def get_default_image_loader():
 30 |     from torchvision import get_image_backend
 31 |     if get_image_backend() == 'accimage':
 32 |         return accimage_loader
 33 |     else:
 34 |         return pil_loader
 35 | 
 36 | 
 37 | def video_loader(video_dir_path, frame_indices, image_loader):
 38 |     video = []
 39 |     for i in frame_indices:
 40 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 41 |         if os.path.exists(image_path):
 42 |             video.append(image_loader(image_path))
 43 |         else:
 44 |             return video
 45 | 
 46 |     return video
 47 | 
 48 | 
 49 | def get_default_video_loader():
 50 |     image_loader = get_default_image_loader()
 51 |     return functools.partial(video_loader, image_loader=image_loader)
 52 | 
 53 | 
 54 | def load_annotation_data(data_file_path):
 55 |     with open(data_file_path, 'r') as data_file:
 56 |         return json.load(data_file)
 57 | 
 58 | 
 59 | def get_class_labels(data):
 60 |     class_names = []
 61 |     index = 0
 62 |     for node1 in data['taxonomy']:
 63 |         is_leaf = True
 64 |         for node2 in data['taxonomy']:
 65 |             if node2['parentId'] == node1['nodeId']:
 66 |                 is_leaf = False
 67 |                 break
 68 |         if is_leaf:
 69 |             class_names.append(node1['nodeName'])
 70 | 
 71 |     class_labels_map = {}
 72 | 
 73 |     for i, class_name in enumerate(class_names):
 74 |         class_labels_map[class_name] = i
 75 | 
 76 |     return class_labels_map
 77 | 
 78 | 
 79 | def get_video_names_and_annotations(data, subset):
 80 |     video_names = []
 81 |     annotations = []
 82 | 
 83 |     for key, value in data['database'].items():
 84 |         this_subset = value['subset']
 85 |         if this_subset == subset:
 86 |             if subset == 'testing':
 87 |                 video_names.append('v_{}'.format(key))
 88 |             else:
 89 |                 video_names.append('v_{}'.format(key))
 90 |                 annotations.append(value['annotations'])
 91 | 
 92 |     return video_names, annotations
 93 | 
 94 | 
 95 | def modify_frame_indices(video_dir_path, frame_indices):
 96 |     modified_indices = []
 97 |     for i in frame_indices:
 98 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 99 |         if not os.path.exists(image_path):
100 |             return modified_indices
101 |         modified_indices.append(i)
102 |     return modified_indices
103 | 
104 | 
105 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
106 |                  sample_duration):
107 |     data = load_annotation_data(annotation_path)
108 |     video_names, annotations = get_video_names_and_annotations(data, subset)
109 |     class_to_idx = get_class_labels(data)
110 |     idx_to_class = {}
111 |     for name, label in class_to_idx.items():
112 |         idx_to_class[label] = name
113 | 
114 |     dataset = []
115 |     for i in range(len(video_names)):
116 |         if i % 1000 == 0:
117 |             print('dataset loading [{}/{}]'.format(i, len(video_names)))
118 | 
119 |         video_path = os.path.join(root_path, video_names[i])
120 |         if not os.path.exists(video_path):
121 |             continue
122 | 
123 |         fps_file_path = os.path.join(video_path, 'fps')
124 |         fps = load_value_file(fps_file_path)
125 | 
126 |         for annotation in annotations[i]:
127 |             begin_t = math.ceil(annotation['segment'][0] * fps)
128 |             end_t = math.ceil(annotation['segment'][1] * fps)
129 |             if begin_t == 0:
130 |                 begin_t = 1
131 |             n_frames = end_t - begin_t
132 | 
133 |             sample = {
134 |                 'video': video_path,
135 |                 'segment': [begin_t, end_t],
136 |                 'fps': fps,
137 |                 'video_id': video_names[i][2:]
138 |             }
139 |             if len(annotations) != 0:
140 |                 sample['label'] = class_to_idx[annotation['label']]
141 |             else:
142 |                 sample['label'] = -1
143 | 
144 |             if n_samples_for_each_video == 1:
145 |                 frame_indices = list(range(begin_t, end_t))
146 |                 frame_indices = modify_frame_indices(sample['video'],
147 |                                                      frame_indices)
148 |                 if len(frame_indices) < 16:
149 |                     continue
150 |                 sample['frame_indices'] = frame_indices
151 |                 dataset.append(sample)
152 |             else:
153 |                 if n_samples_for_each_video > 1:
154 |                     step = max(1,
155 |                                math.ceil((n_frames - 1 - sample_duration) /
156 |                                          (n_samples_for_each_video - 1)))
157 |                 else:
158 |                     step = sample_duration
159 |                 for j in range(begin_t, end_t, step):
160 |                     sample_j = copy.deepcopy(sample)
161 |                     frame_indices = list(range(j, j + sample_duration))
162 |                     frame_indices = modify_frame_indices(
163 |                         sample_j['video'], frame_indices)
164 |                     if len(frame_indices) < 16:
165 |                         continue
166 |                     sample_j['frame_indices'] = frame_indices
167 |                     dataset.append(sample_j)
168 | 
169 |     return dataset, idx_to_class
170 | 
171 | 
172 | def get_end_t(video_path):
173 |     file_names = os.listdir(video_path)
174 |     image_file_names = [x for x in file_names if 'image' in x]
175 |     image_file_names.sort(reverse=True)
176 |     return int(image_file_names[0][6:11])
177 | 
178 | 
179 | def make_untrimmed_dataset(root_path, annotation_path, subset,
180 |                            n_samples_for_each_video, sample_duration):
181 |     data = load_annotation_data(annotation_path)
182 |     video_names, _ = get_video_names_and_annotations(data, subset)
183 |     class_to_idx = get_class_labels(data)
184 |     idx_to_class = {}
185 |     for name, label in class_to_idx.items():
186 |         idx_to_class[label] = name
187 | 
188 |     dataset = []
189 |     for i in range(len(video_names)):
190 |         if i % 1000 == 0:
191 |             print('dataset loading [{}/{}]'.format(i, len(video_names)))
192 | 
193 |         video_path = os.path.join(root_path, video_names[i])
194 |         if not os.path.exists(video_path):
195 |             continue
196 | 
197 |         fps_file_path = os.path.join(video_path, 'fps')
198 |         fps = load_value_file(fps_file_path)
199 | 
200 |         begin_t = 1
201 |         end_t = get_end_t(video_path)
202 |         n_frames = end_t - begin_t
203 | 
204 |         sample = {
205 |             'video': video_path,
206 |             'segment': [begin_t, end_t],
207 |             'fps': fps,
208 |             'video_id': video_names[i][2:]
209 |         }
210 | 
211 |         if n_samples_for_each_video >= 1:
212 |             step = max(1,
213 |                        math.ceil((n_frames - 1 - sample_duration) /
214 |                                  (n_samples_for_each_video - 1)))
215 |         else:
216 |             step = sample_duration
217 |         for j in range(begin_t, end_t, step):
218 |             sample_j = copy.deepcopy(sample)
219 |             frame_indices = list(range(j, j + sample_duration))
220 |             frame_indices = modify_frame_indices(sample_j['video'],
221 |                                                  frame_indices)
222 |             if len(frame_indices) < 16:
223 |                 continue
224 |             sample_j['frame_indices'] = frame_indices
225 |             dataset.append(sample_j)
226 | 
227 |     return dataset, idx_to_class
228 | 
229 | 
230 | class ActivityNet(data.Dataset):
231 |     """
232 |     Args:
233 |         root (string): Root directory path.
234 |         spatial_transform (callable, optional): A function/transform that  takes in an PIL image
235 |             and returns a transformed version. E.g, ``transforms.RandomCrop``
236 |         temporal_transform (callable, optional): A function/transform that  takes in a list of frame indices
237 |             and returns a transformed version
238 |         target_transform (callable, optional): A function/transform that takes in the
239 |             target and transforms it.
240 |         loader (callable, optional): A function to load an video given its path and frame indices.
241 |      Attributes:
242 |         classes (list): List of the class names.
243 |         class_to_idx (dict): Dict with items (class_name, class_index).
244 |         imgs (list): List of (image path, class_index) tuples
245 |     """
246 | 
247 |     def __init__(self,
248 |                  root_path,
249 |                  annotation_path,
250 |                  subset,
251 |                  is_untrimmed_setting=False,
252 |                  n_samples_for_each_video=1,
253 |                  spatial_transform=None,
254 |                  temporal_transform=None,
255 |                  target_transform=None,
256 |                  sample_duration=16,
257 |                  get_loader=get_default_video_loader):
258 |         if is_untrimmed_setting:
259 |             self.data, self.class_names = make_untrimmed_dataset(
260 |                 root_path, annotation_path, subset, n_samples_for_each_video,
261 |                 sample_duration)
262 |         else:
263 |             self.data, self.class_names = make_dataset(
264 |                 root_path, annotation_path, subset, n_samples_for_each_video,
265 |                 sample_duration)
266 | 
267 |         self.spatial_transform = spatial_transform
268 |         self.temporal_transform = temporal_transform
269 |         self.target_transform = target_transform
270 |         self.loader = get_loader()
271 | 
272 |     def __getitem__(self, index):
273 |         """
274 |         Args:
275 |             index (int): Index
276 |         Returns:
277 |             tuple: (image, target) where target is class_index of the target class.
278 |         """
279 |         path = self.data[index]['video']
280 | 
281 |         frame_indices = self.data[index]['frame_indices']
282 |         if self.temporal_transform is not None:
283 |             frame_indices = self.temporal_transform(frame_indices)
284 |         clip = self.loader(path, frame_indices)
285 |         if self.spatial_transform is not None:
286 |             self.spatial_transform.randomize_parameters()
287 |             clip = [self.spatial_transform(img) for img in clip]
288 |         clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
289 | 
290 |         target = self.data[index]
291 |         if self.target_transform is not None:
292 |             target = self.target_transform(target)
293 | 
294 |         return clip, target
295 | 
296 |     def __len__(self):
297 |         return len(self.data)
298 | 


--------------------------------------------------------------------------------
/datasets/dataset.py:
--------------------------------------------------------------------------------
  1 | from datasets.kinetics import Kinetics, Kinetics_adv, Kinetics_bkgmsk, Kinetics_human_msk, Kinetics_adv_msk
  2 | from datasets.activitynet import ActivityNet
  3 | from datasets.ucf101 import UCF101
  4 | from datasets.hmdb51 import HMDB51
  5 | from datasets.diving48 import Diving48
  6 | 
  7 | 
  8 | def get_training_set(opt, spatial_transform, temporal_transform,
  9 |                      target_transform):
 10 |     assert opt.dataset in ['kinetics', 'kinetics_adv', 'kinetics_bkgmsk', 'kinetics_adv_msk', 'activitynet', 'ucf101', 'hmdb51', 'diving48']
 11 | 
 12 |     if opt.dataset == 'kinetics':
 13 |         training_data = Kinetics(
 14 |             opt.video_path+'/train',
 15 |             opt.annotation_path,
 16 |             'training',
 17 |             spatial_transform=spatial_transform,
 18 |             temporal_transform=temporal_transform,
 19 |             target_transform=target_transform)
 20 |     elif opt.dataset == 'kinetics_adv':
 21 |         training_data = Kinetics_adv(
 22 |             opt.video_path+'/train',
 23 |             opt.annotation_path,
 24 |             'training',
 25 |             spatial_transform=spatial_transform,
 26 |             temporal_transform=temporal_transform,
 27 |             target_transform=target_transform,
 28 |             place_pred_path=opt.place_pred_path,
 29 |             is_place_soft_label=opt.is_place_soft)      
 30 |     elif opt.dataset == 'kinetics_bkgmsk':
 31 |         training_data = Kinetics_bkgmsk(
 32 |             opt.video_path+'/train',
 33 |             opt.annotation_path,
 34 |             'training',
 35 |             spatial_transform=spatial_transform,
 36 |             temporal_transform=temporal_transform,
 37 |             target_transform=target_transform,
 38 |             detection_path=opt.human_dets_path,
 39 |             mask_ratio=opt.mask_ratio)            
 40 |     elif opt.dataset == 'kinetics_adv_msk':    
 41 |         training_data_1 = Kinetics_adv(
 42 |             opt.video_path+'/train',
 43 |             opt.annotation_path,
 44 |             'training',
 45 |             spatial_transform=spatial_transform,
 46 |             temporal_transform=temporal_transform,
 47 |             target_transform=target_transform,
 48 |             place_pred_path=opt.place_pred_path,
 49 |             is_place_soft_label=opt.is_place_soft)   
 50 |         training_data_2 = Kinetics_human_msk(
 51 |             opt.video_path+'/train',
 52 |             opt.annotation_path,
 53 |             'training',
 54 |             spatial_transform=spatial_transform,
 55 |             temporal_transform=temporal_transform,
 56 |             target_transform=target_transform,
 57 |             detection_path=opt.human_dets_path,
 58 |             mask_ratio=opt.mask_ratio)         
 59 |         training_data = [training_data_1, training_data_2]
 60 |     elif opt.dataset == 'activitynet':
 61 |         training_data = ActivityNet(
 62 |             opt.video_path,
 63 |             opt.annotation_path,
 64 |             'training',
 65 |             False,
 66 |             spatial_transform=spatial_transform,
 67 |             temporal_transform=temporal_transform,
 68 |             target_transform=target_transform)
 69 |     elif opt.dataset == 'ucf101':
 70 |         training_data = UCF101(
 71 |             opt.video_path,
 72 |             opt.annotation_path,
 73 |             'training',
 74 |             spatial_transform=spatial_transform,
 75 |             temporal_transform=temporal_transform,
 76 |             target_transform=target_transform)
 77 |     elif opt.dataset == 'hmdb51':
 78 |         training_data = HMDB51(
 79 |             opt.video_path,
 80 |             opt.annotation_path,
 81 |             'training',
 82 |             spatial_transform=spatial_transform,
 83 |             temporal_transform=temporal_transform,
 84 |             target_transform=target_transform)
 85 |     elif opt.dataset == 'diving48':
 86 |         training_data = Diving48(
 87 |             opt.video_path,
 88 |             opt.annotation_path,
 89 |             'training',
 90 |             spatial_transform=spatial_transform,
 91 |             temporal_transform=temporal_transform,
 92 |             target_transform=target_transform)
 93 | 
 94 |     return training_data
 95 | 
 96 | 
 97 | def get_validation_set(opt, spatial_transform, temporal_transform,
 98 |                        target_transform):
 99 |     assert opt.dataset in ['kinetics', 'kinetics_adv', 'kinetics_bkgmsk', 'kinetics_human_msk', 'kinetics_adv_msk', 'activitynet', 'ucf101', 'hmdb51', 'diving48']
100 | 
101 |     if opt.dataset == 'kinetics':
102 |         validation_data = Kinetics(
103 |             opt.video_path+'/val',
104 |             opt.annotation_path,
105 |             'validation',
106 |             opt.n_val_samples,
107 |             spatial_transform,
108 |             temporal_transform,
109 |             target_transform,
110 |             sample_duration=opt.sample_duration)
111 |     elif opt.dataset == 'kinetics_adv':
112 |         validation_data = Kinetics_adv(
113 |             opt.video_path+'/val',
114 |             opt.annotation_path,
115 |             'validation',
116 |             opt.n_val_samples,
117 |             spatial_transform,
118 |             temporal_transform,
119 |             target_transform,
120 |             sample_duration=opt.sample_duration,
121 |             place_pred_path=opt.place_pred_path,
122 |             is_place_soft_label=opt.is_place_soft)      
123 |     elif opt.dataset == 'kinetics_bkgmsk':
124 |         validation_data = Kinetics_bkgmsk(
125 |             opt.video_path+'/val',
126 |             opt.annotation_path,
127 |             'validation',
128 |             opt.n_val_samples,
129 |             spatial_transform,
130 |             temporal_transform,
131 |             target_transform,
132 |             sample_duration=opt.sample_duration,
133 |             detection_path=opt.human_dets_path,
134 |             mask_ratio=opt.mask_ratio)                  
135 |     elif opt.dataset == 'kinetics_adv_msk':
136 |         validation_data_1 = Kinetics_adv(
137 |             opt.video_path+'/val',
138 |             opt.annotation_path,
139 |             'validation',
140 |             opt.n_val_samples,
141 |             spatial_transform,
142 |             temporal_transform,
143 |             target_transform,
144 |             sample_duration=opt.sample_duration,
145 |             place_pred_path=opt.place_pred_path,
146 |             is_place_soft_label=opt.is_place_soft)     
147 |         validation_data_2 = Kinetics_human_msk(
148 |             opt.video_path+'/val',
149 |             opt.annotation_path,
150 |             'validation',
151 |             opt.n_val_samples,
152 |             spatial_transform,
153 |             temporal_transform,
154 |             target_transform,
155 |             sample_duration=opt.sample_duration,
156 |             detection_path=opt.human_dets_path,
157 |             mask_ratio=opt.mask_ratio)                   
158 |         validation_data = [validation_data_1, validation_data_2]
159 |     elif opt.dataset == 'activitynet':
160 |         validation_data = ActivityNet(
161 |             opt.video_path,
162 |             opt.annotation_path,
163 |             'validation',
164 |             False,
165 |             opt.n_val_samples,
166 |             spatial_transform,
167 |             temporal_transform,
168 |             target_transform,
169 |             sample_duration=opt.sample_duration)
170 |     elif opt.dataset == 'ucf101':
171 |         validation_data = UCF101(
172 |             opt.video_path,
173 |             opt.annotation_path,
174 |             'validation',
175 |             opt.n_val_samples,
176 |             spatial_transform,
177 |             temporal_transform,
178 |             target_transform,
179 |             sample_duration=opt.sample_duration,
180 |             vis=opt.vis)
181 |     elif opt.dataset == 'hmdb51':
182 |         validation_data = HMDB51(
183 |             opt.video_path,
184 |             opt.annotation_path,
185 |             'validation',
186 |             opt.n_val_samples,
187 |             spatial_transform,
188 |             temporal_transform,
189 |             target_transform,
190 |             sample_duration=opt.sample_duration,
191 |             vis=opt.vis)
192 |     elif opt.dataset == 'diving48':
193 |         validation_data = Diving48(
194 |             opt.video_path,
195 |             opt.annotation_path,
196 |             'validation',
197 |             opt.n_val_samples,
198 |             spatial_transform,
199 |             temporal_transform,
200 |             target_transform,
201 |             sample_duration=opt.sample_duration,
202 |             vis=opt.vis)            
203 |     return validation_data
204 | 
205 | 
206 | def get_test_set(opt, spatial_transform, temporal_transform, target_transform):
207 |     assert opt.dataset in ['kinetics', 'activitynet', 'ucf101', 'hmdb51', 'diving48']
208 |     assert opt.test_subset in ['val', 'test']
209 | 
210 |     if opt.test_subset == 'val':
211 |         subset = 'validation'
212 |     elif opt.test_subset == 'test':
213 |         subset = 'testing'
214 |     if opt.dataset == 'kinetics':
215 |         test_data = Kinetics(
216 |             opt.video_path,
217 |             opt.annotation_path,
218 |             subset,
219 |             0,
220 |             spatial_transform,
221 |             temporal_transform,
222 |             target_transform,
223 |             sample_duration=opt.sample_duration)
224 |     elif opt.dataset == 'activitynet':
225 |         test_data = ActivityNet(
226 |             opt.video_path,
227 |             opt.annotation_path,
228 |             subset,
229 |             True,
230 |             0,
231 |             spatial_transform,
232 |             temporal_transform,
233 |             target_transform,
234 |             sample_duration=opt.sample_duration)
235 |     elif opt.dataset == 'ucf101':
236 |         test_data = UCF101(
237 |             opt.video_path,
238 |             opt.annotation_path,
239 |             subset,
240 |             0,
241 |             spatial_transform,
242 |             temporal_transform,
243 |             target_transform,
244 |             sample_duration=opt.sample_duration)
245 |     elif opt.dataset == 'hmdb51':
246 |         test_data = HMDB51(
247 |             opt.video_path,
248 |             opt.annotation_path,
249 |             subset,
250 |             0,
251 |             spatial_transform,
252 |             temporal_transform,
253 |             target_transform,
254 |             sample_duration=opt.sample_duration)
255 |     elif opt.dataset == 'diving48':
256 |         test_data = Diving48(
257 |             opt.video_path,
258 |             opt.annotation_path,
259 |             subset,
260 |             0,
261 |             spatial_transform,
262 |             temporal_transform,
263 |             target_transform,
264 |             sample_duration=opt.sample_duration)
265 | 
266 |     return test_data
267 | 


--------------------------------------------------------------------------------
/datasets/diving48.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | from PIL import Image
  4 | import os
  5 | import math
  6 | import functools
  7 | import json
  8 | import copy
  9 | 
 10 | from libs.utils import load_value_file
 11 | import pdb
 12 | 
 13 | def pil_loader(path):
 14 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 15 |     with open(path, 'rb') as f:
 16 |         with Image.open(f) as img:
 17 |             return img.convert('RGB')
 18 | 
 19 | 
 20 | def accimage_loader(path):
 21 |     try:
 22 |         import accimage
 23 |         return accimage.Image(path)
 24 |     except IOError:
 25 |         # Potentially a decoding problem, fall back to PIL.Image
 26 |         return pil_loader(path)
 27 | 
 28 | 
 29 | def get_default_image_loader():
 30 |     from torchvision import get_image_backend
 31 |     if get_image_backend() == 'accimage':
 32 |         return accimage_loader
 33 |     else:
 34 |         return pil_loader
 35 | 
 36 | 
 37 | def video_loader(video_dir_path, frame_indices, image_loader):
 38 |     video = []
 39 |     for i in frame_indices:
 40 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 41 |         if os.path.exists(image_path):
 42 |             video.append(image_loader(image_path))
 43 |         else:
 44 |             return video
 45 | 
 46 |     return video
 47 | 
 48 | 
 49 | def get_default_video_loader():
 50 |     image_loader = get_default_image_loader()
 51 |     return functools.partial(video_loader, image_loader=image_loader)
 52 | 
 53 | 
 54 | def load_annotation_data(data_file_path):
 55 |     with open(data_file_path, 'r') as data_file:
 56 |         return json.load(data_file)
 57 | 
 58 | 
 59 | def get_class_labels(data_file_path):
 60 |     with open(data_file_path, 'r') as data_file:
 61 |         data = json.load(data_file)
 62 |     data = ['_'.join(row) for row in data]
 63 |     class_labels_map = {}
 64 |     index = 0
 65 |     for class_label in data:
 66 |         class_labels_map[class_label] = index
 67 |         index += 1
 68 |     return class_labels_map
 69 | 
 70 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
 71 |                  sample_duration):
 72 |     if subset == 'training':
 73 |         postfix = 'train'
 74 |     else: 
 75 |         postfix = 'test'
 76 |     annotation_file_path = os.path.join(annotation_path, 'Diving48_{}.json'.format(postfix))
 77 |     data = load_annotation_data(annotation_file_path)
 78 | 
 79 |     class_file_path = os.path.join(annotation_path, 'Diving48_vocab.json')
 80 |     class_to_idx = get_class_labels(class_file_path)
 81 |     idx_to_class = {}
 82 |     for name, label in class_to_idx.items():
 83 |         idx_to_class[label] = name
 84 | 
 85 |     dataset = []
 86 |     for i in range(len(data)):
 87 |         if i % 1000 == 0:
 88 |             print('dataset loading [{}/{}]'.format(i, len(data)))
 89 | 
 90 |         video_path = os.path.join(root_path, data[i]['vid_name'])
 91 |         if not os.path.exists(video_path):
 92 |             continue
 93 | 
 94 |         n_frames = data[i]['end_frame'] - data[i]['start_frame'] + 1
 95 |         if n_frames <= 0:
 96 |             continue
 97 | 
 98 |         begin_t = 1
 99 |         end_t = n_frames
100 |         sample = {
101 |             'video': video_path,
102 |             'segment': [begin_t, end_t],
103 |             'n_frames': n_frames,
104 |             'video_id': data[i]['vid_name']
105 |         }
106 |         if len(data) != 0:
107 |             sample['label'] = data[i]['label']
108 |         else:
109 |             sample['label'] = -1
110 | 
111 |         if n_samples_for_each_video == 1:
112 |             sample['frame_indices'] = list(range(1, n_frames + 1))
113 |             dataset.append(sample)
114 |         else:
115 |             if n_samples_for_each_video > 1:
116 |                 step = max(1,
117 |                            math.ceil((n_frames - 1 - sample_duration) /
118 |                                      (n_samples_for_each_video - 1)))
119 |             else:
120 |                 step = sample_duration
121 |             for j in range(1, n_frames, step):
122 |                 sample_j = copy.deepcopy(sample)
123 |                 sample_j['frame_indices'] = list(
124 |                     range(j, min(n_frames + 1, j + sample_duration)))
125 |                 dataset.append(sample_j)
126 | 
127 |     return dataset, idx_to_class
128 | 
129 | 
130 | class Diving48(data.Dataset):
131 |     """
132 |     Args:
133 |         root (string): Root directory path.
134 |         spatial_transform (callable, optional): A function/transform that  takes in an PIL image
135 |             and returns a transformed version. E.g, ``transforms.RandomCrop``
136 |         temporal_transform (callable, optional): A function/transform that  takes in a list of frame indices
137 |             and returns a transformed version
138 |         target_transform (callable, optional): A function/transform that takes in the
139 |             target and transforms it.
140 |         loader (callable, optional): A function to load an video given its path and frame indices.
141 |      Attributes:
142 |         classes (list): List of the class names.
143 |         class_to_idx (dict): Dict with items (class_name, class_index).
144 |         imgs (list): List of (image path, class_index) tuples
145 |     """
146 | 
147 |     def __init__(self,
148 |                  root_path,
149 |                  annotation_path,
150 |                  subset,
151 |                  n_samples_for_each_video=1,
152 |                  spatial_transform=None,
153 |                  temporal_transform=None,
154 |                  target_transform=None,
155 |                  sample_duration=16,
156 |                  get_loader=get_default_video_loader,
157 |                  vis=False):
158 |         self.data, self.class_names = make_dataset(
159 |             root_path, annotation_path, subset, n_samples_for_each_video,
160 |             sample_duration)
161 | 
162 |         self.spatial_transform = spatial_transform
163 |         self.temporal_transform = temporal_transform
164 |         self.target_transform = target_transform
165 |         self.vis = vis
166 |         self.loader = get_loader()
167 | 
168 |     def __getitem__(self, index):
169 |         """
170 |         Args:
171 |             index (int): Index
172 |         Returns:
173 |             tuple: (image, target) where target is class_index of the target class.
174 |         """
175 |         path = self.data[index]['video']
176 | 
177 |         frame_indices = self.data[index]['frame_indices']
178 |         if self.temporal_transform is not None:
179 |             frame_indices = self.temporal_transform(frame_indices)
180 |         clip = self.loader(path, frame_indices)
181 |         if self.spatial_transform is not None:
182 |             self.spatial_transform.randomize_parameters()
183 |             clip = [self.spatial_transform(img) for img in clip]
184 |         clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
185 | 
186 |         target = self.data[index]
187 |         if self.target_transform is not None:
188 |             target = self.target_transform(target)
189 | 
190 |         if self.vis:
191 |             return clip, target, path, frame_indices
192 |         else:
193 |             return clip, target
194 | 
195 |     def __len__(self):
196 |         return len(self.data)
197 | 


--------------------------------------------------------------------------------
/datasets/hmdb51.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | from PIL import Image
  4 | import os
  5 | import math
  6 | import functools
  7 | import json
  8 | import copy
  9 | 
 10 | from libs.utils import load_value_file
 11 | 
 12 | 
 13 | def pil_loader(path):
 14 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 15 |     with open(path, 'rb') as f:
 16 |         with Image.open(f) as img:
 17 |             return img.convert('RGB')
 18 | 
 19 | 
 20 | def accimage_loader(path):
 21 |     try:
 22 |         import accimage
 23 |         return accimage.Image(path)
 24 |     except IOError:
 25 |         # Potentially a decoding problem, fall back to PIL.Image
 26 |         return pil_loader(path)
 27 | 
 28 | 
 29 | def get_default_image_loader():
 30 |     from torchvision import get_image_backend
 31 |     if get_image_backend() == 'accimage':
 32 |         return accimage_loader
 33 |     else:
 34 |         return pil_loader
 35 | 
 36 | 
 37 | def video_loader(video_dir_path, frame_indices, image_loader):
 38 |     video = []
 39 |     for i in frame_indices:
 40 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 41 |         if os.path.exists(image_path):
 42 |             video.append(image_loader(image_path))
 43 |         else:
 44 |             return video
 45 | 
 46 |     return video
 47 | 
 48 | 
 49 | def get_default_video_loader():
 50 |     image_loader = get_default_image_loader()
 51 |     return functools.partial(video_loader, image_loader=image_loader)
 52 | 
 53 | 
 54 | def load_annotation_data(data_file_path):
 55 |     with open(data_file_path, 'r') as data_file:
 56 |         return json.load(data_file)
 57 | 
 58 | 
 59 | def get_class_labels(data):
 60 |     class_labels_map = {}
 61 |     index = 0
 62 |     for class_label in data['labels']:
 63 |         class_labels_map[class_label] = index
 64 |         index += 1
 65 |     return class_labels_map
 66 | 
 67 | 
 68 | def get_video_names_and_annotations(data, subset):
 69 |     video_names = []
 70 |     annotations = []
 71 | 
 72 |     for key, value in data['database'].items():
 73 |         this_subset = value['subset']
 74 |         if this_subset == subset:
 75 |             label = value['annotations']['label']
 76 |             video_names.append('{}/{}'.format(label, key))
 77 |             annotations.append(value['annotations'])
 78 | 
 79 |     return video_names, annotations
 80 | 
 81 | 
 82 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
 83 |                  sample_duration):
 84 |     data = load_annotation_data(annotation_path)
 85 |     video_names, annotations = get_video_names_and_annotations(data, subset)
 86 |     class_to_idx = get_class_labels(data)
 87 |     idx_to_class = {}
 88 |     for name, label in class_to_idx.items():
 89 |         idx_to_class[label] = name
 90 | 
 91 |     dataset = []
 92 |     for i in range(len(video_names)):
 93 |         if i % 1000 == 0:
 94 |             print('dataset loading [{}/{}]'.format(i, len(video_names)))
 95 | 
 96 |         video_path = os.path.join(root_path, video_names[i])
 97 |         if not os.path.exists(video_path):
 98 |             continue
 99 | 
100 |         n_frames_file_path = os.path.join(video_path, 'n_frames')
101 |         n_frames = int(load_value_file(n_frames_file_path))
102 |         if n_frames <= 0:
103 |             continue
104 | 
105 |         begin_t = 1
106 |         end_t = n_frames
107 |         sample = {
108 |             'video': video_path,
109 |             'segment': [begin_t, end_t],
110 |             'n_frames': n_frames,
111 |             'video_id': video_names[i].split('/')[1]
112 |         }
113 |         if len(annotations) != 0:
114 |             sample['label'] = class_to_idx[annotations[i]['label']]
115 |         else:
116 |             sample['label'] = -1
117 | 
118 |         if n_samples_for_each_video == 1:
119 |             sample['frame_indices'] = list(range(1, n_frames + 1))
120 |             dataset.append(sample)
121 |         else:
122 |             if n_samples_for_each_video > 1:
123 |                 step = max(1,
124 |                            math.ceil((n_frames - 1 - sample_duration) /
125 |                                      (n_samples_for_each_video - 1)))
126 |             else:
127 |                 step = sample_duration
128 |             for j in range(1, n_frames, step):
129 |                 sample_j = copy.deepcopy(sample)
130 |                 sample_j['frame_indices'] = list(
131 |                     range(j, min(n_frames + 1, j + sample_duration)))
132 |                 dataset.append(sample_j)
133 | 
134 |     return dataset, idx_to_class
135 | 
136 | 
137 | class HMDB51(data.Dataset):
138 |     """
139 |     Args:
140 |         root (string): Root directory path.
141 |         spatial_transform (callable, optional): A function/transform that  takes in an PIL image
142 |             and returns a transformed version. E.g, ``transforms.RandomCrop``
143 |         temporal_transform (callable, optional): A function/transform that  takes in a list of frame indices
144 |             and returns a transformed version
145 |         target_transform (callable, optional): A function/transform that takes in the
146 |             target and transforms it.
147 |         loader (callable, optional): A function to load an video given its path and frame indices.
148 |      Attributes:
149 |         classes (list): List of the class names.
150 |         class_to_idx (dict): Dict with items (class_name, class_index).
151 |         imgs (list): List of (image path, class_index) tuples
152 |     """
153 | 
154 |     def __init__(self,
155 |                  root_path,
156 |                  annotation_path,
157 |                  subset,
158 |                  n_samples_for_each_video=1,
159 |                  spatial_transform=None,
160 |                  temporal_transform=None,
161 |                  target_transform=None,
162 |                  sample_duration=16,
163 |                  get_loader=get_default_video_loader,
164 |                  vis=False):
165 |         self.data, self.class_names = make_dataset(
166 |             root_path, annotation_path, subset, n_samples_for_each_video,
167 |             sample_duration)
168 | 
169 |         self.spatial_transform = spatial_transform
170 |         self.temporal_transform = temporal_transform
171 |         self.target_transform = target_transform
172 |         self.vis = vis
173 |         self.loader = get_loader()
174 | 
175 |     def __getitem__(self, index):
176 |         """
177 |         Args:
178 |             index (int): Index
179 |         Returns:
180 |             tuple: (image, target) where target is class_index of the target class.
181 |         """
182 |         path = self.data[index]['video']
183 | 
184 |         frame_indices = self.data[index]['frame_indices']
185 |         if self.temporal_transform is not None:
186 |             frame_indices = self.temporal_transform(frame_indices)
187 |         clip = self.loader(path, frame_indices)
188 |         if self.spatial_transform is not None:
189 |             self.spatial_transform.randomize_parameters()
190 |             clip = [self.spatial_transform(img) for img in clip]
191 |         clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
192 | 
193 |         target = self.data[index]
194 |         if self.target_transform is not None:
195 |             target = self.target_transform(target)
196 | 
197 |         if self.vis:
198 |             return clip, target, path, frame_indices
199 |         else:
200 |             return clip, target
201 | 
202 |     def __len__(self):
203 |         return len(self.data)
204 | 


--------------------------------------------------------------------------------
/datasets/ucf101.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | from PIL import Image
  4 | import os
  5 | import math
  6 | import functools
  7 | import json
  8 | import copy
  9 | 
 10 | from libs.utils import load_value_file
 11 | 
 12 | 
 13 | def pil_loader(path):
 14 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 15 |     with open(path, 'rb') as f:
 16 |         with Image.open(f) as img:
 17 |             return img.convert('RGB')
 18 | 
 19 | 
 20 | def accimage_loader(path):
 21 |     try:
 22 |         import accimage
 23 |         return accimage.Image(path)
 24 |     except IOError:
 25 |         # Potentially a decoding problem, fall back to PIL.Image
 26 |         return pil_loader(path)
 27 | 
 28 | 
 29 | def get_default_image_loader():
 30 |     from torchvision import get_image_backend
 31 |     if get_image_backend() == 'accimage':
 32 |         return accimage_loader
 33 |     else:
 34 |         return pil_loader
 35 | 
 36 | 
 37 | def video_loader(video_dir_path, frame_indices, image_loader):
 38 |     video = []
 39 |     for i in frame_indices:
 40 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 41 |         if os.path.exists(image_path):
 42 |             video.append(image_loader(image_path))
 43 |         else:
 44 |             return video
 45 | 
 46 |     return video
 47 | 
 48 | 
 49 | def get_default_video_loader():
 50 |     image_loader = get_default_image_loader()
 51 |     return functools.partial(video_loader, image_loader=image_loader)
 52 | 
 53 | 
 54 | def load_annotation_data(data_file_path):
 55 |     with open(data_file_path, 'r') as data_file:
 56 |         return json.load(data_file)
 57 | 
 58 | 
 59 | def get_class_labels(data):
 60 |     class_labels_map = {}
 61 |     index = 0
 62 |     for class_label in data['labels']:
 63 |         class_labels_map[class_label] = index
 64 |         index += 1
 65 |     return class_labels_map
 66 | 
 67 | 
 68 | def get_video_names_and_annotations(data, subset):
 69 |     video_names = []
 70 |     annotations = []
 71 | 
 72 |     for key, value in data['database'].items():
 73 |         this_subset = value['subset']
 74 |         if this_subset == subset:
 75 |             label = value['annotations']['label']
 76 |             video_names.append('{}/{}'.format(label, key))
 77 |             annotations.append(value['annotations'])
 78 | 
 79 |     return video_names, annotations
 80 | 
 81 | 
 82 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
 83 |                  sample_duration):
 84 |     data = load_annotation_data(annotation_path)
 85 |     video_names, annotations = get_video_names_and_annotations(data, subset)
 86 |     class_to_idx = get_class_labels(data)
 87 |     idx_to_class = {}
 88 |     for name, label in class_to_idx.items():
 89 |         idx_to_class[label] = name
 90 | 
 91 |     dataset = []
 92 |     for i in range(len(video_names)):
 93 |         if i % 1000 == 0:
 94 |             print('dataset loading [{}/{}]'.format(i, len(video_names)))
 95 | 
 96 |         video_path = os.path.join(root_path, video_names[i])
 97 |         if not os.path.exists(video_path):
 98 |             continue
 99 | 
100 |         n_frames_file_path = os.path.join(video_path, 'n_frames')
101 |         n_frames = int(load_value_file(n_frames_file_path))
102 |         if n_frames <= 0:
103 |             continue
104 | 
105 |         begin_t = 1
106 |         end_t = n_frames
107 |         sample = {
108 |             'video': video_path,
109 |             'segment': [begin_t, end_t],
110 |             'n_frames': n_frames,
111 |             'video_id': video_names[i].split('/')[1]
112 |         }
113 |         if len(annotations) != 0:
114 |             sample['label'] = class_to_idx[annotations[i]['label']]
115 |         else:
116 |             sample['label'] = -1
117 | 
118 |         if n_samples_for_each_video == 1:
119 |             sample['frame_indices'] = list(range(1, n_frames + 1))
120 |             dataset.append(sample)
121 |         else:
122 |             if n_samples_for_each_video > 1:
123 |                 step = max(1,
124 |                            math.ceil((n_frames - 1 - sample_duration) /
125 |                                      (n_samples_for_each_video - 1)))
126 |             else:
127 |                 step = sample_duration
128 |             for j in range(1, n_frames, step):
129 |                 sample_j = copy.deepcopy(sample)
130 |                 sample_j['frame_indices'] = list(
131 |                     range(j, min(n_frames + 1, j + sample_duration)))
132 |                 dataset.append(sample_j)
133 | 
134 |     return dataset, idx_to_class
135 | 
136 | 
137 | class UCF101(data.Dataset):
138 |     """
139 |     Args:
140 |         root (string): Root directory path.
141 |         spatial_transform (callable, optional): A function/transform that  takes in an PIL image
142 |             and returns a transformed version. E.g, ``transforms.RandomCrop``
143 |         temporal_transform (callable, optional): A function/transform that  takes in a list of frame indices
144 |             and returns a transformed version
145 |         target_transform (callable, optional): A function/transform that takes in the
146 |             target and transforms it.
147 |         loader (callable, optional): A function to load an video given its path and frame indices.
148 |      Attributes:
149 |         classes (list): List of the class names.
150 |         class_to_idx (dict): Dict with items (class_name, class_index).
151 |         imgs (list): List of (image path, class_index) tuples
152 |     """
153 | 
154 |     def __init__(self,
155 |                  root_path,
156 |                  annotation_path,
157 |                  subset,
158 |                  n_samples_for_each_video=1,
159 |                  spatial_transform=None,
160 |                  temporal_transform=None,
161 |                  target_transform=None,
162 |                  sample_duration=16,
163 |                  get_loader=get_default_video_loader,
164 |                  vis=False):
165 |         self.data, self.class_names = make_dataset(
166 |             root_path, annotation_path, subset, n_samples_for_each_video,
167 |             sample_duration)
168 | 
169 |         self.spatial_transform = spatial_transform
170 |         self.temporal_transform = temporal_transform
171 |         self.target_transform = target_transform
172 |         self.vis = vis
173 |         self.loader = get_loader()
174 | 
175 |     def __getitem__(self, index):
176 |         """
177 |         Args:
178 |             index (int): Index
179 |         Returns:
180 |             tuple: (image, target) where target is class_index of the target class.
181 |         """
182 |         path = self.data[index]['video']
183 | 
184 |         frame_indices = self.data[index]['frame_indices']
185 |         if self.temporal_transform is not None:
186 |             frame_indices = self.temporal_transform(frame_indices)
187 |         clip = self.loader(path, frame_indices)
188 |         if self.spatial_transform is not None:
189 |             self.spatial_transform.randomize_parameters()
190 |             clip = [self.spatial_transform(img) for img in clip]
191 |         clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
192 | 
193 |         target = self.data[index]
194 |         if self.target_transform is not None:
195 |             target = self.target_transform(target)
196 | 
197 |         if self.vis:
198 |             return clip, target, path, frame_indices
199 |         else:
200 |             return clip, target
201 | 
202 |     def __len__(self):
203 |         return len(self.data)
204 | 


--------------------------------------------------------------------------------
/libs/mean.py:
--------------------------------------------------------------------------------
 1 | def get_mean(norm_value=255, dataset='activitynet'):
 2 |     assert dataset in ['activitynet', 'kinetics']
 3 | 
 4 |     if dataset == 'activitynet':
 5 |         return [
 6 |             114.7748 / norm_value, 107.7354 / norm_value, 99.4750 / norm_value
 7 |         ]
 8 |     elif dataset == 'kinetics':
 9 |         # Kinetics (10 videos for each class)
10 |         return [
11 |             110.63666788 / norm_value, 103.16065604 / norm_value,
12 |             96.29023126 / norm_value
13 |         ]
14 | 
15 | 
16 | def get_std(norm_value=255):
17 |     # Kinetics (10 videos for each class)
18 |     return [
19 |         38.7568578 / norm_value, 37.88248729 / norm_value,
20 |         40.02898126 / norm_value
21 |     ]
22 | 


--------------------------------------------------------------------------------
/libs/opts.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | 
  4 | def parse_opts():
  5 |     parser = argparse.ArgumentParser()
  6 |     parser.add_argument(
  7 |         '--root_path',
  8 |         default='/root/data/ActivityNet',
  9 |         type=str,
 10 |         help='Root directory path of data')
 11 |     parser.add_argument(
 12 |         '--video_path',
 13 |         default='video_kinetics_jpg',
 14 |         type=str,
 15 |         help='Directory path of Videos')
 16 |     parser.add_argument(
 17 |         '--annotation_path',
 18 |         default='kinetics.json',
 19 |         type=str,
 20 |         help='Annotation file path')
 21 |     parser.add_argument(
 22 |         '--prediction_path',
 23 |         default='kinetics.json',
 24 |         type=str,
 25 |         help='Prediction file path')                
 26 |     parser.add_argument(
 27 |         '--result_path',
 28 |         default='results',
 29 |         type=str,
 30 |         help='Result directory path')
 31 |     parser.add_argument(
 32 |         '--place_pred_path',
 33 |         default='place',
 34 |         type=str,
 35 |         help='place prediction directory full path')        
 36 |     parser.add_argument(
 37 |         '--human_dets_path',
 38 |         default='dets',
 39 |         type=str,
 40 |         help='human detection directory full path')       
 41 |     parser.add_argument(
 42 |         '--mask_ratio',
 43 |         default=0.5,
 44 |         type=float,
 45 |         help='mask out background ratio, higher measn mask out more')
 46 | 
 47 |     parser.add_argument(
 48 |         '--dataset',
 49 |         default='kinetics',
 50 |         type=str,
 51 |         help='Used dataset (activitynet | kinetics | ucf101 | hmdb51)')
 52 |     parser.add_argument(
 53 |         '--n_classes',
 54 |         default=400,
 55 |         type=int,
 56 |         help=
 57 |         'Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)'
 58 |     )
 59 |     parser.add_argument(
 60 |         '--n_finetune_classes',
 61 |         default=400,
 62 |         type=int,
 63 |         help=
 64 |         'Number of classes for fine-tuning. n_classes is set to the number when pretraining.'
 65 |     )
 66 |     parser.add_argument(
 67 |         '--sample_size',
 68 |         default=112,
 69 |         type=int,
 70 |         help='Height and width of inputs')
 71 |     parser.add_argument(
 72 |         '--sample_duration',
 73 |         default=16,
 74 |         type=int,
 75 |         help='Temporal duration of inputs')
 76 |     parser.add_argument(
 77 |         '--initial_scale',
 78 |         default=1.0,
 79 |         type=float,
 80 |         help='Initial scale for multiscale cropping')
 81 |     parser.add_argument(
 82 |         '--n_scales',
 83 |         default=5,
 84 |         type=int,
 85 |         help='Number of scales for multiscale cropping')
 86 |     parser.add_argument(
 87 |         '--scale_step',
 88 |         default=0.84089641525,
 89 |         type=float,
 90 |         help='Scale step for multiscale cropping')
 91 |     parser.add_argument(
 92 |         '--train_crop',
 93 |         default='corner',
 94 |         type=str,
 95 |         help=
 96 |         'Spatial cropping method in training. random is uniform. corner is selection from 4 corners and 1 center.  (random | corner | center)'
 97 |     )
 98 |     parser.add_argument(
 99 |         '--learning_rate',
100 |         default=0.1,
101 |         type=float,
102 |         help=
103 |         'Initial learning rate (divided by 10 while training by lr scheduler)')
104 |     parser.add_argument(
105 |         '--new_layer_lr',
106 |         default=0.1,
107 |         type=float,
108 |         help=
109 |         'Initial learning rate for new layers (divided by 10 while training by lr scheduler)')
110 |     parser.add_argument(
111 |         '--warm_up_epochs',
112 |         default=10,
113 |         type=int,
114 |         help='number of epochs need to warm up the new layers')        
115 |     parser.add_argument('--momentum', default=0.9, type=float, help='Momentum')
116 |     parser.add_argument(
117 |         '--dampening', default=0.9, type=float, help='dampening of SGD')
118 |     parser.add_argument(
119 |         '--weight_decay', default=1e-3, type=float, help='Weight Decay')
120 |     parser.add_argument(
121 |         '--mean_dataset',
122 |         default='activitynet',
123 |         type=str,
124 |         help=
125 |         'dataset for mean values of mean subtraction (activitynet | kinetics)')
126 |     parser.add_argument(
127 |         '--no_mean_norm',
128 |         action='store_true',
129 |         help='If true, inputs are not normalized by mean.')
130 |     parser.set_defaults(no_mean_norm=False)
131 |     parser.add_argument(
132 |         '--std_norm',
133 |         action='store_true',
134 |         help='If true, inputs are normalized by standard deviation.')
135 |     parser.set_defaults(std_norm=False)
136 |     parser.add_argument(
137 |         '--nesterov', action='store_true', help='Nesterov momentum')
138 |     parser.set_defaults(nesterov=False)
139 |     parser.add_argument(
140 |         '--optimizer',
141 |         default='sgd',
142 |         type=str,
143 |         help='Currently only support SGD')
144 |     parser.add_argument(
145 |         '--lr_patience',
146 |         default=10,
147 |         type=int,
148 |         help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.'
149 |     )
150 |     parser.add_argument(
151 |         '--batch_size', default=128, type=int, help='Batch Size')
152 |     parser.add_argument(
153 |         '--val_batch_size', default=16, type=int, help='Batch Size for Validation')
154 |     parser.add_argument(
155 |         '--n_epochs',
156 |         default=200,
157 |         type=int,
158 |         help='Number of total epochs to run')
159 |     parser.add_argument(
160 |         '--begin_epoch',
161 |         default=1,
162 |         type=int,
163 |         help=
164 |         'Training begins at this epoch. Previous trained model indicated by resume_path is loaded.'
165 |     )
166 |     parser.add_argument(
167 |         '--n_val_samples',
168 |         default=3,
169 |         type=int,
170 |         help='Number of validation samples for each activity')
171 |     parser.add_argument(
172 |         '--resume_path',
173 |         default='',
174 |         type=str,
175 |         help='Save data (.pth) of previous training')
176 |     parser.add_argument(
177 |         '--pretrain_path', default='', type=str, help='Pretrained model (.pth)')
178 |     parser.add_argument(
179 |         '--vis',
180 |         action='store_true',
181 |         help='If true, vis')
182 |     parser.set_defaults(vis=False)                  
183 |     parser.add_argument(
184 |         '--is_place_adv',
185 |         action='store_true',
186 |         help='If true, using place adversarial traiing.')
187 |     parser.set_defaults(is_place_adv=False)    
188 |     parser.add_argument(
189 |         '--is_place_soft',
190 |         action='store_true',
191 |         help='If true, using placenet soft label.')
192 |     parser.set_defaults(is_place_soft=False)          
193 |     parser.add_argument(
194 |         '--is_place_entropy',
195 |         action='store_true',
196 |         help='If true, using place entropy loss for training.')
197 |     parser.set_defaults(is_place_entropy=False)            
198 |     parser.add_argument(
199 |         '--is_entropy_max',
200 |         action='store_true',
201 |         help='If true, using place entropy maximization training.')
202 |     parser.set_defaults(is_entropy_max=False)         
203 |     parser.add_argument(
204 |         '--is_mask_adv',
205 |         action='store_false',
206 |         help='If true, using human mask branch for training.')
207 |     parser.set_defaults(is_mask_adv=True)      
208 |     parser.add_argument(
209 |         '--is_mask_cross_entropy',
210 |         action='store_true',
211 |         help='If true, using human mask cross entropy loss.')
212 |     parser.set_defaults(is_mask_cross_entropy=False)          
213 |     parser.add_argument(
214 |         '--is_mask_entropy',
215 |         action='store_true',
216 |         help='If true, using human mask entropy loss.')
217 |     parser.set_defaults(is_mask_entropy=False)      
218 |     parser.add_argument(
219 |         '--is_mask_conf_dual_loader',
220 |         action='store_true',
221 |         help='If true, using two data loaders for human mask action confusion loss.')
222 |     parser.set_defaults(is_mask_conf_dual_loader=False)     
223 |     parser.add_argument(
224 |         '--slower_place_mlp',
225 |         action='store_true',
226 |         help='If true, using slower learning rate for place mlp')
227 |     parser.set_defaults(slower_place_mlp=False)     
228 |     parser.add_argument(
229 |         '--slower_hm_mlp',
230 |         action='store_true',
231 |         help='If true, using slower learning rate for human mask mlp')
232 |     parser.set_defaults(slower_hm_mlp=False)                 
233 |     parser.add_argument(
234 |         '--weight_entropy_loss',
235 |         default=1.0,
236 |         type=float,
237 |         help='weight of the entropy loss')
238 |     parser.add_argument(
239 |         '--num_place_hidden_layers',
240 |         default=1,
241 |         type=int,
242 |         help='Number of hidden layers in the place prediction MLP')    
243 |     parser.add_argument(
244 |         '--num_human_mask_adv_hidden_layers',
245 |         default=1,
246 |         type=int,
247 |         help='Number of hidden layers in the human masked prediction MLP')            
248 |     parser.add_argument(
249 |         '--alpha',
250 |         default=1.0,
251 |         type=float,
252 |         help='lambda of the grad reversarl layer, higher means higher impacts of the adversarial training'
253 |     )
254 |     parser.add_argument(
255 |         '--alpha_hm',
256 |         default=1.0,
257 |         type=float,
258 |         help='lambda of the grad reversarl layer for human mask confusion loss branch, higher means higher impacts of the adversarial training'
259 |     )
260 |     parser.add_argument(
261 |         '--num_places_classes',
262 |         default=0,
263 |         type=int,
264 |         help='Number of place classes')
265 |     parser.add_argument(
266 |         '--ft_begin_index',
267 |         default=0,
268 |         type=int,
269 |         help='Begin block index of fine-tuning')
270 |     parser.add_argument(
271 |         '--not_replace_last_fc',
272 |         action='store_true',
273 |         help='If true, DO NOT replace the last fc layer (classifier) of a network with a new one, if false, replace the last fc layer')
274 |     parser.set_defaults(not_replace_last_fc=False)        
275 |     parser.add_argument(
276 |         '--no_train',
277 |         action='store_true',
278 |         help='If true, training is not performed.')
279 |     parser.set_defaults(no_train=False)
280 |     parser.add_argument(
281 |         '--no_val',
282 |         action='store_true',
283 |         help='If true, validation is not performed.')
284 |     parser.set_defaults(no_val=False)
285 |     parser.add_argument(
286 |         '--test', action='store_true', help='If true, test is performed.')
287 |     parser.set_defaults(test=False)
288 |     parser.add_argument(
289 |         '--test_subset',
290 |         default='val',
291 |         type=str,
292 |         help='Used subset in test (val | test)')
293 |     parser.add_argument(
294 |         '--scale_in_test',
295 |         default=1.0,
296 |         type=float,
297 |         help='Spatial scale in test')
298 |     parser.add_argument(
299 |         '--crop_position_in_test',
300 |         default='c',
301 |         type=str,
302 |         help='Cropping method (c | tl | tr | bl | br) in test')
303 |     parser.add_argument(
304 |         '--no_softmax_in_test',
305 |         action='store_true',
306 |         help='If true, output for each clip is not normalized using softmax.')
307 |     parser.set_defaults(no_softmax_in_test=False)
308 |     parser.add_argument(
309 |         '--no_cuda', action='store_true', help='If true, cuda is not used.')
310 |     parser.set_defaults(no_cuda=False)
311 |     parser.add_argument(
312 |         '--n_threads',
313 |         default=4,
314 |         type=int,
315 |         help='Number of threads for multi-thread loading')
316 |     parser.add_argument(
317 |         '--checkpoint',
318 |         default=10,
319 |         type=int,
320 |         help='Trained model is saved at every this epochs.')
321 |     parser.add_argument(
322 |         '--no_hflip',
323 |         action='store_true',
324 |         help='If true holizontal flipping is not performed.')
325 |     parser.set_defaults(no_hflip=False)
326 |     parser.add_argument(
327 |         '--norm_value',
328 |         default=1,
329 |         type=int,
330 |         help=
331 |         'If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].')
332 |     parser.add_argument(
333 |         '--model',
334 |         default='resnet',
335 |         type=str,
336 |         help='(resnet | preresnet | wideresnet | resnext | densenet | ')
337 |     parser.add_argument(
338 |         '--model_depth',
339 |         default=18,
340 |         type=int,
341 |         help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
342 |     parser.add_argument(
343 |         '--resnet_shortcut',
344 |         default='B',
345 |         type=str,
346 |         help='Shortcut type of resnet (A | B)')
347 |     parser.add_argument(
348 |         '--wide_resnet_k', default=2, type=int, help='Wide resnet k')
349 |     parser.add_argument(
350 |         '--resnext_cardinality',
351 |         default=32,
352 |         type=int,
353 |         help='ResNeXt cardinality')
354 |     parser.add_argument(
355 |         '--manual_seed', default=1, type=int, help='Manually set random seed')
356 | 
357 |     args = parser.parse_args()
358 | 
359 |     return args
360 | 


--------------------------------------------------------------------------------
/libs/spatial_transforms.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import math
  3 | import numbers
  4 | import collections
  5 | import numpy as np
  6 | import torch
  7 | from PIL import Image, ImageOps
  8 | try:
  9 |     import accimage
 10 | except ImportError:
 11 |     accimage = None
 12 | 
 13 | 
 14 | class Compose(object):
 15 |     """Composes several transforms together.
 16 |     Args:
 17 |         transforms (list of ``Transform`` objects): list of transforms to compose.
 18 |     Example:
 19 |         >>> transforms.Compose([
 20 |         >>>     transforms.CenterCrop(10),
 21 |         >>>     transforms.ToTensor(),
 22 |         >>> ])
 23 |     """
 24 | 
 25 |     def __init__(self, transforms):
 26 |         self.transforms = transforms
 27 | 
 28 |     def __call__(self, img):
 29 |         for t in self.transforms:
 30 |             img = t(img)
 31 |         return img
 32 | 
 33 |     def randomize_parameters(self):
 34 |         for t in self.transforms:
 35 |             t.randomize_parameters()
 36 | 
 37 | 
 38 | class ToTensor(object):
 39 |     """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor.
 40 |     Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
 41 |     [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
 42 |     """
 43 | 
 44 |     def __init__(self, norm_value=255):
 45 |         self.norm_value = norm_value
 46 | 
 47 |     def __call__(self, pic):
 48 |         """
 49 |         Args:
 50 |             pic (PIL.Image or numpy.ndarray): Image to be converted to tensor.
 51 |         Returns:
 52 |             Tensor: Converted image.
 53 |         """
 54 |         if isinstance(pic, np.ndarray):
 55 |             # handle numpy array
 56 |             img = torch.from_numpy(pic.transpose((2, 0, 1)))
 57 |             # backward compatibility
 58 |             return img.float().div(self.norm_value)
 59 | 
 60 |         if accimage is not None and isinstance(pic, accimage.Image):
 61 |             nppic = np.zeros(
 62 |                 [pic.channels, pic.height, pic.width], dtype=np.float32)
 63 |             pic.copyto(nppic)
 64 |             return torch.from_numpy(nppic)
 65 | 
 66 |         # handle PIL Image
 67 |         if pic.mode == 'I':
 68 |             img = torch.from_numpy(np.array(pic, np.int32, copy=False))
 69 |         elif pic.mode == 'I;16':
 70 |             img = torch.from_numpy(np.array(pic, np.int16, copy=False))
 71 |         else:
 72 |             img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
 73 |         # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
 74 |         if pic.mode == 'YCbCr':
 75 |             nchannel = 3
 76 |         elif pic.mode == 'I;16':
 77 |             nchannel = 1
 78 |         else:
 79 |             nchannel = len(pic.mode)
 80 |         img = img.view(pic.size[1], pic.size[0], nchannel)
 81 |         # put it from HWC to CHW format
 82 |         # yikes, this transpose takes 80% of the loading time/CPU
 83 |         img = img.transpose(0, 1).transpose(0, 2).contiguous()
 84 |         if isinstance(img, torch.ByteTensor):
 85 |             return img.float().div(self.norm_value)
 86 |         else:
 87 |             return img
 88 | 
 89 |     def randomize_parameters(self):
 90 |         pass
 91 | 
 92 | 
 93 | class Normalize(object):
 94 |     """Normalize an tensor image with mean and standard deviation.
 95 |     Given mean: (R, G, B) and std: (R, G, B),
 96 |     will normalize each channel of the torch.*Tensor, i.e.
 97 |     channel = (channel - mean) / std
 98 |     Args:
 99 |         mean (sequence): Sequence of means for R, G, B channels respecitvely.
100 |         std (sequence): Sequence of standard deviations for R, G, B channels
101 |             respecitvely.
102 |     """
103 | 
104 |     def __init__(self, mean, std):
105 |         self.mean = mean
106 |         self.std = std
107 | 
108 |     def __call__(self, tensor):
109 |         """
110 |         Args:
111 |             tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
112 |         Returns:
113 |             Tensor: Normalized image.
114 |         """
115 |         # TODO: make efficient
116 |         for t, m, s in zip(tensor, self.mean, self.std):
117 |             t.sub_(m).div_(s)
118 |         return tensor
119 | 
120 |     def randomize_parameters(self):
121 |         pass
122 | 
123 | 
124 | class Scale(object):
125 |     """Rescale the input PIL.Image to the given size.
126 |     Args:
127 |         size (sequence or int): Desired output size. If size is a sequence like
128 |             (w, h), output size will be matched to this. If size is an int,
129 |             smaller edge of the image will be matched to this number.
130 |             i.e, if height > width, then image will be rescaled to
131 |             (size * height / width, size)
132 |         interpolation (int, optional): Desired interpolation. Default is
133 |             ``PIL.Image.BILINEAR``
134 |     """
135 | 
136 |     def __init__(self, size, interpolation=Image.BILINEAR):
137 |         assert isinstance(size,
138 |                           int) or (isinstance(size, collections.Iterable) and
139 |                                    len(size) == 2)
140 |         self.size = size
141 |         self.interpolation = interpolation
142 | 
143 |     def __call__(self, img):
144 |         """
145 |         Args:
146 |             img (PIL.Image): Image to be scaled.
147 |         Returns:
148 |             PIL.Image: Rescaled image.
149 |         """
150 |         if isinstance(self.size, int):
151 |             w, h = img.size
152 |             if (w <= h and w == self.size) or (h <= w and h == self.size):
153 |                 return img
154 |             if w < h:
155 |                 ow = self.size
156 |                 oh = int(self.size * h / w)
157 |                 return img.resize((ow, oh), self.interpolation)
158 |             else:
159 |                 oh = self.size
160 |                 ow = int(self.size * w / h)
161 |                 return img.resize((ow, oh), self.interpolation)
162 |         else:
163 |             return img.resize(self.size, self.interpolation)
164 | 
165 |     def randomize_parameters(self):
166 |         pass
167 | 
168 | 
169 | class CenterCrop(object):
170 |     """Crops the given PIL.Image at the center.
171 |     Args:
172 |         size (sequence or int): Desired output size of the crop. If size is an
173 |             int instead of sequence like (h, w), a square crop (size, size) is
174 |             made.
175 |     """
176 | 
177 |     def __init__(self, size):
178 |         if isinstance(size, numbers.Number):
179 |             self.size = (int(size), int(size))
180 |         else:
181 |             self.size = size
182 | 
183 |     def __call__(self, img):
184 |         """
185 |         Args:
186 |             img (PIL.Image): Image to be cropped.
187 |         Returns:
188 |             PIL.Image: Cropped image.
189 |         """
190 |         w, h = img.size
191 |         th, tw = self.size
192 |         x1 = int(round((w - tw) / 2.))
193 |         y1 = int(round((h - th) / 2.))
194 |         return img.crop((x1, y1, x1 + tw, y1 + th))
195 | 
196 |     def randomize_parameters(self):
197 |         pass
198 | 
199 | 
200 | class CornerCrop(object):
201 | 
202 |     def __init__(self, size, crop_position=None):
203 |         self.size = size
204 |         if crop_position is None:
205 |             self.randomize = True
206 |         else:
207 |             self.randomize = False
208 |         self.crop_position = crop_position
209 |         self.crop_positions = ['c', 'tl', 'tr', 'bl', 'br']
210 | 
211 |     def __call__(self, img):
212 |         image_width = img.size[0]
213 |         image_height = img.size[1]
214 | 
215 |         if self.crop_position == 'c':
216 |             th, tw = (self.size, self.size)
217 |             x1 = int(round((image_width - tw) / 2.))
218 |             y1 = int(round((image_height - th) / 2.))
219 |             x2 = x1 + tw
220 |             y2 = y1 + th
221 |         elif self.crop_position == 'tl':
222 |             x1 = 0
223 |             y1 = 0
224 |             x2 = self.size
225 |             y2 = self.size
226 |         elif self.crop_position == 'tr':
227 |             x1 = image_width - self.size
228 |             y1 = 0
229 |             x2 = image_width
230 |             y2 = self.size
231 |         elif self.crop_position == 'bl':
232 |             x1 = 0
233 |             y1 = image_height - self.size
234 |             x2 = self.size
235 |             y2 = image_height
236 |         elif self.crop_position == 'br':
237 |             x1 = image_width - self.size
238 |             y1 = image_height - self.size
239 |             x2 = image_width
240 |             y2 = image_height
241 | 
242 |         img = img.crop((x1, y1, x2, y2))
243 | 
244 |         return img
245 | 
246 |     def randomize_parameters(self):
247 |         if self.randomize:
248 |             self.crop_position = self.crop_positions[random.randint(
249 |                 0,
250 |                 len(self.crop_positions) - 1)]
251 | 
252 | 
253 | class RandomHorizontalFlip(object):
254 |     """Horizontally flip the given PIL.Image randomly with a probability of 0.5."""
255 | 
256 |     def __call__(self, img):
257 |         """
258 |         Args:
259 |             img (PIL.Image): Image to be flipped.
260 |         Returns:
261 |             PIL.Image: Randomly flipped image.
262 |         """
263 |         if self.p < 0.5:
264 |             return img.transpose(Image.FLIP_LEFT_RIGHT)
265 |         return img
266 | 
267 |     def randomize_parameters(self):
268 |         self.p = random.random()
269 | 
270 | 
271 | class MultiScaleCornerCrop(object):
272 |     """Crop the given PIL.Image to randomly selected size.
273 |     A crop of size is selected from scales of the original size.
274 |     A position of cropping is randomly selected from 4 corners and 1 center.
275 |     This crop is finally resized to given size.
276 |     Args:
277 |         scales: cropping scales of the original size
278 |         size: size of the smaller edge
279 |         interpolation: Default: PIL.Image.BILINEAR
280 |     """
281 | 
282 |     def __init__(self,
283 |                  scales,
284 |                  size,
285 |                  interpolation=Image.BILINEAR,
286 |                  crop_positions=['c', 'tl', 'tr', 'bl', 'br']):
287 |         self.scales = scales
288 |         self.size = size
289 |         self.interpolation = interpolation
290 | 
291 |         self.crop_positions = crop_positions
292 | 
293 |     def __call__(self, img):
294 |         min_length = min(img.size[0], img.size[1])
295 |         crop_size = int(min_length * self.scale)
296 | 
297 |         image_width = img.size[0]
298 |         image_height = img.size[1]
299 | 
300 |         if self.crop_position == 'c':
301 |             center_x = image_width // 2
302 |             center_y = image_height // 2
303 |             box_half = crop_size // 2
304 |             x1 = center_x - box_half
305 |             y1 = center_y - box_half
306 |             x2 = center_x + box_half
307 |             y2 = center_y + box_half
308 |         elif self.crop_position == 'tl':
309 |             x1 = 0
310 |             y1 = 0
311 |             x2 = crop_size
312 |             y2 = crop_size
313 |         elif self.crop_position == 'tr':
314 |             x1 = image_width - crop_size
315 |             y1 = 0
316 |             x2 = image_width
317 |             y2 = crop_size
318 |         elif self.crop_position == 'bl':
319 |             x1 = 0
320 |             y1 = image_height - crop_size
321 |             x2 = crop_size
322 |             y2 = image_height
323 |         elif self.crop_position == 'br':
324 |             x1 = image_width - crop_size
325 |             y1 = image_height - crop_size
326 |             x2 = image_width
327 |             y2 = image_height
328 | 
329 |         img = img.crop((x1, y1, x2, y2))
330 | 
331 |         return img.resize((self.size, self.size), self.interpolation)
332 | 
333 |     def randomize_parameters(self):
334 |         self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
335 |         self.crop_position = self.crop_positions[random.randint(
336 |             0,
337 |             len(self.crop_positions) - 1)]
338 | 
339 | 
340 | class MultiScaleRandomCrop(object):
341 | 
342 |     def __init__(self, scales, size, interpolation=Image.BILINEAR):
343 |         self.scales = scales
344 |         self.size = size
345 |         self.interpolation = interpolation
346 | 
347 |     def __call__(self, img):
348 |         min_length = min(img.size[0], img.size[1])
349 |         crop_size = int(min_length * self.scale)
350 | 
351 |         image_width = img.size[0]
352 |         image_height = img.size[1]
353 | 
354 |         x1 = self.tl_x * (image_width - crop_size)
355 |         y1 = self.tl_y * (image_height - crop_size)
356 |         x2 = x1 + crop_size
357 |         y2 = y1 + crop_size
358 | 
359 |         img = img.crop((x1, y1, x2, y2))
360 | 
361 |         return img.resize((self.size, self.size), self.interpolation)
362 | 
363 |     def randomize_parameters(self):
364 |         self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
365 |         self.tl_x = random.random()
366 |         self.tl_y = random.random()
367 | 


--------------------------------------------------------------------------------
/libs/target_transforms.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import math
 3 | 
 4 | 
 5 | class Compose(object):
 6 | 
 7 |     def __init__(self, transforms):
 8 |         self.transforms = transforms
 9 | 
10 |     def __call__(self, target):
11 |         dst = []
12 |         for t in self.transforms:
13 |             dst.append(t(target))
14 |         return dst
15 | 
16 | 
17 | class ClassLabel(object):
18 | 
19 |     def __call__(self, target):
20 |         return target['label']
21 | 
22 | 
23 | class VideoID(object):
24 | 
25 |     def __call__(self, target):
26 |         return target['video_id']
27 | 


--------------------------------------------------------------------------------
/libs/temporal_transforms.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import math
  3 | 
  4 | 
  5 | class LoopPadding(object):
  6 | 
  7 |     def __init__(self, size):
  8 |         self.size = size
  9 | 
 10 |     def __call__(self, frame_indices):
 11 |         out = frame_indices
 12 | 
 13 |         for index in out:
 14 |             if len(out) >= self.size:
 15 |                 break
 16 |             out.append(index)
 17 | 
 18 |         return out
 19 | 
 20 | 
 21 | class TemporalBeginCrop(object):
 22 |     """Temporally crop the given frame indices at a beginning.
 23 | 
 24 |     If the number of frames is less than the size,
 25 |     loop the indices as many times as necessary to satisfy the size.
 26 | 
 27 |     Args:
 28 |         size (int): Desired output size of the crop.
 29 |     """
 30 | 
 31 |     def __init__(self, size):
 32 |         self.size = size
 33 | 
 34 |     def __call__(self, frame_indices):
 35 |         out = frame_indices[:self.size]
 36 | 
 37 |         for index in out:
 38 |             if len(out) >= self.size:
 39 |                 break
 40 |             out.append(index)
 41 | 
 42 |         return out
 43 | 
 44 | 
 45 | class TemporalCenterCrop(object):
 46 |     """Temporally crop the given frame indices at a center.
 47 | 
 48 |     If the number of frames is less than the size,
 49 |     loop the indices as many times as necessary to satisfy the size.
 50 | 
 51 |     Args:
 52 |         size (int): Desired output size of the crop.
 53 |     """
 54 | 
 55 |     def __init__(self, size):
 56 |         self.size = size
 57 | 
 58 |     def __call__(self, frame_indices):
 59 |         """
 60 |         Args:
 61 |             frame_indices (list): frame indices to be cropped.
 62 |         Returns:
 63 |             list: Cropped frame indices.
 64 |         """
 65 | 
 66 |         center_index = len(frame_indices) // 2
 67 |         begin_index = max(0, center_index - (self.size // 2))
 68 |         end_index = min(begin_index + self.size, len(frame_indices))
 69 | 
 70 |         out = frame_indices[begin_index:end_index]
 71 | 
 72 |         for index in out:
 73 |             if len(out) >= self.size:
 74 |                 break
 75 |             out.append(index)
 76 | 
 77 |         return out
 78 | 
 79 | 
 80 | class TemporalRandomCrop(object):
 81 |     """Temporally crop the given frame indices at a random location.
 82 | 
 83 |     If the number of frames is less than the size,
 84 |     loop the indices as many times as necessary to satisfy the size.
 85 | 
 86 |     Args:
 87 |         size (int): Desired output size of the crop.
 88 |     """
 89 | 
 90 |     def __init__(self, size):
 91 |         self.size = size
 92 | 
 93 |     def __call__(self, frame_indices):
 94 |         """
 95 |         Args:
 96 |             frame_indices (list): frame indices to be cropped.
 97 |         Returns:
 98 |             list: Cropped frame indices.
 99 |         """
100 | 
101 |         rand_end = max(0, len(frame_indices) - self.size - 1)
102 |         begin_index = random.randint(0, rand_end)
103 |         end_index = min(begin_index + self.size, len(frame_indices))
104 | 
105 |         out = frame_indices[begin_index:end_index]
106 | 
107 |         for index in out:
108 |             if len(out) >= self.size:
109 |                 break
110 |             out.append(index)
111 | 
112 |         return out
113 | 


--------------------------------------------------------------------------------
/libs/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | import torch.nn.functional as F
 4 | import time
 5 | import os
 6 | import sys
 7 | import json
 8 | import pdb
 9 | 
10 | from libs.utils import AverageMeter
11 | 
12 | # PyTorch 0.3
13 | def calculate_video_results(output_buffer, video_id, test_results, class_names):
14 |     video_outputs = torch.stack(output_buffer)
15 |     average_scores = torch.mean(video_outputs, dim=0)
16 |     sorted_scores, locs = torch.topk(average_scores, k=10)
17 | 
18 |     video_results = []
19 |     for i in range(sorted_scores.size(0)):
20 |         video_results.append({
21 |             'label': class_names[locs[i]],
22 |             'score': sorted_scores[i]
23 |         })
24 | 
25 |     test_results['results'][video_id] = video_results
26 | 
27 | # PyTorch 0.4
28 | def calculate_video_results_pt_0_4(output_buffer, video_id, test_results, class_names):
29 |     video_outputs = torch.stack(output_buffer)
30 |     average_scores = torch.mean(video_outputs, dim=0)
31 |     sorted_scores, locs = torch.topk(average_scores, k=10)
32 | 
33 |     video_results = []
34 |     for i in range(sorted_scores.size(0)):
35 |         video_results.append({
36 |             'label': class_names[locs[i].item()],
37 |             'score': sorted_scores[i].cpu().numpy().item()
38 |         })
39 | 
40 |     test_results['results'][video_id] = video_results
41 |     
42 | 
43 | def test(data_loader, model, opt, class_names):
44 |     print('test')
45 | 
46 |     model.eval()
47 | 
48 |     # pytroch version check
49 |     torch_version = float(torch.__version__[:3])
50 | 
51 |     batch_time = AverageMeter()
52 |     data_time = AverageMeter()
53 | 
54 |     end_time = time.time()
55 |     output_buffer = []
56 |     previous_video_id = ''
57 |     test_results = {'results': {}}
58 |     for i, (inputs, targets) in enumerate(data_loader):
59 |         data_time.update(time.time() - end_time)
60 | 
61 |         inputs = Variable(inputs, volatile=True)
62 |         outputs = model(inputs)
63 |         if not opt.no_softmax_in_test:
64 |             outputs = F.softmax(outputs)
65 | 
66 |         for j in range(outputs.size(0)):
67 |             if not (i == 0 and j == 0) and targets[j] != previous_video_id:
68 |                 if torch_version < 0.4:
69 |                     calculate_video_results(output_buffer, previous_video_id,
70 |                                             test_results, class_names)
71 |                 else:
72 |                     calculate_video_results_pt_0_4(output_buffer, previous_video_id,
73 |                                             test_results, class_names)
74 |                 output_buffer = []
75 |             output_buffer.append(outputs[j].data.cpu())
76 |             previous_video_id = targets[j]
77 | 
78 |         if (i % 100) == 0:
79 |             with open(
80 |                     os.path.join(opt.result_path, '{}.json'.format(
81 |                         opt.test_subset)), 'w') as f:
82 |                 json.dump(test_results, f)
83 | 
84 |         batch_time.update(time.time() - end_time)
85 |         end_time = time.time()
86 | 
87 |         print('[{}/{}]\t'
88 |               'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
89 |               'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format(
90 |                   i + 1,
91 |                   len(data_loader),
92 |                   batch_time=batch_time,
93 |                   data_time=data_time))
94 |     with open(
95 |             os.path.join(opt.result_path, '{}.json'.format(opt.test_subset)),
96 |             'w') as f:
97 |         json.dump(test_results, f)
98 | 


--------------------------------------------------------------------------------
/libs/utils.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | class AverageMeter(object):
 4 |     """Computes and stores the average and current value"""
 5 | 
 6 |     def __init__(self):
 7 |         self.reset()
 8 | 
 9 |     def reset(self):
10 |         self.val = 0
11 |         self.avg = 0
12 |         self.sum = 0
13 |         self.count = 0
14 | 
15 |     def update(self, val, n=1):
16 |         self.val = val
17 |         self.sum += val * n
18 |         self.count += n
19 |         self.avg = self.sum / self.count
20 | 
21 | 
22 | class Logger(object):
23 | 
24 |     def __init__(self, path, header):
25 |         self.log_file = open(path, 'w')
26 |         self.logger = csv.writer(self.log_file, delimiter='\t')
27 | 
28 |         self.logger.writerow(header)
29 |         self.header = header
30 | 
31 |     def __del(self):
32 |         self.log_file.close()
33 | 
34 |     def log(self, values):
35 |         write_values = []
36 |         for col in self.header:
37 |             assert col in values
38 |             write_values.append(values[col])
39 | 
40 |         self.logger.writerow(write_values)
41 |         self.log_file.flush()
42 | 
43 | 
44 | def load_value_file(file_path):
45 |     with open(file_path, 'r') as input_file:
46 |         value = float(input_file.read().rstrip('\n\r'))
47 | 
48 |     return value
49 | 
50 | 
51 | def calculate_accuracy(outputs, targets):
52 |     batch_size = targets.size(0)
53 | 
54 |     _, pred = outputs.topk(1, 1, True)
55 |     pred = pred.t()
56 |     correct = pred.eq(targets.view(1, -1))
57 |     n_correct_elems = correct.float().sum().data[0]
58 | 
59 |     return n_correct_elems / batch_size
60 | 
61 | def calculate_accuracy_pt_0_4(outputs, targets):
62 |     batch_size = targets.size(0)
63 | 
64 |     _, pred = outputs.topk(1, 1, True)
65 |     pred = pred.t()
66 |     correct = pred.eq(targets.view(1, -1))
67 |     n_correct_elems = correct.float().sum().item()
68 | 
69 |     return n_correct_elems / batch_size    


--------------------------------------------------------------------------------
/loss/hloss.py:
--------------------------------------------------------------------------------
 1 | ## code from https://discuss.pytorch.org/t/calculating-the-entropy-loss/14510
 2 | 
 3 | from torch import nn
 4 | import torch.nn.functional as F
 5 | 
 6 | class HLoss(nn.Module):
 7 |     """
 8 |         returning the negative entropy of an input tensor
 9 |     """
10 |     def __init__(self, is_maximization=False):
11 |         super(HLoss, self).__init__()
12 |         self.is_neg = is_maximization
13 | 
14 |     def forward(self, x):
15 |         b = F.softmax(x, dim=1) * F.log_softmax(x, dim=1)
16 |         if self.is_neg:
17 |             # b = 1.0 * b.sum()          # summation over batches         
18 |             b = 1.0 * b.sum(dim=1).mean()     # summation over batches, mean over batches       
19 |         else:
20 |             # b = -1.0 * b.sum()
21 |             b = -1.0 * b.sum(dim=1).mean()     # summation over batches, mean over batches
22 |         return b


--------------------------------------------------------------------------------
/loss/soft_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | ## code from https://discuss.pytorch.org/t/cross-entropy-for-soft-label/16093 and https://discuss.pytorch.org/t/how-should-i-implement-cross-entropy-loss-with-continuous-target-outputs/10720/21
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | class SoftCrossEntropy(nn.Module):
 7 |     def __init__(self):
 8 |         super(SoftCrossEntropy, self).__init__()
 9 |         return
10 | 
11 |     def forward(self, inputs, target):
12 |         """
13 |         :param inputs: predictions
14 |         :param target: target labels
15 |         :return: loss
16 |         """
17 |         logsoftmax = nn.LogSoftmax(dim=1)
18 |         
19 |         return torch.mean(torch.sum(- target * logsoftmax(inputs), 1))


--------------------------------------------------------------------------------
/models/densenet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from collections import OrderedDict
  5 | import math
  6 | 
  7 | __all__ = [
  8 |     'DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet264'
  9 | ]
 10 | 
 11 | 
 12 | def densenet121(**kwargs):
 13 |     model = DenseNet(
 14 |         num_init_features=64,
 15 |         growth_rate=32,
 16 |         block_config=(6, 12, 24, 16),
 17 |         **kwargs)
 18 |     return model
 19 | 
 20 | 
 21 | def densenet169(**kwargs):
 22 |     model = DenseNet(
 23 |         num_init_features=64,
 24 |         growth_rate=32,
 25 |         block_config=(6, 12, 32, 32),
 26 |         **kwargs)
 27 |     return model
 28 | 
 29 | 
 30 | def densenet201(**kwargs):
 31 |     model = DenseNet(
 32 |         num_init_features=64,
 33 |         growth_rate=32,
 34 |         block_config=(6, 12, 48, 32),
 35 |         **kwargs)
 36 |     return model
 37 | 
 38 | 
 39 | def densenet264(**kwargs):
 40 |     model = DenseNet(
 41 |         num_init_features=64,
 42 |         growth_rate=32,
 43 |         block_config=(6, 12, 64, 48),
 44 |         **kwargs)
 45 |     return model
 46 | 
 47 | 
 48 | def get_fine_tuning_parameters(model, ft_begin_index):
 49 |     if ft_begin_index == 0:
 50 |         return model.parameters()
 51 | 
 52 |     ft_module_names = []
 53 |     for i in range(ft_begin_index, 5):
 54 |         ft_module_names.append('denseblock{}'.format(i))
 55 |         ft_module_names.append('transition{}'.format(i))
 56 |     ft_module_names.append('norm5')
 57 |     ft_module_names.append('classifier')
 58 | 
 59 |     parameters = []
 60 |     for k, v in model.named_parameters():
 61 |         for ft_module in ft_module_names:
 62 |             if ft_module in k:
 63 |                 parameters.append({'params': v})
 64 |                 break
 65 |         else:
 66 |             parameters.append({'params': v, 'lr': 0.0})
 67 | 
 68 |     return parameters
 69 | 
 70 | 
 71 | class _DenseLayer(nn.Sequential):
 72 | 
 73 |     def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
 74 |         super(_DenseLayer, self).__init__()
 75 |         self.add_module('norm.1', nn.BatchNorm3d(num_input_features))
 76 |         self.add_module('relu.1', nn.ReLU(inplace=True))
 77 |         self.add_module('conv.1',
 78 |                         nn.Conv3d(
 79 |                             num_input_features,
 80 |                             bn_size * growth_rate,
 81 |                             kernel_size=1,
 82 |                             stride=1,
 83 |                             bias=False))
 84 |         self.add_module('norm.2', nn.BatchNorm3d(bn_size * growth_rate))
 85 |         self.add_module('relu.2', nn.ReLU(inplace=True))
 86 |         self.add_module('conv.2',
 87 |                         nn.Conv3d(
 88 |                             bn_size * growth_rate,
 89 |                             growth_rate,
 90 |                             kernel_size=3,
 91 |                             stride=1,
 92 |                             padding=1,
 93 |                             bias=False))
 94 |         self.drop_rate = drop_rate
 95 | 
 96 |     def forward(self, x):
 97 |         new_features = super(_DenseLayer, self).forward(x)
 98 |         if self.drop_rate > 0:
 99 |             new_features = F.dropout(
100 |                 new_features, p=self.drop_rate, training=self.training)
101 |         return torch.cat([x, new_features], 1)
102 | 
103 | 
104 | class _DenseBlock(nn.Sequential):
105 | 
106 |     def __init__(self, num_layers, num_input_features, bn_size, growth_rate,
107 |                  drop_rate):
108 |         super(_DenseBlock, self).__init__()
109 |         for i in range(num_layers):
110 |             layer = _DenseLayer(num_input_features + i * growth_rate,
111 |                                 growth_rate, bn_size, drop_rate)
112 |             self.add_module('denselayer%d' % (i + 1), layer)
113 | 
114 | 
115 | class _Transition(nn.Sequential):
116 | 
117 |     def __init__(self, num_input_features, num_output_features):
118 |         super(_Transition, self).__init__()
119 |         self.add_module('norm', nn.BatchNorm3d(num_input_features))
120 |         self.add_module('relu', nn.ReLU(inplace=True))
121 |         self.add_module('conv',
122 |                         nn.Conv3d(
123 |                             num_input_features,
124 |                             num_output_features,
125 |                             kernel_size=1,
126 |                             stride=1,
127 |                             bias=False))
128 |         self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2))
129 | 
130 | 
131 | class DenseNet(nn.Module):
132 |     """Densenet-BC model class
133 |     Args:
134 |         growth_rate (int) - how many filters to add each layer (k in paper)
135 |         block_config (list of 4 ints) - how many layers in each pooling block
136 |         num_init_features (int) - the number of filters to learn in the first convolution layer
137 |         bn_size (int) - multiplicative factor for number of bottle neck layers
138 |           (i.e. bn_size * k features in the bottleneck layer)
139 |         drop_rate (float) - dropout rate after each dense layer
140 |         num_classes (int) - number of classification classes
141 |     """
142 | 
143 |     def __init__(self,
144 |                  sample_size,
145 |                  sample_duration,
146 |                  growth_rate=32,
147 |                  block_config=(6, 12, 24, 16),
148 |                  num_init_features=64,
149 |                  bn_size=4,
150 |                  drop_rate=0,
151 |                  num_classes=1000):
152 | 
153 |         super(DenseNet, self).__init__()
154 | 
155 |         self.sample_size = sample_size
156 |         self.sample_duration = sample_duration
157 | 
158 |         # First convolution
159 |         self.features = nn.Sequential(
160 |             OrderedDict([
161 |                 ('conv0',
162 |                  nn.Conv3d(
163 |                      3,
164 |                      num_init_features,
165 |                      kernel_size=7,
166 |                      stride=(1, 2, 2),
167 |                      padding=(3, 3, 3),
168 |                      bias=False)),
169 |                 ('norm0', nn.BatchNorm3d(num_init_features)),
170 |                 ('relu0', nn.ReLU(inplace=True)),
171 |                 ('pool0', nn.MaxPool3d(kernel_size=3, stride=2, padding=1)),
172 |             ]))
173 | 
174 |         # Each denseblock
175 |         num_features = num_init_features
176 |         for i, num_layers in enumerate(block_config):
177 |             block = _DenseBlock(
178 |                 num_layers=num_layers,
179 |                 num_input_features=num_features,
180 |                 bn_size=bn_size,
181 |                 growth_rate=growth_rate,
182 |                 drop_rate=drop_rate)
183 |             self.features.add_module('denseblock%d' % (i + 1), block)
184 |             num_features = num_features + num_layers * growth_rate
185 |             if i != len(block_config) - 1:
186 |                 trans = _Transition(
187 |                     num_input_features=num_features,
188 |                     num_output_features=num_features // 2)
189 |                 self.features.add_module('transition%d' % (i + 1), trans)
190 |                 num_features = num_features // 2
191 | 
192 |         # Final batch norm
193 |         self.features.add_module('norm5', nn.BatchNorm2d(num_features))
194 | 
195 |         for m in self.modules():
196 |             if isinstance(m, nn.Conv3d):
197 |                 m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
198 |             elif isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm2d):
199 |                 m.weight.data.fill_(1)
200 |                 m.bias.data.zero_()
201 | 
202 |         # Linear layer
203 |         self.classifier = nn.Linear(num_features, num_classes)
204 | 
205 |     def forward(self, x):
206 |         features = self.features(x)
207 |         out = F.relu(features, inplace=True)
208 |         last_duration = int(math.ceil(self.sample_duration / 16))
209 |         last_size = int(math.floor(self.sample_size / 32))
210 |         out = F.avg_pool3d(
211 |             out, kernel_size=(last_duration, last_size, last_size)).view(
212 |                 features.size(0), -1)
213 |         out = self.classifier(out)
214 |         return out
215 | 


--------------------------------------------------------------------------------
/models/grad_reversal.py:
--------------------------------------------------------------------------------
 1 | ## code from https://github.com/jindongwang/transferlearning/tree/master/code/deep/DANN(RevGrad)
 2 | ## original paper: Ganin Y, Lempitsky V. Unsupervised domain adaptation by backpropagation. ICML 2015.
 3 | 
 4 | from torch.autograd import Function
 5 | 
 6 | class ReverseLayerF(Function):
 7 |     @staticmethod
 8 |     def forward(ctx, x, alpha):
 9 |         ctx.alpha = alpha
10 |         return x.view_as(x)
11 | 
12 |     @staticmethod
13 |     def backward(ctx, grad_output):
14 |         output = grad_output.neg() * ctx.alpha
15 |         return output, None
16 | 


--------------------------------------------------------------------------------
/models/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | 
  4 | from models import resnet, pre_act_resnet, wide_resnet, resnext, densenet, vgg
  5 | import pdb
  6 | 
  7 | def generate_model(opt):
  8 |     assert opt.model in [
  9 |         'resnet', 'preresnet', 'wideresnet', 'resnext', 'densenet', 'vgg'
 10 |     ]
 11 | 
 12 |     if opt.model == 'resnet':
 13 |         assert opt.model_depth in [10, 18, 34, 50, 101, 152, 200]
 14 | 
 15 |         from models.resnet import get_fine_tuning_parameters, get_adv_fine_tuning_parameters
 16 | 
 17 |         if opt.model_depth == 10:
 18 |             model = resnet.resnet10(
 19 |                 num_classes=opt.n_classes,
 20 |                 shortcut_type=opt.resnet_shortcut,
 21 |                 sample_size=opt.sample_size,
 22 |                 sample_duration=opt.sample_duration,
 23 |                 is_adv=opt.is_place_adv,
 24 |                 is_human_mask_adv=opt.is_mask_adv,
 25 |                 alpha=opt.alpha,
 26 |                 alpha_hm=opt.alpha_hm,
 27 |                 num_places_classes=opt.num_places_classes,
 28 |                 num_place_hidden_layers=opt.num_place_hidden_layers,
 29 |                 num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers)
 30 |         elif opt.model_depth == 18:
 31 |             model = resnet.resnet18(
 32 |                 num_classes=opt.n_classes,
 33 |                 shortcut_type=opt.resnet_shortcut,
 34 |                 sample_size=opt.sample_size,
 35 |                 sample_duration=opt.sample_duration,
 36 |                 is_adv=opt.is_place_adv,
 37 |                 is_human_mask_adv=opt.is_mask_adv,
 38 |                 alpha=opt.alpha,
 39 |                 alpha_hm=opt.alpha_hm,
 40 |                 num_places_classes=opt.num_places_classes,
 41 |                 num_place_hidden_layers=opt.num_place_hidden_layers,
 42 |                 num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers)
 43 |         elif opt.model_depth == 34:
 44 |             model = resnet.resnet34(
 45 |                 num_classes=opt.n_classes,
 46 |                 shortcut_type=opt.resnet_shortcut,
 47 |                 sample_size=opt.sample_size,
 48 |                 sample_duration=opt.sample_duration,
 49 |                 is_adv=opt.is_place_adv,
 50 |                 is_human_mask_adv=opt.is_mask_adv,
 51 |                 alpha=opt.alpha,
 52 |                 alpha_hm=opt.alpha_hm,
 53 |                 num_places_classes=opt.num_places_classes,
 54 |                 num_place_hidden_layers=opt.num_place_hidden_layers,
 55 |                 num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers)
 56 |         elif opt.model_depth == 50:
 57 |             model = resnet.resnet50(
 58 |                 num_classes=opt.n_classes,
 59 |                 shortcut_type=opt.resnet_shortcut,
 60 |                 sample_size=opt.sample_size,
 61 |                 sample_duration=opt.sample_duration,
 62 |                 is_adv=opt.is_place_adv,
 63 |                 is_human_mask_adv=opt.is_mask_adv,
 64 |                 alpha=opt.alpha,
 65 |                 alpha_hm=opt.alpha_hm,
 66 |                 num_places_classes=opt.num_places_classes,
 67 |                 num_place_hidden_layers=opt.num_place_hidden_layers,
 68 |                 num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers)
 69 |         elif opt.model_depth == 101:
 70 |             model = resnet.resnet101(
 71 |                 num_classes=opt.n_classes,
 72 |                 shortcut_type=opt.resnet_shortcut,
 73 |                 sample_size=opt.sample_size,
 74 |                 sample_duration=opt.sample_duration,
 75 |                 is_adv=opt.is_place_adv,
 76 |                 is_human_mask_adv=opt.is_mask_adv,
 77 |                 alpha=opt.alpha,
 78 |                 alpha_hm=opt.alpha_hm,
 79 |                 num_places_classes=opt.num_places_classes,
 80 |                 num_place_hidden_layers=opt.num_place_hidden_layers,
 81 |                 num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers)
 82 |         elif opt.model_depth == 152:
 83 |             model = resnet.resnet152(
 84 |                 num_classes=opt.n_classes,
 85 |                 shortcut_type=opt.resnet_shortcut,
 86 |                 sample_size=opt.sample_size,
 87 |                 sample_duration=opt.sample_duration,
 88 |                 is_adv=opt.is_place_adv,
 89 |                 is_human_mask_adv=opt.is_mask_adv,
 90 |                 alpha=opt.alpha,
 91 |                 alpha_hm=opt.alpha_hm,
 92 |                 num_places_classes=opt.num_places_classes,
 93 |                 num_place_hidden_layers=opt.num_place_hidden_layers,
 94 |                 num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers)
 95 |         elif opt.model_depth == 200:
 96 |             model = resnet.resnet200(
 97 |                 num_classes=opt.n_classes,
 98 |                 shortcut_type=opt.resnet_shortcut,
 99 |                 sample_size=opt.sample_size,
100 |                 sample_duration=opt.sample_duration,
101 |                 is_adv=opt.is_place_adv,
102 |                 is_human_mask_adv=opt.is_mask_adv,
103 |                 alpha=opt.alpha,
104 |                 alpha_hm=opt.alpha_hm,
105 |                 num_places_classes=opt.num_places_classes,
106 |                 num_place_hidden_layers=opt.num_place_hidden_layers,
107 |                 num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers)
108 |     elif opt.model == 'wideresnet':
109 |         assert opt.model_depth in [50]
110 | 
111 |         from models.wide_resnet import get_fine_tuning_parameters
112 | 
113 |         if opt.model_depth == 50:
114 |             model = wide_resnet.resnet50(
115 |                 num_classes=opt.n_classes,
116 |                 shortcut_type=opt.resnet_shortcut,
117 |                 k=opt.wide_resnet_k,
118 |                 sample_size=opt.sample_size,
119 |                 sample_duration=opt.sample_duration)
120 |     elif opt.model == 'resnext':
121 |         assert opt.model_depth in [50, 101, 152]
122 | 
123 |         from models.resnext import get_fine_tuning_parameters
124 | 
125 |         if opt.model_depth == 50:
126 |             model = resnext.resnet50(
127 |                 num_classes=opt.n_classes,
128 |                 shortcut_type=opt.resnet_shortcut,
129 |                 cardinality=opt.resnext_cardinality,
130 |                 sample_size=opt.sample_size,
131 |                 sample_duration=opt.sample_duration)
132 |         elif opt.model_depth == 101:
133 |             model = resnext.resnet101(
134 |                 num_classes=opt.n_classes,
135 |                 shortcut_type=opt.resnet_shortcut,
136 |                 cardinality=opt.resnext_cardinality,
137 |                 sample_size=opt.sample_size,
138 |                 sample_duration=opt.sample_duration)
139 |         elif opt.model_depth == 152:
140 |             model = resnext.resnet152(
141 |                 num_classes=opt.n_classes,
142 |                 shortcut_type=opt.resnet_shortcut,
143 |                 cardinality=opt.resnext_cardinality,
144 |                 sample_size=opt.sample_size,
145 |                 sample_duration=opt.sample_duration)
146 |     elif opt.model == 'preresnet':
147 |         assert opt.model_depth in [18, 34, 50, 101, 152, 200]
148 | 
149 |         from models.pre_act_resnet import get_fine_tuning_parameters
150 | 
151 |         if opt.model_depth == 18:
152 |             model = pre_act_resnet.resnet18(
153 |                 num_classes=opt.n_classes,
154 |                 shortcut_type=opt.resnet_shortcut,
155 |                 sample_size=opt.sample_size,
156 |                 sample_duration=opt.sample_duration)
157 |         elif opt.model_depth == 34:
158 |             model = pre_act_resnet.resnet34(
159 |                 num_classes=opt.n_classes,
160 |                 shortcut_type=opt.resnet_shortcut,
161 |                 sample_size=opt.sample_size,
162 |                 sample_duration=opt.sample_duration)
163 |         elif opt.model_depth == 50:
164 |             model = pre_act_resnet.resnet50(
165 |                 num_classes=opt.n_classes,
166 |                 shortcut_type=opt.resnet_shortcut,
167 |                 sample_size=opt.sample_size,
168 |                 sample_duration=opt.sample_duration)
169 |         elif opt.model_depth == 101:
170 |             model = pre_act_resnet.resnet101(
171 |                 num_classes=opt.n_classes,
172 |                 shortcut_type=opt.resnet_shortcut,
173 |                 sample_size=opt.sample_size,
174 |                 sample_duration=opt.sample_duration)
175 |         elif opt.model_depth == 152:
176 |             model = pre_act_resnet.resnet152(
177 |                 num_classes=opt.n_classes,
178 |                 shortcut_type=opt.resnet_shortcut,
179 |                 sample_size=opt.sample_size,
180 |                 sample_duration=opt.sample_duration)
181 |         elif opt.model_depth == 200:
182 |             model = pre_act_resnet.resnet200(
183 |                 num_classes=opt.n_classes,
184 |                 shortcut_type=opt.resnet_shortcut,
185 |                 sample_size=opt.sample_size,
186 |                 sample_duration=opt.sample_duration)
187 |     elif opt.model == 'densenet':
188 |         assert opt.model_depth in [121, 169, 201, 264]
189 | 
190 |         from models.densenet import get_fine_tuning_parameters
191 | 
192 |         if opt.model_depth == 121:
193 |             model = densenet.densenet121(
194 |                 num_classes=opt.n_classes,
195 |                 sample_size=opt.sample_size,
196 |                 sample_duration=opt.sample_duration)
197 |         elif opt.model_depth == 169:
198 |             model = densenet.densenet169(
199 |                 num_classes=opt.n_classes,
200 |                 sample_size=opt.sample_size,
201 |                 sample_duration=opt.sample_duration)
202 |         elif opt.model_depth == 201:
203 |             model = densenet.densenet201(
204 |                 num_classes=opt.n_classes,
205 |                 sample_size=opt.sample_size,
206 |                 sample_duration=opt.sample_duration)
207 |         elif opt.model_depth == 264:
208 |             model = densenet.densenet264(
209 |                 num_classes=opt.n_classes,
210 |                 sample_size=opt.sample_size,
211 |                 sample_duration=opt.sample_duration)
212 |     elif opt.model == 'vgg':
213 |         
214 |         from models.vgg import get_fine_tuning_parameters, get_adv_fine_tuning_parameters
215 |         
216 |         model = vgg.build_vgg(
217 |             num_classes=opt.n_classes,
218 |             is_adv=opt.is_place_adv,
219 |             is_human_mask_adv=opt.is_mask_adv,
220 |             alpha=opt.alpha,
221 |             alpha_hm=opt.alpha_hm,
222 |             num_places_classes=opt.num_places_classes,
223 |             num_place_hidden_layers=opt.num_place_hidden_layers,
224 |             num_human_mask_adv_hidden_layers=opt.num_human_mask_adv_hidden_layers)
225 | 
226 |     if not opt.no_cuda:
227 |         model = model.cuda()
228 |         model = nn.DataParallel(model, device_ids=None)
229 | 
230 |         if opt.pretrain_path:
231 |             print('loading pretrained model {}'.format(opt.pretrain_path))
232 |             pretrain = torch.load(opt.pretrain_path)
233 |             
234 |             if opt.model != 'vgg':
235 |                 assert opt.arch == pretrain['arch']
236 |             # else:
237 |             #     pdb.set_trace()
238 |             # pdb.set_trace()
239 | 
240 |             # model.load_state_dict(pretrain['state_dict'])
241 |             model_dict = model.state_dict()
242 | 
243 |             # 1. filter out unnecessary keys and the last fc layers' weights
244 |             pretrained_dict = dict()
245 |             if 'state_dict' in pretrain:
246 |                 for k,v in pretrain['state_dict'].items():
247 |                     if ((k in model_dict) and (v.shape == model_dict[k].shape)):
248 |                         pretrained_dict[k] = v
249 |             else:
250 |                 for k,v in pretrain.items():
251 |                     new_k = 'module.vgg.'+ k
252 |                     if ((new_k in model_dict) and (v.shape == model_dict[new_k].shape)):
253 |                         pretrained_dict[new_k] = v
254 |             # 2. overwrite entries in the existing state dict
255 |             model_dict.update(pretrained_dict)
256 |             # 3. load the new state dict
257 |             model.load_state_dict(model_dict)
258 | 
259 |             if not opt.not_replace_last_fc:
260 |                 if opt.model == 'densenet':
261 |                     model.module.classifier = nn.Linear(
262 |                         model.module.classifier.in_features, opt.n_finetune_classes)
263 |                     model.module.classifier = model.module.classifier.cuda()
264 |                 else:
265 |                     model.module.fc = nn.Linear(model.module.fc.in_features,
266 |                                                 opt.n_finetune_classes)
267 |                     model.module.fc = model.module.fc.cuda()
268 | 
269 |             if opt.is_place_adv or opt.is_mask_cross_entropy or opt.is_mask_entropy:
270 |                 # pdb.set_trace()
271 |                 parameters = get_adv_fine_tuning_parameters(model, opt.ft_begin_index, opt.new_layer_lr, not_replace_last_fc=opt.not_replace_last_fc, is_human_mask_adv=opt.is_mask_adv, slower_place_mlp=opt.slower_place_mlp, slower_hm_mlp=opt.slower_hm_mlp)
272 |             else:
273 |                 parameters = get_fine_tuning_parameters(model, opt.ft_begin_index)
274 | 
275 |             return model, parameters
276 |     else:
277 |         if opt.pretrain_path:
278 |             print('loading pretrained model {}'.format(opt.pretrain_path))
279 |             pretrain = torch.load(opt.pretrain_path)
280 |             
281 |             if opt.model != 'vgg':
282 |                 assert opt.arch == pretrain['arch']
283 |             # else:
284 |             #     pdb.set_trace()
285 | 
286 |             # model.load_state_dict(pretrain['state_dict'])
287 |             model_dict = model.state_dict()
288 | 
289 |             # 1. filter out unnecessary keys and the last fc layers' weights
290 |             pretrained_dict = dict()
291 |             if 'state_dict' in pretrain:
292 |                 for k,v in pretrain['state_dict'].items():
293 |                     if ((k in model_dict) and (v.shape == model_dict[k].shape)):
294 |                         pretrained_dict[k] = v
295 |             else:
296 |                 for k,v in pretrain.items():
297 |                     new_k = 'module.vgg.'+ k
298 |                     if ((new_k in model_dict) and (v.shape == model_dict[new_k].shape)):
299 |                         pretrained_dict[new_k] = v
300 |             # 2. overwrite entries in the existing state dict
301 |             model_dict.update(pretrained_dict)
302 |             # 3. load the new state dict
303 |             model.load_state_dict(model_dict)
304 | 
305 |             if not opt.not_replace_last_fc:
306 |                 if opt.model == 'densenet':
307 |                     model.classifier = nn.Linear(
308 |                         model.classifier.in_features, opt.n_finetune_classes)
309 |                 else:
310 |                     model.fc = nn.Linear(model.fc.in_features,
311 |                                                 opt.n_finetune_classes)
312 |                                                 
313 |             if opt.is_place_adv:
314 |                 parameters = get_adv_fine_tuning_parameters(model, opt.ft_begin_index, opt.new_layer_lr, not_replace_last_fc=opt.not_replace_last_fc, is_human_mask_adv=opt.is_mask_adv, slower_place_mlp=opt.slower_place_mlp, slower_hm_mlp=opt.slower_hm_mlp)
315 |             else:
316 |                 parameters = get_fine_tuning_parameters(model, opt.ft_begin_index)
317 |             return model, parameters
318 | 
319 |     return model, model.parameters()
320 | 


--------------------------------------------------------------------------------
/models/pre_act_resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | 
  8 | __all__ = [
  9 |     'PreActivationResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
 10 |     'resnet152', 'resnet200'
 11 | ]
 12 | 
 13 | 
 14 | def conv3x3x3(in_planes, out_planes, stride=1):
 15 |     # 3x3x3 convolution with padding
 16 |     return nn.Conv3d(
 17 |         in_planes,
 18 |         out_planes,
 19 |         kernel_size=3,
 20 |         stride=stride,
 21 |         padding=1,
 22 |         bias=False)
 23 | 
 24 | 
 25 | def downsample_basic_block(x, planes, stride):
 26 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 27 |     zero_pads = torch.Tensor(
 28 |         out.size(0), planes - out.size(1), out.size(2), out.size(3),
 29 |         out.size(4)).zero_()
 30 |     if isinstance(out.data, torch.cuda.FloatTensor):
 31 |         zero_pads = zero_pads.cuda()
 32 | 
 33 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 34 | 
 35 |     return out
 36 | 
 37 | 
 38 | class PreActivationBasicBlock(nn.Module):
 39 |     expansion = 1
 40 | 
 41 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 42 |         super(PreActivationBasicBlock, self).__init__()
 43 |         self.bn1 = nn.BatchNorm3d(inplanes)
 44 |         self.conv1 = conv3x3x3(inplanes, planes, stride)
 45 |         self.bn2 = nn.BatchNorm3d(planes)
 46 |         self.conv2 = conv3x3x3(planes, planes)
 47 |         self.relu = nn.ReLU(inplace=True)
 48 |         self.downsample = downsample
 49 |         self.stride = stride
 50 | 
 51 |     def forward(self, x):
 52 |         residual = x
 53 | 
 54 |         out = self.bn1(x)
 55 |         out = self.relu(out)
 56 |         out = self.conv1(out)
 57 | 
 58 |         out = self.bn2(out)
 59 |         out = self.relu(out)
 60 |         out = self.conv2(out)
 61 | 
 62 |         if self.downsample is not None:
 63 |             residual = self.downsample(x)
 64 | 
 65 |         out += residual
 66 | 
 67 |         return out
 68 | 
 69 | 
 70 | class PreActivationBottleneck(nn.Module):
 71 |     expansion = 4
 72 | 
 73 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 74 |         super(PreActivationBottleneck, self).__init__()
 75 |         self.bn1 = nn.BatchNorm3d(inplanes)
 76 |         self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
 77 |         self.bn2 = nn.BatchNorm3d(planes)
 78 |         self.conv2 = nn.Conv3d(
 79 |             planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 80 |         self.bn3 = nn.BatchNorm3d(planes)
 81 |         self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
 82 |         self.relu = nn.ReLU(inplace=True)
 83 |         self.downsample = downsample
 84 |         self.stride = stride
 85 | 
 86 |     def forward(self, x):
 87 |         residual = x
 88 | 
 89 |         out = self.bn1(x)
 90 |         out = self.relu(out)
 91 |         out = self.conv1(out)
 92 | 
 93 |         out = self.bn2(out)
 94 |         out = self.relu(out)
 95 |         out = self.conv2(out)
 96 | 
 97 |         out = self.bn3(out)
 98 |         out = self.relu(out)
 99 |         out = self.conv3(out)
100 | 
101 |         if self.downsample is not None:
102 |             residual = self.downsample(x)
103 | 
104 |         out += residual
105 | 
106 |         return out
107 | 
108 | 
109 | class PreActivationResNet(nn.Module):
110 | 
111 |     def __init__(self,
112 |                  block,
113 |                  layers,
114 |                  sample_size,
115 |                  sample_duration,
116 |                  shortcut_type='B',
117 |                  num_classes=400):
118 |         self.inplanes = 64
119 |         super(PreActivationResNet, self).__init__()
120 |         self.conv1 = nn.Conv3d(
121 |             3,
122 |             64,
123 |             kernel_size=7,
124 |             stride=(1, 2, 2),
125 |             padding=(3, 3, 3),
126 |             bias=False)
127 |         self.bn1 = nn.BatchNorm3d(64)
128 |         self.relu = nn.ReLU(inplace=True)
129 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
130 |         self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
131 |         self.layer2 = self._make_layer(
132 |             block, 128, layers[1], shortcut_type, stride=2)
133 |         self.layer3 = self._make_layer(
134 |             block, 256, layers[2], shortcut_type, stride=2)
135 |         self.layer4 = self._make_layer(
136 |             block, 512, layers[3], shortcut_type, stride=2)
137 |         last_duration = int(math.ceil(sample_duration / 16))
138 |         last_size = int(math.ceil(sample_size / 32))
139 |         self.avgpool = nn.AvgPool3d(
140 |             (last_duration, last_size, last_size), stride=1)
141 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
142 | 
143 |         for m in self.modules():
144 |             if isinstance(m, nn.Conv3d):
145 |                 m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
146 |             elif isinstance(m, nn.BatchNorm3d):
147 |                 m.weight.data.fill_(1)
148 |                 m.bias.data.zero_()
149 | 
150 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
151 |         downsample = None
152 |         if stride != 1 or self.inplanes != planes * block.expansion:
153 |             if shortcut_type == 'A':
154 |                 downsample = partial(
155 |                     downsample_basic_block,
156 |                     planes=planes * block.expansion,
157 |                     stride=stride)
158 |             else:
159 |                 downsample = nn.Sequential(
160 |                     nn.Conv3d(
161 |                         self.inplanes,
162 |                         planes * block.expansion,
163 |                         kernel_size=1,
164 |                         stride=stride,
165 |                         bias=False), nn.BatchNorm3d(planes * block.expansion))
166 | 
167 |         layers = []
168 |         layers.append(block(self.inplanes, planes, stride, downsample))
169 |         self.inplanes = planes * block.expansion
170 |         for i in range(1, blocks):
171 |             layers.append(block(self.inplanes, planes))
172 | 
173 |         return nn.Sequential(*layers)
174 | 
175 |     def forward(self, x):
176 |         x = self.conv1(x)
177 |         x = self.bn1(x)
178 |         x = self.relu(x)
179 |         x = self.maxpool(x)
180 | 
181 |         x = self.layer1(x)
182 |         x = self.layer2(x)
183 |         x = self.layer3(x)
184 |         x = self.layer4(x)
185 | 
186 |         x = self.avgpool(x)
187 | 
188 |         x = x.view(x.size(0), -1)
189 |         x = self.fc(x)
190 | 
191 |         return x
192 | 
193 | 
194 | def get_fine_tuning_parameters(model, ft_begin_index):
195 |     if ft_begin_index == 0:
196 |         return model.parameters()
197 | 
198 |     ft_module_names = []
199 |     for i in range(ft_begin_index, 5):
200 |         ft_module_names.append('layer{}'.format(i))
201 |     ft_module_names.append('fc')
202 | 
203 |     parameters = []
204 |     for k, v in model.named_parameters():
205 |         for ft_module in ft_module_names:
206 |             if ft_module in k:
207 |                 parameters.append({'params': v})
208 |                 break
209 |         else:
210 |             parameters.append({'params': v, 'lr': 0.0})
211 | 
212 |     return parameters
213 | 
214 | 
215 | def resnet18(**kwargs):
216 |     """Constructs a ResNet-18 model.
217 |     """
218 |     model = PreActivationResNet(PreActivationBasicBlock, [2, 2, 2, 2], **kwargs)
219 |     return model
220 | 
221 | 
222 | def resnet34(**kwargs):
223 |     """Constructs a ResNet-34 model.
224 |     """
225 |     model = PreActivationResNet(PreActivationBasicBlock, [3, 4, 6, 3], **kwargs)
226 |     return model
227 | 
228 | 
229 | def resnet50(**kwargs):
230 |     """Constructs a ResNet-50 model.
231 |     """
232 |     model = PreActivationResNet(PreActivationBottleneck, [3, 4, 6, 3], **kwargs)
233 |     return model
234 | 
235 | 
236 | def resnet101(**kwargs):
237 |     """Constructs a ResNet-101 model.
238 |     """
239 |     model = PreActivationResNet(PreActivationBottleneck, [3, 4, 23, 3],
240 |                                 **kwargs)
241 |     return model
242 | 
243 | 
244 | def resnet152(**kwargs):
245 |     """Constructs a ResNet-101 model.
246 |     """
247 |     model = PreActivationResNet(PreActivationBottleneck, [3, 8, 36, 3],
248 |                                 **kwargs)
249 |     return model
250 | 
251 | 
252 | def resnet200(**kwargs):
253 |     """Constructs a ResNet-101 model.
254 |     """
255 |     model = PreActivationResNet(PreActivationBottleneck, [3, 24, 36, 3],
256 |                                 **kwargs)
257 |     return model
258 | 


--------------------------------------------------------------------------------
/models/resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | # adv
  8 | from models.grad_reversal import ReverseLayerF
  9 | import pdb
 10 | __all__ = [
 11 |     'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
 12 |     'resnet152', 'resnet200'
 13 | ]
 14 | 
 15 | def conv3x3x3(in_planes, out_planes, stride=1):
 16 |     # 3x3x3 convolution with padding
 17 |     return nn.Conv3d(
 18 |         in_planes,
 19 |         out_planes,
 20 |         kernel_size=3,
 21 |         stride=stride,
 22 |         padding=1,
 23 |         bias=False)
 24 | 
 25 | 
 26 | def downsample_basic_block(x, planes, stride):
 27 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 28 |     zero_pads = torch.Tensor(
 29 |         out.size(0), planes - out.size(1), out.size(2), out.size(3),
 30 |         out.size(4)).zero_()
 31 |     if isinstance(out.data, torch.cuda.FloatTensor):
 32 |         zero_pads = zero_pads.cuda()
 33 | 
 34 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 35 | 
 36 |     return out
 37 | 
 38 | 
 39 | class BasicBlock(nn.Module):
 40 |     expansion = 1
 41 | 
 42 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 43 |         super(BasicBlock, self).__init__()
 44 |         self.conv1 = conv3x3x3(inplanes, planes, stride)
 45 |         self.bn1 = nn.BatchNorm3d(planes)
 46 |         self.relu = nn.ReLU(inplace=True)
 47 |         self.conv2 = conv3x3x3(planes, planes)
 48 |         self.bn2 = nn.BatchNorm3d(planes)
 49 |         self.downsample = downsample
 50 |         self.stride = stride
 51 | 
 52 |     def forward(self, x):
 53 |         residual = x
 54 | 
 55 |         out = self.conv1(x)
 56 |         out = self.bn1(out)
 57 |         out = self.relu(out)
 58 | 
 59 |         out = self.conv2(out)
 60 |         out = self.bn2(out)
 61 | 
 62 |         if self.downsample is not None:
 63 |             residual = self.downsample(x)
 64 | 
 65 |         out += residual
 66 |         out = self.relu(out)
 67 | 
 68 |         return out
 69 | 
 70 | 
 71 | class Bottleneck(nn.Module):
 72 |     expansion = 4
 73 | 
 74 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 75 |         super(Bottleneck, self).__init__()
 76 |         self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
 77 |         self.bn1 = nn.BatchNorm3d(planes)
 78 |         self.conv2 = nn.Conv3d(
 79 |             planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 80 |         self.bn2 = nn.BatchNorm3d(planes)
 81 |         self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
 82 |         self.bn3 = nn.BatchNorm3d(planes * 4)
 83 |         self.relu = nn.ReLU(inplace=True)
 84 |         self.downsample = downsample
 85 |         self.stride = stride
 86 | 
 87 |     def forward(self, x):
 88 |         residual = x
 89 | 
 90 |         out = self.conv1(x)
 91 |         out = self.bn1(out)
 92 |         out = self.relu(out)
 93 | 
 94 |         out = self.conv2(out)
 95 |         out = self.bn2(out)
 96 |         out = self.relu(out)
 97 | 
 98 |         out = self.conv3(out)
 99 |         out = self.bn3(out)
100 | 
101 |         if self.downsample is not None:
102 |             residual = self.downsample(x)
103 | 
104 |         out += residual
105 |         out = self.relu(out)
106 | 
107 |         return out
108 | 
109 | 
110 | class MLP_Block(nn.Module):
111 |     expansion = 1
112 | 
113 |     def __init__(self, inplanes, planes):
114 |         super(MLP_Block, self).__init__()
115 |         self.fc = nn.Linear(inplanes, planes)
116 |         self.bn = nn.BatchNorm1d(planes)
117 |         self.relu = nn.ReLU(inplace=True)
118 | 
119 |     def forward(self, x):
120 |         out = self.fc(x)
121 |         out = self.bn(out)
122 |         out = self.relu(out)
123 | 
124 |         return out
125 | 
126 | 
127 | class ResNet(nn.Module):
128 | 
129 |     def __init__(self,
130 |                  block,
131 |                  layers,
132 |                  sample_size,
133 |                  sample_duration,
134 |                  shortcut_type='B',
135 |                  num_classes=400,
136 |                  is_adv=False,   
137 |                  is_human_mask_adv=False,              
138 |                  alpha=0.0,
139 |                  alpha_hm=0.0,
140 |                  num_places_classes=365,                 
141 |                  num_place_hidden_layers=1,
142 |                  num_human_mask_adv_hidden_layers=1):
143 |         self.inplanes = 64
144 |         
145 |         # adv
146 |         self.is_adv = is_adv        
147 |         self.is_human_mask_adv = is_human_mask_adv
148 |         self.alpha = alpha
149 |         self.alpha_hm = alpha_hm
150 |         self.num_places_classes = num_places_classes
151 |         
152 |         super(ResNet, self).__init__()
153 |         self.conv1 = nn.Conv3d(
154 |             3,
155 |             64,
156 |             kernel_size=7,
157 |             stride=(1, 2, 2),
158 |             padding=(3, 3, 3),
159 |             bias=False)
160 |         self.bn1 = nn.BatchNorm3d(64)
161 |         self.relu = nn.ReLU(inplace=True)
162 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
163 |         self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
164 |         self.layer2 = self._make_layer(
165 |             block, 128, layers[1], shortcut_type, stride=2)
166 |         self.layer3 = self._make_layer(
167 |             block, 256, layers[2], shortcut_type, stride=2)
168 |         self.layer4 = self._make_layer(
169 |             block, 512, layers[3], shortcut_type, stride=2)
170 |         last_duration = int(math.ceil(sample_duration / 16))
171 |         last_size = int(math.ceil(sample_size / 32))
172 |         self.avgpool = nn.AvgPool3d(
173 |             (last_duration, last_size, last_size), stride=1)
174 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
175 | 
176 |         # human mask adv
177 |         if self.is_human_mask_adv:
178 |             self.hm_mlp = nn.Sequential()
179 |             self.hm_mlp = self._make_mlp_layer(MLP_Block, 512 * block.expansion, 512 * block.expansion, num_human_mask_adv_hidden_layers)
180 |             self.hm_mlp.add_module('hm_last_fc', nn.Linear(512 * block.expansion, num_classes))
181 |    
182 |         # adv
183 |         if self.is_adv:
184 |             self.place_mlp = nn.Sequential()
185 |             self.place_mlp = self._make_mlp_layer(MLP_Block, 512 * block.expansion, 512 * block.expansion, num_place_hidden_layers)
186 |             self.place_mlp.add_module('p_last_fc', nn.Linear(512 * block.expansion, self.num_places_classes))
187 |    
188 |         for m in self.modules():
189 |             if isinstance(m, nn.Conv3d):
190 |                 m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
191 |             elif isinstance(m, nn.BatchNorm3d):
192 |                 m.weight.data.fill_(1)
193 |                 m.bias.data.zero_()
194 | 
195 |     def _make_mlp_layer(self, block, inplanes, planes, blocks):        
196 |         layers = []
197 |         layers.append(block(inplanes, planes))
198 |         for i in range(1, blocks):
199 |             layers.append(block(inplanes, planes))
200 | 
201 |         return nn.Sequential(*layers)
202 | 
203 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
204 |         downsample = None
205 |         if stride != 1 or self.inplanes != planes * block.expansion:
206 |             if shortcut_type == 'A':
207 |                 downsample = partial(
208 |                     downsample_basic_block,
209 |                     planes=planes * block.expansion,
210 |                     stride=stride)
211 |             else:
212 |                 downsample = nn.Sequential(
213 |                     nn.Conv3d(
214 |                         self.inplanes,
215 |                         planes * block.expansion,
216 |                         kernel_size=1,
217 |                         stride=stride,
218 |                         bias=False), nn.BatchNorm3d(planes * block.expansion))
219 | 
220 |         layers = []
221 |         layers.append(block(self.inplanes, planes, stride, downsample))
222 |         self.inplanes = planes * block.expansion
223 |         for i in range(1, blocks):
224 |             layers.append(block(self.inplanes, planes))
225 | 
226 |         return nn.Sequential(*layers)
227 | 
228 |     def forward(self, x):
229 |         x = self.conv1(x)
230 |         x = self.bn1(x)
231 |         x = self.relu(x)
232 |         x = self.maxpool(x)
233 | 
234 |         x = self.layer1(x)
235 |         x = self.layer2(x)
236 |         x = self.layer3(x)
237 |         x = self.layer4(x)
238 | 
239 |         x = self.avgpool(x)
240 | 
241 |         x = x.view(x.size(0), -1)
242 | 
243 |         # adv
244 |         if self.is_human_mask_adv:
245 |             rev_x_hm = ReverseLayerF.apply(x, self.alpha_hm)
246 |         if self.is_adv:            
247 |             rev_x = ReverseLayerF.apply(x, self.alpha)                    
248 |             dom_x = self.place_mlp(rev_x)
249 |         
250 |         x = self.fc(x)            
251 | 
252 |         if self.is_human_mask_adv and self.is_adv:
253 |             hm_x = self.hm_mlp(rev_x_hm)
254 |             return x, dom_x, hm_x
255 |         elif self.is_adv:        
256 |             return x, dom_x
257 |         elif self.is_human_mask_adv:
258 |             hm_x = self.hm_mlp(rev_x_hm)
259 |             return x, hm_x
260 |         else:            
261 |             return x
262 |         
263 | def get_fine_tuning_parameters(model, ft_begin_index):
264 |     if ft_begin_index == 0:
265 |         return model.parameters()
266 | 
267 |     ft_module_names = []
268 |     for i in range(ft_begin_index, 5):
269 |         ft_module_names.append('layer{}'.format(i))
270 |     ft_module_names.append('fc')
271 | 
272 |     parameters = []
273 |     for k, v in model.named_parameters():
274 |         for ft_module in ft_module_names:
275 |             if ft_module in k:
276 |                 parameters.append({'params': v})
277 |                 break
278 |         else:
279 |             parameters.append({'params': v, 'lr': 0.0})
280 |     
281 |     return parameters
282 | 
283 | def get_adv_fine_tuning_parameters(model, ft_begin_index, new_layer_lr, not_replace_last_fc=False, is_human_mask_adv=False, slower_place_mlp=False, slower_hm_mlp=False):
284 |     ft_module_names, frozen_module_names = [], []
285 |     
286 |     for i in range(0, ft_begin_index):
287 |         frozen_module_names.append('layer{}'.format(i))
288 |     for i in range(ft_begin_index, 5):
289 |         ft_module_names.append('layer{}'.format(i))
290 |     
291 |     new_module_names = []
292 | 
293 |     if not slower_place_mlp:
294 |         new_module_names.append('place_mlp')
295 |     else:
296 |         ft_module_names.append('place_mlp')
297 |     if is_human_mask_adv:
298 |         if not slower_hm_mlp:
299 |             new_module_names.append('hm_mlp')
300 |         else:
301 |             ft_module_names.append('hm_mlp')
302 |     if not not_replace_last_fc:
303 |         new_module_names.append('fc')
304 |     
305 |     pretrained_parameters, new_parameters = [], []
306 |     for k, v in model.named_parameters():
307 |         for ft_module in ft_module_names:
308 |             if ft_module in k:
309 |                 print('finetune params:{}'.format(k))
310 |                 pretrained_parameters.append(v)
311 |                 break
312 |         else:                
313 |             for new_module in new_module_names:
314 |                 if new_module in k:
315 |                     print('new params:{}'.format(k))                    
316 |                     new_parameters.append(v)
317 |                     break
318 |             else:
319 |                 for frozen_module in frozen_module_names:
320 |                     if frozen_module in k:
321 |                         print('frozen:{}'.format(k))                        
322 |                         pretrained_parameters.append(v)
323 |                         break
324 |                 else:
325 |                     print('finetune params:{}'.format(k))
326 |                     pretrained_parameters.append(v)
327 |    
328 |     return [pretrained_parameters, new_parameters]
329 | 
330 | def resnet10(**kwargs):
331 |     """Constructs a ResNet-18 model.
332 |     """
333 |     model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs)
334 |     return model
335 | 
336 | 
337 | def resnet18(**kwargs):
338 |     """Constructs a ResNet-18 model.
339 |     """
340 |     model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
341 |     return model
342 | 
343 | 
344 | def resnet34(**kwargs):
345 |     """Constructs a ResNet-34 model.
346 |     """
347 |     model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
348 |     return model
349 | 
350 | 
351 | def resnet50(**kwargs):
352 |     """Constructs a ResNet-50 model.
353 |     """
354 |     model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
355 |     return model
356 | 
357 | 
358 | def resnet101(**kwargs):
359 |     """Constructs a ResNet-101 model.
360 |     """
361 |     model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
362 |     return model
363 | 
364 | 
365 | def resnet152(**kwargs):
366 |     """Constructs a ResNet-101 model.
367 |     """
368 |     model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
369 |     return model
370 | 
371 | 
372 | def resnet200(**kwargs):
373 |     """Constructs a ResNet-101 model.
374 |     """
375 |     model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs)
376 |     return model


--------------------------------------------------------------------------------
/models/resnext.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | 
  8 | __all__ = ['ResNeXt', 'resnet50', 'resnet101']
  9 | 
 10 | 
 11 | def conv3x3x3(in_planes, out_planes, stride=1):
 12 |     # 3x3x3 convolution with padding
 13 |     return nn.Conv3d(
 14 |         in_planes,
 15 |         out_planes,
 16 |         kernel_size=3,
 17 |         stride=stride,
 18 |         padding=1,
 19 |         bias=False)
 20 | 
 21 | 
 22 | def downsample_basic_block(x, planes, stride):
 23 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 24 |     zero_pads = torch.Tensor(
 25 |         out.size(0), planes - out.size(1), out.size(2), out.size(3),
 26 |         out.size(4)).zero_()
 27 |     if isinstance(out.data, torch.cuda.FloatTensor):
 28 |         zero_pads = zero_pads.cuda()
 29 | 
 30 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 31 | 
 32 |     return out
 33 | 
 34 | 
 35 | class ResNeXtBottleneck(nn.Module):
 36 |     expansion = 2
 37 | 
 38 |     def __init__(self, inplanes, planes, cardinality, stride=1,
 39 |                  downsample=None):
 40 |         super(ResNeXtBottleneck, self).__init__()
 41 |         mid_planes = cardinality * int(planes / 32)
 42 |         self.conv1 = nn.Conv3d(inplanes, mid_planes, kernel_size=1, bias=False)
 43 |         self.bn1 = nn.BatchNorm3d(mid_planes)
 44 |         self.conv2 = nn.Conv3d(
 45 |             mid_planes,
 46 |             mid_planes,
 47 |             kernel_size=3,
 48 |             stride=stride,
 49 |             padding=1,
 50 |             groups=cardinality,
 51 |             bias=False)
 52 |         self.bn2 = nn.BatchNorm3d(mid_planes)
 53 |         self.conv3 = nn.Conv3d(
 54 |             mid_planes, planes * self.expansion, kernel_size=1, bias=False)
 55 |         self.bn3 = nn.BatchNorm3d(planes * self.expansion)
 56 |         self.relu = nn.ReLU(inplace=True)
 57 |         self.downsample = downsample
 58 |         self.stride = stride
 59 | 
 60 |     def forward(self, x):
 61 |         residual = x
 62 | 
 63 |         out = self.conv1(x)
 64 |         out = self.bn1(out)
 65 |         out = self.relu(out)
 66 | 
 67 |         out = self.conv2(out)
 68 |         out = self.bn2(out)
 69 |         out = self.relu(out)
 70 | 
 71 |         out = self.conv3(out)
 72 |         out = self.bn3(out)
 73 | 
 74 |         if self.downsample is not None:
 75 |             residual = self.downsample(x)
 76 | 
 77 |         out += residual
 78 |         out = self.relu(out)
 79 | 
 80 |         return out
 81 | 
 82 | 
 83 | class ResNeXt(nn.Module):
 84 | 
 85 |     def __init__(self,
 86 |                  block,
 87 |                  layers,
 88 |                  sample_size,
 89 |                  sample_duration,
 90 |                  shortcut_type='B',
 91 |                  cardinality=32,
 92 |                  num_classes=400):
 93 |         self.inplanes = 64
 94 |         super(ResNeXt, self).__init__()
 95 |         self.conv1 = nn.Conv3d(
 96 |             3,
 97 |             64,
 98 |             kernel_size=7,
 99 |             stride=(1, 2, 2),
100 |             padding=(3, 3, 3),
101 |             bias=False)
102 |         self.bn1 = nn.BatchNorm3d(64)
103 |         self.relu = nn.ReLU(inplace=True)
104 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
105 |         self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type,
106 |                                        cardinality)
107 |         self.layer2 = self._make_layer(
108 |             block, 256, layers[1], shortcut_type, cardinality, stride=2)
109 |         self.layer3 = self._make_layer(
110 |             block, 512, layers[2], shortcut_type, cardinality, stride=2)
111 |         self.layer4 = self._make_layer(
112 |             block, 1024, layers[3], shortcut_type, cardinality, stride=2)
113 |         last_duration = int(math.ceil(sample_duration / 16))
114 |         last_size = int(math.ceil(sample_size / 32))
115 |         self.avgpool = nn.AvgPool3d(
116 |             (last_duration, last_size, last_size), stride=1)
117 |         self.fc = nn.Linear(cardinality * 32 * block.expansion, num_classes)
118 | 
119 |         for m in self.modules():
120 |             if isinstance(m, nn.Conv3d):
121 |                 m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
122 |             elif isinstance(m, nn.BatchNorm3d):
123 |                 m.weight.data.fill_(1)
124 |                 m.bias.data.zero_()
125 | 
126 |     def _make_layer(self,
127 |                     block,
128 |                     planes,
129 |                     blocks,
130 |                     shortcut_type,
131 |                     cardinality,
132 |                     stride=1):
133 |         downsample = None
134 |         if stride != 1 or self.inplanes != planes * block.expansion:
135 |             if shortcut_type == 'A':
136 |                 downsample = partial(
137 |                     downsample_basic_block,
138 |                     planes=planes * block.expansion,
139 |                     stride=stride)
140 |             else:
141 |                 downsample = nn.Sequential(
142 |                     nn.Conv3d(
143 |                         self.inplanes,
144 |                         planes * block.expansion,
145 |                         kernel_size=1,
146 |                         stride=stride,
147 |                         bias=False), nn.BatchNorm3d(planes * block.expansion))
148 | 
149 |         layers = []
150 |         layers.append(
151 |             block(self.inplanes, planes, cardinality, stride, downsample))
152 |         self.inplanes = planes * block.expansion
153 |         for i in range(1, blocks):
154 |             layers.append(block(self.inplanes, planes, cardinality))
155 | 
156 |         return nn.Sequential(*layers)
157 | 
158 |     def forward(self, x):
159 |         x = self.conv1(x)
160 |         x = self.bn1(x)
161 |         x = self.relu(x)
162 |         x = self.maxpool(x)
163 | 
164 |         x = self.layer1(x)
165 |         x = self.layer2(x)
166 |         x = self.layer3(x)
167 |         x = self.layer4(x)
168 | 
169 |         x = self.avgpool(x)
170 | 
171 |         x = x.view(x.size(0), -1)
172 |         x = self.fc(x)
173 | 
174 |         return x
175 | 
176 | 
177 | def get_fine_tuning_parameters(model, ft_begin_index):
178 |     if ft_begin_index == 0:
179 |         return model.parameters()
180 | 
181 |     ft_module_names = []
182 |     for i in range(ft_begin_index, 5):
183 |         ft_module_names.append('layer{}'.format(i))
184 |     ft_module_names.append('fc')
185 | 
186 |     parameters = []
187 |     for k, v in model.named_parameters():
188 |         for ft_module in ft_module_names:
189 |             if ft_module in k:
190 |                 parameters.append({'params': v})
191 |                 break
192 |         else:
193 |             parameters.append({'params': v, 'lr': 0.0})
194 | 
195 |     return parameters
196 | 
197 | 
198 | def resnet50(**kwargs):
199 |     """Constructs a ResNet-50 model.
200 |     """
201 |     model = ResNeXt(ResNeXtBottleneck, [3, 4, 6, 3], **kwargs)
202 |     return model
203 | 
204 | 
205 | def resnet101(**kwargs):
206 |     """Constructs a ResNet-101 model.
207 |     """
208 |     model = ResNeXt(ResNeXtBottleneck, [3, 4, 23, 3], **kwargs)
209 |     return model
210 | 
211 | 
212 | def resnet152(**kwargs):
213 |     """Constructs a ResNet-101 model.
214 |     """
215 |     model = ResNeXt(ResNeXtBottleneck, [3, 8, 36, 3], **kwargs)
216 |     return model
217 | 


--------------------------------------------------------------------------------
/models/vgg.py:
--------------------------------------------------------------------------------
  1 | """ VGG16 network Class
  2 | Adapted from Gurkirt Singh's code: https://github.com/gurkirt/realtime-action-detection/blob/master/ssd.py
  3 | """
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.autograd import Variable
  9 | import os
 10 | # adv
 11 | from models.grad_reversal import ReverseLayerF
 12 | import pdb
 13 | 
 14 | class MLP_Block(nn.Module):
 15 |     def __init__(self, inplanes, planes):
 16 |         super(MLP_Block, self).__init__()
 17 |         self.fc = nn.Linear(inplanes, planes)
 18 |         # self.bn = nn.BatchNorm1d(planes)
 19 |         self.relu = nn.ReLU(inplace=True)
 20 | 
 21 |     def forward(self, x):
 22 |         out = self.fc(x)
 23 |         # out = self.bn(out)
 24 |         out = self.relu(out)
 25 | 
 26 |         return out
 27 | 
 28 | class VGG16(nn.Module):
 29 |     def __init__(self, 
 30 |                 base, 
 31 |                 num_classes,
 32 |                 is_adv=False,   
 33 |                 is_human_mask_adv=False,   
 34 |                 alpha=0.0,
 35 |                 alpha_hm=0.0,
 36 |                 num_places_classes=365,                 
 37 |                 num_place_hidden_layers=1,
 38 |                 num_human_mask_adv_hidden_layers=1):
 39 |         super(VGG16, self).__init__()
 40 |         self.num_classes = num_classes
 41 |         self.size = 300
 42 |         self.is_adv = is_adv        
 43 |         self.is_human_mask_adv = is_human_mask_adv
 44 |         self.alpha = alpha
 45 |         self.alpha_hm = alpha_hm
 46 |         self.num_places_classes = num_places_classes
 47 | 
 48 |         self.vgg = nn.ModuleList(base)
 49 |         self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
 50 |         self.mlp = nn.Sequential(
 51 |             nn.Linear(1024 * 7 * 7, 4096),
 52 |             nn.ReLU(True),
 53 |             nn.Dropout(),
 54 |             nn.Linear(4096, 4096),
 55 |             nn.ReLU(True),
 56 |             nn.Dropout(),            
 57 |         )
 58 |         self.fc = nn.Linear(4096, self.num_classes)
 59 | 
 60 |         # human mask adv
 61 |         if self.is_human_mask_adv:
 62 |             self.hm_mlp = nn.Sequential()
 63 |             self.hm_mlp = self._make_mlp_layer(MLP_Block, 4096, 4096, num_human_mask_adv_hidden_layers)
 64 |             self.hm_mlp.add_module('hm_last_fc', nn.Linear(4096, num_classes))
 65 | 
 66 |         # adv
 67 |         if self.is_adv:
 68 |             self.place_mlp = nn.Sequential()            
 69 |             self.place_mlp = self._make_mlp_layer(MLP_Block, 4096, 4096, num_place_hidden_layers)
 70 |             self.place_mlp.add_module('p_last_fc', nn.Linear(4096, self.num_places_classes))
 71 | 
 72 |     def forward(self, x):
 73 |         for k in range(len(self.vgg)):
 74 |             x = self.vgg[k](x)
 75 | 
 76 |         x = self.avgpool(x)
 77 |         x = x.view(x.size(0), -1)
 78 |         x = self.mlp(x)
 79 | 
 80 |         # adv
 81 |         if self.is_human_mask_adv:
 82 |             rev_x_hm = ReverseLayerF.apply(x, self.alpha_hm)
 83 |         if self.is_adv:            
 84 |             rev_x = ReverseLayerF.apply(x, self.alpha)                    
 85 |             dom_x = self.place_mlp(rev_x)
 86 | 
 87 |         x = self.fc(x)
 88 | 
 89 |         if self.is_human_mask_adv and self.is_adv:
 90 |             hm_x = self.hm_mlp(rev_x_hm)
 91 |             return x, dom_x, hm_x
 92 |         elif self.is_adv:        
 93 |             return x, dom_x
 94 |         elif self.is_human_mask_adv:
 95 |             hm_x = self.hm_mlp(rev_x_hm)
 96 |             return x, hm_x
 97 |         else:            
 98 |             return x
 99 | 
100 |     def _make_mlp_layer(self, block, inplanes, planes, blocks):        
101 |         layers = []
102 |         layers.append(block(inplanes, planes))
103 |         for i in range(1, blocks):
104 |             layers.append(block(inplanes, planes))
105 | 
106 |         return nn.Sequential(*layers)
107 |         
108 |     def load_weights(self, base_file):
109 |         other, ext = os.path.splitext(base_file)
110 |         if ext == '.pkl' or '.pth':
111 |             print('Loading weights into state dict...')
112 |             self.load_state_dict(torch.load(base_file, map_location=lambda storage, loc: storage))
113 |             print('Finished!')
114 |         else:
115 |             print('Sorry only .pth and .pkl files supported.')
116 | 
117 | 
118 | # This function is derived from torchvision VGG make_layers()
119 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py
120 | def vgg(cfg, i, batch_norm=False):
121 |     layers = []
122 |     in_channels = i
123 |     for v in cfg:
124 |         if v == 'M':
125 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
126 |         elif v == 'C':
127 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
128 |         else:
129 |             conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
130 |             if batch_norm:
131 |                 layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
132 |             else:
133 |                 layers += [conv2d, nn.ReLU(inplace=True)]
134 |             in_channels = v
135 |     pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
136 |     conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
137 |     conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
138 |     layers += [pool5, conv6,
139 |                nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
140 |     return layers
141 | 
142 | base = {
143 |     '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
144 |             512, 512, 512],
145 |     '512': [],
146 | }
147 | 
148 | def build_vgg(**kwargs):
149 | # def build_vgg(size=300, num_classes=24):
150 | #     if size != 300:
151 | #         print("Error: Sorry only SSD300 is supported currently!")
152 | #         return
153 |     model = VGG16(vgg(base['300'], 3), **kwargs)
154 |     return model
155 | 
156 | def get_fine_tuning_parameters(model, ft_begin_index):    
157 |     if ft_begin_index > 0:
158 |         print('Finetuing only partial layers is not supported')
159 |         return         
160 | 
161 |     ft_module_names, new_module_names = [], []
162 |     ft_module_names.append('vgg')
163 | 
164 |     new_module_names.append('mlp')
165 |     new_module_names.append('fc')
166 | 
167 |     pretrained_parameters, new_parameters = [], []
168 |     for k, v in model.named_parameters():
169 |         for ft_module in ft_module_names:
170 |             if ft_module in k:
171 |                 print('finetune params:{}'.format(k))
172 |                 pretrained_parameters.append(v)
173 |                 break
174 |         else:
175 |             for new_module in new_module_names:
176 |                 if new_module in k:
177 |                     print('new params:{}'.format(k))                    
178 |                     new_parameters.append(v)
179 |                     break                
180 |    
181 |     return [pretrained_parameters, new_parameters]
182 | 
183 | def get_adv_fine_tuning_parameters(model, ft_begin_index, new_layer_lr, not_replace_last_fc=False, is_human_mask_adv=False, slower_place_mlp=False, slower_hm_mlp=False):
184 |     if ft_begin_index > 0:
185 |         print('Finetuing only partial layers is not supported')
186 |         return 
187 | 
188 |     ft_module_names, new_module_names = [], []
189 |     ft_module_names.append('vgg')
190 | 
191 |     new_module_names.append('mlp')
192 |     if not slower_place_mlp:
193 |         new_module_names.append('place_mlp')
194 |     else:
195 |         ft_module_names.append('place_mlp')
196 |     if is_human_mask_adv:
197 |         if not slower_hm_mlp:
198 |             new_module_names.append('hm_mlp')
199 |         else:
200 |             ft_module_names.append('hm_mlp')
201 |     if not not_replace_last_fc:
202 |         new_module_names.append('fc')
203 | 
204 |     
205 |     pretrained_parameters, new_parameters = [], []
206 |     for k, v in model.named_parameters():
207 |         for ft_module in ft_module_names:
208 |             if ft_module in k:
209 |                 print('finetune params:{}'.format(k))
210 |                 pretrained_parameters.append(v)
211 |                 break
212 |         else:                
213 |             for new_module in new_module_names:
214 |                 if new_module in k:
215 |                     print('new params:{}'.format(k))                    
216 |                     new_parameters.append(v)
217 |                     break
218 |             
219 |     return [pretrained_parameters, new_parameters]        
220 |     


--------------------------------------------------------------------------------
/models/wide_resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | 
  8 | __all__ = ['WideResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101']
  9 | 
 10 | 
 11 | def conv3x3x3(in_planes, out_planes, stride=1):
 12 |     # 3x3x3 convolution with padding
 13 |     return nn.Conv3d(
 14 |         in_planes,
 15 |         out_planes,
 16 |         kernel_size=3,
 17 |         stride=stride,
 18 |         padding=1,
 19 |         bias=False)
 20 | 
 21 | 
 22 | def downsample_basic_block(x, planes, stride):
 23 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 24 |     zero_pads = torch.Tensor(
 25 |         out.size(0), planes - out.size(1), out.size(2), out.size(3),
 26 |         out.size(4)).zero_()
 27 |     if isinstance(out.data, torch.cuda.FloatTensor):
 28 |         zero_pads = zero_pads.cuda()
 29 | 
 30 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 31 | 
 32 |     return out
 33 | 
 34 | 
 35 | class WideBottleneck(nn.Module):
 36 |     expansion = 2
 37 | 
 38 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 39 |         super(WideBottleneck, self).__init__()
 40 |         self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
 41 |         self.bn1 = nn.BatchNorm3d(planes)
 42 |         self.conv2 = nn.Conv3d(
 43 |             planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 44 |         self.bn2 = nn.BatchNorm3d(planes)
 45 |         self.conv3 = nn.Conv3d(
 46 |             planes, planes * self.expansion, kernel_size=1, bias=False)
 47 |         self.bn3 = nn.BatchNorm3d(planes * self.expansion)
 48 |         self.relu = nn.ReLU(inplace=True)
 49 |         self.downsample = downsample
 50 |         self.stride = stride
 51 | 
 52 |     def forward(self, x):
 53 |         residual = x
 54 | 
 55 |         out = self.conv1(x)
 56 |         out = self.bn1(out)
 57 |         out = self.relu(out)
 58 | 
 59 |         out = self.conv2(out)
 60 |         out = self.bn2(out)
 61 |         out = self.relu(out)
 62 | 
 63 |         out = self.conv3(out)
 64 |         out = self.bn3(out)
 65 | 
 66 |         if self.downsample is not None:
 67 |             residual = self.downsample(x)
 68 | 
 69 |         out += residual
 70 |         out = self.relu(out)
 71 | 
 72 |         return out
 73 | 
 74 | 
 75 | class WideResNet(nn.Module):
 76 | 
 77 |     def __init__(self,
 78 |                  block,
 79 |                  layers,
 80 |                  sample_size,
 81 |                  sample_duration,
 82 |                  k=1,
 83 |                  shortcut_type='B',
 84 |                  num_classes=400):
 85 |         self.inplanes = 64
 86 |         super(WideResNet, self).__init__()
 87 |         self.conv1 = nn.Conv3d(
 88 |             3,
 89 |             64,
 90 |             kernel_size=7,
 91 |             stride=(1, 2, 2),
 92 |             padding=(3, 3, 3),
 93 |             bias=False)
 94 |         self.bn1 = nn.BatchNorm3d(64)
 95 |         self.relu = nn.ReLU(inplace=True)
 96 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
 97 |         self.layer1 = self._make_layer(block, 64 * k, layers[0], shortcut_type)
 98 |         self.layer2 = self._make_layer(
 99 |             block, 128 * k, layers[1], shortcut_type, stride=2)
100 |         self.layer3 = self._make_layer(
101 |             block, 256 * k, layers[2], shortcut_type, stride=2)
102 |         self.layer4 = self._make_layer(
103 |             block, 512 * k, layers[3], shortcut_type, stride=2)
104 |         last_duration = int(math.ceil(sample_duration / 16))
105 |         last_size = int(math.ceil(sample_size / 32))
106 |         self.avgpool = nn.AvgPool3d(
107 |             (last_duration, last_size, last_size), stride=1)
108 |         self.fc = nn.Linear(512 * k * block.expansion, num_classes)
109 | 
110 |         for m in self.modules():
111 |             if isinstance(m, nn.Conv3d):
112 |                 m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
113 |             elif isinstance(m, nn.BatchNorm3d):
114 |                 m.weight.data.fill_(1)
115 |                 m.bias.data.zero_()
116 | 
117 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
118 |         downsample = None
119 |         if stride != 1 or self.inplanes != planes * block.expansion:
120 |             if shortcut_type == 'A':
121 |                 downsample = partial(
122 |                     downsample_basic_block,
123 |                     planes=planes * block.expansion,
124 |                     stride=stride)
125 |             else:
126 |                 downsample = nn.Sequential(
127 |                     nn.Conv3d(
128 |                         self.inplanes,
129 |                         planes * block.expansion,
130 |                         kernel_size=1,
131 |                         stride=stride,
132 |                         bias=False), nn.BatchNorm3d(planes * block.expansion))
133 | 
134 |         layers = []
135 |         layers.append(block(self.inplanes, planes, stride, downsample))
136 |         self.inplanes = planes * block.expansion
137 |         for i in range(1, blocks):
138 |             layers.append(block(self.inplanes, planes))
139 | 
140 |         return nn.Sequential(*layers)
141 | 
142 |     def forward(self, x):
143 |         x = self.conv1(x)
144 |         x = self.bn1(x)
145 |         x = self.relu(x)
146 |         x = self.maxpool(x)
147 | 
148 |         x = self.layer1(x)
149 |         x = self.layer2(x)
150 |         x = self.layer3(x)
151 |         x = self.layer4(x)
152 | 
153 |         x = self.avgpool(x)
154 | 
155 |         x = x.view(x.size(0), -1)
156 |         x = self.fc(x)
157 | 
158 |         return x
159 | 
160 | 
161 | def get_fine_tuning_parameters(model, ft_begin_index):
162 |     if ft_begin_index == 0:
163 |         return model.parameters()
164 | 
165 |     ft_module_names = []
166 |     for i in range(ft_begin_index, 5):
167 |         ft_module_names.append('layer{}'.format(i))
168 |     ft_module_names.append('fc')
169 | 
170 |     parameters = []
171 |     for k, v in model.named_parameters():
172 |         for ft_module in ft_module_names:
173 |             if ft_module in k:
174 |                 parameters.append({'params': v})
175 |                 break
176 |         else:
177 |             parameters.append({'params': v, 'lr': 0.0})
178 | 
179 |     return parameters
180 | 
181 | 
182 | def resnet50(**kwargs):
183 |     """Constructs a ResNet-50 model.
184 |     """
185 |     model = WideResNet(WideBottleneck, [3, 4, 6, 3], **kwargs)
186 |     return model
187 | 


--------------------------------------------------------------------------------
/sdn_packages.txt:
--------------------------------------------------------------------------------
  1 | # Name                    Version                   Build  Channel
  2 | _libgcc_mutex             0.1                        main  
  3 | absl-py                   0.7.1                     <pip>
  4 | asn1crypto                0.24.0                   py36_0  
  5 | astor                     0.7.1                     <pip>
  6 | attrs                     19.1.0                   py36_1  
  7 | backcall                  0.1.0                    py36_0  
  8 | blas                      1.0                         mkl  
  9 | bleach                    1.5.0                     <pip>
 10 | bleach                    3.1.0                    py36_0  
 11 | bzip2                     1.0.6                h14c3975_5  
 12 | ca-certificates           2019.11.28           hecc5488_0    conda-forge
 13 | cairo                     1.14.12              h8948797_3  
 14 | certifi                   2019.11.28               py36_0    conda-forge
 15 | cffi                      1.12.3           py36h2e261b9_0  
 16 | chardet                   3.0.4                    py36_1  
 17 | cryptography              2.6.1            py36h1ba5d50_0  
 18 | cuda90                    1.0                  h6433d27_0    pytorch
 19 | cudatoolkit               9.0                  h13b8566_0  
 20 | cudnn                     7.6.0                 cuda9.0_0  
 21 | cycler                    0.10.0                   py36_0  
 22 | cython                    0.29.6           py36he6710b0_0  
 23 | dbus                      1.13.6               h746ee38_0  
 24 | decorator                 4.4.0                    py36_1  
 25 | defusedxml                0.5.0                    py36_1  
 26 | entrypoints               0.3                      py36_0  
 27 | enum34                    1.1.6                     <pip>
 28 | expat                     2.2.6                he6710b0_0  
 29 | ffmpeg                    4.0                  hcdf2ecd_0  
 30 | fontconfig                2.13.0               h9420a91_0  
 31 | freeglut                  3.0.0                hf484d3e_5  
 32 | freetype                  2.9.1                h8a8886c_1  
 33 | gast                      0.2.2                     <pip>
 34 | glib                      2.56.2               hd408876_0  
 35 | gmp                       6.1.2                h6c8ec71_1  
 36 | graphite2                 1.3.13               h23475e2_0  
 37 | grpcio                    1.20.0                    <pip>
 38 | gst-plugins-base          1.14.0               hbbd80ab_1  
 39 | gstreamer                 1.14.0               hb453b48_1  
 40 | h5py                      2.9.0                     <pip>
 41 | harfbuzz                  1.8.8                hffaf4a1_0  
 42 | hdf5                      1.10.2               hba1933b_1  
 43 | html5lib                  0.9999999                 <pip>
 44 | icu                       58.2                 h9c2bf20_1  
 45 | idna                      2.8                      py36_0  
 46 | intel-openmp              2019.3                      199  
 47 | ipykernel                 5.1.0            py36h39e3cac_0  
 48 | ipython                   7.4.0            py36h39e3cac_0  
 49 | ipython_genutils          0.2.0                    py36_0  
 50 | jasper                    2.0.14               h07fcdf6_1  
 51 | jedi                      0.13.3                   py36_0  
 52 | jinja2                    2.10.1                   py36_0  
 53 | joblib                    0.13.2                   py36_0  
 54 | jpeg                      9b                   h024ee3a_2  
 55 | jsonschema                3.0.1                    py36_0  
 56 | jupyter_client            5.2.4                    py36_0  
 57 | jupyter_core              4.4.0                    py36_0  
 58 | jupyterlab                0.35.4           py36hf63ae98_0  
 59 | jupyterlab_server         0.2.0                    py36_0  
 60 | Keras-Applications        1.0.7                     <pip>
 61 | Keras-Preprocessing       1.0.9                     <pip>
 62 | kiwisolver                1.0.1            py36hf484d3e_0  
 63 | libedit                   3.1.20181209         hc058e9b_0  
 64 | libffi                    3.2.1                hd88cf55_4  
 65 | libgcc-ng                 9.1.0                hdf63c60_0  
 66 | libgfortran-ng            7.3.0                hdf63c60_0  
 67 | libglu                    9.0.0                hf484d3e_1  
 68 | libopencv                 3.4.2                hb342d67_1  
 69 | libopus                   1.3                  h7b6447c_0  
 70 | libpng                    1.6.36               hbc83047_0  
 71 | libsodium                 1.0.16               h1bed415_0  
 72 | libstdcxx-ng              8.2.0                hdf63c60_1  
 73 | libtiff                   4.0.10               h2733197_2  
 74 | libuuid                   1.0.3                h1bed415_2  
 75 | libvpx                    1.7.0                h439df22_0  
 76 | libxcb                    1.13                 h1bed415_1  
 77 | libxml2                   2.9.9                he19cac6_0  
 78 | Markdown                  3.1                       <pip>
 79 | markupsafe                1.1.1            py36h7b6447c_0  
 80 | matplotlib                3.0.3            py36h5429711_0  
 81 | mistune                   0.8.4            py36h7b6447c_0  
 82 | mkl                       2018.0.3                      1  
 83 | mkl-service               1.1.2            py36h90e4bf4_5  
 84 | mkl_fft                   1.0.6            py36h7dd41cf_0  
 85 | mkl_random                1.0.1            py36h4414c95_1  
 86 | mock                      2.0.0                     <pip>
 87 | nbconvert                 5.4.1                    py36_3  
 88 | nbformat                  4.4.0                    py36_0  
 89 | nccl                      1.3.5                 cuda9.0_0  
 90 | ncurses                   6.1                  he6710b0_1  
 91 | ninja                     1.9.0            py36hfd86e86_0  
 92 | notebook                  5.7.8                    py36_0  
 93 | numpy                     1.15.4           py36h1d66e8a_0  
 94 | numpy-base                1.15.4           py36h81de0dd_0  
 95 | olefile                   0.46                     py36_0  
 96 | opencv                    3.4.2            py36h6fd60c2_1  
 97 | openssl                   1.1.1d               h7b6447c_3  
 98 | packaging                 19.0                     py36_0  
 99 | pandas                    0.24.2           py36he6710b0_0  
100 | pandoc                    2.2.3.2                       0  
101 | pandocfilters             1.4.2                    py36_1  
102 | parso                     0.4.0                      py_0  
103 | pbr                       5.1.3                     <pip>
104 | pcre                      8.43                 he6710b0_0  
105 | pexpect                   4.7.0                    py36_0  
106 | pickleshare               0.7.5                    py36_0  
107 | pillow                    6.1.0            py36h34e0f95_0  
108 | pip                       19.1.1                   py36_0  
109 | pixman                    0.38.0               h7b6447c_0  
110 | prometheus_client         0.6.0                    py36_0  
111 | prompt_toolkit            2.0.9                    py36_0  
112 | protobuf                  3.7.1                     <pip>
113 | ptyprocess                0.6.0                    py36_0  
114 | py-opencv                 3.4.2            py36hb342d67_1  
115 | pycocotools               2.0.0                     <pip>
116 | pycparser                 2.19                     py36_0  
117 | pygments                  2.3.1                    py36_0  
118 | pyopenssl                 19.0.0                   py36_0  
119 | pyparsing                 2.4.0                      py_0  
120 | pyqt                      5.9.2            py36h05f1152_2  
121 | pyrsistent                0.14.11          py36h7b6447c_0  
122 | pysocks                   1.7.0                    py36_0  
123 | python                    3.6.8                h0371630_0  
124 | python-dateutil           2.8.0                    py36_0  
125 | pytorch                   0.4.1            py36ha74772b_0  
126 | pytz                      2019.1                     py_0  
127 | pyyaml                    3.13             py36h14c3975_0  
128 | pyzmq                     18.0.0           py36he6710b0_0  
129 | qt                        5.9.7                h5867ecd_1  
130 | readline                  7.0                  h7b6447c_5  
131 | requests                  2.21.0                   py36_0  
132 | scikit-learn              0.20.1           py36h4989274_0  
133 | scikit-video              1.1.11                    <pip>
134 | scipy                     1.1.0            py36hfa4b5c9_1  
135 | send2trash                1.5.0                    py36_0  
136 | setuptools                41.0.1                   py36_0  
137 | sip                       4.19.8           py36hf484d3e_0  
138 | six                       1.12.0                   py36_0  
139 | sqlite                    3.29.0               h7b6447c_0  
140 | termcolor                 1.1.0                     <pip>
141 | terminado                 0.8.2                    py36_0  
142 | testpath                  0.4.2                    py36_0  
143 | tk                        8.6.8                hbc83047_0  
144 | torchvision               0.2.1                    py36_0  
145 | tornado                   6.0.2            py36h7b6447c_0  
146 | traitlets                 4.3.2                    py36_0  
147 | urllib3                   1.24.2                   py36_0  
148 | wcwidth                   0.1.7                    py36_0  
149 | webencodings              0.5.1                    py36_1  
150 | Werkzeug                  0.15.2                    <pip>
151 | wheel                     0.33.4                   py36_0  
152 | x264                      1!152.20180806       h14c3975_0    conda-forge
153 | xz                        5.2.4                h14c3975_4  
154 | yaml                      0.1.7                had09818_2  
155 | zeromq                    4.3.1                he6710b0_3  
156 | zlib                      1.2.11               h7b6447c_3  
157 | zstd                      1.3.7                h0b5b093_0  
158 | 


--------------------------------------------------------------------------------
/utils/eval_diving48.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import sys, os
  6 | sys.path.append('/home/jinchoi/src/3D-ResNets-PyTorch')
  7 | from opts import parse_opts
  8 | import pdb
  9 | 
 10 | class Diving48classification(object):
 11 |     def __init__(self, ground_truth_filename=None, class_def_filename=None, prediction_filename=None,
 12 |                  subset='validation', verbose=False, top_k=1):
 13 |         if not ground_truth_filename:
 14 |             raise IOError('Please input a valid ground truth file.')
 15 |         if not prediction_filename:
 16 |             raise IOError('Please input a valid prediction file.')
 17 |         self.subset = subset
 18 |         self.verbose = verbose
 19 |         self.top_k = top_k
 20 |         self.ap = None
 21 |         self.hit_at_k = None
 22 |         # Import ground truth and predictions.
 23 |         self.ground_truth = self._import_ground_truth(
 24 |             ground_truth_filename)
 25 |         self.activity_index = self._get_class_labels(class_def_filename)
 26 |         self.prediction = self._import_prediction(prediction_filename)
 27 | 
 28 |         if self.verbose:
 29 |             print ('[INIT] Loaded annotations from {} subset.'.format(subset))
 30 |             nr_gt = len(self.ground_truth)
 31 |             print ('\tNumber of ground truth instances: {}'.format(nr_gt))
 32 |             nr_pred = len(self.prediction)
 33 |             print ('\tNumber of predictions: {}'.format(nr_pred))
 34 | 
 35 |     def _get_class_labels(self, data_file_path):
 36 |         with open(data_file_path, 'r') as data_file:
 37 |             data = json.load(data_file)
 38 |         data = ['_'.join(row) for row in data]
 39 |         class_labels_map = {}
 40 |         index = 0
 41 |         for class_label in data:
 42 |             class_labels_map[class_label] = index
 43 |             index += 1
 44 |         return class_labels_map
 45 |     
 46 |     def _import_ground_truth(self, ground_truth_filename):
 47 |         """Reads ground truth file, checks if it is well formatted, and returns
 48 |            the ground truth instances and the activity classes.
 49 | 
 50 |         Parameters
 51 |         ----------
 52 |         ground_truth_filename : str
 53 |             Full path to the ground truth json file.
 54 | 
 55 |         Outputs
 56 |         -------
 57 |         ground_truth : df
 58 |             Data frame containing the ground truth instances.
 59 |         activity_index : dict
 60 |             Dictionary containing class index.
 61 |         """
 62 |         with open(ground_truth_filename, 'r') as fobj:
 63 |             data = json.load(fobj)
 64 |         
 65 |         # pdb.set_trace()
 66 |         # Checking format
 67 |         # if not all([field in data.keys() for field in self.gt_fields]):
 68 |             # raise IOError('Please input a valid ground truth file.')
 69 | 
 70 |         # Initialize data frame
 71 |         video_lst, label_lst = [], []
 72 |         for cur_data in data:
 73 |             video_lst.append(cur_data['vid_name'])
 74 |             label_lst.append(cur_data['label'])
 75 |         ground_truth = pd.DataFrame({'video-id': video_lst,
 76 |                                      'label': label_lst})
 77 |         ground_truth = ground_truth.drop_duplicates().reset_index(drop=True)
 78 | 
 79 |         return ground_truth
 80 | 
 81 |     def _import_prediction(self, prediction_filename):
 82 |         """Reads prediction file, checks if it is well formatted, and returns
 83 |            the prediction instances.
 84 | 
 85 |         Parameters
 86 |         ----------
 87 |         prediction_filename : str
 88 |             Full path to the prediction json file.
 89 | 
 90 |         Outputs
 91 |         -------
 92 |         prediction : df
 93 |             Data frame containing the prediction instances.
 94 |         """
 95 |         with open(prediction_filename, 'r') as fobj:
 96 |             data = json.load(fobj)
 97 |         # Checking format...
 98 |         # if not all([field in data.keys() for field in self.pred_fields]):
 99 |             # raise IOError('Please input a valid prediction file.')
100 | 
101 |         # Initialize data frame
102 |         video_lst, label_lst, score_lst = [], [], []
103 |         for videoid, v in data['results'].items():
104 |             for result in v:
105 |                 label = self.activity_index[result['label']]
106 |                 video_lst.append(videoid)
107 |                 label_lst.append(label)
108 |                 score_lst.append(result['score'])
109 |         prediction = pd.DataFrame({'video-id': video_lst,
110 |                                    'label': label_lst,
111 |                                    'score': score_lst})
112 |         return prediction
113 | 
114 |     def evaluate(self):
115 |         """Evaluates a prediction file. For the detection task we measure the
116 |         interpolated mean average precision to measure the performance of a
117 |         method.
118 |         """
119 |         hit_at_k = compute_video_hit_at_k(self.ground_truth,
120 |                                           self.prediction, top_k=self.top_k)
121 |         if self.verbose:
122 |             print ('[RESULTS] Performance on Diving48 video '
123 |                    'classification task.')
124 |             print ('\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k))
125 |             #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k)
126 |         self.hit_at_k = hit_at_k
127 | 
128 | ################################################################################
129 | # Metrics
130 | ################################################################################
131 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3):
132 |     """Compute accuracy at k prediction between ground truth and
133 |     predictions data frames. This code is greatly inspired by evaluation
134 |     performed in Karpathy et al. CVPR14.
135 | 
136 |     Parameters
137 |     ----------
138 |     ground_truth : df
139 |         Data frame containing the ground truth instances.
140 |         Required fields: ['video-id', 'label']
141 |     prediction : df
142 |         Data frame containing the prediction instances.
143 |         Required fields: ['video-id, 'label', 'score']
144 | 
145 |     Outputs
146 |     -------
147 |     acc : float
148 |         Top k accuracy score.
149 |     """
150 |     video_ids = np.unique(ground_truth['video-id'].values)
151 |     avg_hits_per_vid = np.zeros(video_ids.size)
152 |     for i, vid in enumerate(video_ids):
153 |         pred_idx = prediction['video-id'] == vid
154 |         if not pred_idx.any():
155 |             continue
156 |         this_pred = prediction.loc[pred_idx].reset_index(drop=True)
157 |         # Get top K predictions sorted by decreasing score.
158 |         sort_idx = this_pred['score'].values.argsort()[::-1][:top_k]
159 |         this_pred = this_pred.loc[sort_idx].reset_index(drop=True)
160 |         # Get labels and compare against ground truth.
161 |         pred_label = this_pred['label'].tolist()
162 |         gt_idx = ground_truth['video-id'] == vid
163 |         gt_label = ground_truth.loc[gt_idx]['label'].tolist()
164 |         avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0
165 |                                        for this_label in gt_label])
166 |     return float(avg_hits_per_vid.mean())
167 | 
168 | 
169 | if __name__ == '__main__':
170 |     opt = parse_opts()
171 |     opt.class_def_path = os.path.join(opt.root_path, opt.annotation_path, 'Diving48_vocab.json')
172 | 
173 |     opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path, 'Diving48_test.json')
174 |     
175 |     # pdb.set_trace()
176 |     diving48_classification = Diving48classification(opt.annotation_path, opt.class_def_path, opt.prediction_path, subset='val', verbose=True, top_k=1)
177 |     diving48_classification.evaluate()
178 |     print(diving48_classification.hit_at_k)
179 | 


--------------------------------------------------------------------------------
/utils/eval_hmdb51.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pdb
  6 | import sys, os
  7 | sys.path.append('/home/jinchoi/src/3D-ResNets-PyTorch')
  8 | from opts import parse_opts
  9 | 
 10 | 
 11 | class HMDBclassification(object):
 12 | 
 13 |     def __init__(self, ground_truth_filename=None, prediction_filename=None,
 14 |                  subset='validation', verbose=False, top_k=1):
 15 |         if not ground_truth_filename:
 16 |             raise IOError('Please input a valid ground truth file.')
 17 |         if not prediction_filename:
 18 |             raise IOError('Please input a valid prediction file.')
 19 |         self.subset = subset
 20 |         self.verbose = verbose
 21 |         self.top_k = top_k
 22 |         self.ap = None
 23 |         self.hit_at_k = None
 24 |         # Import ground truth and predictions.
 25 |         self.ground_truth, self.activity_index = self._import_ground_truth(
 26 |             ground_truth_filename)
 27 |         self.prediction = self._import_prediction(prediction_filename)
 28 | 
 29 |         if self.verbose:
 30 |             print('[INIT] Loaded annotations from {} subset.'.format(subset))
 31 |             nr_gt = len(self.ground_truth)
 32 |             print ('\tNumber of ground truth instances: {}'.format(nr_gt))
 33 |             nr_pred = len(self.prediction)
 34 |             print ('\tNumber of predictions: {}'.format(nr_pred))
 35 | 
 36 |     def _import_ground_truth(self, ground_truth_filename):
 37 |         """Reads ground truth file, checks if it is well formatted, and returns
 38 |            the ground truth instances and the activity classes.
 39 | 
 40 |         Parameters
 41 |         ----------
 42 |         ground_truth_filename : str
 43 |             Full path to the ground truth json file.
 44 | 
 45 |         Outputs
 46 |         -------
 47 |         ground_truth : df
 48 |             Data frame containing the ground truth instances.
 49 |         activity_index : dict
 50 |             Dictionary containing class index.
 51 |         """
 52 |         with open(ground_truth_filename, 'r') as fobj:
 53 |             data = json.load(fobj)
 54 |         # Checking format
 55 |         # if not all([field in data.keys() for field in self.gt_fields]):
 56 |             # raise IOError('Please input a valid ground truth file.')
 57 | 
 58 |         # Initialize data frame
 59 |         activity_index, cidx = {}, 0
 60 |         video_lst, label_lst = [], []
 61 |         for videoid, v in data['database'].items():
 62 |             if self.subset != v['subset']:
 63 |                 continue
 64 |             this_label = v['annotations']['label']
 65 |             if this_label not in activity_index:
 66 |                 activity_index[this_label] = cidx
 67 |                 cidx += 1
 68 |             video_lst.append(videoid)
 69 |             label_lst.append(activity_index[this_label])
 70 |         ground_truth = pd.DataFrame({'video-id': video_lst,
 71 |                                      'label': label_lst})
 72 |         ground_truth = ground_truth.drop_duplicates().reset_index(drop=True)
 73 |         return ground_truth, activity_index
 74 | 
 75 |     def _import_prediction(self, prediction_filename):
 76 |         """Reads prediction file, checks if it is well formatted, and returns
 77 |            the prediction instances.
 78 | 
 79 |         Parameters
 80 |         ----------
 81 |         prediction_filename : str
 82 |             Full path to the prediction json file.
 83 | 
 84 |         Outputs
 85 |         -------
 86 |         prediction : df
 87 |             Data frame containing the prediction instances.
 88 |         """
 89 |         with open(prediction_filename, 'r') as fobj:
 90 |             data = json.load(fobj)
 91 |         # Checking format...
 92 |         # if not all([field in data.keys() for field in self.pred_fields]):
 93 |             # raise IOError('Please input a valid prediction file.')
 94 | 
 95 |         # Initialize data frame
 96 |         video_lst, label_lst, score_lst = [], [], []
 97 |         for videoid, v in data['results'].items():
 98 |             for result in v:
 99 |                 label = self.activity_index[result['label']]
100 |                 video_lst.append(videoid)
101 |                 label_lst.append(label)
102 |                 score_lst.append(result['score'])
103 |         prediction = pd.DataFrame({'video-id': video_lst,
104 |                                    'label': label_lst,
105 |                                    'score': score_lst})
106 |         return prediction
107 | 
108 |     def evaluate(self):
109 |         """Evaluates a prediction file. For the detection task we measure the
110 |         interpolated mean average precision to measure the performance of a
111 |         method.
112 |         """
113 |         hit_at_k = compute_video_hit_at_k(self.ground_truth,
114 |                                           self.prediction, top_k=self.top_k)
115 |         if self.verbose:
116 |             print ('[RESULTS] Performance on HMDB-51 video '
117 |                    'classification task.')
118 |             print ('\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k))
119 |             #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k)
120 |         self.hit_at_k = hit_at_k
121 | 
122 | ################################################################################
123 | # Metrics
124 | ################################################################################
125 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3):
126 |     """Compute accuracy at k prediction between ground truth and
127 |     predictions data frames. This code is greatly inspired by evaluation
128 |     performed in Karpathy et al. CVPR14.
129 | 
130 |     Parameters
131 |     ----------
132 |     ground_truth : df
133 |         Data frame containing the ground truth instances.
134 |         Required fields: ['video-id', 'label']
135 |     prediction : df
136 |         Data frame containing the prediction instances.
137 |         Required fields: ['video-id, 'label', 'score']
138 | 
139 |     Outputs
140 |     -------
141 |     acc : float
142 |         Top k accuracy score.
143 |     """
144 |     video_ids = np.unique(ground_truth['video-id'].values)
145 |     avg_hits_per_vid = np.zeros(video_ids.size)
146 |     for i, vid in enumerate(video_ids):
147 |         pred_idx = prediction['video-id'] == vid
148 |         if not pred_idx.any():
149 |             continue
150 |         this_pred = prediction.loc[pred_idx].reset_index(drop=True)
151 |         # Get top K predictions sorted by decreasing score.
152 |         sort_idx = this_pred['score'].values.argsort()[::-1][:top_k]
153 |         this_pred = this_pred.loc[sort_idx].reset_index(drop=True)
154 |         # Get labels and compare against ground truth.
155 |         pred_label = this_pred['label'].tolist()
156 |         gt_idx = ground_truth['video-id'] == vid
157 |         gt_label = ground_truth.loc[gt_idx]['label'].tolist()
158 |         avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0
159 |                                        for this_label in gt_label])
160 |     return float(avg_hits_per_vid.mean())
161 | 
162 | if __name__ == '__main__':
163 |     opt = parse_opts()
164 |     opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
165 |     
166 |     hmdb_classification = HMDBclassification(opt.annotation_path, opt.prediction_path, subset='validation', verbose=True, top_k=1)
167 |     hmdb_classification.evaluate()
168 |     print(hmdb_classification.hit_at_k)
169 | 


--------------------------------------------------------------------------------
/utils/eval_kinetics.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import urllib2
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | API = 'http://ec2-52-11-11-89.us-west-2.compute.amazonaws.com/challenge17/api.py'
  8 | 
  9 | def get_blocked_videos(api=API):
 10 |     api_url = '{}?action=get_blocked'.format(api)
 11 |     req = urllib2.Request(api_url)
 12 |     response = urllib2.urlopen(req)
 13 |     return json.loads(response.read())
 14 | 
 15 | class KINETICSclassification(object):
 16 |     GROUND_TRUTH_FIELDS = ['database', 'labels']
 17 |     PREDICTION_FIELDS = ['results', 'version', 'external_data']
 18 | 
 19 |     def __init__(self, ground_truth_filename=None, prediction_filename=None,
 20 |                  ground_truth_fields=GROUND_TRUTH_FIELDS,
 21 |                  prediction_fields=PREDICTION_FIELDS,
 22 |                  subset='validation', verbose=False, top_k=1,
 23 |                  check_status=True):
 24 |         if not ground_truth_filename:
 25 |             raise IOError('Please input a valid ground truth file.')
 26 |         if not prediction_filename:
 27 |             raise IOError('Please input a valid prediction file.')
 28 |         self.subset = subset
 29 |         self.verbose = verbose
 30 |         self.gt_fields = ground_truth_fields
 31 |         self.pred_fields = prediction_fields
 32 |         self.top_k = top_k
 33 |         self.ap = None
 34 |         self.hit_at_k = None
 35 |         self.check_status = check_status
 36 |         # Retrieve blocked videos from server.
 37 |         if self.check_status:
 38 |             self.blocked_videos = get_blocked_videos()
 39 |         else:
 40 |             self.blocked_videos = list()
 41 |         # Import ground truth and predictions.
 42 |         self.ground_truth, self.activity_index = self._import_ground_truth(
 43 |             ground_truth_filename)
 44 |         self.prediction = self._import_prediction(prediction_filename)
 45 | 
 46 |         if self.verbose:
 47 |             print '[INIT] Loaded annotations from {} subset.'.format(subset)
 48 |             nr_gt = len(self.ground_truth)
 49 |             print '\tNumber of ground truth instances: {}'.format(nr_gt)
 50 |             nr_pred = len(self.prediction)
 51 |             print '\tNumber of predictions: {}'.format(nr_pred)
 52 | 
 53 |     def _import_ground_truth(self, ground_truth_filename):
 54 |         """Reads ground truth file, checks if it is well formatted, and returns
 55 |            the ground truth instances and the activity classes.
 56 | 
 57 |         Parameters
 58 |         ----------
 59 |         ground_truth_filename : str
 60 |             Full path to the ground truth json file.
 61 | 
 62 |         Outputs
 63 |         -------
 64 |         ground_truth : df
 65 |             Data frame containing the ground truth instances.
 66 |         activity_index : dict
 67 |             Dictionary containing class index.
 68 |         """
 69 |         with open(ground_truth_filename, 'r') as fobj:
 70 |             data = json.load(fobj)
 71 |         # Checking format
 72 |         # if not all([field in data.keys() for field in self.gt_fields]):
 73 |             # raise IOError('Please input a valid ground truth file.')
 74 | 
 75 |         # Initialize data frame
 76 |         activity_index, cidx = {}, 0
 77 |         video_lst, label_lst = [], []
 78 |         for videoid, v in data['database'].iteritems():
 79 |             if self.subset != v['subset']:
 80 |                 continue
 81 |             if videoid in self.blocked_videos:
 82 |                 continue
 83 |             this_label = v['annotations']['label']
 84 |             if this_label not in activity_index:
 85 |                 activity_index[this_label] = cidx
 86 |                 cidx += 1
 87 |             video_lst.append(videoid[:-14])
 88 |             label_lst.append(activity_index[this_label])
 89 |         ground_truth = pd.DataFrame({'video-id': video_lst,
 90 |                                      'label': label_lst})
 91 |         ground_truth = ground_truth.drop_duplicates().reset_index(drop=True)
 92 |         return ground_truth, activity_index
 93 | 
 94 |     def _import_prediction(self, prediction_filename):
 95 |         """Reads prediction file, checks if it is well formatted, and returns
 96 |            the prediction instances.
 97 | 
 98 |         Parameters
 99 |         ----------
100 |         prediction_filename : str
101 |             Full path to the prediction json file.
102 | 
103 |         Outputs
104 |         -------
105 |         prediction : df
106 |             Data frame containing the prediction instances.
107 |         """
108 |         with open(prediction_filename, 'r') as fobj:
109 |             data = json.load(fobj)
110 |         # Checking format...
111 |         # if not all([field in data.keys() for field in self.pred_fields]):
112 |             # raise IOError('Please input a valid prediction file.')
113 | 
114 |         # Initialize data frame
115 |         video_lst, label_lst, score_lst = [], [], []
116 |         for videoid, v in data['results'].iteritems():
117 |             if videoid in self.blocked_videos:
118 |                 continue
119 |             for result in v:
120 |                 label = self.activity_index[result['label']]
121 |                 video_lst.append(videoid)
122 |                 label_lst.append(label)
123 |                 score_lst.append(result['score'])
124 |         prediction = pd.DataFrame({'video-id': video_lst,
125 |                                    'label': label_lst,
126 |                                    'score': score_lst})
127 |         return prediction
128 | 
129 |     def evaluate(self):
130 |         """Evaluates a prediction file. For the detection task we measure the
131 |         interpolated mean average precision to measure the performance of a
132 |         method.
133 |         """
134 |         hit_at_k = compute_video_hit_at_k(self.ground_truth,
135 |                                           self.prediction, top_k=self.top_k)
136 |         # avg_hit_at_k = compute_video_hit_at_k(
137 |             # self.ground_truth, self.prediction, top_k=self.top_k, avg=True)
138 |         if self.verbose:
139 |             print ('[RESULTS] Performance on Kinetics video '
140 |                    'classification task.')
141 |             # print '\tMean Average Precision: {}'.format(ap.mean())
142 |             print '\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k)
143 |             #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k)
144 |         # self.ap = ap
145 |         self.hit_at_k = hit_at_k
146 |         # self.avg_hit_at_k = avg_hit_at_k
147 | 
148 | ################################################################################
149 | # Metrics
150 | ################################################################################
151 | 
152 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3, avg=False):
153 |     """Compute accuracy at k prediction between ground truth and
154 |     predictions data frames. This code is greatly inspired by evaluation
155 |     performed in Karpathy et al. CVPR14.
156 | 
157 |     Parameters
158 |     ----------
159 |     ground_truth : df
160 |         Data frame containing the ground truth instances.
161 |         Required fields: ['video-id', 'label']
162 |     prediction : df
163 |         Data frame containing the prediction instances.
164 |         Required fields: ['video-id, 'label', 'score']
165 | 
166 |     Outputs
167 |     -------
168 |     acc : float
169 |         Top k accuracy score.
170 |     """
171 |     video_ids = np.unique(ground_truth['video-id'].values)
172 |     avg_hits_per_vid = np.zeros(video_ids.size)
173 |     for i, vid in enumerate(video_ids):
174 |         pred_idx = prediction['video-id'] == vid
175 |         if not pred_idx.any():
176 |             continue
177 |         this_pred = prediction.loc[pred_idx].reset_index(drop=True)
178 |         # Get top K predictions sorted by decreasing score.
179 |         sort_idx = this_pred['score'].values.argsort()[::-1][:top_k]
180 |         this_pred = this_pred.loc[sort_idx].reset_index(drop=True)
181 |         # Get labels and compare against ground truth.
182 |         pred_label = this_pred['label'].tolist()
183 |         gt_idx = ground_truth['video-id'] == vid
184 |         gt_label = ground_truth.loc[gt_idx]['label'].tolist()
185 |         avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0
186 |                                        for this_label in gt_label])
187 |         if not avg:
188 |             avg_hits_per_vid[i] = np.ceil(avg_hits_per_vid[i])
189 |     return float(avg_hits_per_vid.mean())
190 | 


--------------------------------------------------------------------------------
/utils/eval_ucf101.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import sys, os
  6 | sys.path.append('/home/jinchoi/src/3D-ResNets-PyTorch')
  7 | from opts import parse_opts
  8 | 
  9 | class UCFclassification(object):
 10 | 
 11 |     def __init__(self, ground_truth_filename=None, prediction_filename=None,
 12 |                  subset='validation', verbose=False, top_k=1):
 13 |         if not ground_truth_filename:
 14 |             raise IOError('Please input a valid ground truth file.')
 15 |         if not prediction_filename:
 16 |             raise IOError('Please input a valid prediction file.')
 17 |         self.subset = subset
 18 |         self.verbose = verbose
 19 |         self.top_k = top_k
 20 |         self.ap = None
 21 |         self.hit_at_k = None
 22 |         # Import ground truth and predictions.
 23 |         self.ground_truth, self.activity_index = self._import_ground_truth(
 24 |             ground_truth_filename)
 25 |         self.prediction = self._import_prediction(prediction_filename)
 26 | 
 27 |         if self.verbose:
 28 |             print ('[INIT] Loaded annotations from {} subset.'.format(subset))
 29 |             nr_gt = len(self.ground_truth)
 30 |             print ('\tNumber of ground truth instances: {}'.format(nr_gt))
 31 |             nr_pred = len(self.prediction)
 32 |             print ('\tNumber of predictions: {}'.format(nr_pred))
 33 | 
 34 |     def _import_ground_truth(self, ground_truth_filename):
 35 |         """Reads ground truth file, checks if it is well formatted, and returns
 36 |            the ground truth instances and the activity classes.
 37 | 
 38 |         Parameters
 39 |         ----------
 40 |         ground_truth_filename : str
 41 |             Full path to the ground truth json file.
 42 | 
 43 |         Outputs
 44 |         -------
 45 |         ground_truth : df
 46 |             Data frame containing the ground truth instances.
 47 |         activity_index : dict
 48 |             Dictionary containing class index.
 49 |         """
 50 |         with open(ground_truth_filename, 'r') as fobj:
 51 |             data = json.load(fobj)
 52 |         # Checking format
 53 |         # if not all([field in data.keys() for field in self.gt_fields]):
 54 |             # raise IOError('Please input a valid ground truth file.')
 55 | 
 56 |         # Initialize data frame
 57 |         activity_index, cidx = {}, 0
 58 |         video_lst, label_lst = [], []
 59 |         for videoid, v in data['database'].items():
 60 |             if self.subset != v['subset']:
 61 |                 continue
 62 |             this_label = v['annotations']['label']
 63 |             if this_label not in activity_index:
 64 |                 activity_index[this_label] = cidx
 65 |                 cidx += 1
 66 |             video_lst.append(videoid)
 67 |             label_lst.append(activity_index[this_label])
 68 |         ground_truth = pd.DataFrame({'video-id': video_lst,
 69 |                                      'label': label_lst})
 70 |         ground_truth = ground_truth.drop_duplicates().reset_index(drop=True)
 71 |         return ground_truth, activity_index
 72 | 
 73 |     def _import_prediction(self, prediction_filename):
 74 |         """Reads prediction file, checks if it is well formatted, and returns
 75 |            the prediction instances.
 76 | 
 77 |         Parameters
 78 |         ----------
 79 |         prediction_filename : str
 80 |             Full path to the prediction json file.
 81 | 
 82 |         Outputs
 83 |         -------
 84 |         prediction : df
 85 |             Data frame containing the prediction instances.
 86 |         """
 87 |         with open(prediction_filename, 'r') as fobj:
 88 |             data = json.load(fobj)
 89 |         # Checking format...
 90 |         # if not all([field in data.keys() for field in self.pred_fields]):
 91 |             # raise IOError('Please input a valid prediction file.')
 92 | 
 93 |         # Initialize data frame
 94 |         video_lst, label_lst, score_lst = [], [], []
 95 |         for videoid, v in data['results'].items():
 96 |             for result in v:
 97 |                 label = self.activity_index[result['label']]
 98 |                 video_lst.append(videoid)
 99 |                 label_lst.append(label)
100 |                 score_lst.append(result['score'])
101 |         prediction = pd.DataFrame({'video-id': video_lst,
102 |                                    'label': label_lst,
103 |                                    'score': score_lst})
104 |         return prediction
105 | 
106 |     def evaluate(self):
107 |         """Evaluates a prediction file. For the detection task we measure the
108 |         interpolated mean average precision to measure the performance of a
109 |         method.
110 |         """
111 |         hit_at_k = compute_video_hit_at_k(self.ground_truth,
112 |                                           self.prediction, top_k=self.top_k)
113 |         if self.verbose:
114 |             print ('[RESULTS] Performance on UCF101 video '
115 |                    'classification task.')
116 |             print ('\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k))
117 |             #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k)
118 |         self.hit_at_k = hit_at_k
119 | 
120 | ################################################################################
121 | # Metrics
122 | ################################################################################
123 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3):
124 |     """Compute accuracy at k prediction between ground truth and
125 |     predictions data frames. This code is greatly inspired by evaluation
126 |     performed in Karpathy et al. CVPR14.
127 | 
128 |     Parameters
129 |     ----------
130 |     ground_truth : df
131 |         Data frame containing the ground truth instances.
132 |         Required fields: ['video-id', 'label']
133 |     prediction : df
134 |         Data frame containing the prediction instances.
135 |         Required fields: ['video-id, 'label', 'score']
136 | 
137 |     Outputs
138 |     -------
139 |     acc : float
140 |         Top k accuracy score.
141 |     """
142 |     video_ids = np.unique(ground_truth['video-id'].values)
143 |     avg_hits_per_vid = np.zeros(video_ids.size)
144 |     for i, vid in enumerate(video_ids):
145 |         pred_idx = prediction['video-id'] == vid
146 |         if not pred_idx.any():
147 |             continue
148 |         this_pred = prediction.loc[pred_idx].reset_index(drop=True)
149 |         # Get top K predictions sorted by decreasing score.
150 |         sort_idx = this_pred['score'].values.argsort()[::-1][:top_k]
151 |         this_pred = this_pred.loc[sort_idx].reset_index(drop=True)
152 |         # Get labels and compare against ground truth.
153 |         pred_label = this_pred['label'].tolist()
154 |         gt_idx = ground_truth['video-id'] == vid
155 |         gt_label = ground_truth.loc[gt_idx]['label'].tolist()
156 |         avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0
157 |                                        for this_label in gt_label])
158 |     return float(avg_hits_per_vid.mean())
159 | 
160 | 
161 | if __name__ == '__main__':
162 |     opt = parse_opts()
163 |     opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
164 |         
165 |     ucf_classification = UCFclassification(opt.annotation_path, opt.prediction_path,subset='validation', verbose=True, top_k=1)
166 |     ucf_classification.evaluate()
167 |     print(ucf_classification.hit_at_k)
168 | 


--------------------------------------------------------------------------------
/utils/fps.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | 
 7 | if __name__=="__main__":
 8 |   dir_path = sys.argv[1]
 9 |   dst_dir_path = sys.argv[2]
10 | 
11 |   for file_name in os.listdir(dir_path):
12 |     if '.mp4' not in file_name:
13 |       continue
14 |     name, ext = os.path.splitext(file_name)
15 |     dst_directory_path = os.path.join(dst_dir_path, name)
16 | 
17 |     video_file_path = os.path.join(dir_path, file_name)
18 |     p = subprocess.Popen('ffprobe {}'.format(video_file_path),
19 |                          shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
20 |     _, res = p.communicate()
21 |     res = res.decode('utf-8')
22 | 
23 |     duration_index = res.find('Duration:')
24 |     duration_str = res[(duration_index + 10):(duration_index + 21)]
25 |     hour = float(duration_str[0:2])
26 |     minute = float(duration_str[3:5])
27 |     sec = float(duration_str[6:10])
28 |     total_sec = hour * 3600 + minute * 60 + sec
29 | 
30 |     n_frames = len(os.listdir(dst_directory_path))
31 |     if os.path.exists(os.path.join(dst_directory_path, 'fps')):
32 |       n_frames -= 1
33 | 
34 |     fps = round(n_frames / total_sec, 2)
35 | 
36 |     print(video_file_path, os.path.exists(video_file_path), fps)
37 |     with open(os.path.join(dst_directory_path, 'fps'), 'w') as fps_file:
38 |       fps_file.write('{}\n'.format(fps))
39 | 


--------------------------------------------------------------------------------
/utils/hmdb51_json.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import json
 5 | import pandas as pd
 6 | 
 7 | def convert_csv_to_dict(csv_dir_path, split_index):
 8 |     database = {}
 9 |     for filename in os.listdir(csv_dir_path):
10 |         if 'split{}'.format(split_index) not in filename:
11 |             continue
12 |         
13 |         data = pd.read_csv(os.path.join(csv_dir_path, filename),
14 |                            delimiter=' ', header=None)
15 |         keys = []
16 |         subsets = []
17 |         for i in range(data.shape[0]):
18 |             row = data.ix[i, :]
19 |             if row[1] == 0:
20 |                 continue
21 |             elif row[1] == 1:
22 |                 subset = 'training'
23 |             elif row[1] == 2:
24 |                 subset = 'validation'
25 |             
26 |             keys.append(row[0].split('.')[0])
27 |             subsets.append(subset)        
28 |         
29 |         for i in range(len(keys)):
30 |             key = keys[i]
31 |             database[key] = {}
32 |             database[key]['subset'] = subsets[i]
33 |             label = '_'.join(filename.split('_')[:-2])
34 |             database[key]['annotations'] = {'label': label}
35 |     
36 |     return database
37 | 
38 | def get_labels(csv_dir_path):
39 |     labels = []
40 |     for name in os.listdir(csv_dir_path):
41 |         labels.append('_'.join(name.split('_')[:-2]))
42 |     return sorted(list(set(labels)))
43 | 
44 | def convert_hmdb51_csv_to_activitynet_json(csv_dir_path, split_index, dst_json_path):
45 |     labels = get_labels(csv_dir_path)
46 |     database = convert_csv_to_dict(csv_dir_path, split_index)
47 |     
48 |     dst_data = {}
49 |     dst_data['labels'] = labels
50 |     dst_data['database'] = {}
51 |     dst_data['database'].update(database)
52 | 
53 |     with open(dst_json_path, 'w') as dst_file:
54 |         json.dump(dst_data, dst_file)
55 | 
56 | if __name__ == '__main__':
57 |     csv_dir_path = sys.argv[1]
58 | 
59 |     for split_index in range(1, 4):
60 |         dst_json_path = os.path.join(csv_dir_path, 'hmdb51_{}.json'.format(split_index))
61 |         convert_hmdb51_csv_to_activitynet_json(csv_dir_path, split_index, dst_json_path)


--------------------------------------------------------------------------------
/utils/kinetics_json.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import json
 5 | import pandas as pd
 6 | 
 7 | def convert_csv_to_dict(csv_path, subset):
 8 |     data = pd.read_csv(csv_path)
 9 | 
10 |     keys = []
11 |     key_labels = []
12 |     for i in range(data.shape[0]):
13 |         row = data.ix[i, :]
14 |         basename = '%s_%s_%s' % (row['youtube_id'],
15 |                                  '%06d' % row['time_start'],
16 |                                  '%06d' % row['time_end'])
17 |         keys.append(basename)
18 |         if subset != 'testing':
19 |             key_labels.append(row['label'])
20 | 
21 |     database = {}
22 |     for i in range(len(keys)):
23 |         key = keys[i]
24 |         database[key] = {}
25 |         database[key]['subset'] = subset
26 |         if subset != 'testing':
27 |             label = key_labels[i]
28 |             database[key]['annotations'] = {'label': label}
29 |         else:
30 |             database[key]['annotations'] = {}
31 | 
32 |     return database
33 | 
34 | def load_labels(train_csv_path):
35 |     data = pd.read_csv(train_csv_path)
36 |     return data['label'].unique().tolist()
37 | 
38 | def convert_kinetics_csv_to_activitynet_json(train_csv_path, val_csv_path, test_csv_path, dst_json_path):
39 |     labels = load_labels(train_csv_path)
40 |     train_database = convert_csv_to_dict(train_csv_path, 'training')
41 |     val_database = convert_csv_to_dict(val_csv_path, 'validation')
42 |     test_database = convert_csv_to_dict(test_csv_path, 'testing')
43 |     dst_data = {}
44 |     dst_data['labels'] = labels
45 |     dst_data['database'] = {}
46 |     dst_data['database'].update(train_database)
47 |     dst_data['database'].update(val_database)
48 |     dst_data['database'].update(test_database)
49 | 
50 |     with open(dst_json_path, 'w') as dst_file:
51 |         json.dump(dst_data, dst_file)
52 | 
53 | if __name__=="__main__":
54 |   train_csv_path = sys.argv[1]
55 |   val_csv_path = sys.argv[2]
56 |   test_csv_path = sys.argv[3]
57 |   dst_json_path = sys.argv[4]
58 | 
59 |   convert_kinetics_csv_to_activitynet_json(
60 |     train_csv_path, val_csv_path, test_csv_path, dst_json_path)
61 | 


--------------------------------------------------------------------------------
/utils/n_frames_kinetics.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | def class_process(dir_path, class_name):
 7 |   class_path = os.path.join(dir_path, class_name)
 8 |   if not os.path.isdir(class_path):
 9 |     return
10 | 
11 |   for file_name in os.listdir(class_path):
12 |     video_dir_path = os.path.join(class_path, file_name)
13 |     image_indices = []
14 |     for image_file_name in os.listdir(video_dir_path):
15 |       if 'image' not in image_file_name:
16 |         continue
17 |       image_indices.append(int(image_file_name[6:11]))
18 | 
19 |     if len(image_indices) == 0:
20 |       print('no image files', video_dir_path)
21 |       n_frames = 0
22 |     else:
23 |       image_indices.sort(reverse=True)
24 |       n_frames = image_indices[0]
25 |       print(video_dir_path, n_frames)
26 |     with open(os.path.join(video_dir_path, 'n_frames'), 'w') as dst_file:
27 |       dst_file.write(str(n_frames))
28 | 
29 | 
30 | if __name__=="__main__":
31 |   dir_path = sys.argv[1]
32 |   for class_name in os.listdir(dir_path):
33 |     class_process(dir_path, class_name)
34 | 
35 |   class_name = 'test'
36 |   class_process(dir_path, class_name)
37 | 


--------------------------------------------------------------------------------
/utils/n_frames_ucf101_hmdb51.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | import pdb
 6 | 
 7 | def class_process(dir_path, class_name):
 8 |   class_path = os.path.join(dir_path, class_name)
 9 |   if not os.path.isdir(class_path):
10 |     return
11 | 
12 |   for file_name in os.listdir(class_path):
13 |     video_dir_path = os.path.join(class_path, file_name)
14 |     image_indices = []
15 |     # pdb.set_trace()
16 |     for image_file_name in os.listdir(video_dir_path):
17 |       if 'image' not in image_file_name:
18 |         continue
19 |       image_indices.append(int(image_file_name[6:11]))
20 | 
21 |     if len(image_indices) == 0:
22 |       print('no image files', video_dir_path)
23 |       n_frames = 0
24 |     else:
25 |       image_indices.sort(reverse=True)
26 |       n_frames = image_indices[0]
27 |       print(video_dir_path, n_frames)
28 |     with open(os.path.join(video_dir_path, 'n_frames'), 'w') as dst_file:
29 |       dst_file.write(str(n_frames))
30 | 
31 | 
32 | if __name__=="__main__":
33 |   dir_path = sys.argv[1]
34 |   for class_name in os.listdir(dir_path):
35 |     class_process(dir_path, class_name)
36 | 


--------------------------------------------------------------------------------
/utils/ucf101_json.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import json
 5 | import pandas as pd
 6 | 
 7 | def convert_csv_to_dict(csv_path, subset):
 8 |     data = pd.read_csv(csv_path, delimiter=' ', header=None)
 9 |     keys = []
10 |     key_labels = []
11 |     for i in range(data.shape[0]):
12 |         row = data.ix[i, :]
13 |         slash_rows = data.ix[i, 0].split('/')
14 |         class_name = slash_rows[0]
15 |         basename = slash_rows[1].split('.')[0]
16 |         
17 |         keys.append(basename)
18 |         key_labels.append(class_name)
19 |         
20 |     database = {}
21 |     for i in range(len(keys)):
22 |         key = keys[i]
23 |         database[key] = {}
24 |         database[key]['subset'] = subset
25 |         label = key_labels[i]
26 |         database[key]['annotations'] = {'label': label}
27 |     
28 |     return database
29 | 
30 | def load_labels(label_csv_path):
31 |     data = pd.read_csv(label_csv_path, delimiter=' ', header=None)
32 |     labels = []
33 |     for i in range(data.shape[0]):
34 |         labels.append(data.ix[i, 1])
35 |     return labels
36 | 
37 | def convert_ucf101_csv_to_activitynet_json(label_csv_path, train_csv_path, 
38 |                                            val_csv_path, dst_json_path):
39 |     labels = load_labels(label_csv_path)
40 |     train_database = convert_csv_to_dict(train_csv_path, 'training')
41 |     val_database = convert_csv_to_dict(val_csv_path, 'validation')
42 |     
43 |     dst_data = {}
44 |     dst_data['labels'] = labels
45 |     dst_data['database'] = {}
46 |     dst_data['database'].update(train_database)
47 |     dst_data['database'].update(val_database)
48 | 
49 |     with open(dst_json_path, 'w') as dst_file:
50 |         json.dump(dst_data, dst_file)
51 | 
52 | if __name__ == '__main__':
53 |     csv_dir_path = sys.argv[1]
54 | 
55 |     for split_index in range(1, 4):
56 |         label_csv_path = os.path.join(csv_dir_path, 'classInd.txt')
57 |         train_csv_path = os.path.join(csv_dir_path, 'trainlist0{}.txt'.format(split_index))
58 |         val_csv_path = os.path.join(csv_dir_path, 'testlist0{}.txt'.format(split_index))
59 |         dst_json_path = os.path.join(csv_dir_path, 'ucf101_0{}.json'.format(split_index))
60 | 
61 |         convert_ucf101_csv_to_activitynet_json(label_csv_path, train_csv_path,
62 |                                                val_csv_path, dst_json_path)
63 | 


--------------------------------------------------------------------------------
/utils/video_jpg.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | 
 7 | if __name__=="__main__":
 8 |   dir_path = sys.argv[1]
 9 |   dst_dir_path = sys.argv[2]
10 | 
11 |   for file_name in os.listdir(dir_path):
12 |     if '.mp4' not in file_name:
13 |       continue
14 |     name, ext = os.path.splitext(file_name)
15 |     dst_directory_path = os.path.join(dst_dir_path, name)
16 | 
17 |     video_file_path = os.path.join(dir_path, file_name)
18 |     try:
19 |       if os.path.exists(dst_directory_path):
20 |         if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')):
21 |           subprocess.call('rm -r {}'.format(dst_directory_path), shell=True)
22 |           print('remove {}'.format(dst_directory_path))
23 |           os.mkdir(dst_directory_path)
24 |         else:
25 |           continue
26 |       else:
27 |         os.mkdir(dst_directory_path)
28 |     except:
29 |       print(dst_directory_path)
30 |       continue
31 |     cmd = 'ffmpeg -i {} -vf scale=-1:360 {}/image_%05d.jpg'.format(video_file_path, dst_directory_path)
32 |     print(cmd)
33 |     subprocess.call(cmd, shell=True)
34 |     print('\n')
35 | 


--------------------------------------------------------------------------------
/utils/video_jpg_diving48.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | import pdb
 6 | 
 7 | def class_process(dir_path, dst_dir_path, class_name, resume_vid_idx=0):
 8 |   class_path = os.path.join(dir_path, class_name)
 9 |   if not os.path.isdir(class_path):
10 |     return
11 | 
12 |   dst_class_path = os.path.join(dst_dir_path, class_name)
13 |   if not os.path.exists(dst_class_path):
14 |     os.mkdir(dst_class_path)
15 | 
16 |   for file_name in os.listdir(class_path)[resume_vid_idx:]:
17 |     if '.mp4' not in file_name:
18 |       continue
19 |     name, ext = os.path.splitext(file_name)
20 |     dst_directory_path = os.path.join(dst_class_path, name)
21 | 
22 |     video_file_path = os.path.join(class_path, file_name)
23 |     try:
24 |       if os.path.exists(dst_directory_path):
25 |         if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')):
26 |           subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True)
27 |           print('remove {}'.format(dst_directory_path))
28 |           os.mkdir(dst_directory_path)
29 |         else:
30 |           continue
31 |       else:
32 |         os.mkdir(dst_directory_path)
33 |     except:
34 |       print(dst_directory_path)
35 |       continue
36 |     cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path)
37 |     print(cmd)
38 |     subprocess.call(cmd, shell=True)
39 |     print('\n')
40 | 
41 | if __name__=="__main__":
42 |   dir_path = sys.argv[1]
43 |   dst_dir_path = sys.argv[2]
44 |   resume_vid_idx = int(sys.argv[3])
45 | 
46 |   for class_name in os.listdir(dir_path):
47 |     class_process(dir_path, dst_dir_path, class_name, resume_vid_idx=resume_vid_idx)
48 | 


--------------------------------------------------------------------------------
/utils/video_jpg_kinetics.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | def class_process(dir_path, dst_dir_path, class_name):
 7 |   class_path = os.path.join(dir_path, class_name)
 8 |   if not os.path.isdir(class_path):
 9 |     return
10 | 
11 |   dst_class_path = os.path.join(dst_dir_path, class_name)
12 |   if not os.path.exists(dst_class_path):
13 |     os.mkdir(dst_class_path)
14 | 
15 |   for file_name in os.listdir(class_path):
16 |     if '.mp4' not in file_name:
17 |       continue
18 |     name, ext = os.path.splitext(file_name)
19 |     dst_directory_path = os.path.join(dst_class_path, name)
20 | 
21 |     video_file_path = os.path.join(class_path, file_name)
22 |     try:
23 |       if os.path.exists(dst_directory_path):
24 |         if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')):
25 |           subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True)
26 |           print('remove {}'.format(dst_directory_path))
27 |           os.mkdir(dst_directory_path)
28 |         else:
29 |           continue
30 |       else:
31 |         os.mkdir(dst_directory_path)
32 |     except:
33 |       print(dst_directory_path)
34 |       continue
35 |     cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path)
36 |     print(cmd)
37 |     subprocess.call(cmd, shell=True)
38 |     print('\n')
39 | 
40 | if __name__=="__main__":
41 |   dir_path = sys.argv[1]
42 |   dst_dir_path = sys.argv[2]
43 |   start_ind = int(sys.argv[3]) #inclusive
44 |   end_ind = int(sys.argv[4]) #exclusive
45 |     
46 |   for class_name in os.listdir(dir_path)[start_ind:end_ind]:
47 |     class_process(dir_path, dst_dir_path, class_name)
48 | 
49 |   class_name = 'test'
50 |   class_process(dir_path, dst_dir_path, class_name)
51 | 


--------------------------------------------------------------------------------
/utils/video_jpg_ucf101_hmdb51.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | def class_process(dir_path, dst_dir_path, class_name):
 7 |   class_path = os.path.join(dir_path, class_name)
 8 |   if not os.path.isdir(class_path):
 9 |     return
10 | 
11 |   dst_class_path = os.path.join(dst_dir_path, class_name)
12 |   if not os.path.exists(dst_class_path):
13 |     os.mkdir(dst_class_path)
14 | 
15 |   for file_name in os.listdir(class_path):
16 |     if '.avi' not in file_name:
17 |       continue
18 |     name, ext = os.path.splitext(file_name)
19 |     dst_directory_path = os.path.join(dst_class_path, name)
20 | 
21 |     video_file_path = os.path.join(class_path, file_name)
22 |     try:
23 |       if os.path.exists(dst_directory_path):
24 |         if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')):
25 |           subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True)
26 |           print('remove {}'.format(dst_directory_path))
27 |           os.mkdir(dst_directory_path)
28 |         else:
29 |           continue
30 |       else:
31 |         os.mkdir(dst_directory_path)
32 |     except:
33 |       print(dst_directory_path)
34 |       continue
35 |     cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path)
36 |     print(cmd)
37 |     subprocess.call(cmd, shell=True)
38 |     print('\n')
39 | 
40 | if __name__=="__main__":
41 |   dir_path = sys.argv[1]
42 |   dst_dir_path = sys.argv[2]
43 | 
44 |   for class_name in os.listdir(dir_path):
45 |     class_process(dir_path, dst_dir_path, class_name)
46 | 


--------------------------------------------------------------------------------