├── .gitignore
├── 3D_experiment
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── dataset.py
    ├── datasets
    │   ├── activitynet.py
    │   ├── hmdb51.py
    │   ├── kinetics.py
    │   └── ucf101.py
    ├── main.py
    ├── mean.py
    ├── models
    │   ├── non_local.py
    │   └── resnet3D.py
    ├── opts.py
    ├── run.sh
    ├── spatial_transforms.py
    ├── target_transforms.py
    ├── temporal_transforms.py
    ├── test.py
    ├── train.py
    ├── utils.py
    ├── utils
    │   ├── eval_hmdb51.py
    │   ├── eval_kinetics.py
    │   ├── eval_ucf101.py
    │   ├── fps.py
    │   ├── hmdb51_json.py
    │   ├── kinetics_json.py
    │   ├── n_frames_kinetics.py
    │   ├── n_frames_ucf101_hmdb51.py
    │   ├── ucf101_json.py
    │   ├── video_jpg.py
    │   ├── video_jpg_kinetics.py
    │   └── video_jpg_ucf101_hmdb51.py
    └── validation.py
├── LICENSE
├── README.md
├── figure
    ├── Figure2.jpg
    ├── Table1.jpg
    └── resnet56_cifar.jpg
├── main.py
├── models
    ├── __init__.py
    ├── non_local.py
    ├── resnet2D.py
    └── resnet3D.py
├── run.sh
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | data/
3 | checkpoint/
4 | *.txt
5 | plot_loss.ipynb
6 | 


--------------------------------------------------------------------------------
/3D_experiment/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | data/
  7 | __pycache__
  8 | *.txt
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | env/
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # dotenv
 87 | .env
 88 | 
 89 | # virtualenv
 90 | .venv
 91 | venv/
 92 | ENV/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | 
107 | .DS_Store
108 | 
109 | .vscode
110 | 


--------------------------------------------------------------------------------
/3D_experiment/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Kensho Hara
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/3D_experiment/README.md:
--------------------------------------------------------------------------------
  1 | # 3D ResNets for Action Recognition
  2 | 
  3 | ## TL:DR
  4 | Run `run.sh` to start training using C2D model. If you wish to run other models, please refer to the original repository. 
  5 | Most of the code is borrowed from https://github.com/kenshohara/3D-ResNets-PyTorch except for the model architecture.
  6 | 
  7 | ## Summary
  8 | 
  9 | This is the PyTorch code for the following papers:
 10 | 
 11 | [
 12 | Kensho Hara, Hirokatsu Kataoka, and Yutaka Satoh,  
 13 | "Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet?",  
 14 | Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6546-6555, 2018.
 15 | ](http://openaccess.thecvf.com/content_cvpr_2018/html/Hara_Can_Spatiotemporal_3D_CVPR_2018_paper.html)
 16 | 
 17 | [
 18 | Kensho Hara, Hirokatsu Kataoka, and Yutaka Satoh,  
 19 | "Learning Spatio-Temporal Features with 3D Residual Networks for Action Recognition",  
 20 | Proceedings of the ICCV Workshop on Action, Gesture, and Emotion Recognition, 2017.
 21 | ](http://openaccess.thecvf.com/content_ICCV_2017_workshops/papers/w44/Hara_Learning_Spatio-Temporal_Features_ICCV_2017_paper.pdf)
 22 | 
 23 | This code includes training, fine-tuning and testing on Kinetics, ActivityNet, UCF-101, and HMDB-51.  
 24 | 
 25 | ## Citation
 26 | 
 27 | If you use this code or pre-trained models, please cite the following:
 28 | 
 29 | ```bibtex
 30 | @inproceedings{hara3dcnns,
 31 |   author={Kensho Hara and Hirokatsu Kataoka and Yutaka Satoh},
 32 |   title={Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet?},
 33 |   booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
 34 |   pages={6546--6555},
 35 |   year={2018},
 36 | }
 37 | ```
 38 | 
 39 | ## Requirements
 40 | 
 41 | * [PyTorch](http://pytorch.org/)
 42 | 
 43 | ```bash
 44 | conda install pytorch torchvision cuda80 -c soumith
 45 | ```
 46 | 
 47 | * FFmpeg, FFprobe
 48 | 
 49 | ```bash
 50 | wget http://johnvansickle.com/ffmpeg/releases/ffmpeg-release-64bit-static.tar.xz
 51 | tar xvf ffmpeg-release-64bit-static.tar.xz
 52 | cd ./ffmpeg-3.3.3-64bit-static/; sudo cp ffmpeg ffprobe /usr/local/bin;
 53 | ```
 54 | 
 55 | * Python 3
 56 | 
 57 | ## Preparation
 58 | 
 59 | ### ActivityNet
 60 | 
 61 | * Download videos using [the official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler).
 62 | * Convert from avi to jpg files using ```utils/video_jpg.py```
 63 | 
 64 | ```bash
 65 | python utils/video_jpg.py avi_video_directory jpg_video_directory
 66 | ```
 67 | 
 68 | * Generate fps files using ```utils/fps.py```
 69 | 
 70 | ```bash
 71 | python utils/fps.py avi_video_directory jpg_video_directory
 72 | ```
 73 | 
 74 | ### Kinetics
 75 | 
 76 | * Download videos using [the official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics).
 77 |   * Locate test set in ```video_directory/test```.
 78 | * Convert from avi to jpg files using ```utils/video_jpg_kinetics.py```
 79 | 
 80 | ```bash
 81 | python utils/video_jpg_kinetics.py avi_video_directory jpg_video_directory
 82 | ```
 83 | 
 84 | * Generate n_frames files using ```utils/n_frames_kinetics.py```
 85 | 
 86 | ```bash
 87 | python utils/n_frames_kinetics.py jpg_video_directory
 88 | ```
 89 | 
 90 | * Generate annotation file in json format similar to ActivityNet using ```utils/kinetics_json.py```
 91 |   * The CSV files (kinetics_{train, val, test}.csv) are included in the crawler.
 92 | 
 93 | ```bash
 94 | python utils/kinetics_json.py train_csv_path val_csv_path test_csv_path dst_json_path
 95 | ```
 96 | 
 97 | ### UCF-101
 98 | 
 99 | * Download videos and train/test splits [here](http://crcv.ucf.edu/data/UCF101.php).
100 | * Convert from avi to jpg files using ```utils/video_jpg_ucf101_hmdb51.py```
101 | 
102 | ```bash
103 | python utils/video_jpg_ucf101_hmdb51.py avi_video_directory jpg_video_directory
104 | ```
105 | 
106 | * Generate n_frames files using ```utils/n_frames_ucf101_hmdb51.py```
107 | 
108 | ```bash
109 | python utils/n_frames_ucf101_hmdb51.py jpg_video_directory
110 | ```
111 | 
112 | * Generate annotation file in json format similar to ActivityNet using ```utils/ucf101_json.py```
113 |   * ```annotation_dir_path``` includes classInd.txt, trainlist0{1, 2, 3}.txt, testlist0{1, 2, 3}.txt
114 | 
115 | ```bash
116 | python utils/ucf101_json.py annotation_dir_path
117 | ```
118 | 
119 | ### HMDB-51
120 | 
121 | * Download videos and train/test splits [here](http://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/).
122 | * Convert from avi to jpg files using ```utils/video_jpg_ucf101_hmdb51.py```
123 | 
124 | ```bash
125 | python utils/video_jpg_ucf101_hmdb51.py avi_video_directory jpg_video_directory
126 | ```
127 | 
128 | * Generate n_frames files using ```utils/n_frames_ucf101_hmdb51.py```
129 | 
130 | ```bash
131 | python utils/n_frames_ucf101_hmdb51.py jpg_video_directory
132 | ```
133 | 
134 | * Generate annotation file in json format similar to ActivityNet using ```utils/hmdb51_json.py```
135 |   * ```annotation_dir_path``` includes brush_hair_test_split1.txt, ...
136 | 
137 | ```bash
138 | python utils/hmdb51_json.py annotation_dir_path
139 | ```
140 | 
141 | ## Running the code
142 | 
143 | Assume the structure of data directories is the following:
144 | 
145 | ```misc
146 | ~/
147 |   data/
148 |     kinetics_videos/
149 |       jpg/
150 |         .../ (directories of class names)
151 |           .../ (directories of video names)
152 |             ... (jpg files)
153 |     results/
154 |       save_100.pth
155 |     kinetics.json
156 | ```
157 | 
158 | Confirm all options.
159 | 
160 | ```bash
161 | python main.lua -h
162 | ```
163 | 
164 | Train ResNets-34 on the Kinetics dataset (400 classes) with 4 CPU threads (for data loading).  
165 | Batch size is 128.  
166 | Save models at every 5 epochs.
167 | All GPUs is used for the training.
168 | If you want a part of GPUs, use ```CUDA_VISIBLE_DEVICES=...```.
169 | 
170 | ```bash
171 | python main.py --root_path ~/data --video_path kinetics_videos/jpg --annotation_path kinetics.json \
172 | --result_path results --dataset kinetics --model resnet \
173 | --model_depth 34 --n_classes 400 --batch_size 128 --n_threads 4 --checkpoint 5
174 | ```
175 | 
176 | Continue Training from epoch 101. (~/data/results/save_100.pth is loaded.)
177 | 
178 | ```bash
179 | python main.py --root_path ~/data --video_path kinetics_videos/jpg --annotation_path kinetics.json \
180 | --result_path results --dataset kinetics --resume_path results/save_100.pth \
181 | --model_depth 34 --n_classes 400 --batch_size 128 --n_threads 4 --checkpoint 5
182 | ```
183 | 
184 | 
185 | 


--------------------------------------------------------------------------------
/3D_experiment/dataset.py:
--------------------------------------------------------------------------------
  1 | from datasets.kinetics import Kinetics
  2 | from datasets.activitynet import ActivityNet
  3 | from datasets.ucf101 import UCF101
  4 | from datasets.hmdb51 import HMDB51
  5 | 
  6 | 
  7 | def get_training_set(opt, spatial_transform, temporal_transform,
  8 |                      target_transform):
  9 |     assert opt.dataset in ['kinetics', 'activitynet', 'ucf101', 'hmdb51']
 10 | 
 11 |     if opt.dataset == 'kinetics':
 12 |         training_data = Kinetics(
 13 |             opt.video_path,
 14 |             opt.annotation_path,
 15 |             'training',
 16 |             spatial_transform=spatial_transform,
 17 |             temporal_transform=temporal_transform,
 18 |             target_transform=target_transform)
 19 |     elif opt.dataset == 'activitynet':
 20 |         training_data = ActivityNet(
 21 |             opt.video_path,
 22 |             opt.annotation_path,
 23 |             'training',
 24 |             False,
 25 |             spatial_transform=spatial_transform,
 26 |             temporal_transform=temporal_transform,
 27 |             target_transform=target_transform)
 28 |     elif opt.dataset == 'ucf101':
 29 |         training_data = UCF101(
 30 |             opt.video_path,
 31 |             opt.annotation_path,
 32 |             'training',
 33 |             spatial_transform=spatial_transform,
 34 |             temporal_transform=temporal_transform,
 35 |             target_transform=target_transform)
 36 |     elif opt.dataset == 'hmdb51':
 37 |         training_data = HMDB51(
 38 |             opt.video_path,
 39 |             opt.annotation_path,
 40 |             'training',
 41 |             spatial_transform=spatial_transform,
 42 |             temporal_transform=temporal_transform,
 43 |             target_transform=target_transform)
 44 | 
 45 |     return training_data
 46 | 
 47 | 
 48 | def get_validation_set(opt, spatial_transform, temporal_transform,
 49 |                        target_transform):
 50 |     assert opt.dataset in ['kinetics', 'activitynet', 'ucf101', 'hmdb51']
 51 | 
 52 |     if opt.dataset == 'kinetics':
 53 |         validation_data = Kinetics(
 54 |             opt.video_path,
 55 |             opt.annotation_path,
 56 |             'validation',
 57 |             opt.n_val_samples,
 58 |             spatial_transform,
 59 |             temporal_transform,
 60 |             target_transform,
 61 |             sample_duration=opt.sample_duration)
 62 |     elif opt.dataset == 'activitynet':
 63 |         validation_data = ActivityNet(
 64 |             opt.video_path,
 65 |             opt.annotation_path,
 66 |             'validation',
 67 |             False,
 68 |             opt.n_val_samples,
 69 |             spatial_transform,
 70 |             temporal_transform,
 71 |             target_transform,
 72 |             sample_duration=opt.sample_duration)
 73 |     elif opt.dataset == 'ucf101':
 74 |         validation_data = UCF101(
 75 |             opt.video_path,
 76 |             opt.annotation_path,
 77 |             'validation',
 78 |             opt.n_val_samples,
 79 |             spatial_transform,
 80 |             temporal_transform,
 81 |             target_transform,
 82 |             sample_duration=opt.sample_duration)
 83 |     elif opt.dataset == 'hmdb51':
 84 |         validation_data = HMDB51(
 85 |             opt.video_path,
 86 |             opt.annotation_path,
 87 |             'validation',
 88 |             opt.n_val_samples,
 89 |             spatial_transform,
 90 |             temporal_transform,
 91 |             target_transform,
 92 |             sample_duration=opt.sample_duration)
 93 |     return validation_data
 94 | 
 95 | 
 96 | def get_test_set(opt, spatial_transform, temporal_transform, target_transform):
 97 |     assert opt.dataset in ['kinetics', 'activitynet', 'ucf101', 'hmdb51']
 98 |     assert opt.test_subset in ['val', 'test']
 99 | 
100 |     if opt.test_subset == 'val':
101 |         subset = 'validation'
102 |     elif opt.test_subset == 'test':
103 |         subset = 'testing'
104 |     if opt.dataset == 'kinetics':
105 |         test_data = Kinetics(
106 |             opt.video_path,
107 |             opt.annotation_path,
108 |             subset,
109 |             0,
110 |             spatial_transform,
111 |             temporal_transform,
112 |             target_transform,
113 |             sample_duration=opt.sample_duration)
114 |     elif opt.dataset == 'activitynet':
115 |         test_data = ActivityNet(
116 |             opt.video_path,
117 |             opt.annotation_path,
118 |             subset,
119 |             True,
120 |             0,
121 |             spatial_transform,
122 |             temporal_transform,
123 |             target_transform,
124 |             sample_duration=opt.sample_duration)
125 |     elif opt.dataset == 'ucf101':
126 |         test_data = UCF101(
127 |             opt.video_path,
128 |             opt.annotation_path,
129 |             subset,
130 |             0,
131 |             spatial_transform,
132 |             temporal_transform,
133 |             target_transform,
134 |             sample_duration=opt.sample_duration)
135 |     elif opt.dataset == 'hmdb51':
136 |         test_data = HMDB51(
137 |             opt.video_path,
138 |             opt.annotation_path,
139 |             subset,
140 |             0,
141 |             spatial_transform,
142 |             temporal_transform,
143 |             target_transform,
144 |             sample_duration=opt.sample_duration)
145 | 
146 |     return test_data
147 | 


--------------------------------------------------------------------------------
/3D_experiment/datasets/activitynet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | from PIL import Image
  4 | import os
  5 | import functools
  6 | import json
  7 | import copy
  8 | import math
  9 | 
 10 | from utils import load_value_file
 11 | 
 12 | 
 13 | def pil_loader(path):
 14 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 15 |     with open(path, 'rb') as f:
 16 |         with Image.open(f) as img:
 17 |             return img.convert('RGB')
 18 | 
 19 | 
 20 | def accimage_loader(path):
 21 |     try:
 22 |         import accimage
 23 |         return accimage.Image(path)
 24 |     except IOError:
 25 |         # Potentially a decoding problem, fall back to PIL.Image
 26 |         return pil_loader(path)
 27 | 
 28 | 
 29 | def get_default_image_loader():
 30 |     from torchvision import get_image_backend
 31 |     if get_image_backend() == 'accimage':
 32 |         return accimage_loader
 33 |     else:
 34 |         return pil_loader
 35 | 
 36 | 
 37 | def video_loader(video_dir_path, frame_indices, image_loader):
 38 |     video = []
 39 |     for i in frame_indices:
 40 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 41 |         if os.path.exists(image_path):
 42 |             video.append(image_loader(image_path))
 43 |         else:
 44 |             return video
 45 | 
 46 |     return video
 47 | 
 48 | 
 49 | def get_default_video_loader():
 50 |     image_loader = get_default_image_loader()
 51 |     return functools.partial(video_loader, image_loader=image_loader)
 52 | 
 53 | 
 54 | def load_annotation_data(data_file_path):
 55 |     with open(data_file_path, 'r') as data_file:
 56 |         return json.load(data_file)
 57 | 
 58 | 
 59 | def get_class_labels(data):
 60 |     class_names = []
 61 |     index = 0
 62 |     for node1 in data['taxonomy']:
 63 |         is_leaf = True
 64 |         for node2 in data['taxonomy']:
 65 |             if node2['parentId'] == node1['nodeId']:
 66 |                 is_leaf = False
 67 |                 break
 68 |         if is_leaf:
 69 |             class_names.append(node1['nodeName'])
 70 | 
 71 |     class_labels_map = {}
 72 | 
 73 |     for i, class_name in enumerate(class_names):
 74 |         class_labels_map[class_name] = i
 75 | 
 76 |     return class_labels_map
 77 | 
 78 | 
 79 | def get_video_names_and_annotations(data, subset):
 80 |     video_names = []
 81 |     annotations = []
 82 | 
 83 |     for key, value in data['database'].items():
 84 |         this_subset = value['subset']
 85 |         if this_subset == subset:
 86 |             if subset == 'testing':
 87 |                 video_names.append('v_{}'.format(key))
 88 |             else:
 89 |                 video_names.append('v_{}'.format(key))
 90 |                 annotations.append(value['annotations'])
 91 | 
 92 |     return video_names, annotations
 93 | 
 94 | 
 95 | def modify_frame_indices(video_dir_path, frame_indices):
 96 |     modified_indices = []
 97 |     for i in frame_indices:
 98 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 99 |         if not os.path.exists(image_path):
100 |             return modified_indices
101 |         modified_indices.append(i)
102 |     return modified_indices
103 | 
104 | 
105 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
106 |                  sample_duration):
107 |     data = load_annotation_data(annotation_path)
108 |     video_names, annotations = get_video_names_and_annotations(data, subset)
109 |     class_to_idx = get_class_labels(data)
110 |     idx_to_class = {}
111 |     for name, label in class_to_idx.items():
112 |         idx_to_class[label] = name
113 | 
114 |     dataset = []
115 |     for i in range(len(video_names)):
116 |         if i % 1000 == 0:
117 |             print('dataset loading [{}/{}]'.format(i, len(video_names)))
118 | 
119 |         video_path = os.path.join(root_path, video_names[i])
120 |         if not os.path.exists(video_path):
121 |             continue
122 | 
123 |         fps_file_path = os.path.join(video_path, 'fps')
124 |         fps = load_value_file(fps_file_path)
125 | 
126 |         for annotation in annotations[i]:
127 |             begin_t = math.ceil(annotation['segment'][0] * fps)
128 |             end_t = math.ceil(annotation['segment'][1] * fps)
129 |             if begin_t == 0:
130 |                 begin_t = 1
131 |             n_frames = end_t - begin_t
132 | 
133 |             sample = {
134 |                 'video': video_path,
135 |                 'segment': [begin_t, end_t],
136 |                 'fps': fps,
137 |                 'video_id': video_names[i][2:]
138 |             }
139 |             if len(annotations) != 0:
140 |                 sample['label'] = class_to_idx[annotation['label']]
141 |             else:
142 |                 sample['label'] = -1
143 | 
144 |             if n_samples_for_each_video == 1:
145 |                 frame_indices = list(range(begin_t, end_t))
146 |                 frame_indices = modify_frame_indices(sample['video'],
147 |                                                      frame_indices)
148 |                 if len(frame_indices) < 16:
149 |                     continue
150 |                 sample['frame_indices'] = frame_indices
151 |                 dataset.append(sample)
152 |             else:
153 |                 if n_samples_for_each_video > 1:
154 |                     step = max(1,
155 |                                math.ceil((n_frames - 1 - sample_duration) /
156 |                                          (n_samples_for_each_video - 1)))
157 |                 else:
158 |                     step = sample_duration
159 |                 for j in range(begin_t, end_t, step):
160 |                     sample_j = copy.deepcopy(sample)
161 |                     frame_indices = list(range(j, j + sample_duration))
162 |                     frame_indices = modify_frame_indices(
163 |                         sample_j['video'], frame_indices)
164 |                     if len(frame_indices) < 16:
165 |                         continue
166 |                     sample_j['frame_indices'] = frame_indices
167 |                     dataset.append(sample_j)
168 | 
169 |     return dataset, idx_to_class
170 | 
171 | 
172 | def get_end_t(video_path):
173 |     file_names = os.listdir(video_path)
174 |     image_file_names = [x for x in file_names if 'image' in x]
175 |     image_file_names.sort(reverse=True)
176 |     return int(image_file_names[0][6:11])
177 | 
178 | 
179 | def make_untrimmed_dataset(root_path, annotation_path, subset,
180 |                            n_samples_for_each_video, sample_duration):
181 |     data = load_annotation_data(annotation_path)
182 |     video_names, _ = get_video_names_and_annotations(data, subset)
183 |     class_to_idx = get_class_labels(data)
184 |     idx_to_class = {}
185 |     for name, label in class_to_idx.items():
186 |         idx_to_class[label] = name
187 | 
188 |     dataset = []
189 |     for i in range(len(video_names)):
190 |         if i % 1000 == 0:
191 |             print('dataset loading [{}/{}]'.format(i, len(video_names)))
192 | 
193 |         video_path = os.path.join(root_path, video_names[i])
194 |         if not os.path.exists(video_path):
195 |             continue
196 | 
197 |         fps_file_path = os.path.join(video_path, 'fps')
198 |         fps = load_value_file(fps_file_path)
199 | 
200 |         begin_t = 1
201 |         end_t = get_end_t(video_path)
202 |         n_frames = end_t - begin_t
203 | 
204 |         sample = {
205 |             'video': video_path,
206 |             'segment': [begin_t, end_t],
207 |             'fps': fps,
208 |             'video_id': video_names[i][2:]
209 |         }
210 | 
211 |         if n_samples_for_each_video >= 1:
212 |             step = max(1,
213 |                        math.ceil((n_frames - 1 - sample_duration) /
214 |                                  (n_samples_for_each_video - 1)))
215 |         else:
216 |             step = sample_duration
217 |         for j in range(begin_t, end_t, step):
218 |             sample_j = copy.deepcopy(sample)
219 |             frame_indices = list(range(j, j + sample_duration))
220 |             frame_indices = modify_frame_indices(sample_j['video'],
221 |                                                  frame_indices)
222 |             if len(frame_indices) < 16:
223 |                 continue
224 |             sample_j['frame_indices'] = frame_indices
225 |             dataset.append(sample_j)
226 | 
227 |     return dataset, idx_to_class
228 | 
229 | 
230 | class ActivityNet(data.Dataset):
231 |     """
232 |     Args:
233 |         root (string): Root directory path.
234 |         spatial_transform (callable, optional): A function/transform that  takes in an PIL image
235 |             and returns a transformed version. E.g, ``transforms.RandomCrop``
236 |         temporal_transform (callable, optional): A function/transform that  takes in a list of frame indices
237 |             and returns a transformed version
238 |         target_transform (callable, optional): A function/transform that takes in the
239 |             target and transforms it.
240 |         loader (callable, optional): A function to load an video given its path and frame indices.
241 |      Attributes:
242 |         classes (list): List of the class names.
243 |         class_to_idx (dict): Dict with items (class_name, class_index).
244 |         imgs (list): List of (image path, class_index) tuples
245 |     """
246 | 
247 |     def __init__(self,
248 |                  root_path,
249 |                  annotation_path,
250 |                  subset,
251 |                  is_untrimmed_setting=False,
252 |                  n_samples_for_each_video=1,
253 |                  spatial_transform=None,
254 |                  temporal_transform=None,
255 |                  target_transform=None,
256 |                  sample_duration=16,
257 |                  get_loader=get_default_video_loader):
258 |         if is_untrimmed_setting:
259 |             self.data, self.class_names = make_untrimmed_dataset(
260 |                 root_path, annotation_path, subset, n_samples_for_each_video,
261 |                 sample_duration)
262 |         else:
263 |             self.data, self.class_names = make_dataset(
264 |                 root_path, annotation_path, subset, n_samples_for_each_video,
265 |                 sample_duration)
266 | 
267 |         self.spatial_transform = spatial_transform
268 |         self.temporal_transform = temporal_transform
269 |         self.target_transform = target_transform
270 |         self.loader = get_loader()
271 | 
272 |     def __getitem__(self, index):
273 |         """
274 |         Args:
275 |             index (int): Index
276 |         Returns:
277 |             tuple: (image, target) where target is class_index of the target class.
278 |         """
279 |         path = self.data[index]['video']
280 | 
281 |         frame_indices = self.data[index]['frame_indices']
282 |         if self.temporal_transform is not None:
283 |             frame_indices = self.temporal_transform(frame_indices)
284 |         clip = self.loader(path, frame_indices)
285 |         if self.spatial_transform is not None:
286 |             self.spatial_transform.randomize_parameters()
287 |             clip = [self.spatial_transform(img) for img in clip]
288 |         clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
289 | 
290 |         target = self.data[index]
291 |         if self.target_transform is not None:
292 |             target = self.target_transform(target)
293 | 
294 |         return clip, target
295 | 
296 |     def __len__(self):
297 |         return len(self.data)
298 | 


--------------------------------------------------------------------------------
/3D_experiment/datasets/hmdb51.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | from PIL import Image
  4 | import os
  5 | import math
  6 | import functools
  7 | import json
  8 | import copy
  9 | 
 10 | from utils import load_value_file
 11 | 
 12 | 
 13 | def pil_loader(path):
 14 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 15 |     with open(path, 'rb') as f:
 16 |         with Image.open(f) as img:
 17 |             return img.convert('RGB')
 18 | 
 19 | 
 20 | def accimage_loader(path):
 21 |     try:
 22 |         import accimage
 23 |         return accimage.Image(path)
 24 |     except IOError:
 25 |         # Potentially a decoding problem, fall back to PIL.Image
 26 |         return pil_loader(path)
 27 | 
 28 | 
 29 | def get_default_image_loader():
 30 |     from torchvision import get_image_backend
 31 |     if get_image_backend() == 'accimage':
 32 |         return accimage_loader
 33 |     else:
 34 |         return pil_loader
 35 | 
 36 | 
 37 | def video_loader(video_dir_path, frame_indices, image_loader):
 38 |     video = []
 39 |     for i in frame_indices:
 40 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 41 |         if os.path.exists(image_path):
 42 |             video.append(image_loader(image_path))
 43 |         else:
 44 |             return video
 45 | 
 46 |     return video
 47 | 
 48 | 
 49 | def get_default_video_loader():
 50 |     image_loader = get_default_image_loader()
 51 |     return functools.partial(video_loader, image_loader=image_loader)
 52 | 
 53 | 
 54 | def load_annotation_data(data_file_path):
 55 |     with open(data_file_path, 'r') as data_file:
 56 |         return json.load(data_file)
 57 | 
 58 | 
 59 | def get_class_labels(data):
 60 |     class_labels_map = {}
 61 |     index = 0
 62 |     for class_label in data['labels']:
 63 |         class_labels_map[class_label] = index
 64 |         index += 1
 65 |     return class_labels_map
 66 | 
 67 | 
 68 | def get_video_names_and_annotations(data, subset):
 69 |     video_names = []
 70 |     annotations = []
 71 | 
 72 |     for key, value in data['database'].items():
 73 |         this_subset = value['subset']
 74 |         if this_subset == subset:
 75 |             label = value['annotations']['label']
 76 |             video_names.append('{}/{}'.format(label, key))
 77 |             annotations.append(value['annotations'])
 78 | 
 79 |     return video_names, annotations
 80 | 
 81 | 
 82 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
 83 |                  sample_duration):
 84 |     data = load_annotation_data(annotation_path)
 85 |     video_names, annotations = get_video_names_and_annotations(data, subset)
 86 |     class_to_idx = get_class_labels(data)
 87 |     idx_to_class = {}
 88 |     for name, label in class_to_idx.items():
 89 |         idx_to_class[label] = name
 90 | 
 91 |     dataset = []
 92 |     for i in range(len(video_names)):
 93 |         if i % 1000 == 0:
 94 |             print('dataset loading [{}/{}]'.format(i, len(video_names)))
 95 | 
 96 |         video_path = os.path.join(root_path, video_names[i])
 97 |         if not os.path.exists(video_path):
 98 |             continue
 99 | 
100 |         n_frames_file_path = os.path.join(video_path, 'n_frames')
101 |         n_frames = int(load_value_file(n_frames_file_path))
102 |         if n_frames <= 0:
103 |             continue
104 | 
105 |         begin_t = 1
106 |         end_t = n_frames
107 |         sample = {
108 |             'video': video_path,
109 |             'segment': [begin_t, end_t],
110 |             'n_frames': n_frames,
111 |             'video_id': video_names[i].split('/')[1]
112 |         }
113 |         if len(annotations) != 0:
114 |             sample['label'] = class_to_idx[annotations[i]['label']]
115 |         else:
116 |             sample['label'] = -1
117 | 
118 |         if n_samples_for_each_video == 1:
119 |             sample['frame_indices'] = list(range(1, n_frames + 1))
120 |             dataset.append(sample)
121 |         else:
122 |             if n_samples_for_each_video > 1:
123 |                 step = max(1,
124 |                            math.ceil((n_frames - 1 - sample_duration) /
125 |                                      (n_samples_for_each_video - 1)))
126 |             else:
127 |                 step = sample_duration
128 |             for j in range(1, n_frames, step):
129 |                 sample_j = copy.deepcopy(sample)
130 |                 sample_j['frame_indices'] = list(
131 |                     range(j, min(n_frames + 1, j + sample_duration)))
132 |                 dataset.append(sample_j)
133 | 
134 |     return dataset, idx_to_class
135 | 
136 | 
137 | class HMDB51(data.Dataset):
138 |     """
139 |     Args:
140 |         root (string): Root directory path.
141 |         spatial_transform (callable, optional): A function/transform that  takes in an PIL image
142 |             and returns a transformed version. E.g, ``transforms.RandomCrop``
143 |         temporal_transform (callable, optional): A function/transform that  takes in a list of frame indices
144 |             and returns a transformed version
145 |         target_transform (callable, optional): A function/transform that takes in the
146 |             target and transforms it.
147 |         loader (callable, optional): A function to load an video given its path and frame indices.
148 |      Attributes:
149 |         classes (list): List of the class names.
150 |         class_to_idx (dict): Dict with items (class_name, class_index).
151 |         imgs (list): List of (image path, class_index) tuples
152 |     """
153 | 
154 |     def __init__(self,
155 |                  root_path,
156 |                  annotation_path,
157 |                  subset,
158 |                  n_samples_for_each_video=1,
159 |                  spatial_transform=None,
160 |                  temporal_transform=None,
161 |                  target_transform=None,
162 |                  sample_duration=16,
163 |                  get_loader=get_default_video_loader):
164 |         self.data, self.class_names = make_dataset(
165 |             root_path, annotation_path, subset, n_samples_for_each_video,
166 |             sample_duration)
167 | 
168 |         self.spatial_transform = spatial_transform
169 |         self.temporal_transform = temporal_transform
170 |         self.target_transform = target_transform
171 |         self.loader = get_loader()
172 | 
173 |     def __getitem__(self, index):
174 |         """
175 |         Args:
176 |             index (int): Index
177 |         Returns:
178 |             tuple: (image, target) where target is class_index of the target class.
179 |         """
180 |         path = self.data[index]['video']
181 | 
182 |         frame_indices = self.data[index]['frame_indices']
183 |         if self.temporal_transform is not None:
184 |             frame_indices = self.temporal_transform(frame_indices)
185 |         clip = self.loader(path, frame_indices)
186 |         if self.spatial_transform is not None:
187 |             self.spatial_transform.randomize_parameters()
188 |             clip = [self.spatial_transform(img) for img in clip]
189 |         clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
190 | 
191 |         target = self.data[index]
192 |         if self.target_transform is not None:
193 |             target = self.target_transform(target)
194 | 
195 |         return clip, target
196 | 
197 |     def __len__(self):
198 |         return len(self.data)
199 | 


--------------------------------------------------------------------------------
/3D_experiment/datasets/kinetics.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | from PIL import Image
  4 | import os
  5 | import math
  6 | import functools
  7 | import json
  8 | import copy
  9 | 
 10 | from utils import load_value_file
 11 | 
 12 | 
 13 | def pil_loader(path):
 14 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 15 |     with open(path, 'rb') as f:
 16 |         with Image.open(f) as img:
 17 |             return img.convert('RGB')
 18 | 
 19 | 
 20 | def accimage_loader(path):
 21 |     try:
 22 |         import accimage
 23 |         return accimage.Image(path)
 24 |     except IOError:
 25 |         # Potentially a decoding problem, fall back to PIL.Image
 26 |         return pil_loader(path)
 27 | 
 28 | 
 29 | def get_default_image_loader():
 30 |     from torchvision import get_image_backend
 31 |     if get_image_backend() == 'accimage':
 32 |         return accimage_loader
 33 |     else:
 34 |         return pil_loader
 35 | 
 36 | 
 37 | def video_loader(video_dir_path, frame_indices, image_loader):
 38 |     video = []
 39 |     for i in frame_indices:
 40 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 41 |         if os.path.exists(image_path):
 42 |             video.append(image_loader(image_path))
 43 |         else:
 44 |             return video
 45 | 
 46 |     return video
 47 | 
 48 | 
 49 | def get_default_video_loader():
 50 |     image_loader = get_default_image_loader()
 51 |     return functools.partial(video_loader, image_loader=image_loader)
 52 | 
 53 | 
 54 | def load_annotation_data(data_file_path):
 55 |     with open(data_file_path, 'r') as data_file:
 56 |         return json.load(data_file)
 57 | 
 58 | 
 59 | def get_class_labels(data):
 60 |     class_labels_map = {}
 61 |     index = 0
 62 |     for class_label in data['labels']:
 63 |         class_labels_map[class_label] = index
 64 |         index += 1
 65 |     return class_labels_map
 66 | 
 67 | 
 68 | def get_video_names_and_annotations(data, subset):
 69 |     video_names = []
 70 |     annotations = []
 71 | 
 72 |     for key, value in data['database'].items():
 73 |         this_subset = value['subset']
 74 |         if this_subset == subset:
 75 |             if subset == 'testing':
 76 |                 video_names.append('test/{}'.format(key))
 77 |             else:
 78 |                 label = value['annotations']['label']
 79 |                 video_names.append('{}/{}'.format(label, key))
 80 |                 annotations.append(value['annotations'])
 81 | 
 82 |     return video_names, annotations
 83 | 
 84 | 
 85 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
 86 |                  sample_duration):
 87 |     data = load_annotation_data(annotation_path)
 88 |     video_names, annotations = get_video_names_and_annotations(data, subset)
 89 |     class_to_idx = get_class_labels(data)
 90 |     idx_to_class = {}
 91 |     for name, label in class_to_idx.items():
 92 |         idx_to_class[label] = name
 93 | 
 94 |     dataset = []
 95 |     for i in range(len(video_names)):
 96 |         if i % 1000 == 0:
 97 |             print('dataset loading [{}/{}]'.format(i, len(video_names)))
 98 | 
 99 |         video_path = os.path.join(root_path, video_names[i])
100 |         if not os.path.exists(video_path):
101 |             continue
102 | 
103 |         n_frames_file_path = os.path.join(video_path, 'n_frames')
104 |         n_frames = int(load_value_file(n_frames_file_path))
105 |         if n_frames <= 0:
106 |             continue
107 | 
108 |         begin_t = 1
109 |         end_t = n_frames
110 |         sample = {
111 |             'video': video_path,
112 |             'segment': [begin_t, end_t],
113 |             'n_frames': n_frames,
114 |             'video_id': video_names[i][:-14].split('/')[1]
115 |         }
116 |         if len(annotations) != 0:
117 |             sample['label'] = class_to_idx[annotations[i]['label']]
118 |         else:
119 |             sample['label'] = -1
120 | 
121 |         if n_samples_for_each_video == 1:
122 |             sample['frame_indices'] = list(range(1, n_frames + 1))
123 |             dataset.append(sample)
124 |         else:
125 |             if n_samples_for_each_video > 1:
126 |                 step = max(1,
127 |                            math.ceil((n_frames - 1 - sample_duration) /
128 |                                      (n_samples_for_each_video - 1)))
129 |             else:
130 |                 step = sample_duration
131 |             for j in range(1, n_frames, step):
132 |                 sample_j = copy.deepcopy(sample)
133 |                 sample_j['frame_indices'] = list(
134 |                     range(j, min(n_frames + 1, j + sample_duration)))
135 |                 dataset.append(sample_j)
136 | 
137 |     return dataset, idx_to_class
138 | 
139 | 
140 | class Kinetics(data.Dataset):
141 |     """
142 |     Args:
143 |         root (string): Root directory path.
144 |         spatial_transform (callable, optional): A function/transform that  takes in an PIL image
145 |             and returns a transformed version. E.g, ``transforms.RandomCrop``
146 |         temporal_transform (callable, optional): A function/transform that  takes in a list of frame indices
147 |             and returns a transformed version
148 |         target_transform (callable, optional): A function/transform that takes in the
149 |             target and transforms it.
150 |         loader (callable, optional): A function to load an video given its path and frame indices.
151 |      Attributes:
152 |         classes (list): List of the class names.
153 |         class_to_idx (dict): Dict with items (class_name, class_index).
154 |         imgs (list): List of (image path, class_index) tuples
155 |     """
156 | 
157 |     def __init__(self,
158 |                  root_path,
159 |                  annotation_path,
160 |                  subset,
161 |                  n_samples_for_each_video=1,
162 |                  spatial_transform=None,
163 |                  temporal_transform=None,
164 |                  target_transform=None,
165 |                  sample_duration=16,
166 |                  get_loader=get_default_video_loader):
167 |         self.data, self.class_names = make_dataset(
168 |             root_path, annotation_path, subset, n_samples_for_each_video,
169 |             sample_duration)
170 | 
171 |         self.spatial_transform = spatial_transform
172 |         self.temporal_transform = temporal_transform
173 |         self.target_transform = target_transform
174 |         self.loader = get_loader()
175 | 
176 |     def __getitem__(self, index):
177 |         """
178 |         Args:
179 |             index (int): Index
180 |         Returns:
181 |             tuple: (image, target) where target is class_index of the target class.
182 |         """
183 |         path = self.data[index]['video']
184 | 
185 |         frame_indices = self.data[index]['frame_indices']
186 |         if self.temporal_transform is not None:
187 |             frame_indices = self.temporal_transform(frame_indices)
188 |         clip = self.loader(path, frame_indices)
189 |         if self.spatial_transform is not None:
190 |             self.spatial_transform.randomize_parameters()
191 |             clip = [self.spatial_transform(img) for img in clip]
192 |         clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
193 | 
194 |         target = self.data[index]
195 |         if self.target_transform is not None:
196 |             target = self.target_transform(target)
197 | 
198 |         return clip, target
199 | 
200 |     def __len__(self):
201 |         return len(self.data)
202 | 


--------------------------------------------------------------------------------
/3D_experiment/datasets/ucf101.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | from PIL import Image
  4 | import os
  5 | import math
  6 | import functools
  7 | import json
  8 | import copy
  9 | 
 10 | from utils import load_value_file
 11 | 
 12 | 
 13 | def pil_loader(path):
 14 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 15 |     with open(path, 'rb') as f:
 16 |         with Image.open(f) as img:
 17 |             return img.convert('RGB')
 18 | 
 19 | 
 20 | def accimage_loader(path):
 21 |     try:
 22 |         import accimage
 23 |         return accimage.Image(path)
 24 |     except IOError:
 25 |         # Potentially a decoding problem, fall back to PIL.Image
 26 |         return pil_loader(path)
 27 | 
 28 | 
 29 | def get_default_image_loader():
 30 |     from torchvision import get_image_backend
 31 |     if get_image_backend() == 'accimage':
 32 |         return accimage_loader
 33 |     else:
 34 |         return pil_loader
 35 | 
 36 | 
 37 | def video_loader(video_dir_path, frame_indices, image_loader):
 38 |     video = []
 39 |     for i in frame_indices:
 40 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 41 |         if os.path.exists(image_path):
 42 |             video.append(image_loader(image_path))
 43 |         else:
 44 |             return video
 45 | 
 46 |     return video
 47 | 
 48 | 
 49 | def get_default_video_loader():
 50 |     image_loader = get_default_image_loader()
 51 |     return functools.partial(video_loader, image_loader=image_loader)
 52 | 
 53 | 
 54 | def load_annotation_data(data_file_path):
 55 |     with open(data_file_path, 'r') as data_file:
 56 |         return json.load(data_file)
 57 | 
 58 | 
 59 | def get_class_labels(data):
 60 |     class_labels_map = {}
 61 |     index = 0
 62 |     for class_label in data['labels']:
 63 |         class_labels_map[class_label] = index
 64 |         index += 1
 65 |     return class_labels_map
 66 | 
 67 | 
 68 | def get_video_names_and_annotations(data, subset):
 69 |     video_names = []
 70 |     annotations = []
 71 | 
 72 |     for key, value in data['database'].items():
 73 |         this_subset = value['subset']
 74 |         if this_subset == subset:
 75 |             label = value['annotations']['label']
 76 |             video_names.append('{}/{}'.format(label, key))
 77 |             annotations.append(value['annotations'])
 78 | 
 79 |     return video_names, annotations
 80 | 
 81 | 
 82 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
 83 |                  sample_duration):
 84 |     data = load_annotation_data(annotation_path)
 85 |     video_names, annotations = get_video_names_and_annotations(data, subset)
 86 |     class_to_idx = get_class_labels(data)
 87 |     idx_to_class = {}
 88 |     for name, label in class_to_idx.items():
 89 |         idx_to_class[label] = name
 90 | 
 91 |     dataset = []
 92 |     for i in range(len(video_names)):
 93 |         if i % 1000 == 0:
 94 |             print('dataset loading [{}/{}]'.format(i, len(video_names)))
 95 | 
 96 |         video_path = os.path.join(root_path, video_names[i])
 97 |         if not os.path.exists(video_path):
 98 |             continue
 99 | 
100 |         n_frames_file_path = os.path.join(video_path, 'n_frames')
101 |         n_frames = int(load_value_file(n_frames_file_path))
102 |         if n_frames <= 0:
103 |             continue
104 | 
105 |         begin_t = 1
106 |         end_t = n_frames
107 |         sample = {
108 |             'video': video_path,
109 |             'segment': [begin_t, end_t],
110 |             'n_frames': n_frames,
111 |             'video_id': video_names[i].split('/')[1]
112 |         }
113 |         if len(annotations) != 0:
114 |             sample['label'] = class_to_idx[annotations[i]['label']]
115 |         else:
116 |             sample['label'] = -1
117 | 
118 |         if n_samples_for_each_video == 1:
119 |             sample['frame_indices'] = list(range(1, n_frames + 1))
120 |             dataset.append(sample)
121 |         else:
122 |             if n_samples_for_each_video > 1:
123 |                 step = max(1,
124 |                            math.ceil((n_frames - 1 - sample_duration) /
125 |                                      (n_samples_for_each_video - 1)))
126 |             else:
127 |                 step = sample_duration
128 |             for j in range(1, n_frames, step):
129 |                 sample_j = copy.deepcopy(sample)
130 |                 sample_j['frame_indices'] = list(
131 |                     range(j, min(n_frames + 1, j + sample_duration)))
132 |                 dataset.append(sample_j)
133 | 
134 |     return dataset, idx_to_class
135 | 
136 | 
137 | class UCF101(data.Dataset):
138 |     """
139 |     Args:
140 |         root (string): Root directory path.
141 |         spatial_transform (callable, optional): A function/transform that  takes in an PIL image
142 |             and returns a transformed version. E.g, ``transforms.RandomCrop``
143 |         temporal_transform (callable, optional): A function/transform that  takes in a list of frame indices
144 |             and returns a transformed version
145 |         target_transform (callable, optional): A function/transform that takes in the
146 |             target and transforms it.
147 |         loader (callable, optional): A function to load an video given its path and frame indices.
148 |      Attributes:
149 |         classes (list): List of the class names.
150 |         class_to_idx (dict): Dict with items (class_name, class_index).
151 |         imgs (list): List of (image path, class_index) tuples
152 |     """
153 | 
154 |     def __init__(self,
155 |                  root_path,
156 |                  annotation_path,
157 |                  subset,
158 |                  n_samples_for_each_video=1,
159 |                  spatial_transform=None,
160 |                  temporal_transform=None,
161 |                  target_transform=None,
162 |                  sample_duration=16,
163 |                  get_loader=get_default_video_loader):
164 |         self.data, self.class_names = make_dataset(
165 |             root_path, annotation_path, subset, n_samples_for_each_video,
166 |             sample_duration)
167 | 
168 |         self.spatial_transform = spatial_transform
169 |         self.temporal_transform = temporal_transform
170 |         self.target_transform = target_transform
171 |         self.loader = get_loader()
172 | 
173 |     def __getitem__(self, index):
174 |         """
175 |         Args:
176 |             index (int): Index
177 |         Returns:
178 |             tuple: (image, target) where target is class_index of the target class.
179 |         """
180 |         path = self.data[index]['video']
181 | 
182 |         frame_indices = self.data[index]['frame_indices']
183 |         if self.temporal_transform is not None:
184 |             frame_indices = self.temporal_transform(frame_indices)
185 |         clip = self.loader(path, frame_indices)
186 |         if self.spatial_transform is not None:
187 |             self.spatial_transform.randomize_parameters()
188 |             clip = [self.spatial_transform(img) for img in clip]
189 |         clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
190 | 
191 |         target = self.data[index]
192 |         if self.target_transform is not None:
193 |             target = self.target_transform(target)
194 | 
195 |         return clip, target
196 | 
197 |     def __len__(self):
198 |         return len(self.data)
199 | 


--------------------------------------------------------------------------------
/3D_experiment/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import numpy as np
  5 | import torch
  6 | from torch import nn
  7 | from torch import optim
  8 | from torch.optim import lr_scheduler
  9 | 
 10 | from opts import parse_opts
 11 | from mean import get_mean, get_std
 12 | from spatial_transforms import (
 13 |     Compose, Normalize, Scale, CenterCrop, CornerCrop, MultiScaleCornerCrop,
 14 |     MultiScaleRandomCrop, RandomHorizontalFlip, ToTensor)
 15 | from temporal_transforms import LoopPadding, TemporalRandomCrop
 16 | from target_transforms import ClassLabel, VideoID
 17 | from target_transforms import Compose as TargetCompose
 18 | from dataset import get_training_set, get_validation_set, get_test_set
 19 | from utils import Logger
 20 | from train import train_epoch
 21 | from validation import val_epoch
 22 | import test
 23 | from models.resnet3D import resnet3D50
 24 | 
 25 | if __name__ == '__main__':
 26 |     opt = parse_opts()
 27 |     if opt.root_path != '':
 28 |         opt.video_path = os.path.join(opt.root_path, opt.video_path)
 29 |         opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
 30 |         opt.result_path = os.path.join(opt.root_path, opt.result_path)
 31 |         if opt.resume_path:
 32 |             opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
 33 |         if opt.pretrain_path:
 34 |             opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
 35 |     opt.scales = [opt.initial_scale]
 36 |     for i in range(1, opt.n_scales):
 37 |         opt.scales.append(opt.scales[-1] * opt.scale_step)
 38 |     opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
 39 |     opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset)
 40 |     opt.std = get_std(opt.norm_value)
 41 |     print(opt)
 42 |     with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
 43 |         json.dump(vars(opt), opt_file)
 44 | 
 45 |     torch.manual_seed(opt.manual_seed)
 46 | 
 47 |     model = resnet3D50(non_local=True)
 48 |     parameters = model.parameters()
 49 |     
 50 |     if not opt.no_cuda:
 51 |         model = model.cuda()
 52 |         model = nn.DataParallel(model, device_ids=None)
 53 |     
 54 |     print(model)
 55 |     criterion = nn.CrossEntropyLoss()
 56 |     if not opt.no_cuda:
 57 |         criterion = criterion.cuda()
 58 | 
 59 |     if opt.no_mean_norm and not opt.std_norm:
 60 |         norm_method = Normalize([0, 0, 0], [1, 1, 1])
 61 |     elif not opt.std_norm:
 62 |         norm_method = Normalize(opt.mean, [1, 1, 1])
 63 |     else:
 64 |         norm_method = Normalize(opt.mean, opt.std)
 65 | 
 66 |     if not opt.no_train:
 67 |         assert opt.train_crop in ['random', 'corner', 'center']
 68 |         if opt.train_crop == 'random':
 69 |             crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size)
 70 |         elif opt.train_crop == 'corner':
 71 |             crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size)
 72 |         elif opt.train_crop == 'center':
 73 |             crop_method = MultiScaleCornerCrop(
 74 |                 opt.scales, opt.sample_size, crop_positions=['c'])
 75 |         spatial_transform = Compose([
 76 |             crop_method,
 77 |             RandomHorizontalFlip(),
 78 |             ToTensor(opt.norm_value), norm_method
 79 |         ])
 80 |         temporal_transform = TemporalRandomCrop(opt.sample_duration)
 81 |         target_transform = ClassLabel()
 82 |         training_data = get_training_set(opt, spatial_transform,
 83 |                                          temporal_transform, target_transform)
 84 |         train_loader = torch.utils.data.DataLoader(
 85 |             training_data,
 86 |             batch_size=opt.batch_size,
 87 |             shuffle=True,
 88 |             num_workers=opt.n_threads,
 89 |             pin_memory=True)
 90 |         train_logger = Logger(
 91 |             os.path.join(opt.result_path, 'train.log'),
 92 |             ['epoch', 'loss', 'acc', 'lr'])
 93 |         train_batch_logger = Logger(
 94 |             os.path.join(opt.result_path, 'train_batch.log'),
 95 |             ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr'])
 96 | 
 97 |         if opt.nesterov:
 98 |             dampening = 0
 99 |         else:
100 |             dampening = opt.dampening
101 |         optimizer = optim.SGD(
102 |             parameters,
103 |             lr=opt.learning_rate,
104 |             momentum=opt.momentum,
105 |             dampening=dampening,
106 |             weight_decay=opt.weight_decay,
107 |             nesterov=opt.nesterov)
108 |         scheduler = lr_scheduler.ReduceLROnPlateau(
109 |             optimizer, 'min', patience=opt.lr_patience)
110 |     if not opt.no_val:
111 |         spatial_transform = Compose([
112 |             Scale(opt.sample_size),
113 |             CenterCrop(opt.sample_size),
114 |             ToTensor(opt.norm_value), norm_method
115 |         ])
116 |         temporal_transform = LoopPadding(opt.sample_duration)
117 |         target_transform = ClassLabel()
118 |         validation_data = get_validation_set(
119 |             opt, spatial_transform, temporal_transform, target_transform)
120 |         val_loader = torch.utils.data.DataLoader(
121 |             validation_data,
122 |             batch_size=opt.batch_size,
123 |             shuffle=False,
124 |             num_workers=opt.n_threads,
125 |             pin_memory=True)
126 |         val_logger = Logger(
127 |             os.path.join(opt.result_path, 'val.log'), ['epoch', 'loss', 'acc'])
128 | 
129 |     if opt.resume_path:
130 |         print('loading checkpoint {}'.format(opt.resume_path))
131 |         checkpoint = torch.load(opt.resume_path)
132 |         assert opt.arch == checkpoint['arch']
133 | 
134 |         opt.begin_epoch = checkpoint['epoch']
135 |         model.load_state_dict(checkpoint['state_dict'])
136 |         if not opt.no_train:
137 |             optimizer.load_state_dict(checkpoint['optimizer'])
138 | 
139 |     print('run')
140 |     for i in range(opt.begin_epoch, opt.n_epochs + 1):
141 |         if not opt.no_train:
142 |             train_epoch(i, train_loader, model, criterion, optimizer, opt,
143 |                         train_logger, train_batch_logger)
144 |         if not opt.no_val:
145 |             validation_loss = val_epoch(i, val_loader, model, criterion, opt,
146 |                                         val_logger)
147 | 
148 |         if not opt.no_train and not opt.no_val:
149 |             scheduler.step(validation_loss)
150 | 
151 |     if opt.test:
152 |         spatial_transform = Compose([
153 |             Scale(int(opt.sample_size / opt.scale_in_test)),
154 |             CornerCrop(opt.sample_size, opt.crop_position_in_test),
155 |             ToTensor(opt.norm_value), norm_method
156 |         ])
157 |         temporal_transform = LoopPadding(opt.sample_duration)
158 |         target_transform = VideoID()
159 | 
160 |         test_data = get_test_set(opt, spatial_transform, temporal_transform,
161 |                                  target_transform)
162 |         test_loader = torch.utils.data.DataLoader(
163 |             test_data,
164 |             batch_size=opt.batch_size,
165 |             shuffle=False,
166 |             num_workers=opt.n_threads,
167 |             pin_memory=True)
168 |         test.test(test_loader, model, opt, test_data.class_names)
169 | 


--------------------------------------------------------------------------------
/3D_experiment/mean.py:
--------------------------------------------------------------------------------
 1 | def get_mean(norm_value=255, dataset='activitynet'):
 2 |     assert dataset in ['activitynet', 'kinetics']
 3 | 
 4 |     if dataset == 'activitynet':
 5 |         return [
 6 |             114.7748 / norm_value, 107.7354 / norm_value, 99.4750 / norm_value
 7 |         ]
 8 |     elif dataset == 'kinetics':
 9 |         # Kinetics (10 videos for each class)
10 |         return [
11 |             110.63666788 / norm_value, 103.16065604 / norm_value,
12 |             96.29023126 / norm_value
13 |         ]
14 | 
15 | 
16 | def get_std(norm_value=255):
17 |     # Kinetics (10 videos for each class)
18 |     return [
19 |         38.7568578 / norm_value, 37.88248729 / norm_value,
20 |         40.02898126 / norm_value
21 |     ]
22 | 


--------------------------------------------------------------------------------
/3D_experiment/models/non_local.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn import functional as F
  4 | 
  5 | 
  6 | class NLBlockND(nn.Module):
  7 |     def __init__(self, in_channels, inter_channels=None, mode='embedded', 
  8 |                  dimension=3, bn_layer=True):
  9 |         """Implementation of Non-Local Block with 4 different pairwise functions
 10 |         args:
 11 |             in_channels: original channel size (1024 in the paper)
 12 |             inter_channels: channel size inside the block if not specifed reduced to half (512 in the paper)
 13 |             mode: supports Gaussian, Embedded Gaussian, Dot Product, and Concatenation
 14 |             dimension: can be 1 (temporal), 2 (spatial), 3 (spatiotemporal)
 15 |             bn_layer: whether to add batch norm
 16 |         """
 17 |         super(NLBlockND, self).__init__()
 18 | 
 19 |         assert dimension in [1, 2, 3]
 20 |         
 21 |         if mode not in ['gaussian', 'embedded', 'dot', 'concatenate']:
 22 |             raise ValueError('`mode` must be one of `gaussian`, `embedded`, `dot` or `concatenate`')
 23 |             
 24 |         self.mode = mode
 25 |         self.dimension = dimension
 26 | 
 27 |         self.in_channels = in_channels
 28 |         self.inter_channels = inter_channels
 29 | 
 30 |         # the channel size is reduced to half inside the block
 31 |         if self.inter_channels is None:
 32 |             self.inter_channels = in_channels // 2
 33 |             if self.inter_channels == 0:
 34 |                 self.inter_channels = 1
 35 |         
 36 |         # assign appropriate convolutional, max pool, and batch norm layers for different dimensions
 37 |         if dimension == 3:
 38 |             conv_nd = nn.Conv3d
 39 |             max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
 40 |             bn = nn.BatchNorm3d
 41 |         elif dimension == 2:
 42 |             conv_nd = nn.Conv2d
 43 |             max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
 44 |             bn = nn.BatchNorm2d
 45 |         else:
 46 |             conv_nd = nn.Conv1d
 47 |             max_pool_layer = nn.MaxPool1d(kernel_size=(2))
 48 |             bn = nn.BatchNorm1d
 49 | 
 50 |         # function g in the paper which goes through conv. with kernel size 1
 51 |         self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
 52 | 
 53 |         # add BatchNorm layer after the last conv layer
 54 |         if bn_layer:
 55 |             self.W_z = nn.Sequential(
 56 |                     conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, kernel_size=1),
 57 |                     bn(self.in_channels)
 58 |                 )
 59 |             nn.init.constant_(self.W_z[1].weight, 0)
 60 |             nn.init.constant_(self.W_z[1].bias, 0)
 61 |         else:
 62 |             self.W_z = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, kernel_size=1)
 63 |             nn.init.constant_(self.W_z.weight, 0)
 64 |             nn.init.constant_(self.W_z.bias, 0)
 65 | 
 66 |         # define theta and phi for all operations except gaussian
 67 |         if self.mode == "embedded" or self.mode == "dot" or self.mode == "concatenate":
 68 |             self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
 69 |             self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
 70 |         
 71 |         if self.mode == "concatenate":
 72 |             self.W_f = nn.Sequential(
 73 |                     nn.Conv2d(in_channels=self.inter_channel * 2, out_channels=1, kernel_size=1),
 74 |                     nn.ReLU()
 75 |                 )
 76 |             
 77 |     def forward(self, x):
 78 |         """
 79 |         args
 80 |             x: (N, C, T, H, W) for dimension=3; (N, C, H, W) for dimension 2; (N, C, T) for dimension 1
 81 |         """
 82 | 
 83 |         batch_size = x.size(0)
 84 |         
 85 |         # (N, C, THW)
 86 |         g_x = self.g(x).view(batch_size, self.inter_channels, -1)
 87 |         g_x = g_x.permute(0, 2, 1)
 88 | 
 89 |         if self.mode == "gaussian":
 90 |             theta_x = x.view(batch_size, self.in_channels, -1)
 91 |             phi_x = x.view(batch_size, self.in_channels, -1)
 92 |             theta_x = theta_x.permute(0, 2, 1)
 93 |             f = torch.matmul(theta_x, phi_x)
 94 | 
 95 |         elif self.mode == "embedded" or self.mode == "dot":
 96 |             theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
 97 |             phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
 98 |             theta_x = theta_x.permute(0, 2, 1)
 99 |             f = torch.matmul(theta_x, phi_x)
100 | 
101 |         elif self.mode == "concatenate":
102 |             theta_x = self.theta(x).view(batch_size, self.inter_channels, -1, 1)
103 |             phi_x = self.phi(x).view(batch_size, self.inter_channels, 1, -1)
104 |             
105 |             h = theta_x.size(2)
106 |             w = phi_x.size(3)
107 |             theta_x = theta_x.repeat(1, 1, 1, w)
108 |             phi_x = phi_x.repeat(1, 1, h, 1)
109 |             
110 |             concat = torch.cat([theta_x, phi_x], dim=1)
111 |             f = self.W_f(concat)
112 |             f = f.view(f.size(0), f.size(2), f.size(3))
113 |         
114 |         if self.mode == "gaussian" or self.mode == "embedded":
115 |             f_div_C = F.softmax(f, dim=-1)
116 |         elif self.mode == "dot" or self.mode == "concatenate":
117 |             N = f.size(-1) # number of position in x
118 |             f_div_C = f / N
119 |         
120 |         y = torch.matmul(f_div_C, g_x)
121 |         
122 |         # contiguous here just allocates contiguous chunk of memory
123 |         y = y.permute(0, 2, 1).contiguous()
124 |         y = y.view(batch_size, self.inter_channels, *x.size()[2:])
125 |         
126 |         W_y = self.W_z(y)
127 |         # residual connection
128 |         z = W_y + x
129 | 
130 |         return z
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     import torch
135 | 
136 |     for bn_layer in [True, False]:
137 |         img = torch.zeros(2, 3, 20)
138 |         net = NLBlockND(in_channels=3, mode='concatenate', dimension=1, bn_layer=bn_layer)
139 |         out = net(img)
140 |         print(out.size())
141 | 
142 |         img = torch.zeros(2, 3, 20, 20)
143 |         net = NLBlockND(in_channels=3, mode='concatenate', dimension=2, bn_layer=bn_layer)
144 |         out = net(img)
145 |         print(out.size())
146 | 
147 |         img = torch.randn(2, 3, 8, 20, 20)
148 |         net = NLBlockND(in_channels=3, mode='concatenate', dimension=3, bn_layer=bn_layer)
149 |         out = net(img)
150 |         print(out.size())
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/3D_experiment/models/resnet3D.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ResNet50 (C2D) for spatiotemporal task. Only ResNet50 backbone structure was implemented here.
  3 | """
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | import math
  9 | from functools import partial
 10 | from models.non_local import NLBlockND
 11 | 
 12 | 
 13 | class Bottleneck(nn.Module):
 14 |     """
 15 |     Bottleneck block structure used in ResNet 50. 
 16 |     As mentioned in Section 4. 2D ConvNet baseline (C2D), 
 17 |     all convolutions are in essence 2D kernels that prcoess the input frame-by-frame 
 18 |     (implemented as (1 x k x k) kernels). 
 19 |     """
 20 |     expansion = 4
 21 | 
 22 |     def __init__(self, inplanes, planes, stride=1, padding=(0, 1, 1), downsample=None):
 23 |         super(Bottleneck, self).__init__()
 24 |         self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=(1, 1, 1), bias=False)
 25 |         self.bn1 = nn.BatchNorm3d(planes)
 26 |         self.conv2 = nn.Conv3d(planes, planes, kernel_size=(1, 3, 3), stride=stride, padding=padding, bias=False)
 27 |         self.bn2 = nn.BatchNorm3d(planes)
 28 |         self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=(1, 1, 1), bias=False)
 29 |         self.bn3 = nn.BatchNorm3d(planes * 4)
 30 |         self.relu = nn.ReLU(inplace=True)
 31 |         self.downsample = downsample
 32 |         self.stride = stride
 33 | 
 34 |     def forward(self, x):
 35 |         identity = x
 36 | 
 37 |         out = self.conv1(x)
 38 |         out = self.bn1(out)
 39 |         out = self.relu(out)
 40 | 
 41 |         out = self.conv2(out)
 42 |         out = self.bn2(out)
 43 |         out = self.relu(out)
 44 | 
 45 |         out = self.conv3(out)
 46 |         out = self.bn3(out)
 47 | 
 48 |         if self.downsample is not None:
 49 |             identity = self.downsample(x)
 50 |         
 51 |         out += identity
 52 |         out = self.relu(out)
 53 | 
 54 |         return out
 55 | 
 56 | 
 57 | class ResNet3D(nn.Module):
 58 |     """C2D with ResNet 50 backbone.
 59 |     The only operation involving the temporal domain are the pooling layer after the second residual block.
 60 |     For more details of the structure, refer to Table 1 from the paper. 
 61 |     Padding was added accordingly to match the correct dimensionality.
 62 |     """
 63 |     def __init__(self, block, layers, num_classes=400, non_local=False):
 64 |         self.inplanes = 64
 65 |         super(ResNet3D, self).__init__()
 66 |         
 67 |         # first convolution operation has essentially 2D kernels
 68 |         # output: 64 x 16 x 112 x 112
 69 |         self.conv1 = nn.Conv3d(3, 64, kernel_size=(1, 7, 7), stride=2, padding=(0, 3, 3), bias=False)
 70 |         self.bn1 = nn.BatchNorm3d(64)
 71 |         self.relu = nn.ReLU(inplace=True)
 72 |         
 73 |         # output: 64 x 8 x 56 x 56
 74 |         self.pool1 = nn.MaxPool3d(kernel_size=3, stride=2)
 75 |         
 76 |         # output: 256 x 8 x 56 x 56
 77 |         self.layer1 = self._make_layer(block, 64, layers[0], stride=1, d_padding=0)
 78 |         
 79 |         # pooling on temporal domain
 80 |         # output: 256 x 4 x 56 x 56
 81 |         self.pool_t = nn.MaxPool3d(kernel_size=(3, 1, 1), stride=(2, 1, 1))
 82 |         
 83 |         # output: 512 x 4 x 28 x 28
 84 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2, padding=(2, 1, 1))
 85 |         
 86 |         # add one non-local block here
 87 |         # output: 1024 x 4 x 14 x 14
 88 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2, padding=(2, 1, 1), non_local=non_local)
 89 | 
 90 |         # output: 2048 x 4 x 7 x 7
 91 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2, padding=(2, 1, 1))
 92 |         
 93 |         # output: 2048 x 1
 94 |         self.avgpool = nn.AvgPool3d(kernel_size=(4, 7, 7))
 95 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
 96 | 
 97 |         for m in self.modules():
 98 |             if isinstance(m, nn.Conv3d):
 99 |                 m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out')
100 |             elif isinstance(m, nn.BatchNorm3d):
101 |                 m.weight.data.fill_(1)
102 |                 m.bias.data.zero_()
103 | 
104 |     def _make_layer(self, block, planes, blocks, stride=1, padding=(0, 1, 1), d_padding=(2, 0, 0), non_local=False):
105 |         downsample = nn.Sequential(
106 |                             nn.Conv3d(self.inplanes, planes * block.expansion, 
107 |                                       kernel_size=1, stride=stride, padding=d_padding, bias=False), 
108 |                             nn.BatchNorm3d(planes * block.expansion)
109 |                         )
110 | 
111 |         layers = []
112 |         layers.append(block(self.inplanes, planes, stride, padding, downsample))
113 |         self.inplanes = planes * block.expansion
114 |         
115 |         last_idx = blocks
116 |         if non_local:
117 |             last_idx = blocks - 1
118 |             
119 |         for i in range(1, last_idx):
120 |             layers.append(block(self.inplanes, planes))
121 |         
122 |         # add non-local block here
123 |         if non_local:
124 |             layers.append(NLBlockND(in_channels=1024, dimension=3))
125 |             layers.append(block(self.inplanes, planes))
126 |             
127 |         return nn.Sequential(*layers)
128 | 
129 |     def forward(self, x):
130 |         x = self.conv1(x)
131 |         x = self.bn1(x)
132 |         x = self.relu(x)
133 |         x = self.pool1(x)
134 | 
135 |         x = self.layer1(x)
136 |         x = self.pool_t(x)
137 |         x = self.layer2(x)
138 |         x = self.layer3(x)
139 |         x = self.layer4(x)
140 | 
141 |         x = self.avgpool(x)
142 | 
143 |         x = x.view(x.size(0), -1)
144 |         x = self.fc(x)
145 | 
146 |         return x
147 | 
148 | 
149 | def resnet3D50(non_local=False, **kwargs):
150 |     """Constructs a C2D ResNet-50 model.
151 |     """
152 |     model = ResNet3D(Bottleneck, [3, 4, 6, 3], non_local=non_local, **kwargs)
153 |     return model
154 | 
155 | 
156 | 
157 | if __name__=='__main__':
158 |     # Test case of 32 frames (224 x 224 x 3) input of batch size 1
159 |     img = Variable(torch.randn(1, 3, 32, 224, 224))
160 |     net = resnet3D50(non_local=True)
161 |     count = 0
162 |     for name, param in net.named_parameters():
163 |         if param.requires_grad:
164 |             count += 1
165 |             print(name)
166 |     print (count)
167 |     out = net(img)
168 |     print(out.size())
169 | 


--------------------------------------------------------------------------------
/3D_experiment/opts.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | 
  4 | def parse_opts():
  5 |     parser = argparse.ArgumentParser()
  6 |     parser.add_argument(
  7 |         '--root_path',
  8 |         default='/root/data/ActivityNet',
  9 |         type=str,
 10 |         help='Root directory path of data')
 11 |     parser.add_argument(
 12 |         '--video_path',
 13 |         default='video_kinetics_jpg',
 14 |         type=str,
 15 |         help='Directory path of Videos')
 16 |     parser.add_argument(
 17 |         '--annotation_path',
 18 |         default='kinetics.json',
 19 |         type=str,
 20 |         help='Annotation file path')
 21 |     parser.add_argument(
 22 |         '--result_path',
 23 |         default='results',
 24 |         type=str,
 25 |         help='Result directory path')
 26 |     parser.add_argument(
 27 |         '--dataset',
 28 |         default='kinetics',
 29 |         type=str,
 30 |         help='Used dataset (activitynet | kinetics | ucf101 | hmdb51)')
 31 |     parser.add_argument(
 32 |         '--n_classes',
 33 |         default=400,
 34 |         type=int,
 35 |         help=
 36 |         'Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)'
 37 |     )
 38 |     parser.add_argument(
 39 |         '--n_finetune_classes',
 40 |         default=400,
 41 |         type=int,
 42 |         help=
 43 |         'Number of classes for fine-tuning. n_classes is set to the number when pretraining.'
 44 |     )
 45 |     parser.add_argument(
 46 |         '--sample_size',
 47 |         default=112,
 48 |         type=int,
 49 |         help='Height and width of inputs')
 50 |     parser.add_argument(
 51 |         '--sample_duration',
 52 |         default=16,
 53 |         type=int,
 54 |         help='Temporal duration of inputs')
 55 |     parser.add_argument(
 56 |         '--initial_scale',
 57 |         default=1.0,
 58 |         type=float,
 59 |         help='Initial scale for multiscale cropping')
 60 |     parser.add_argument(
 61 |         '--n_scales',
 62 |         default=5,
 63 |         type=int,
 64 |         help='Number of scales for multiscale cropping')
 65 |     parser.add_argument(
 66 |         '--scale_step',
 67 |         default=0.84089641525,
 68 |         type=float,
 69 |         help='Scale step for multiscale cropping')
 70 |     parser.add_argument(
 71 |         '--train_crop',
 72 |         default='corner',
 73 |         type=str,
 74 |         help=
 75 |         'Spatial cropping method in training. random is uniform. corner is selection from 4 corners and 1 center.  (random | corner | center)'
 76 |     )
 77 |     parser.add_argument(
 78 |         '--learning_rate',
 79 |         default=0.1,
 80 |         type=float,
 81 |         help=
 82 |         'Initial learning rate (divided by 10 while training by lr scheduler)')
 83 |     parser.add_argument('--momentum', default=0.9, type=float, help='Momentum')
 84 |     parser.add_argument(
 85 |         '--dampening', default=0.9, type=float, help='dampening of SGD')
 86 |     parser.add_argument(
 87 |         '--weight_decay', default=1e-3, type=float, help='Weight Decay')
 88 |     parser.add_argument(
 89 |         '--mean_dataset',
 90 |         default='activitynet',
 91 |         type=str,
 92 |         help=
 93 |         'dataset for mean values of mean subtraction (activitynet | kinetics)')
 94 |     parser.add_argument(
 95 |         '--no_mean_norm',
 96 |         action='store_true',
 97 |         help='If true, inputs are not normalized by mean.')
 98 |     parser.set_defaults(no_mean_norm=False)
 99 |     parser.add_argument(
100 |         '--std_norm',
101 |         action='store_true',
102 |         help='If true, inputs are normalized by standard deviation.')
103 |     parser.set_defaults(std_norm=False)
104 |     parser.add_argument(
105 |         '--nesterov', action='store_true', help='Nesterov momentum')
106 |     parser.set_defaults(nesterov=False)
107 |     parser.add_argument(
108 |         '--optimizer',
109 |         default='sgd',
110 |         type=str,
111 |         help='Currently only support SGD')
112 |     parser.add_argument(
113 |         '--lr_patience',
114 |         default=10,
115 |         type=int,
116 |         help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.'
117 |     )
118 |     parser.add_argument(
119 |         '--batch_size', default=128, type=int, help='Batch Size')
120 |     parser.add_argument(
121 |         '--n_epochs',
122 |         default=200,
123 |         type=int,
124 |         help='Number of total epochs to run')
125 |     parser.add_argument(
126 |         '--begin_epoch',
127 |         default=1,
128 |         type=int,
129 |         help=
130 |         'Training begins at this epoch. Previous trained model indicated by resume_path is loaded.'
131 |     )
132 |     parser.add_argument(
133 |         '--n_val_samples',
134 |         default=3,
135 |         type=int,
136 |         help='Number of validation samples for each activity')
137 |     parser.add_argument(
138 |         '--resume_path',
139 |         default='',
140 |         type=str,
141 |         help='Save data (.pth) of previous training')
142 |     parser.add_argument(
143 |         '--pretrain_path', default='', type=str, help='Pretrained model (.pth)')
144 |     parser.add_argument(
145 |         '--ft_begin_index',
146 |         default=0,
147 |         type=int,
148 |         help='Begin block index of fine-tuning')
149 |     parser.add_argument(
150 |         '--no_train',
151 |         action='store_true',
152 |         help='If true, training is not performed.')
153 |     parser.set_defaults(no_train=False)
154 |     parser.add_argument(
155 |         '--no_val',
156 |         action='store_true',
157 |         help='If true, validation is not performed.')
158 |     parser.set_defaults(no_val=False)
159 |     parser.add_argument(
160 |         '--test', action='store_true', help='If true, test is performed.')
161 |     parser.set_defaults(test=False)
162 |     parser.add_argument(
163 |         '--test_subset',
164 |         default='val',
165 |         type=str,
166 |         help='Used subset in test (val | test)')
167 |     parser.add_argument(
168 |         '--scale_in_test',
169 |         default=1.0,
170 |         type=float,
171 |         help='Spatial scale in test')
172 |     parser.add_argument(
173 |         '--crop_position_in_test',
174 |         default='c',
175 |         type=str,
176 |         help='Cropping method (c | tl | tr | bl | br) in test')
177 |     parser.add_argument(
178 |         '--no_softmax_in_test',
179 |         action='store_true',
180 |         help='If true, output for each clip is not normalized using softmax.')
181 |     parser.set_defaults(no_softmax_in_test=False)
182 |     parser.add_argument(
183 |         '--no_cuda', action='store_true', help='If true, cuda is not used.')
184 |     parser.set_defaults(no_cuda=False)
185 |     parser.add_argument(
186 |         '--n_threads',
187 |         default=4,
188 |         type=int,
189 |         help='Number of threads for multi-thread loading')
190 |     parser.add_argument(
191 |         '--checkpoint',
192 |         default=10,
193 |         type=int,
194 |         help='Trained model is saved at every this epochs.')
195 |     parser.add_argument(
196 |         '--no_hflip',
197 |         action='store_true',
198 |         help='If true holizontal flipping is not performed.')
199 |     parser.set_defaults(no_hflip=False)
200 |     parser.add_argument(
201 |         '--norm_value',
202 |         default=1,
203 |         type=int,
204 |         help=
205 |         'If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].')
206 |     parser.add_argument(
207 |         '--model',
208 |         default='resnet',
209 |         type=str,
210 |         help='(resnet | resnet_nl | preresnet | wideresnet | resnext | densenet | ')
211 |     parser.add_argument(
212 |         '--model_depth',
213 |         default=18,
214 |         type=int,
215 |         help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
216 |     parser.add_argument(
217 |         '--resnet_shortcut',
218 |         default='B',
219 |         type=str,
220 |         help='Shortcut type of resnet (A | B)')
221 |     parser.add_argument(
222 |         '--wide_resnet_k', default=2, type=int, help='Wide resnet k')
223 |     parser.add_argument(
224 |         '--resnext_cardinality',
225 |         default=32,
226 |         type=int,
227 |         help='ResNeXt cardinality')
228 |     parser.add_argument(
229 |         '--manual_seed', default=1, type=int, help='Manually set random seed')
230 | 
231 |     args = parser.parse_args()
232 | 
233 |     return args
234 | 


--------------------------------------------------------------------------------
/3D_experiment/run.sh:
--------------------------------------------------------------------------------
1 | python main.py --sample_size 224 --root_path ./data --video_path hmdb51/jpg --annotation_path hmdb51_1.json --result_path results --dataset hmdb51 --model resnet --model_depth 50 --n_classes 51 --batch_size 32 --n_threads 4 --checkpoint 5 2>&1 | tee output_hmdb.txt
2 | 


--------------------------------------------------------------------------------
/3D_experiment/spatial_transforms.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import math
  3 | import numbers
  4 | import collections
  5 | import numpy as np
  6 | import torch
  7 | from PIL import Image, ImageOps
  8 | try:
  9 |     import accimage
 10 | except ImportError:
 11 |     accimage = None
 12 | 
 13 | 
 14 | class Compose(object):
 15 |     """Composes several transforms together.
 16 |     Args:
 17 |         transforms (list of ``Transform`` objects): list of transforms to compose.
 18 |     Example:
 19 |         >>> transforms.Compose([
 20 |         >>>     transforms.CenterCrop(10),
 21 |         >>>     transforms.ToTensor(),
 22 |         >>> ])
 23 |     """
 24 | 
 25 |     def __init__(self, transforms):
 26 |         self.transforms = transforms
 27 | 
 28 |     def __call__(self, img):
 29 |         for t in self.transforms:
 30 |             img = t(img)
 31 |         return img
 32 | 
 33 |     def randomize_parameters(self):
 34 |         for t in self.transforms:
 35 |             t.randomize_parameters()
 36 | 
 37 | 
 38 | class ToTensor(object):
 39 |     """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor.
 40 |     Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
 41 |     [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
 42 |     """
 43 | 
 44 |     def __init__(self, norm_value=255):
 45 |         self.norm_value = norm_value
 46 | 
 47 |     def __call__(self, pic):
 48 |         """
 49 |         Args:
 50 |             pic (PIL.Image or numpy.ndarray): Image to be converted to tensor.
 51 |         Returns:
 52 |             Tensor: Converted image.
 53 |         """
 54 |         if isinstance(pic, np.ndarray):
 55 |             # handle numpy array
 56 |             img = torch.from_numpy(pic.transpose((2, 0, 1)))
 57 |             # backward compatibility
 58 |             return img.float().div(self.norm_value)
 59 | 
 60 |         if accimage is not None and isinstance(pic, accimage.Image):
 61 |             nppic = np.zeros(
 62 |                 [pic.channels, pic.height, pic.width], dtype=np.float32)
 63 |             pic.copyto(nppic)
 64 |             return torch.from_numpy(nppic)
 65 | 
 66 |         # handle PIL Image
 67 |         if pic.mode == 'I':
 68 |             img = torch.from_numpy(np.array(pic, np.int32, copy=False))
 69 |         elif pic.mode == 'I;16':
 70 |             img = torch.from_numpy(np.array(pic, np.int16, copy=False))
 71 |         else:
 72 |             img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
 73 |         # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
 74 |         if pic.mode == 'YCbCr':
 75 |             nchannel = 3
 76 |         elif pic.mode == 'I;16':
 77 |             nchannel = 1
 78 |         else:
 79 |             nchannel = len(pic.mode)
 80 |         img = img.view(pic.size[1], pic.size[0], nchannel)
 81 |         # put it from HWC to CHW format
 82 |         # yikes, this transpose takes 80% of the loading time/CPU
 83 |         img = img.transpose(0, 1).transpose(0, 2).contiguous()
 84 |         if isinstance(img, torch.ByteTensor):
 85 |             return img.float().div(self.norm_value)
 86 |         else:
 87 |             return img
 88 | 
 89 |     def randomize_parameters(self):
 90 |         pass
 91 | 
 92 | 
 93 | class Normalize(object):
 94 |     """Normalize an tensor image with mean and standard deviation.
 95 |     Given mean: (R, G, B) and std: (R, G, B),
 96 |     will normalize each channel of the torch.*Tensor, i.e.
 97 |     channel = (channel - mean) / std
 98 |     Args:
 99 |         mean (sequence): Sequence of means for R, G, B channels respecitvely.
100 |         std (sequence): Sequence of standard deviations for R, G, B channels
101 |             respecitvely.
102 |     """
103 | 
104 |     def __init__(self, mean, std):
105 |         self.mean = mean
106 |         self.std = std
107 | 
108 |     def __call__(self, tensor):
109 |         """
110 |         Args:
111 |             tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
112 |         Returns:
113 |             Tensor: Normalized image.
114 |         """
115 |         # TODO: make efficient
116 |         for t, m, s in zip(tensor, self.mean, self.std):
117 |             t.sub_(m).div_(s)
118 |         return tensor
119 | 
120 |     def randomize_parameters(self):
121 |         pass
122 | 
123 | 
124 | class Scale(object):
125 |     """Rescale the input PIL.Image to the given size.
126 |     Args:
127 |         size (sequence or int): Desired output size. If size is a sequence like
128 |             (w, h), output size will be matched to this. If size is an int,
129 |             smaller edge of the image will be matched to this number.
130 |             i.e, if height > width, then image will be rescaled to
131 |             (size * height / width, size)
132 |         interpolation (int, optional): Desired interpolation. Default is
133 |             ``PIL.Image.BILINEAR``
134 |     """
135 | 
136 |     def __init__(self, size, interpolation=Image.BILINEAR):
137 |         assert isinstance(size,
138 |                           int) or (isinstance(size, collections.Iterable) and
139 |                                    len(size) == 2)
140 |         self.size = size
141 |         self.interpolation = interpolation
142 | 
143 |     def __call__(self, img):
144 |         """
145 |         Args:
146 |             img (PIL.Image): Image to be scaled.
147 |         Returns:
148 |             PIL.Image: Rescaled image.
149 |         """
150 |         if isinstance(self.size, int):
151 |             w, h = img.size
152 |             if (w <= h and w == self.size) or (h <= w and h == self.size):
153 |                 return img
154 |             if w < h:
155 |                 ow = self.size
156 |                 oh = int(self.size * h / w)
157 |                 return img.resize((ow, oh), self.interpolation)
158 |             else:
159 |                 oh = self.size
160 |                 ow = int(self.size * w / h)
161 |                 return img.resize((ow, oh), self.interpolation)
162 |         else:
163 |             return img.resize(self.size, self.interpolation)
164 | 
165 |     def randomize_parameters(self):
166 |         pass
167 | 
168 | 
169 | class CenterCrop(object):
170 |     """Crops the given PIL.Image at the center.
171 |     Args:
172 |         size (sequence or int): Desired output size of the crop. If size is an
173 |             int instead of sequence like (h, w), a square crop (size, size) is
174 |             made.
175 |     """
176 | 
177 |     def __init__(self, size):
178 |         if isinstance(size, numbers.Number):
179 |             self.size = (int(size), int(size))
180 |         else:
181 |             self.size = size
182 | 
183 |     def __call__(self, img):
184 |         """
185 |         Args:
186 |             img (PIL.Image): Image to be cropped.
187 |         Returns:
188 |             PIL.Image: Cropped image.
189 |         """
190 |         w, h = img.size
191 |         th, tw = self.size
192 |         x1 = int(round((w - tw) / 2.))
193 |         y1 = int(round((h - th) / 2.))
194 |         return img.crop((x1, y1, x1 + tw, y1 + th))
195 | 
196 |     def randomize_parameters(self):
197 |         pass
198 | 
199 | 
200 | class CornerCrop(object):
201 | 
202 |     def __init__(self, size, crop_position=None):
203 |         self.size = size
204 |         if crop_position is None:
205 |             self.randomize = True
206 |         else:
207 |             self.randomize = False
208 |         self.crop_position = crop_position
209 |         self.crop_positions = ['c', 'tl', 'tr', 'bl', 'br']
210 | 
211 |     def __call__(self, img):
212 |         image_width = img.size[0]
213 |         image_height = img.size[1]
214 | 
215 |         if self.crop_position == 'c':
216 |             th, tw = (self.size, self.size)
217 |             x1 = int(round((image_width - tw) / 2.))
218 |             y1 = int(round((image_height - th) / 2.))
219 |             x2 = x1 + tw
220 |             y2 = y1 + th
221 |         elif self.crop_position == 'tl':
222 |             x1 = 0
223 |             y1 = 0
224 |             x2 = self.size
225 |             y2 = self.size
226 |         elif self.crop_position == 'tr':
227 |             x1 = image_width - self.size
228 |             y1 = 0
229 |             x2 = image_width
230 |             y2 = self.size
231 |         elif self.crop_position == 'bl':
232 |             x1 = 0
233 |             y1 = image_height - self.size
234 |             x2 = self.size
235 |             y2 = image_height
236 |         elif self.crop_position == 'br':
237 |             x1 = image_width - self.size
238 |             y1 = image_height - self.size
239 |             x2 = image_width
240 |             y2 = image_height
241 | 
242 |         img = img.crop((x1, y1, x2, y2))
243 | 
244 |         return img
245 | 
246 |     def randomize_parameters(self):
247 |         if self.randomize:
248 |             self.crop_position = self.crop_positions[random.randint(
249 |                 0,
250 |                 len(self.crop_positions) - 1)]
251 | 
252 | 
253 | class RandomHorizontalFlip(object):
254 |     """Horizontally flip the given PIL.Image randomly with a probability of 0.5."""
255 | 
256 |     def __call__(self, img):
257 |         """
258 |         Args:
259 |             img (PIL.Image): Image to be flipped.
260 |         Returns:
261 |             PIL.Image: Randomly flipped image.
262 |         """
263 |         if self.p < 0.5:
264 |             return img.transpose(Image.FLIP_LEFT_RIGHT)
265 |         return img
266 | 
267 |     def randomize_parameters(self):
268 |         self.p = random.random()
269 | 
270 | 
271 | class MultiScaleCornerCrop(object):
272 |     """Crop the given PIL.Image to randomly selected size.
273 |     A crop of size is selected from scales of the original size.
274 |     A position of cropping is randomly selected from 4 corners and 1 center.
275 |     This crop is finally resized to given size.
276 |     Args:
277 |         scales: cropping scales of the original size
278 |         size: size of the smaller edge
279 |         interpolation: Default: PIL.Image.BILINEAR
280 |     """
281 | 
282 |     def __init__(self,
283 |                  scales,
284 |                  size,
285 |                  interpolation=Image.BILINEAR,
286 |                  crop_positions=['c', 'tl', 'tr', 'bl', 'br']):
287 |         self.scales = scales
288 |         self.size = size
289 |         self.interpolation = interpolation
290 | 
291 |         self.crop_positions = crop_positions
292 | 
293 |     def __call__(self, img):
294 |         min_length = min(img.size[0], img.size[1])
295 |         crop_size = int(min_length * self.scale)
296 | 
297 |         image_width = img.size[0]
298 |         image_height = img.size[1]
299 | 
300 |         if self.crop_position == 'c':
301 |             center_x = image_width // 2
302 |             center_y = image_height // 2
303 |             box_half = crop_size // 2
304 |             x1 = center_x - box_half
305 |             y1 = center_y - box_half
306 |             x2 = center_x + box_half
307 |             y2 = center_y + box_half
308 |         elif self.crop_position == 'tl':
309 |             x1 = 0
310 |             y1 = 0
311 |             x2 = crop_size
312 |             y2 = crop_size
313 |         elif self.crop_position == 'tr':
314 |             x1 = image_width - crop_size
315 |             y1 = 0
316 |             x2 = image_width
317 |             y2 = crop_size
318 |         elif self.crop_position == 'bl':
319 |             x1 = 0
320 |             y1 = image_height - crop_size
321 |             x2 = crop_size
322 |             y2 = image_height
323 |         elif self.crop_position == 'br':
324 |             x1 = image_width - crop_size
325 |             y1 = image_height - crop_size
326 |             x2 = image_width
327 |             y2 = image_height
328 | 
329 |         img = img.crop((x1, y1, x2, y2))
330 | 
331 |         return img.resize((self.size, self.size), self.interpolation)
332 | 
333 |     def randomize_parameters(self):
334 |         self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
335 |         self.crop_position = self.crop_positions[random.randint(
336 |             0,
337 |             len(self.crop_positions) - 1)]
338 | 
339 | 
340 | class MultiScaleRandomCrop(object):
341 | 
342 |     def __init__(self, scales, size, interpolation=Image.BILINEAR):
343 |         self.scales = scales
344 |         self.size = size
345 |         self.interpolation = interpolation
346 | 
347 |     def __call__(self, img):
348 |         min_length = min(img.size[0], img.size[1])
349 |         crop_size = int(min_length * self.scale)
350 | 
351 |         image_width = img.size[0]
352 |         image_height = img.size[1]
353 | 
354 |         x1 = self.tl_x * (image_width - crop_size)
355 |         y1 = self.tl_y * (image_height - crop_size)
356 |         x2 = x1 + crop_size
357 |         y2 = y1 + crop_size
358 | 
359 |         img = img.crop((x1, y1, x2, y2))
360 | 
361 |         return img.resize((self.size, self.size), self.interpolation)
362 | 
363 |     def randomize_parameters(self):
364 |         self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
365 |         self.tl_x = random.random()
366 |         self.tl_y = random.random()
367 | 


--------------------------------------------------------------------------------
/3D_experiment/target_transforms.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import math
 3 | 
 4 | 
 5 | class Compose(object):
 6 | 
 7 |     def __init__(self, transforms):
 8 |         self.transforms = transforms
 9 | 
10 |     def __call__(self, target):
11 |         dst = []
12 |         for t in self.transforms:
13 |             dst.append(t(target))
14 |         return dst
15 | 
16 | 
17 | class ClassLabel(object):
18 | 
19 |     def __call__(self, target):
20 |         return target['label']
21 | 
22 | 
23 | class VideoID(object):
24 | 
25 |     def __call__(self, target):
26 |         return target['video_id']
27 | 


--------------------------------------------------------------------------------
/3D_experiment/temporal_transforms.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import math
  3 | 
  4 | 
  5 | class LoopPadding(object):
  6 | 
  7 |     def __init__(self, size):
  8 |         self.size = size
  9 | 
 10 |     def __call__(self, frame_indices):
 11 |         out = frame_indices
 12 | 
 13 |         for index in out:
 14 |             if len(out) >= self.size:
 15 |                 break
 16 |             out.append(index)
 17 | 
 18 |         return out
 19 | 
 20 | 
 21 | class TemporalBeginCrop(object):
 22 |     """Temporally crop the given frame indices at a beginning.
 23 | 
 24 |     If the number of frames is less than the size,
 25 |     loop the indices as many times as necessary to satisfy the size.
 26 | 
 27 |     Args:
 28 |         size (int): Desired output size of the crop.
 29 |     """
 30 | 
 31 |     def __init__(self, size):
 32 |         self.size = size
 33 | 
 34 |     def __call__(self, frame_indices):
 35 |         out = frame_indices[:self.size]
 36 | 
 37 |         for index in out:
 38 |             if len(out) >= self.size:
 39 |                 break
 40 |             out.append(index)
 41 | 
 42 |         return out
 43 | 
 44 | 
 45 | class TemporalCenterCrop(object):
 46 |     """Temporally crop the given frame indices at a center.
 47 | 
 48 |     If the number of frames is less than the size,
 49 |     loop the indices as many times as necessary to satisfy the size.
 50 | 
 51 |     Args:
 52 |         size (int): Desired output size of the crop.
 53 |     """
 54 | 
 55 |     def __init__(self, size):
 56 |         self.size = size
 57 | 
 58 |     def __call__(self, frame_indices):
 59 |         """
 60 |         Args:
 61 |             frame_indices (list): frame indices to be cropped.
 62 |         Returns:
 63 |             list: Cropped frame indices.
 64 |         """
 65 | 
 66 |         center_index = len(frame_indices) // 2
 67 |         begin_index = max(0, center_index - (self.size // 2))
 68 |         end_index = min(begin_index + self.size, len(frame_indices))
 69 | 
 70 |         out = frame_indices[begin_index:end_index]
 71 | 
 72 |         for index in out:
 73 |             if len(out) >= self.size:
 74 |                 break
 75 |             out.append(index)
 76 | 
 77 |         return out
 78 | 
 79 | 
 80 | class TemporalRandomCrop(object):
 81 |     """Temporally crop the given frame indices at a random location.
 82 | 
 83 |     If the number of frames is less than the size,
 84 |     loop the indices as many times as necessary to satisfy the size.
 85 | 
 86 |     Args:
 87 |         size (int): Desired output size of the crop.
 88 |     """
 89 | 
 90 |     def __init__(self, size):
 91 |         self.size = size
 92 | 
 93 |     def __call__(self, frame_indices):
 94 |         """
 95 |         Args:
 96 |             frame_indices (list): frame indices to be cropped.
 97 |         Returns:
 98 |             list: Cropped frame indices.
 99 |         """
100 | 
101 |         rand_end = max(0, len(frame_indices) - self.size - 1)
102 |         begin_index = random.randint(0, rand_end)
103 |         end_index = min(begin_index + self.size, len(frame_indices))
104 | 
105 |         out = frame_indices[begin_index:end_index]
106 | 
107 |         for index in out:
108 |             if len(out) >= self.size:
109 |                 break
110 |             out.append(index)
111 | 
112 |         return out
113 | 


--------------------------------------------------------------------------------
/3D_experiment/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | import torch.nn.functional as F
 4 | import time
 5 | import os
 6 | import sys
 7 | import json
 8 | 
 9 | from utils import AverageMeter
10 | 
11 | 
12 | def calculate_video_results(output_buffer, video_id, test_results, class_names):
13 |     video_outputs = torch.stack(output_buffer)
14 |     average_scores = torch.mean(video_outputs, dim=0)
15 |     sorted_scores, locs = torch.topk(average_scores, k=10)
16 | 
17 |     video_results = []
18 |     for i in range(sorted_scores.size(0)):
19 |         video_results.append({
20 |             'label': class_names[locs[i]],
21 |             'score': sorted_scores[i]
22 |         })
23 | 
24 |     test_results['results'][video_id] = video_results
25 | 
26 | 
27 | def test(data_loader, model, opt, class_names):
28 |     print('test')
29 | 
30 |     model.eval()
31 | 
32 |     batch_time = AverageMeter()
33 |     data_time = AverageMeter()
34 | 
35 |     end_time = time.time()
36 |     output_buffer = []
37 |     previous_video_id = ''
38 |     test_results = {'results': {}}
39 |     for i, (inputs, targets) in enumerate(data_loader):
40 |         data_time.update(time.time() - end_time)
41 | 
42 |         inputs = Variable(inputs, volatile=True)
43 |         outputs = model(inputs)
44 |         if not opt.no_softmax_in_test:
45 |             outputs = F.softmax(outputs)
46 | 
47 |         for j in range(outputs.size(0)):
48 |             if not (i == 0 and j == 0) and targets[j] != previous_video_id:
49 |                 calculate_video_results(output_buffer, previous_video_id,
50 |                                         test_results, class_names)
51 |                 output_buffer = []
52 |             output_buffer.append(outputs[j].data.cpu())
53 |             previous_video_id = targets[j]
54 | 
55 |         if (i % 100) == 0:
56 |             with open(
57 |                     os.path.join(opt.result_path, '{}.json'.format(
58 |                         opt.test_subset)), 'w') as f:
59 |                 json.dump(test_results, f)
60 | 
61 |         batch_time.update(time.time() - end_time)
62 |         end_time = time.time()
63 | 
64 |         print('[{}/{}]\t'
65 |               'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
66 |               'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format(
67 |                   i + 1,
68 |                   len(data_loader),
69 |                   batch_time=batch_time,
70 |                   data_time=data_time))
71 |     with open(
72 |             os.path.join(opt.result_path, '{}.json'.format(opt.test_subset)),
73 |             'w') as f:
74 |         json.dump(test_results, f)
75 | 


--------------------------------------------------------------------------------
/3D_experiment/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | import time
 4 | import os
 5 | import sys
 6 | 
 7 | from utils import AverageMeter, calculate_accuracy
 8 | 
 9 | 
10 | def train_epoch(epoch, data_loader, model, criterion, optimizer, opt,
11 |                 epoch_logger, batch_logger):
12 |     print('train at epoch {}'.format(epoch))
13 | 
14 |     model.train()
15 | 
16 |     batch_time = AverageMeter()
17 |     data_time = AverageMeter()
18 |     losses = AverageMeter()
19 |     accuracies = AverageMeter()
20 | 
21 |     end_time = time.time()
22 |     for i, (inputs, targets) in enumerate(data_loader):
23 |         data_time.update(time.time() - end_time)
24 | 
25 |         if not opt.no_cuda:
26 |             targets = targets.cuda(async=True)
27 |         inputs = Variable(inputs)
28 |         targets = Variable(targets)
29 |         outputs = model(inputs)
30 |         loss = criterion(outputs, targets)
31 |         acc = calculate_accuracy(outputs, targets)
32 | 
33 |         losses.update(loss.data[0], inputs.size(0))
34 |         accuracies.update(acc, inputs.size(0))
35 | 
36 |         optimizer.zero_grad()
37 |         loss.backward()
38 |         optimizer.step()
39 | 
40 |         batch_time.update(time.time() - end_time)
41 |         end_time = time.time()
42 | 
43 |         batch_logger.log({
44 |             'epoch': epoch,
45 |             'batch': i + 1,
46 |             'iter': (epoch - 1) * len(data_loader) + (i + 1),
47 |             'loss': losses.val,
48 |             'acc': accuracies.val,
49 |             'lr': optimizer.param_groups[0]['lr']
50 |         })
51 | 
52 |         print('Epoch: [{0}][{1}/{2}]\t'
53 |               'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
54 |               'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
55 |               'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
56 |               'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
57 |                   epoch,
58 |                   i + 1,
59 |                   len(data_loader),
60 |                   batch_time=batch_time,
61 |                   data_time=data_time,
62 |                   loss=losses,
63 |                   acc=accuracies))
64 | 
65 |     epoch_logger.log({
66 |         'epoch': epoch,
67 |         'loss': losses.avg,
68 |         'acc': accuracies.avg,
69 |         'lr': optimizer.param_groups[0]['lr']
70 |     })
71 | 
72 |     if epoch % opt.checkpoint == 0:
73 |         save_file_path = os.path.join(opt.result_path,
74 |                                       'save_{}.pth'.format(epoch))
75 |         states = {
76 |             'epoch': epoch + 1,
77 |             'arch': opt.arch,
78 |             'state_dict': model.state_dict(),
79 |             'optimizer': optimizer.state_dict(),
80 |         }
81 |         torch.save(states, save_file_path)
82 | 


--------------------------------------------------------------------------------
/3D_experiment/utils.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | 
 4 | class AverageMeter(object):
 5 |     """Computes and stores the average and current value"""
 6 | 
 7 |     def __init__(self):
 8 |         self.reset()
 9 | 
10 |     def reset(self):
11 |         self.val = 0
12 |         self.avg = 0
13 |         self.sum = 0
14 |         self.count = 0
15 | 
16 |     def update(self, val, n=1):
17 |         self.val = val
18 |         self.sum += val * n
19 |         self.count += n
20 |         self.avg = self.sum / self.count
21 | 
22 | 
23 | class Logger(object):
24 | 
25 |     def __init__(self, path, header):
26 |         self.log_file = open(path, 'w')
27 |         self.logger = csv.writer(self.log_file, delimiter='\t')
28 | 
29 |         self.logger.writerow(header)
30 |         self.header = header
31 | 
32 |     def __del(self):
33 |         self.log_file.close()
34 | 
35 |     def log(self, values):
36 |         write_values = []
37 |         for col in self.header:
38 |             assert col in values
39 |             write_values.append(values[col])
40 | 
41 |         self.logger.writerow(write_values)
42 |         self.log_file.flush()
43 | 
44 | 
45 | def load_value_file(file_path):
46 |     with open(file_path, 'r') as input_file:
47 |         value = float(input_file.read().rstrip('\n\r'))
48 | 
49 |     return value
50 | 
51 | 
52 | def calculate_accuracy(outputs, targets):
53 |     batch_size = targets.size(0)
54 | 
55 |     _, pred = outputs.topk(1, 1, True)
56 |     pred = pred.t()
57 |     correct = pred.eq(targets.view(1, -1))
58 |     n_correct_elems = correct.float().sum().data[0]
59 | 
60 |     return n_correct_elems / batch_size
61 | 


--------------------------------------------------------------------------------
/3D_experiment/utils/eval_hmdb51.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | class HMDBclassification(object):
  7 | 
  8 |     def __init__(self, ground_truth_filename=None, prediction_filename=None,
  9 |                  subset='validation', verbose=False, top_k=1):
 10 |         if not ground_truth_filename:
 11 |             raise IOError('Please input a valid ground truth file.')
 12 |         if not prediction_filename:
 13 |             raise IOError('Please input a valid prediction file.')
 14 |         self.subset = subset
 15 |         self.verbose = verbose
 16 |         self.top_k = top_k
 17 |         self.ap = None
 18 |         self.hit_at_k = None
 19 |         # Import ground truth and predictions.
 20 |         self.ground_truth, self.activity_index = self._import_ground_truth(
 21 |             ground_truth_filename)
 22 |         self.prediction = self._import_prediction(prediction_filename)
 23 | 
 24 |         if self.verbose:
 25 |             print '[INIT] Loaded annotations from {} subset.'.format(subset)
 26 |             nr_gt = len(self.ground_truth)
 27 |             print '\tNumber of ground truth instances: {}'.format(nr_gt)
 28 |             nr_pred = len(self.prediction)
 29 |             print '\tNumber of predictions: {}'.format(nr_pred)
 30 | 
 31 |     def _import_ground_truth(self, ground_truth_filename):
 32 |         """Reads ground truth file, checks if it is well formatted, and returns
 33 |            the ground truth instances and the activity classes.
 34 | 
 35 |         Parameters
 36 |         ----------
 37 |         ground_truth_filename : str
 38 |             Full path to the ground truth json file.
 39 | 
 40 |         Outputs
 41 |         -------
 42 |         ground_truth : df
 43 |             Data frame containing the ground truth instances.
 44 |         activity_index : dict
 45 |             Dictionary containing class index.
 46 |         """
 47 |         with open(ground_truth_filename, 'r') as fobj:
 48 |             data = json.load(fobj)
 49 |         # Checking format
 50 |         # if not all([field in data.keys() for field in self.gt_fields]):
 51 |             # raise IOError('Please input a valid ground truth file.')
 52 | 
 53 |         # Initialize data frame
 54 |         activity_index, cidx = {}, 0
 55 |         video_lst, label_lst = [], []
 56 |         for videoid, v in data['database'].iteritems():
 57 |             if self.subset != v['subset']:
 58 |                 continue
 59 |             this_label = v['annotations']['label']
 60 |             if this_label not in activity_index:
 61 |                 activity_index[this_label] = cidx
 62 |                 cidx += 1
 63 |             video_lst.append(videoid)
 64 |             label_lst.append(activity_index[this_label])
 65 |         ground_truth = pd.DataFrame({'video-id': video_lst,
 66 |                                      'label': label_lst})
 67 |         ground_truth = ground_truth.drop_duplicates().reset_index(drop=True)
 68 |         return ground_truth, activity_index
 69 | 
 70 |     def _import_prediction(self, prediction_filename):
 71 |         """Reads prediction file, checks if it is well formatted, and returns
 72 |            the prediction instances.
 73 | 
 74 |         Parameters
 75 |         ----------
 76 |         prediction_filename : str
 77 |             Full path to the prediction json file.
 78 | 
 79 |         Outputs
 80 |         -------
 81 |         prediction : df
 82 |             Data frame containing the prediction instances.
 83 |         """
 84 |         with open(prediction_filename, 'r') as fobj:
 85 |             data = json.load(fobj)
 86 |         # Checking format...
 87 |         # if not all([field in data.keys() for field in self.pred_fields]):
 88 |             # raise IOError('Please input a valid prediction file.')
 89 | 
 90 |         # Initialize data frame
 91 |         video_lst, label_lst, score_lst = [], [], []
 92 |         for videoid, v in data['results'].iteritems():
 93 |             for result in v:
 94 |                 label = self.activity_index[result['label']]
 95 |                 video_lst.append(videoid)
 96 |                 label_lst.append(label)
 97 |                 score_lst.append(result['score'])
 98 |         prediction = pd.DataFrame({'video-id': video_lst,
 99 |                                    'label': label_lst,
100 |                                    'score': score_lst})
101 |         return prediction
102 | 
103 |     def evaluate(self):
104 |         """Evaluates a prediction file. For the detection task we measure the
105 |         interpolated mean average precision to measure the performance of a
106 |         method.
107 |         """
108 |         hit_at_k = compute_video_hit_at_k(self.ground_truth,
109 |                                           self.prediction, top_k=self.top_k)
110 |         if self.verbose:
111 |             print ('[RESULTS] Performance on ActivityNet untrimmed video '
112 |                    'classification task.')
113 |             print '\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k)
114 |             #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k)
115 |         self.hit_at_k = hit_at_k
116 | 
117 | ################################################################################
118 | # Metrics
119 | ################################################################################
120 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3):
121 |     """Compute accuracy at k prediction between ground truth and
122 |     predictions data frames. This code is greatly inspired by evaluation
123 |     performed in Karpathy et al. CVPR14.
124 | 
125 |     Parameters
126 |     ----------
127 |     ground_truth : df
128 |         Data frame containing the ground truth instances.
129 |         Required fields: ['video-id', 'label']
130 |     prediction : df
131 |         Data frame containing the prediction instances.
132 |         Required fields: ['video-id, 'label', 'score']
133 | 
134 |     Outputs
135 |     -------
136 |     acc : float
137 |         Top k accuracy score.
138 |     """
139 |     video_ids = np.unique(ground_truth['video-id'].values)
140 |     avg_hits_per_vid = np.zeros(video_ids.size)
141 |     for i, vid in enumerate(video_ids):
142 |         pred_idx = prediction['video-id'] == vid
143 |         if not pred_idx.any():
144 |             continue
145 |         this_pred = prediction.loc[pred_idx].reset_index(drop=True)
146 |         # Get top K predictions sorted by decreasing score.
147 |         sort_idx = this_pred['score'].values.argsort()[::-1][:top_k]
148 |         this_pred = this_pred.loc[sort_idx].reset_index(drop=True)
149 |         # Get labels and compare against ground truth.
150 |         pred_label = this_pred['label'].tolist()
151 |         gt_idx = ground_truth['video-id'] == vid
152 |         gt_label = ground_truth.loc[gt_idx]['label'].tolist()
153 |         avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0
154 |                                        for this_label in gt_label])
155 |     return float(avg_hits_per_vid.mean())
156 | 


--------------------------------------------------------------------------------
/3D_experiment/utils/eval_kinetics.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import urllib2
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | API = 'http://ec2-52-11-11-89.us-west-2.compute.amazonaws.com/challenge17/api.py'
  8 | 
  9 | def get_blocked_videos(api=API):
 10 |     api_url = '{}?action=get_blocked'.format(api)
 11 |     req = urllib2.Request(api_url)
 12 |     response = urllib2.urlopen(req)
 13 |     return json.loads(response.read())
 14 | 
 15 | class KINETICSclassification(object):
 16 |     GROUND_TRUTH_FIELDS = ['database', 'labels']
 17 |     PREDICTION_FIELDS = ['results', 'version', 'external_data']
 18 | 
 19 |     def __init__(self, ground_truth_filename=None, prediction_filename=None,
 20 |                  ground_truth_fields=GROUND_TRUTH_FIELDS,
 21 |                  prediction_fields=PREDICTION_FIELDS,
 22 |                  subset='validation', verbose=False, top_k=1,
 23 |                  check_status=True):
 24 |         if not ground_truth_filename:
 25 |             raise IOError('Please input a valid ground truth file.')
 26 |         if not prediction_filename:
 27 |             raise IOError('Please input a valid prediction file.')
 28 |         self.subset = subset
 29 |         self.verbose = verbose
 30 |         self.gt_fields = ground_truth_fields
 31 |         self.pred_fields = prediction_fields
 32 |         self.top_k = top_k
 33 |         self.ap = None
 34 |         self.hit_at_k = None
 35 |         self.check_status = check_status
 36 |         # Retrieve blocked videos from server.
 37 |         if self.check_status:
 38 |             self.blocked_videos = get_blocked_videos()
 39 |         else:
 40 |             self.blocked_videos = list()
 41 |         # Import ground truth and predictions.
 42 |         self.ground_truth, self.activity_index = self._import_ground_truth(
 43 |             ground_truth_filename)
 44 |         self.prediction = self._import_prediction(prediction_filename)
 45 | 
 46 |         if self.verbose:
 47 |             print '[INIT] Loaded annotations from {} subset.'.format(subset)
 48 |             nr_gt = len(self.ground_truth)
 49 |             print '\tNumber of ground truth instances: {}'.format(nr_gt)
 50 |             nr_pred = len(self.prediction)
 51 |             print '\tNumber of predictions: {}'.format(nr_pred)
 52 | 
 53 |     def _import_ground_truth(self, ground_truth_filename):
 54 |         """Reads ground truth file, checks if it is well formatted, and returns
 55 |            the ground truth instances and the activity classes.
 56 | 
 57 |         Parameters
 58 |         ----------
 59 |         ground_truth_filename : str
 60 |             Full path to the ground truth json file.
 61 | 
 62 |         Outputs
 63 |         -------
 64 |         ground_truth : df
 65 |             Data frame containing the ground truth instances.
 66 |         activity_index : dict
 67 |             Dictionary containing class index.
 68 |         """
 69 |         with open(ground_truth_filename, 'r') as fobj:
 70 |             data = json.load(fobj)
 71 |         # Checking format
 72 |         # if not all([field in data.keys() for field in self.gt_fields]):
 73 |             # raise IOError('Please input a valid ground truth file.')
 74 | 
 75 |         # Initialize data frame
 76 |         activity_index, cidx = {}, 0
 77 |         video_lst, label_lst = [], []
 78 |         for videoid, v in data['database'].iteritems():
 79 |             if self.subset != v['subset']:
 80 |                 continue
 81 |             if videoid in self.blocked_videos:
 82 |                 continue
 83 |             this_label = v['annotations']['label']
 84 |             if this_label not in activity_index:
 85 |                 activity_index[this_label] = cidx
 86 |                 cidx += 1
 87 |             video_lst.append(videoid[:-14])
 88 |             label_lst.append(activity_index[this_label])
 89 |         ground_truth = pd.DataFrame({'video-id': video_lst,
 90 |                                      'label': label_lst})
 91 |         ground_truth = ground_truth.drop_duplicates().reset_index(drop=True)
 92 |         return ground_truth, activity_index
 93 | 
 94 |     def _import_prediction(self, prediction_filename):
 95 |         """Reads prediction file, checks if it is well formatted, and returns
 96 |            the prediction instances.
 97 | 
 98 |         Parameters
 99 |         ----------
100 |         prediction_filename : str
101 |             Full path to the prediction json file.
102 | 
103 |         Outputs
104 |         -------
105 |         prediction : df
106 |             Data frame containing the prediction instances.
107 |         """
108 |         with open(prediction_filename, 'r') as fobj:
109 |             data = json.load(fobj)
110 |         # Checking format...
111 |         # if not all([field in data.keys() for field in self.pred_fields]):
112 |             # raise IOError('Please input a valid prediction file.')
113 | 
114 |         # Initialize data frame
115 |         video_lst, label_lst, score_lst = [], [], []
116 |         for videoid, v in data['results'].iteritems():
117 |             if videoid in self.blocked_videos:
118 |                 continue
119 |             for result in v:
120 |                 label = self.activity_index[result['label']]
121 |                 video_lst.append(videoid)
122 |                 label_lst.append(label)
123 |                 score_lst.append(result['score'])
124 |         prediction = pd.DataFrame({'video-id': video_lst,
125 |                                    'label': label_lst,
126 |                                    'score': score_lst})
127 |         return prediction
128 | 
129 |     def evaluate(self):
130 |         """Evaluates a prediction file. For the detection task we measure the
131 |         interpolated mean average precision to measure the performance of a
132 |         method.
133 |         """
134 |         hit_at_k = compute_video_hit_at_k(self.ground_truth,
135 |                                           self.prediction, top_k=self.top_k)
136 |         # avg_hit_at_k = compute_video_hit_at_k(
137 |             # self.ground_truth, self.prediction, top_k=self.top_k, avg=True)
138 |         if self.verbose:
139 |             print ('[RESULTS] Performance on ActivityNet untrimmed video '
140 |                    'classification task.')
141 |             # print '\tMean Average Precision: {}'.format(ap.mean())
142 |             print '\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k)
143 |             #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k)
144 |         # self.ap = ap
145 |         self.hit_at_k = hit_at_k
146 |         # self.avg_hit_at_k = avg_hit_at_k
147 | 
148 | ################################################################################
149 | # Metrics
150 | ################################################################################
151 | 
152 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3, avg=False):
153 |     """Compute accuracy at k prediction between ground truth and
154 |     predictions data frames. This code is greatly inspired by evaluation
155 |     performed in Karpathy et al. CVPR14.
156 | 
157 |     Parameters
158 |     ----------
159 |     ground_truth : df
160 |         Data frame containing the ground truth instances.
161 |         Required fields: ['video-id', 'label']
162 |     prediction : df
163 |         Data frame containing the prediction instances.
164 |         Required fields: ['video-id, 'label', 'score']
165 | 
166 |     Outputs
167 |     -------
168 |     acc : float
169 |         Top k accuracy score.
170 |     """
171 |     video_ids = np.unique(ground_truth['video-id'].values)
172 |     avg_hits_per_vid = np.zeros(video_ids.size)
173 |     for i, vid in enumerate(video_ids):
174 |         pred_idx = prediction['video-id'] == vid
175 |         if not pred_idx.any():
176 |             continue
177 |         this_pred = prediction.loc[pred_idx].reset_index(drop=True)
178 |         # Get top K predictions sorted by decreasing score.
179 |         sort_idx = this_pred['score'].values.argsort()[::-1][:top_k]
180 |         this_pred = this_pred.loc[sort_idx].reset_index(drop=True)
181 |         # Get labels and compare against ground truth.
182 |         pred_label = this_pred['label'].tolist()
183 |         gt_idx = ground_truth['video-id'] == vid
184 |         gt_label = ground_truth.loc[gt_idx]['label'].tolist()
185 |         avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0
186 |                                        for this_label in gt_label])
187 |         if not avg:
188 |             avg_hits_per_vid[i] = np.ceil(avg_hits_per_vid[i])
189 |     return float(avg_hits_per_vid.mean())
190 | 


--------------------------------------------------------------------------------
/3D_experiment/utils/eval_ucf101.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | class UCFclassification(object):
  7 | 
  8 |     def __init__(self, ground_truth_filename=None, prediction_filename=None,
  9 |                  subset='validation', verbose=False, top_k=1):
 10 |         if not ground_truth_filename:
 11 |             raise IOError('Please input a valid ground truth file.')
 12 |         if not prediction_filename:
 13 |             raise IOError('Please input a valid prediction file.')
 14 |         self.subset = subset
 15 |         self.verbose = verbose
 16 |         self.top_k = top_k
 17 |         self.ap = None
 18 |         self.hit_at_k = None
 19 |         # Import ground truth and predictions.
 20 |         self.ground_truth, self.activity_index = self._import_ground_truth(
 21 |             ground_truth_filename)
 22 |         self.prediction = self._import_prediction(prediction_filename)
 23 | 
 24 |         if self.verbose:
 25 |             print '[INIT] Loaded annotations from {} subset.'.format(subset)
 26 |             nr_gt = len(self.ground_truth)
 27 |             print '\tNumber of ground truth instances: {}'.format(nr_gt)
 28 |             nr_pred = len(self.prediction)
 29 |             print '\tNumber of predictions: {}'.format(nr_pred)
 30 | 
 31 |     def _import_ground_truth(self, ground_truth_filename):
 32 |         """Reads ground truth file, checks if it is well formatted, and returns
 33 |            the ground truth instances and the activity classes.
 34 | 
 35 |         Parameters
 36 |         ----------
 37 |         ground_truth_filename : str
 38 |             Full path to the ground truth json file.
 39 | 
 40 |         Outputs
 41 |         -------
 42 |         ground_truth : df
 43 |             Data frame containing the ground truth instances.
 44 |         activity_index : dict
 45 |             Dictionary containing class index.
 46 |         """
 47 |         with open(ground_truth_filename, 'r') as fobj:
 48 |             data = json.load(fobj)
 49 |         # Checking format
 50 |         # if not all([field in data.keys() for field in self.gt_fields]):
 51 |             # raise IOError('Please input a valid ground truth file.')
 52 | 
 53 |         # Initialize data frame
 54 |         activity_index, cidx = {}, 0
 55 |         video_lst, label_lst = [], []
 56 |         for videoid, v in data['database'].iteritems():
 57 |             if self.subset != v['subset']:
 58 |                 continue
 59 |             this_label = v['annotations']['label']
 60 |             if this_label not in activity_index:
 61 |                 activity_index[this_label] = cidx
 62 |                 cidx += 1
 63 |             video_lst.append(videoid)
 64 |             label_lst.append(activity_index[this_label])
 65 |         ground_truth = pd.DataFrame({'video-id': video_lst,
 66 |                                      'label': label_lst})
 67 |         ground_truth = ground_truth.drop_duplicates().reset_index(drop=True)
 68 |         return ground_truth, activity_index
 69 | 
 70 |     def _import_prediction(self, prediction_filename):
 71 |         """Reads prediction file, checks if it is well formatted, and returns
 72 |            the prediction instances.
 73 | 
 74 |         Parameters
 75 |         ----------
 76 |         prediction_filename : str
 77 |             Full path to the prediction json file.
 78 | 
 79 |         Outputs
 80 |         -------
 81 |         prediction : df
 82 |             Data frame containing the prediction instances.
 83 |         """
 84 |         with open(prediction_filename, 'r') as fobj:
 85 |             data = json.load(fobj)
 86 |         # Checking format...
 87 |         # if not all([field in data.keys() for field in self.pred_fields]):
 88 |             # raise IOError('Please input a valid prediction file.')
 89 | 
 90 |         # Initialize data frame
 91 |         video_lst, label_lst, score_lst = [], [], []
 92 |         for videoid, v in data['results'].iteritems():
 93 |             for result in v:
 94 |                 label = self.activity_index[result['label']]
 95 |                 video_lst.append(videoid)
 96 |                 label_lst.append(label)
 97 |                 score_lst.append(result['score'])
 98 |         prediction = pd.DataFrame({'video-id': video_lst,
 99 |                                    'label': label_lst,
100 |                                    'score': score_lst})
101 |         return prediction
102 | 
103 |     def evaluate(self):
104 |         """Evaluates a prediction file. For the detection task we measure the
105 |         interpolated mean average precision to measure the performance of a
106 |         method.
107 |         """
108 |         hit_at_k = compute_video_hit_at_k(self.ground_truth,
109 |                                           self.prediction, top_k=self.top_k)
110 |         if self.verbose:
111 |             print ('[RESULTS] Performance on ActivityNet untrimmed video '
112 |                    'classification task.')
113 |             print '\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k)
114 |             #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k)
115 |         self.hit_at_k = hit_at_k
116 | 
117 | ################################################################################
118 | # Metrics
119 | ################################################################################
120 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3):
121 |     """Compute accuracy at k prediction between ground truth and
122 |     predictions data frames. This code is greatly inspired by evaluation
123 |     performed in Karpathy et al. CVPR14.
124 | 
125 |     Parameters
126 |     ----------
127 |     ground_truth : df
128 |         Data frame containing the ground truth instances.
129 |         Required fields: ['video-id', 'label']
130 |     prediction : df
131 |         Data frame containing the prediction instances.
132 |         Required fields: ['video-id, 'label', 'score']
133 | 
134 |     Outputs
135 |     -------
136 |     acc : float
137 |         Top k accuracy score.
138 |     """
139 |     video_ids = np.unique(ground_truth['video-id'].values)
140 |     avg_hits_per_vid = np.zeros(video_ids.size)
141 |     for i, vid in enumerate(video_ids):
142 |         pred_idx = prediction['video-id'] == vid
143 |         if not pred_idx.any():
144 |             continue
145 |         this_pred = prediction.loc[pred_idx].reset_index(drop=True)
146 |         # Get top K predictions sorted by decreasing score.
147 |         sort_idx = this_pred['score'].values.argsort()[::-1][:top_k]
148 |         this_pred = this_pred.loc[sort_idx].reset_index(drop=True)
149 |         # Get labels and compare against ground truth.
150 |         pred_label = this_pred['label'].tolist()
151 |         gt_idx = ground_truth['video-id'] == vid
152 |         gt_label = ground_truth.loc[gt_idx]['label'].tolist()
153 |         avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0
154 |                                        for this_label in gt_label])
155 |     return float(avg_hits_per_vid.mean())
156 | 


--------------------------------------------------------------------------------
/3D_experiment/utils/fps.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | 
 7 | if __name__=="__main__":
 8 |   dir_path = sys.argv[1]
 9 |   dst_dir_path = sys.argv[2]
10 | 
11 |   for file_name in os.listdir(dir_path):
12 |     if '.mp4' not in file_name:
13 |       continue
14 |     name, ext = os.path.splitext(file_name)
15 |     dst_directory_path = os.path.join(dst_dir_path, name)
16 | 
17 |     video_file_path = os.path.join(dir_path, file_name)
18 |     p = subprocess.Popen('ffprobe {}'.format(video_file_path),
19 |                          shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
20 |     _, res = p.communicate()
21 |     res = res.decode('utf-8')
22 | 
23 |     duration_index = res.find('Duration:')
24 |     duration_str = res[(duration_index + 10):(duration_index + 21)]
25 |     hour = float(duration_str[0:2])
26 |     minute = float(duration_str[3:5])
27 |     sec = float(duration_str[6:10])
28 |     total_sec = hour * 3600 + minute * 60 + sec
29 | 
30 |     n_frames = len(os.listdir(dst_directory_path))
31 |     if os.path.exists(os.path.join(dst_directory_path, 'fps')):
32 |       n_frames -= 1
33 | 
34 |     fps = round(n_frames / total_sec, 2)
35 | 
36 |     print(video_file_path, os.path.exists(video_file_path), fps)
37 |     with open(os.path.join(dst_directory_path, 'fps'), 'w') as fps_file:
38 |       fps_file.write('{}\n'.format(fps))
39 | 


--------------------------------------------------------------------------------
/3D_experiment/utils/hmdb51_json.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import json
 5 | import pandas as pd
 6 | 
 7 | def convert_csv_to_dict(csv_dir_path, split_index):
 8 |     database = {}
 9 |     for filename in os.listdir(csv_dir_path):
10 |         if 'split{}'.format(split_index) not in filename:
11 |             continue
12 |         
13 |         data = pd.read_csv(os.path.join(csv_dir_path, filename),
14 |                            delimiter=' ', header=None)
15 |         keys = []
16 |         subsets = []
17 |         for i in range(data.shape[0]):
18 |             row = data.ix[i, :]
19 |             if row[1] == 0:
20 |                 continue
21 |             elif row[1] == 1:
22 |                 subset = 'training'
23 |             elif row[1] == 2:
24 |                 subset = 'validation'
25 |             
26 |             keys.append(row[0].split('.')[0])
27 |             subsets.append(subset)        
28 |         
29 |         for i in range(len(keys)):
30 |             key = keys[i]
31 |             database[key] = {}
32 |             database[key]['subset'] = subsets[i]
33 |             label = '_'.join(filename.split('_')[:-2])
34 |             database[key]['annotations'] = {'label': label}
35 |     
36 |     return database
37 | 
38 | def get_labels(csv_dir_path):
39 |     labels = []
40 |     for name in os.listdir(csv_dir_path):
41 |         labels.append('_'.join(name.split('_')[:-2]))
42 |     return sorted(list(set(labels)))
43 | 
44 | def convert_hmdb51_csv_to_activitynet_json(csv_dir_path, split_index, dst_json_path):
45 |     labels = get_labels(csv_dir_path)
46 |     database = convert_csv_to_dict(csv_dir_path, split_index)
47 |     
48 |     dst_data = {}
49 |     dst_data['labels'] = labels
50 |     dst_data['database'] = {}
51 |     dst_data['database'].update(database)
52 | 
53 |     with open(dst_json_path, 'w') as dst_file:
54 |         json.dump(dst_data, dst_file)
55 | 
56 | if __name__ == '__main__':
57 |     csv_dir_path = sys.argv[1]
58 | 
59 |     for split_index in range(1, 4):
60 |         dst_json_path = os.path.join(csv_dir_path, 'hmdb51_{}.json'.format(split_index))
61 |         convert_hmdb51_csv_to_activitynet_json(csv_dir_path, split_index, dst_json_path)


--------------------------------------------------------------------------------
/3D_experiment/utils/kinetics_json.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import json
 5 | import pandas as pd
 6 | 
 7 | def convert_csv_to_dict(csv_path, subset):
 8 |     data = pd.read_csv(csv_path)
 9 |     keys = []
10 |     key_labels = []
11 |     for i in range(data.shape[0]):
12 |         row = data.ix[i, :]
13 |         basename = '%s_%s_%s' % (row['youtube_id'],
14 |                                  '%06d' % row['time_start'],
15 |                                  '%06d' % row['time_end'])
16 |         keys.append(basename)
17 |         if subset != 'testing':
18 |             key_labels.append(row['label'])
19 | 
20 |     database = {}
21 |     for i in range(len(keys)):
22 |         key = keys[i]
23 |         database[key] = {}
24 |         database[key]['subset'] = subset
25 |         if subset != 'testing':
26 |             label = key_labels[i]
27 |             database[key]['annotations'] = {'label': label}
28 |         else:
29 |             database[key]['annotations'] = {}
30 | 
31 |     return database
32 | 
33 | def load_labels(train_csv_path):
34 |     data = pd.read_csv(train_csv_path)
35 |     return data['label'].unique().tolist()
36 | 
37 | def convert_kinetics_csv_to_activitynet_json(train_csv_path, val_csv_path, test_csv_path, dst_json_path):
38 |     labels = load_labels(train_csv_path)
39 |     train_database = convert_csv_to_dict(train_csv_path, 'training')
40 |     val_database = convert_csv_to_dict(val_csv_path, 'validation')
41 |     test_database = convert_csv_to_dict(test_csv_path, 'testing')
42 | 
43 |     dst_data = {}
44 |     dst_data['labels'] = labels
45 |     dst_data['database'] = {}
46 |     dst_data['database'].update(train_database)
47 |     dst_data['database'].update(val_database)
48 |     dst_data['database'].update(test_database)
49 | 
50 |     with open(dst_json_path, 'w') as dst_file:
51 |         json.dump(dst_data, dst_file)
52 | 
53 | if __name__=="__main__":
54 |   train_csv_path = sys.argv[1]
55 |   val_csv_path = sys.argv[2]
56 |   test_csv_path = sys.argv[3]
57 |   dst_json_path = sys.argv[4]
58 | 
59 |   convert_kinetics_csv_to_activitynet_json(
60 |     train_csv_path, val_csv_path, test_csv_path, dst_json_path)
61 | 


--------------------------------------------------------------------------------
/3D_experiment/utils/n_frames_kinetics.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | def class_process(dir_path, class_name):
 7 |   class_path = os.path.join(dir_path, class_name)
 8 |   if not os.path.isdir(class_path):
 9 |     return
10 | 
11 |   for file_name in os.listdir(class_path):
12 |     video_dir_path = os.path.join(class_path, file_name)
13 |     image_indices = []
14 |     for image_file_name in os.listdir(video_dir_path):
15 |       if 'image' not in image_file_name:
16 |         continue
17 |       image_indices.append(int(image_file_name[6:11]))
18 | 
19 |     if len(image_indices) == 0:
20 |       print('no image files', video_dir_path)
21 |       n_frames = 0
22 |     else:
23 |       image_indices.sort(reverse=True)
24 |       n_frames = image_indices[0]
25 |       print(video_dir_path, n_frames)
26 |     with open(os.path.join(video_dir_path, 'n_frames'), 'w') as dst_file:
27 |       dst_file.write(str(n_frames))
28 | 
29 | 
30 | if __name__=="__main__":
31 |   dir_path = sys.argv[1]
32 |   for class_name in os.listdir(dir_path):
33 |     class_process(dir_path, class_name)
34 | 
35 |   class_name = 'test'
36 |   class_process(dir_path, class_name)
37 | 


--------------------------------------------------------------------------------
/3D_experiment/utils/n_frames_ucf101_hmdb51.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | def class_process(dir_path, class_name):
 7 |   class_path = os.path.join(dir_path, class_name)
 8 |   if not os.path.isdir(class_path):
 9 |     return
10 | 
11 |   for file_name in os.listdir(class_path):
12 |     video_dir_path = os.path.join(class_path, file_name)
13 |     image_indices = []
14 |     for image_file_name in os.listdir(video_dir_path):
15 |       if 'image' not in image_file_name:
16 |         continue
17 |       image_indices.append(int(image_file_name[6:11]))
18 | 
19 |     if len(image_indices) == 0:
20 |       print('no image files', video_dir_path)
21 |       n_frames = 0
22 |     else:
23 |       image_indices.sort(reverse=True)
24 |       n_frames = image_indices[0]
25 |       print(video_dir_path, n_frames)
26 |     with open(os.path.join(video_dir_path, 'n_frames'), 'w') as dst_file:
27 |       dst_file.write(str(n_frames))
28 | 
29 | 
30 | if __name__=="__main__":
31 |   dir_path = sys.argv[1]
32 |   for class_name in os.listdir(dir_path):
33 |     class_process(dir_path, class_name)
34 | 


--------------------------------------------------------------------------------
/3D_experiment/utils/ucf101_json.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import json
 5 | import pandas as pd
 6 | 
 7 | def convert_csv_to_dict(csv_path, subset):
 8 |     data = pd.read_csv(csv_path, delimiter=' ', header=None)
 9 |     keys = []
10 |     key_labels = []
11 |     for i in range(data.shape[0]):
12 |         row = data.ix[i, :]
13 |         slash_rows = data.ix[i, 0].split('/')
14 |         class_name = slash_rows[0]
15 |         basename = slash_rows[1].split('.')[0]
16 |         
17 |         keys.append(basename)
18 |         key_labels.append(class_name)
19 |         
20 |     database = {}
21 |     for i in range(len(keys)):
22 |         key = keys[i]
23 |         database[key] = {}
24 |         database[key]['subset'] = subset
25 |         label = key_labels[i]
26 |         database[key]['annotations'] = {'label': label}
27 |     
28 |     return database
29 | 
30 | def load_labels(label_csv_path):
31 |     data = pd.read_csv(label_csv_path, delimiter=' ', header=None)
32 |     labels = []
33 |     for i in range(data.shape[0]):
34 |         labels.append(data.ix[i, 1])
35 |     return labels
36 | 
37 | def convert_ucf101_csv_to_activitynet_json(label_csv_path, train_csv_path, 
38 |                                            val_csv_path, dst_json_path):
39 |     labels = load_labels(label_csv_path)
40 |     train_database = convert_csv_to_dict(train_csv_path, 'training')
41 |     val_database = convert_csv_to_dict(val_csv_path, 'validation')
42 |     
43 |     dst_data = {}
44 |     dst_data['labels'] = labels
45 |     dst_data['database'] = {}
46 |     dst_data['database'].update(train_database)
47 |     dst_data['database'].update(val_database)
48 | 
49 |     with open(dst_json_path, 'w') as dst_file:
50 |         json.dump(dst_data, dst_file)
51 | 
52 | if __name__ == '__main__':
53 |     csv_dir_path = sys.argv[1]
54 | 
55 |     for split_index in range(1, 4):
56 |         label_csv_path = os.path.join(csv_dir_path, 'classInd.txt')
57 |         train_csv_path = os.path.join(csv_dir_path, 'trainlist0{}.txt'.format(split_index))
58 |         val_csv_path = os.path.join(csv_dir_path, 'testlist0{}.txt'.format(split_index))
59 |         dst_json_path = os.path.join(csv_dir_path, 'ucf101_0{}.json'.format(split_index))
60 | 
61 |         convert_ucf101_csv_to_activitynet_json(label_csv_path, train_csv_path,
62 |                                                val_csv_path, dst_json_path)
63 | 


--------------------------------------------------------------------------------
/3D_experiment/utils/video_jpg.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | 
 7 | if __name__=="__main__":
 8 |   dir_path = sys.argv[1]
 9 |   dst_dir_path = sys.argv[2]
10 | 
11 |   for file_name in os.listdir(dir_path):
12 |     if '.mp4' not in file_name:
13 |       continue
14 |     name, ext = os.path.splitext(file_name)
15 |     dst_directory_path = os.path.join(dst_dir_path, name)
16 | 
17 |     video_file_path = os.path.join(dir_path, file_name)
18 |     try:
19 |       if os.path.exists(dst_directory_path):
20 |         if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')):
21 |           subprocess.call('rm -r {}'.format(dst_directory_path), shell=True)
22 |           print('remove {}'.format(dst_directory_path))
23 |           os.mkdir(dst_directory_path)
24 |         else:
25 |           continue
26 |       else:
27 |         os.mkdir(dst_directory_path)
28 |     except:
29 |       print(dst_directory_path)
30 |       continue
31 |     cmd = 'ffmpeg -i {} -vf scale=-1:360 {}/image_%05d.jpg'.format(video_file_path, dst_directory_path)
32 |     print(cmd)
33 |     subprocess.call(cmd, shell=True)
34 |     print('\n')
35 | 


--------------------------------------------------------------------------------
/3D_experiment/utils/video_jpg_kinetics.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | def class_process(dir_path, dst_dir_path, class_name):
 7 |   class_path = os.path.join(dir_path, class_name)
 8 |   if not os.path.isdir(class_path):
 9 |     return
10 | 
11 |   dst_class_path = os.path.join(dst_dir_path, class_name)
12 |   if not os.path.exists(dst_class_path):
13 |     os.mkdir(dst_class_path)
14 | 
15 |   for file_name in os.listdir(class_path):
16 |     if '.mp4' not in file_name:
17 |       continue
18 |     name, ext = os.path.splitext(file_name)
19 |     dst_directory_path = os.path.join(dst_class_path, name)
20 | 
21 |     video_file_path = os.path.join(class_path, file_name)
22 |     try:
23 |       if os.path.exists(dst_directory_path):
24 |         if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')):
25 |           subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True)
26 |           print('remove {}'.format(dst_directory_path))
27 |           os.mkdir(dst_directory_path)
28 |         else:
29 |           continue
30 |       else:
31 |         os.mkdir(dst_directory_path)
32 |     except:
33 |       print(dst_directory_path)
34 |       continue
35 |     cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path)
36 |     print(cmd)
37 |     subprocess.call(cmd, shell=True)
38 |     print('\n')
39 | 
40 | if __name__=="__main__":
41 |   dir_path = sys.argv[1]
42 |   dst_dir_path = sys.argv[2]
43 | 
44 |   for class_name in os.listdir(dir_path):
45 |     class_process(dir_path, dst_dir_path, class_name)
46 | 
47 |   class_name = 'test'
48 |   class_process(dir_path, dst_dir_path, class_name)
49 | 


--------------------------------------------------------------------------------
/3D_experiment/utils/video_jpg_ucf101_hmdb51.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | def class_process(dir_path, dst_dir_path, class_name):
 7 |   class_path = os.path.join(dir_path, class_name)
 8 |   if not os.path.isdir(class_path):
 9 |     return
10 | 
11 |   dst_class_path = os.path.join(dst_dir_path, class_name)
12 |   if not os.path.exists(dst_class_path):
13 |     os.mkdir(dst_class_path)
14 | 
15 |   for file_name in os.listdir(class_path):
16 |     if '.avi' not in file_name:
17 |       continue
18 |     name, ext = os.path.splitext(file_name)
19 |     dst_directory_path = os.path.join(dst_class_path, name)
20 | 
21 |     video_file_path = os.path.join(class_path, file_name)
22 |     try:
23 |       if os.path.exists(dst_directory_path):
24 |         if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')):
25 |           subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True)
26 |           print('remove {}'.format(dst_directory_path))
27 |           os.mkdir(dst_directory_path)
28 |         else:
29 |           continue
30 |       else:
31 |         os.mkdir(dst_directory_path)
32 |     except:
33 |       print(dst_directory_path)
34 |       continue
35 |     cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path)
36 |     print(cmd)
37 |     subprocess.call(cmd, shell=True)
38 |     print('\n')
39 | 
40 | if __name__=="__main__":
41 |   dir_path = sys.argv[1]
42 |   dst_dir_path = sys.argv[2]
43 | 
44 |   for class_name in os.listdir(dir_path):
45 |     class_process(dir_path, dst_dir_path, class_name)
46 | 


--------------------------------------------------------------------------------
/3D_experiment/validation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | import time
 4 | import sys
 5 | 
 6 | from utils import AverageMeter, calculate_accuracy
 7 | 
 8 | 
 9 | def val_epoch(epoch, data_loader, model, criterion, opt, logger):
10 |     print('validation at epoch {}'.format(epoch))
11 | 
12 |     model.eval()
13 | 
14 |     batch_time = AverageMeter()
15 |     data_time = AverageMeter()
16 |     losses = AverageMeter()
17 |     accuracies = AverageMeter()
18 | 
19 |     end_time = time.time()
20 |     for i, (inputs, targets) in enumerate(data_loader):
21 |         data_time.update(time.time() - end_time)
22 | 
23 |         if not opt.no_cuda:
24 |             targets = targets.cuda(async=True)
25 |         inputs = Variable(inputs, volatile=True)
26 |         targets = Variable(targets, volatile=True)
27 |         outputs = model(inputs)
28 |         loss = criterion(outputs, targets)
29 |         acc = calculate_accuracy(outputs, targets)
30 | 
31 |         losses.update(loss.data[0], inputs.size(0))
32 |         accuracies.update(acc, inputs.size(0))
33 | 
34 |         batch_time.update(time.time() - end_time)
35 |         end_time = time.time()
36 | 
37 |         print('Epoch: [{0}][{1}/{2}]\t'
38 |               'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
39 |               'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
40 |               'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
41 |               'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
42 |                   epoch,
43 |                   i + 1,
44 |                   len(data_loader),
45 |                   batch_time=batch_time,
46 |                   data_time=data_time,
47 |                   loss=losses,
48 |                   acc=accuracies))
49 | 
50 |     logger.log({'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg})
51 | 
52 |     return losses.avg
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Seunghwan Cha
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch Implementation of Non-Local Neural Network
 2 | 
 3 | This repository contains my implementation of [Non-Local Neural Netowrks (CVPR 2018)](https://arxiv.org/pdf/1711.07971.pdf).
 4 | 
 5 | To understand more about the structure of this paper, you may refer to this [slide](https://www.youtube.com/redirect?redir_token=4Bf1C-e-Vz_0r5HbPD9meYLcyL58MTU1MTc5MjE0NEAxNTUxNzA1NzQ0&q=https%3A%2F%2Fwww.slideshare.net%2FTaeohKim4%2Fpr083-nonlocal-neural-networks&v=ZM153wo3baA&event=video_description) and [video](https://www.youtube.com/watch?v=ZM153wo3baA) which is in Korean.
 6 | 
 7 | The experiment was run on CIFAR-10 dataset for the sake of ensuring that the code runs without error.
 8 | 
 9 | ## Implementation Details
10 | The original paper used ResNet-50 as its backbone structure for conducting experiment on video datasets such as Kinetics, Charades.
11 | 
12 | As an inital study, I adopted ResNet-56 strucutre for CIFAR-10 dataset which is a 2D classification. The architecture is implemented in `models/resnet2D.py`.
13 | 
14 | Original baseline model from the paper called C2D uses ResNet-50 as its backbone and 1 non-local block after the 4th residual block. This structure is implemented in `models/resnet3D.py`. The detail of the architecture is shown in the below figure:
15 | 
16 | <img src='figure/Table1.jpg' width="60%"></img>
17 | 
18 | The four different pairwise functions discussed in the paper are implemented accordingly in `models/non_local.py`. You can simply pass one of the operation as an argument. The details of the non-local block is shown in the below figure:
19 | 
20 | <img src='figure/Figure2.jpg' width="60%"></img>
21 | 
22 | Finally, the original experiment of activity recognition was similarly replicated in `3D_experiment` folder. The necessary data preprocessing code was borrowed from https://github.com/kenshohara/3D-ResNets-PyTorch. The training is run without error but I didn't have enough time to compare the performance boost from the addition of non-local block.
23 | 
24 | ## Training
25 | 1) To start training for CIFAR-10 with ResNet-56, you can simply execute `run.sh`. 
26 | 
27 | 2) To start training for HMDB51 dataset with C2D, you first need to prepare the HMDB51 dataset as instructed in the `3D_experiment` folder. Then, execute `run.sh`. It seems like use of multiple GPU(s) may be need due to memory issues.
28 | 
29 | ## Results
30 | Trained on CIFAR-10 for 200 epochs using the command shown in `run.sh`. The training was conducted using single 1080ti GPU.
31 | The result showed that there wasn't a huge performance boost for image classification task on CIFAR-10. The below graph illustrates the loss curves for two different networks.
32 | 
33 | <img src='figure/resnet56_cifar.jpg' width="80%"></img>
34 | 
35 | The Top-1 validation accuracy for ResNet-56 without non-local block was *93.97%* while the one with non-local block had *93.98%* validation accuracy.
36 | 
37 | This could be due to two reasons: 1) the proposed task was mainly for video classification 2) the input size of CIFAR-10 is too small so may not maintain spatial information after the second resnet block.
38 | 
39 | ## TO DO
40 | - [x] Compare the result of baseline model and that of non-local model for CIFAR-10
41 | - [x] Prepare video dataset (e.g. UCF-101, HMDB-51)
42 | - [x] Modify the model code to adapt to spatiotemporal settings
43 | - [x] Run test on some video datasets
44 | - [ ] Run test on image segmentation dataset (e.g. COCO)
45 | 
46 | ## Reference
47 | This repo is an adaptation from several other exisitng works.
48 | - https://github.com/akamaster/pytorch_resnet_cifar10
49 | - https://github.com/kuangliu/pytorch-cifar
50 | - https://github.com/facebookresearch/video-nonlocal-net
51 | - https://github.com/AlexHex7/Non-local_pytorch
52 | - https://github.com/kenshohara/3D-ResNets-PyTorch
53 | 
54 | 


--------------------------------------------------------------------------------
/figure/Figure2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tea1528/Non-Local-NN-Pytorch/64c6047f7d801402d04340695bfbdfeee03e4797/figure/Figure2.jpg


--------------------------------------------------------------------------------
/figure/Table1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tea1528/Non-Local-NN-Pytorch/64c6047f7d801402d04340695bfbdfeee03e4797/figure/Table1.jpg


--------------------------------------------------------------------------------
/figure/resnet56_cifar.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tea1528/Non-Local-NN-Pytorch/64c6047f7d801402d04340695bfbdfeee03e4797/figure/resnet56_cifar.jpg


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | '''Train CIFAR10 with PyTorch.'''
  2 | from __future__ import print_function
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.optim as optim
  7 | import torch.nn.functional as F
  8 | import torch.backends.cudnn as cudnn
  9 | 
 10 | import torchvision
 11 | import torchvision.transforms as transforms
 12 | 
 13 | import os
 14 | import argparse
 15 | 
 16 | from models.resnet2D import resnet2D56
 17 | 
 18 | parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
 19 | parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
 20 | parser.add_argument('--verbose', '-v', action='store_true', help='display progress bar')
 21 | parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint')
 22 | parser.add_argument('--nl', '-n', action='store_true', help='add non-local block')
 23 | args = parser.parse_args()
 24 | 
 25 | if args.verbose:
 26 |     from utils import progress_bar
 27 | 
 28 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
 29 | best_acc = 0  # best test accuracy
 30 | start_epoch = 0  # start from epoch 0 or last checkpoint epoch
 31 | 
 32 | # Data
 33 | print('==> Preparing data..')
 34 | transform_train = transforms.Compose([
 35 |     transforms.RandomCrop(32, padding=4),
 36 |     transforms.RandomHorizontalFlip(),
 37 |     transforms.ToTensor(),
 38 |     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
 39 | ])
 40 | 
 41 | transform_test = transforms.Compose([
 42 |     transforms.ToTensor(),
 43 |     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
 44 | ])
 45 | 
 46 | trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
 47 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)
 48 | 
 49 | testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
 50 | testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)
 51 | 
 52 | classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
 53 | 
 54 | # Model
 55 | print('==> Building model..')
 56 | if args.nl:
 57 |     print("ResNet-56 with non-local block after second residual block..")
 58 |     net = resnet2D56(non_local=True)
 59 | else:
 60 |     print("ResNet-56 without non-local block..")
 61 |     net = resnet2D56(non_local=False)
 62 | 
 63 | 
 64 | 
 65 | net = net.to(device)
 66 | 
 67 | if device == 'cuda':
 68 |     net = torch.nn.DataParallel(net)
 69 |     cudnn.benchmark = True
 70 | 
 71 | if args.resume:
 72 |     # Load checkpoint.
 73 |     print('==> Resuming from checkpoint..')
 74 |     assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
 75 |     checkpoint = torch.load('./checkpoint/ckpt.t7')
 76 |     net.load_state_dict(checkpoint['net'])
 77 |     best_acc = checkpoint['acc']
 78 |     start_epoch = checkpoint['epoch']
 79 | 
 80 | criterion = nn.CrossEntropyLoss()
 81 | optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
 82 | lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150], last_epoch=start_epoch - 1)
 83 | 
 84 | # Training
 85 | def train(epoch):
 86 |     print('\nEpoch: %d' % epoch)
 87 |     net.train()
 88 |     train_loss = 0
 89 |     correct = 0
 90 |     total = 0
 91 |     for batch_idx, (inputs, targets) in enumerate(trainloader):
 92 |         inputs, targets = inputs.to(device), targets.to(device)
 93 |         optimizer.zero_grad()
 94 |         outputs = net(inputs)
 95 |         loss = criterion(outputs, targets)
 96 |         loss.backward()
 97 |         optimizer.step()
 98 | 
 99 |         train_loss += loss.item()
100 |         _, predicted = outputs.max(1)
101 |         total += targets.size(0)
102 |         correct += predicted.eq(targets).sum().item()
103 | 
104 |         if args.verbose:
105 |             progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
106 |                 % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
107 |     if not args.verbose:
108 |         print('Loss: %.3f' % train_loss)
109 | 
110 |     return train_loss
111 | 
112 | def test(epoch):
113 |     global best_acc
114 |     net.eval()
115 |     test_loss = 0
116 |     correct = 0
117 |     total = 0
118 |     with torch.no_grad():
119 |         for batch_idx, (inputs, targets) in enumerate(testloader):
120 |             inputs, targets = inputs.to(device), targets.to(device)
121 |             outputs = net(inputs)
122 |             loss = criterion(outputs, targets)
123 | 
124 |             test_loss += loss.item()
125 |             _, predicted = outputs.max(1)
126 |             total += targets.size(0)
127 |             correct += predicted.eq(targets).sum().item()
128 |             
129 |             if args.verbose:
130 |                 progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
131 |                     % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
132 | 
133 |     if not args.verbose:
134 |         print('Loss: %.3f' % test_loss)
135 | 
136 |     # Save checkpoint.
137 |     acc = 100.*correct/total
138 |     if acc > best_acc:
139 |         print('Saving..')
140 |         state = {
141 |             'net': net.state_dict(),
142 |             'acc': acc,
143 |             'epoch': epoch,
144 |         }
145 |         if not os.path.isdir('checkpoint'):
146 |             os.mkdir('checkpoint')
147 |         torch.save(state, './checkpoint/ckpt.t7')
148 |         best_acc = acc
149 |     return test_loss
150 | 
151 | tr_loss_list = []
152 | tst_loss_list = []
153 | 
154 | for epoch in range(start_epoch, start_epoch+200):
155 |     train_l = train(epoch)
156 |     lr_scheduler.step()
157 |     test_l = test(epoch)
158 |     tr_loss_list.append(train_l)
159 |     tst_loss_list.append(test_l)
160 | 
161 | print("Best Accuracy: ", best_acc)
162 | print("-----------------------------------------------")
163 | 
164 | print("train loss")
165 | print(tr_loss_list)
166 | print("test loss")
167 | print(tst_loss_list)
168 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | 


--------------------------------------------------------------------------------
/models/non_local.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn import functional as F
  4 | 
  5 | 
  6 | class NLBlockND(nn.Module):
  7 |     def __init__(self, in_channels, inter_channels=None, mode='embedded', 
  8 |                  dimension=3, bn_layer=True):
  9 |         """Implementation of Non-Local Block with 4 different pairwise functions but doesn't include subsampling trick
 10 |         args:
 11 |             in_channels: original channel size (1024 in the paper)
 12 |             inter_channels: channel size inside the block if not specifed reduced to half (512 in the paper)
 13 |             mode: supports Gaussian, Embedded Gaussian, Dot Product, and Concatenation
 14 |             dimension: can be 1 (temporal), 2 (spatial), 3 (spatiotemporal)
 15 |             bn_layer: whether to add batch norm
 16 |         """
 17 |         super(NLBlockND, self).__init__()
 18 | 
 19 |         assert dimension in [1, 2, 3]
 20 |         
 21 |         if mode not in ['gaussian', 'embedded', 'dot', 'concatenate']:
 22 |             raise ValueError('`mode` must be one of `gaussian`, `embedded`, `dot` or `concatenate`')
 23 |             
 24 |         self.mode = mode
 25 |         self.dimension = dimension
 26 | 
 27 |         self.in_channels = in_channels
 28 |         self.inter_channels = inter_channels
 29 | 
 30 |         # the channel size is reduced to half inside the block
 31 |         if self.inter_channels is None:
 32 |             self.inter_channels = in_channels // 2
 33 |             if self.inter_channels == 0:
 34 |                 self.inter_channels = 1
 35 |         
 36 |         # assign appropriate convolutional, max pool, and batch norm layers for different dimensions
 37 |         if dimension == 3:
 38 |             conv_nd = nn.Conv3d
 39 |             max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
 40 |             bn = nn.BatchNorm3d
 41 |         elif dimension == 2:
 42 |             conv_nd = nn.Conv2d
 43 |             max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
 44 |             bn = nn.BatchNorm2d
 45 |         else:
 46 |             conv_nd = nn.Conv1d
 47 |             max_pool_layer = nn.MaxPool1d(kernel_size=(2))
 48 |             bn = nn.BatchNorm1d
 49 | 
 50 |         # function g in the paper which goes through conv. with kernel size 1
 51 |         self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
 52 | 
 53 |         # add BatchNorm layer after the last conv layer
 54 |         if bn_layer:
 55 |             self.W_z = nn.Sequential(
 56 |                     conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, kernel_size=1),
 57 |                     bn(self.in_channels)
 58 |                 )
 59 |             # from section 4.1 of the paper, initializing params of BN ensures that the initial state of non-local block is identity mapping
 60 |             nn.init.constant_(self.W_z[1].weight, 0)
 61 |             nn.init.constant_(self.W_z[1].bias, 0)
 62 |         else:
 63 |             self.W_z = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, kernel_size=1)
 64 | 
 65 |             # from section 3.3 of the paper by initializing Wz to 0, this block can be inserted to any existing architecture
 66 |             nn.init.constant_(self.W_z.weight, 0)
 67 |             nn.init.constant_(self.W_z.bias, 0)
 68 | 
 69 |         # define theta and phi for all operations except gaussian
 70 |         if self.mode == "embedded" or self.mode == "dot" or self.mode == "concatenate":
 71 |             self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
 72 |             self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
 73 |         
 74 |         if self.mode == "concatenate":
 75 |             self.W_f = nn.Sequential(
 76 |                     nn.Conv2d(in_channels=self.inter_channels * 2, out_channels=1, kernel_size=1),
 77 |                     nn.ReLU()
 78 |                 )
 79 |             
 80 |     def forward(self, x):
 81 |         """
 82 |         args
 83 |             x: (N, C, T, H, W) for dimension=3; (N, C, H, W) for dimension 2; (N, C, T) for dimension 1
 84 |         """
 85 | 
 86 |         batch_size = x.size(0)
 87 |         
 88 |         # (N, C, THW)
 89 |         # this reshaping and permutation is from the spacetime_nonlocal function in the original Caffe2 implementation
 90 |         g_x = self.g(x).view(batch_size, self.inter_channels, -1)
 91 |         g_x = g_x.permute(0, 2, 1)
 92 | 
 93 |         if self.mode == "gaussian":
 94 |             theta_x = x.view(batch_size, self.in_channels, -1)
 95 |             phi_x = x.view(batch_size, self.in_channels, -1)
 96 |             theta_x = theta_x.permute(0, 2, 1)
 97 |             f = torch.matmul(theta_x, phi_x)
 98 | 
 99 |         elif self.mode == "embedded" or self.mode == "dot":
100 |             theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
101 |             phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
102 |             theta_x = theta_x.permute(0, 2, 1)
103 |             f = torch.matmul(theta_x, phi_x)
104 | 
105 |         elif self.mode == "concatenate":
106 |             theta_x = self.theta(x).view(batch_size, self.inter_channels, -1, 1)
107 |             phi_x = self.phi(x).view(batch_size, self.inter_channels, 1, -1)
108 |             
109 |             h = theta_x.size(2)
110 |             w = phi_x.size(3)
111 |             theta_x = theta_x.repeat(1, 1, 1, w)
112 |             phi_x = phi_x.repeat(1, 1, h, 1)
113 |             
114 |             concat = torch.cat([theta_x, phi_x], dim=1)
115 |             f = self.W_f(concat)
116 |             f = f.view(f.size(0), f.size(2), f.size(3))
117 |         
118 |         if self.mode == "gaussian" or self.mode == "embedded":
119 |             f_div_C = F.softmax(f, dim=-1)
120 |         elif self.mode == "dot" or self.mode == "concatenate":
121 |             N = f.size(-1) # number of position in x
122 |             f_div_C = f / N
123 |         
124 |         y = torch.matmul(f_div_C, g_x)
125 |         
126 |         # contiguous here just allocates contiguous chunk of memory
127 |         y = y.permute(0, 2, 1).contiguous()
128 |         y = y.view(batch_size, self.inter_channels, *x.size()[2:])
129 |         
130 |         W_y = self.W_z(y)
131 |         # residual connection
132 |         z = W_y + x
133 | 
134 |         return z
135 | 
136 | 
137 | if __name__ == '__main__':
138 |     import torch
139 | 
140 |     for bn_layer in [True, False]:
141 |         img = torch.zeros(2, 3, 20)
142 |         net = NLBlockND(in_channels=3, mode='concatenate', dimension=1, bn_layer=bn_layer)
143 |         out = net(img)
144 |         print(out.size())
145 | 
146 |         img = torch.zeros(2, 3, 20, 20)
147 |         net = NLBlockND(in_channels=3, mode='concatenate', dimension=2, bn_layer=bn_layer)
148 |         out = net(img)
149 |         print(out.size())
150 | 
151 |         img = torch.randn(2, 3, 8, 20, 20)
152 |         net = NLBlockND(in_channels=3, mode='concatenate', dimension=3, bn_layer=bn_layer)
153 |         out = net(img)
154 |         print(out.size())
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/models/resnet2D.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Non-Local ResNet2D-50 for CIFAR-10 dataset.
  3 | Most of the code is borrowed from https://github.com/akamaster/pytorch_resnet_cifar10
  4 | 
  5 | Properly implemented ResNet-s for CIFAR10 as described in paper [1].
  6 | 
  7 | The implementation and structure of this file is hugely influenced by [2]
  8 | which is implemented for ImageNet and doesn't have option A for identity.
  9 | Moreover, most of the implementations on the web is copy-paste from
 10 | torchvision's resnet and has wrong number of params.
 11 | 
 12 | Reference:
 13 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
 14 |     Deep Residual Learning for Image Recognition. arXiv:1512.03385
 15 | [2] https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
 16 | '''
 17 | import torch
 18 | import torch.nn as nn
 19 | import torch.nn.functional as F
 20 | import torch.nn.init as init
 21 | 
 22 | from torch.autograd import Variable
 23 | from models.non_local import NLBlockND
 24 | 
 25 | 
 26 | def _weights_init(m):
 27 |     if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
 28 |         init.kaiming_normal_(m.weight)
 29 | 
 30 | class LambdaLayer(nn.Module):
 31 |     def __init__(self, lambd):
 32 |         super(LambdaLayer, self).__init__()
 33 |         self.lambd = lambd
 34 | 
 35 |     def forward(self, x):
 36 |         return self.lambd(x)
 37 | 
 38 | 
 39 | class BasicBlock(nn.Module):
 40 |     expansion = 1
 41 | 
 42 |     def __init__(self, in_planes, planes, stride=1, option='A'):
 43 |         super(BasicBlock, self).__init__()
 44 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
 45 |         self.bn1 = nn.BatchNorm2d(planes)
 46 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
 47 |         self.bn2 = nn.BatchNorm2d(planes)
 48 | 
 49 |         self.shortcut = nn.Sequential()
 50 |         if stride != 1 or in_planes != planes:
 51 |             if option == 'A':
 52 |                 """
 53 |                 For CIFAR10 ResNet paper uses option A.
 54 |                 """
 55 |                 self.shortcut = LambdaLayer(lambda x:
 56 |                                             F.pad(x[:, :, ::2, ::2], (0, 0, 0, 0, planes//4, planes//4), "constant", 0))
 57 |             elif option == 'B':
 58 |                 self.shortcut = nn.Sequential(
 59 |                      nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
 60 |                      nn.BatchNorm2d(self.expansion * planes)
 61 |                 )
 62 | 
 63 |     def forward(self, x):
 64 |         out = F.relu(self.bn1(self.conv1(x)))
 65 |         out = self.bn2(self.conv2(out))
 66 |         out += self.shortcut(x)
 67 |         out = F.relu(out)
 68 |         return out
 69 | 
 70 | 
 71 | class ResNet2D(nn.Module):
 72 |     def __init__(self, block, num_blocks, num_classes=10, non_local=False):
 73 |         super(ResNet2D, self).__init__()
 74 |         self.in_planes = 16
 75 | 
 76 |         self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
 77 |         self.bn1 = nn.BatchNorm2d(16)
 78 |         self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1)
 79 |         
 80 |         # add non-local block after layer 2
 81 |         self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2, non_local=non_local)
 82 |         self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2)
 83 |         self.linear = nn.Linear(64, num_classes)
 84 | 
 85 |         self.apply(_weights_init)
 86 | 
 87 |     def _make_layer(self, block, planes, num_blocks, stride, non_local=False):
 88 |         strides = [stride] + [1]*(num_blocks-1)
 89 |         layers = []
 90 | 
 91 |         last_idx = len(strides)
 92 |         if non_local:
 93 |             last_idx = len(strides) - 1
 94 | 
 95 |         for i in range(last_idx):
 96 |             layers.append(block(self.in_planes, planes, strides[i]))
 97 |             self.in_planes = planes * block.expansion
 98 | 
 99 |         if non_local:
100 |             layers.append(NLBlockND(in_channels=planes, dimension=2))
101 |             layers.append(block(self.in_planes, planes, strides[-1]))
102 | 
103 |         return nn.Sequential(*layers)
104 | 
105 |     def forward(self, x):
106 |         out = F.relu(self.bn1(self.conv1(x)))
107 |         out = self.layer1(out)
108 |         out = self.layer2(out)
109 |         out = self.layer3(out)
110 |         out = F.avg_pool2d(out, out.size()[3])
111 |         out = out.view(out.size(0), -1)
112 |         out = self.linear(out)
113 |         return out
114 | 
115 | 
116 | def resnet2D56(non_local=False, **kwargs):
117 |     """Constructs a ResNet-56 model.
118 |     """
119 |     return ResNet2D(BasicBlock, [9, 9, 9], non_local=non_local, **kwargs)
120 | 
121 | 
122 | if __name__=='__main__':
123 |     # Test case for (224 x 224 x 3) input of batch size 1
124 |     img = Variable(torch.randn(1, 3, 224, 224))
125 |     net = resnet2D56()
126 |     count = 0
127 |     for name, param in net.named_parameters():
128 |         if param.requires_grad:
129 |             count += 1
130 |             print(name)
131 |     print (count)
132 |     out = net(img)
133 |     print(out.size())
134 | 


--------------------------------------------------------------------------------
/models/resnet3D.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ResNet50 (C2D) for spatiotemporal task. Only ResNet50 backbone structure was implemented here.
  3 | """
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | import math
  9 | from functools import partial
 10 | from models.non_local import NLBlockND
 11 | 
 12 | 
 13 | class Bottleneck(nn.Module):
 14 |     """
 15 |     Bottleneck block structure used in ResNet 50. 
 16 |     As mentioned in Section 4. 2D ConvNet baseline (C2D), 
 17 |     all convolutions are in essence 2D kernels that prcoess the input frame-by-frame 
 18 |     (implemented as (1 x k x k) kernels). 
 19 |     """
 20 |     expansion = 4
 21 | 
 22 |     def __init__(self, inplanes, planes, stride=1, padding=(0, 1, 1), downsample=None):
 23 |         super(Bottleneck, self).__init__()
 24 |         self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=(1, 1, 1), bias=False)
 25 |         self.bn1 = nn.BatchNorm3d(planes)
 26 |         self.conv2 = nn.Conv3d(planes, planes, kernel_size=(1, 3, 3), stride=stride, padding=padding, bias=False)
 27 |         self.bn2 = nn.BatchNorm3d(planes)
 28 |         self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=(1, 1, 1), bias=False)
 29 |         self.bn3 = nn.BatchNorm3d(planes * 4)
 30 |         self.relu = nn.ReLU(inplace=True)
 31 |         self.downsample = downsample
 32 |         self.stride = stride
 33 | 
 34 |     def forward(self, x):
 35 |         identity = x
 36 | 
 37 |         out = self.conv1(x)
 38 |         out = self.bn1(out)
 39 |         out = self.relu(out)
 40 | 
 41 |         out = self.conv2(out)
 42 |         out = self.bn2(out)
 43 |         out = self.relu(out)
 44 | 
 45 |         out = self.conv3(out)
 46 |         out = self.bn3(out)
 47 | 
 48 |         if self.downsample is not None:
 49 |             identity = self.downsample(x)
 50 |         
 51 |         out += identity
 52 |         out = self.relu(out)
 53 | 
 54 |         return out
 55 | 
 56 | 
 57 | class ResNet3D(nn.Module):
 58 |     """C2D with ResNet 50 backbone.
 59 |     The only operation involving the temporal domain are the pooling layer after the second residual block.
 60 |     For more details of the structure, refer to Table 1 from the paper. 
 61 |     Padding was added accordingly to match the correct dimensionality.
 62 |     """
 63 |     def __init__(self, block, layers, num_classes=400, non_local=False):
 64 |         self.inplanes = 64
 65 |         super(ResNet3D, self).__init__()
 66 |         
 67 |         # first convolution operation has essentially 2D kernels
 68 |         # output: 64 x 16 x 112 x 112
 69 |         self.conv1 = nn.Conv3d(3, 64, kernel_size=(1, 7, 7), stride=2, padding=(0, 3, 3), bias=False)
 70 |         self.bn1 = nn.BatchNorm3d(64)
 71 |         self.relu = nn.ReLU(inplace=True)
 72 |         
 73 |         # output: 64 x 8 x 56 x 56
 74 |         self.pool1 = nn.MaxPool3d(kernel_size=3, stride=2)
 75 |         
 76 |         # output: 256 x 8 x 56 x 56
 77 |         self.layer1 = self._make_layer(block, 64, layers[0], stride=1, d_padding=0)
 78 |         
 79 |         # pooling on temporal domain
 80 |         # output: 256 x 4 x 56 x 56
 81 |         self.pool_t = nn.MaxPool3d(kernel_size=(3, 1, 1), stride=(2, 1, 1))
 82 |         
 83 |         # output: 512 x 4 x 28 x 28
 84 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2, padding=(2, 1, 1))
 85 |         
 86 |         # add one non-local block here
 87 |         # output: 1024 x 4 x 14 x 14
 88 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2, padding=(2, 1, 1), non_local=non_local)
 89 | 
 90 |         # output: 2048 x 4 x 7 x 7
 91 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2, padding=(2, 1, 1))
 92 |         
 93 |         # output: 2048 x 1
 94 |         self.avgpool = nn.AvgPool3d(kernel_size=(4, 7, 7))
 95 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
 96 | 
 97 |         for m in self.modules():
 98 |             if isinstance(m, nn.Conv3d):
 99 |                 m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out')
100 |             elif isinstance(m, nn.BatchNorm3d):
101 |                 m.weight.data.fill_(1)
102 |                 m.bias.data.zero_()
103 | 
104 |     def _make_layer(self, block, planes, blocks, stride=1, padding=(0, 1, 1), d_padding=(2, 0, 0), non_local=False):
105 |         downsample = nn.Sequential(
106 |                             nn.Conv3d(self.inplanes, planes * block.expansion, 
107 |                                       kernel_size=1, stride=stride, padding=d_padding, bias=False), 
108 |                             nn.BatchNorm3d(planes * block.expansion)
109 |                         )
110 | 
111 |         layers = []
112 |         layers.append(block(self.inplanes, planes, stride, padding, downsample))
113 |         self.inplanes = planes * block.expansion
114 |         
115 |         last_idx = blocks
116 |         if non_local:
117 |             last_idx = blocks - 1
118 |             
119 |         for i in range(1, last_idx):
120 |             layers.append(block(self.inplanes, planes))
121 |         
122 |         # add non-local block here
123 |         if non_local:
124 |             layers.append(NLBlockND(in_channels=1024, dimension=3))
125 |             layers.append(block(self.inplanes, planes))
126 |             
127 |         return nn.Sequential(*layers)
128 | 
129 |     def forward(self, x):
130 |         x = self.conv1(x)
131 |         x = self.bn1(x)
132 |         x = self.relu(x)
133 |         x = self.pool1(x)
134 | 
135 |         x = self.layer1(x)
136 |         x = self.pool_t(x)
137 |         x = self.layer2(x)
138 |         x = self.layer3(x)
139 |         x = self.layer4(x)
140 | 
141 |         x = self.avgpool(x)
142 | 
143 |         x = x.view(x.size(0), -1)
144 |         x = self.fc(x)
145 | 
146 |         return x
147 | 
148 | 
149 | def resnet3D50(non_local=False, **kwargs):
150 |     """Constructs a C2D ResNet-50 model.
151 |     """
152 |     model = ResNet3D(Bottleneck, [3, 4, 6, 3], non_local=non_local, **kwargs)
153 |     return model
154 | 
155 | 
156 | 
157 | if __name__=='__main__':
158 |     # Test case of 32 frames (224 x 224 x 3) input of batch size 1
159 |     img = Variable(torch.randn(1, 3, 32, 224, 224))
160 |     net = resnet3D50(non_local=True)
161 |     count = 0
162 |     for name, param in net.named_parameters():
163 |         if param.requires_grad:
164 |             count += 1
165 |             print(name)
166 |     print (count)
167 |     out = net(img)
168 |     print(out.size())
169 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | python main.py --verbose 2>&1 | tee regular_output.txt
2 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | '''Some helper functions for PyTorch, including:
  2 |     - get_mean_and_std: calculate the mean and std value of dataset.
  3 |     - msr_init: net parameter initialization.
  4 |     - progress_bar: progress bar mimic xlua.progress.
  5 | '''
  6 | import os
  7 | import sys
  8 | import time
  9 | import math
 10 | 
 11 | import torch.nn as nn
 12 | import torch.nn.init as init
 13 | 
 14 | 
 15 | def get_mean_and_std(dataset):
 16 |     '''Compute the mean and std value of dataset.'''
 17 |     dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2)
 18 |     mean = torch.zeros(3)
 19 |     std = torch.zeros(3)
 20 |     print('==> Computing mean and std..')
 21 |     for inputs, targets in dataloader:
 22 |         for i in range(3):
 23 |             mean[i] += inputs[:,i,:,:].mean()
 24 |             std[i] += inputs[:,i,:,:].std()
 25 |     mean.div_(len(dataset))
 26 |     std.div_(len(dataset))
 27 |     return mean, std
 28 | 
 29 | def init_params(net):
 30 |     '''Init layer parameters.'''
 31 |     for m in net.modules():
 32 |         if isinstance(m, nn.Conv2d):
 33 |             init.kaiming_normal(m.weight, mode='fan_out')
 34 |             if m.bias:
 35 |                 init.constant(m.bias, 0)
 36 |         elif isinstance(m, nn.BatchNorm2d):
 37 |             init.constant(m.weight, 1)
 38 |             init.constant(m.bias, 0)
 39 |         elif isinstance(m, nn.Linear):
 40 |             init.normal(m.weight, std=1e-3)
 41 |             if m.bias:
 42 |                 init.constant(m.bias, 0)
 43 | 
 44 | 
 45 | _, term_width = os.popen('stty size', 'r').read().split()
 46 | term_width = int(term_width)
 47 | 
 48 | TOTAL_BAR_LENGTH = 65.
 49 | last_time = time.time()
 50 | begin_time = last_time
 51 | def progress_bar(current, total, msg=None):
 52 |     global last_time, begin_time
 53 |     if current == 0:
 54 |         begin_time = time.time()  # Reset for new bar.
 55 | 
 56 |     cur_len = int(TOTAL_BAR_LENGTH*current/total)
 57 |     rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1
 58 | 
 59 |     sys.stdout.write(' [')
 60 |     for i in range(cur_len):
 61 |         sys.stdout.write('=')
 62 |     sys.stdout.write('>')
 63 |     for i in range(rest_len):
 64 |         sys.stdout.write('.')
 65 |     sys.stdout.write(']')
 66 | 
 67 |     cur_time = time.time()
 68 |     step_time = cur_time - last_time
 69 |     last_time = cur_time
 70 |     tot_time = cur_time - begin_time
 71 | 
 72 |     L = []
 73 |     L.append('  Step: %s' % format_time(step_time))
 74 |     L.append(' | Tot: %s' % format_time(tot_time))
 75 |     if msg:
 76 |         L.append(' | ' + msg)
 77 | 
 78 |     msg = ''.join(L)
 79 |     sys.stdout.write(msg)
 80 |     for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3):
 81 |         sys.stdout.write(' ')
 82 | 
 83 |     # Go back to the center of the bar.
 84 |     for i in range(term_width-int(TOTAL_BAR_LENGTH/2)+2):
 85 |         sys.stdout.write('\b')
 86 |     sys.stdout.write(' %d/%d ' % (current+1, total))
 87 | 
 88 |     if current < total-1:
 89 |         sys.stdout.write('\r')
 90 |     else:
 91 |         sys.stdout.write('\n')
 92 |     sys.stdout.flush()
 93 | 
 94 | def format_time(seconds):
 95 |     days = int(seconds / 3600/24)
 96 |     seconds = seconds - days*3600*24
 97 |     hours = int(seconds / 3600)
 98 |     seconds = seconds - hours*3600
 99 |     minutes = int(seconds / 60)
100 |     seconds = seconds - minutes*60
101 |     secondsf = int(seconds)
102 |     seconds = seconds - secondsf
103 |     millis = int(seconds*1000)
104 | 
105 |     f = ''
106 |     i = 1
107 |     if days > 0:
108 |         f += str(days) + 'D'
109 |         i += 1
110 |     if hours > 0 and i <= 2:
111 |         f += str(hours) + 'h'
112 |         i += 1
113 |     if minutes > 0 and i <= 2:
114 |         f += str(minutes) + 'm'
115 |         i += 1
116 |     if secondsf > 0 and i <= 2:
117 |         f += str(secondsf) + 's'
118 |         i += 1
119 |     if millis > 0 and i <= 2:
120 |         f += str(millis) + 'ms'
121 |         i += 1
122 |     if f == '':
123 |         f = '0ms'
124 |     return f
125 | 


--------------------------------------------------------------------------------