├── .gitignore
├── 3D_experiment
├── .gitignore
├── LICENSE
├── README.md
├── dataset.py
├── datasets
│ ├── activitynet.py
│ ├── hmdb51.py
│ ├── kinetics.py
│ └── ucf101.py
├── main.py
├── mean.py
├── models
│ ├── non_local.py
│ └── resnet3D.py
├── opts.py
├── run.sh
├── spatial_transforms.py
├── target_transforms.py
├── temporal_transforms.py
├── test.py
├── train.py
├── utils.py
├── utils
│ ├── eval_hmdb51.py
│ ├── eval_kinetics.py
│ ├── eval_ucf101.py
│ ├── fps.py
│ ├── hmdb51_json.py
│ ├── kinetics_json.py
│ ├── n_frames_kinetics.py
│ ├── n_frames_ucf101_hmdb51.py
│ ├── ucf101_json.py
│ ├── video_jpg.py
│ ├── video_jpg_kinetics.py
│ └── video_jpg_ucf101_hmdb51.py
└── validation.py
├── LICENSE
├── README.md
├── figure
├── Figure2.jpg
├── Table1.jpg
└── resnet56_cifar.jpg
├── main.py
├── models
├── __init__.py
├── non_local.py
├── resnet2D.py
└── resnet3D.py
├── run.sh
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | data/
3 | checkpoint/
4 | *.txt
5 | plot_loss.ipynb
6 |
--------------------------------------------------------------------------------
/3D_experiment/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | data/
7 | __pycache__
8 | *.txt
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | .hypothesis/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # dotenv
87 | .env
88 |
89 | # virtualenv
90 | .venv
91 | venv/
92 | ENV/
93 |
94 | # Spyder project settings
95 | .spyderproject
96 | .spyproject
97 |
98 | # Rope project settings
99 | .ropeproject
100 |
101 | # mkdocs documentation
102 | /site
103 |
104 | # mypy
105 | .mypy_cache/
106 |
107 | .DS_Store
108 |
109 | .vscode
110 |
--------------------------------------------------------------------------------
/3D_experiment/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Kensho Hara
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/3D_experiment/README.md:
--------------------------------------------------------------------------------
1 | # 3D ResNets for Action Recognition
2 |
3 | ## TL:DR
4 | Run `run.sh` to start training using C2D model. If you wish to run other models, please refer to the original repository.
5 | Most of the code is borrowed from https://github.com/kenshohara/3D-ResNets-PyTorch except for the model architecture.
6 |
7 | ## Summary
8 |
9 | This is the PyTorch code for the following papers:
10 |
11 | [
12 | Kensho Hara, Hirokatsu Kataoka, and Yutaka Satoh,
13 | "Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet?",
14 | Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6546-6555, 2018.
15 | ](http://openaccess.thecvf.com/content_cvpr_2018/html/Hara_Can_Spatiotemporal_3D_CVPR_2018_paper.html)
16 |
17 | [
18 | Kensho Hara, Hirokatsu Kataoka, and Yutaka Satoh,
19 | "Learning Spatio-Temporal Features with 3D Residual Networks for Action Recognition",
20 | Proceedings of the ICCV Workshop on Action, Gesture, and Emotion Recognition, 2017.
21 | ](http://openaccess.thecvf.com/content_ICCV_2017_workshops/papers/w44/Hara_Learning_Spatio-Temporal_Features_ICCV_2017_paper.pdf)
22 |
23 | This code includes training, fine-tuning and testing on Kinetics, ActivityNet, UCF-101, and HMDB-51.
24 |
25 | ## Citation
26 |
27 | If you use this code or pre-trained models, please cite the following:
28 |
29 | ```bibtex
30 | @inproceedings{hara3dcnns,
31 | author={Kensho Hara and Hirokatsu Kataoka and Yutaka Satoh},
32 | title={Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet?},
33 | booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
34 | pages={6546--6555},
35 | year={2018},
36 | }
37 | ```
38 |
39 | ## Requirements
40 |
41 | * [PyTorch](http://pytorch.org/)
42 |
43 | ```bash
44 | conda install pytorch torchvision cuda80 -c soumith
45 | ```
46 |
47 | * FFmpeg, FFprobe
48 |
49 | ```bash
50 | wget http://johnvansickle.com/ffmpeg/releases/ffmpeg-release-64bit-static.tar.xz
51 | tar xvf ffmpeg-release-64bit-static.tar.xz
52 | cd ./ffmpeg-3.3.3-64bit-static/; sudo cp ffmpeg ffprobe /usr/local/bin;
53 | ```
54 |
55 | * Python 3
56 |
57 | ## Preparation
58 |
59 | ### ActivityNet
60 |
61 | * Download videos using [the official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler).
62 | * Convert from avi to jpg files using ```utils/video_jpg.py```
63 |
64 | ```bash
65 | python utils/video_jpg.py avi_video_directory jpg_video_directory
66 | ```
67 |
68 | * Generate fps files using ```utils/fps.py```
69 |
70 | ```bash
71 | python utils/fps.py avi_video_directory jpg_video_directory
72 | ```
73 |
74 | ### Kinetics
75 |
76 | * Download videos using [the official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics).
77 | * Locate test set in ```video_directory/test```.
78 | * Convert from avi to jpg files using ```utils/video_jpg_kinetics.py```
79 |
80 | ```bash
81 | python utils/video_jpg_kinetics.py avi_video_directory jpg_video_directory
82 | ```
83 |
84 | * Generate n_frames files using ```utils/n_frames_kinetics.py```
85 |
86 | ```bash
87 | python utils/n_frames_kinetics.py jpg_video_directory
88 | ```
89 |
90 | * Generate annotation file in json format similar to ActivityNet using ```utils/kinetics_json.py```
91 | * The CSV files (kinetics_{train, val, test}.csv) are included in the crawler.
92 |
93 | ```bash
94 | python utils/kinetics_json.py train_csv_path val_csv_path test_csv_path dst_json_path
95 | ```
96 |
97 | ### UCF-101
98 |
99 | * Download videos and train/test splits [here](http://crcv.ucf.edu/data/UCF101.php).
100 | * Convert from avi to jpg files using ```utils/video_jpg_ucf101_hmdb51.py```
101 |
102 | ```bash
103 | python utils/video_jpg_ucf101_hmdb51.py avi_video_directory jpg_video_directory
104 | ```
105 |
106 | * Generate n_frames files using ```utils/n_frames_ucf101_hmdb51.py```
107 |
108 | ```bash
109 | python utils/n_frames_ucf101_hmdb51.py jpg_video_directory
110 | ```
111 |
112 | * Generate annotation file in json format similar to ActivityNet using ```utils/ucf101_json.py```
113 | * ```annotation_dir_path``` includes classInd.txt, trainlist0{1, 2, 3}.txt, testlist0{1, 2, 3}.txt
114 |
115 | ```bash
116 | python utils/ucf101_json.py annotation_dir_path
117 | ```
118 |
119 | ### HMDB-51
120 |
121 | * Download videos and train/test splits [here](http://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/).
122 | * Convert from avi to jpg files using ```utils/video_jpg_ucf101_hmdb51.py```
123 |
124 | ```bash
125 | python utils/video_jpg_ucf101_hmdb51.py avi_video_directory jpg_video_directory
126 | ```
127 |
128 | * Generate n_frames files using ```utils/n_frames_ucf101_hmdb51.py```
129 |
130 | ```bash
131 | python utils/n_frames_ucf101_hmdb51.py jpg_video_directory
132 | ```
133 |
134 | * Generate annotation file in json format similar to ActivityNet using ```utils/hmdb51_json.py```
135 | * ```annotation_dir_path``` includes brush_hair_test_split1.txt, ...
136 |
137 | ```bash
138 | python utils/hmdb51_json.py annotation_dir_path
139 | ```
140 |
141 | ## Running the code
142 |
143 | Assume the structure of data directories is the following:
144 |
145 | ```misc
146 | ~/
147 | data/
148 | kinetics_videos/
149 | jpg/
150 | .../ (directories of class names)
151 | .../ (directories of video names)
152 | ... (jpg files)
153 | results/
154 | save_100.pth
155 | kinetics.json
156 | ```
157 |
158 | Confirm all options.
159 |
160 | ```bash
161 | python main.lua -h
162 | ```
163 |
164 | Train ResNets-34 on the Kinetics dataset (400 classes) with 4 CPU threads (for data loading).
165 | Batch size is 128.
166 | Save models at every 5 epochs.
167 | All GPUs is used for the training.
168 | If you want a part of GPUs, use ```CUDA_VISIBLE_DEVICES=...```.
169 |
170 | ```bash
171 | python main.py --root_path ~/data --video_path kinetics_videos/jpg --annotation_path kinetics.json \
172 | --result_path results --dataset kinetics --model resnet \
173 | --model_depth 34 --n_classes 400 --batch_size 128 --n_threads 4 --checkpoint 5
174 | ```
175 |
176 | Continue Training from epoch 101. (~/data/results/save_100.pth is loaded.)
177 |
178 | ```bash
179 | python main.py --root_path ~/data --video_path kinetics_videos/jpg --annotation_path kinetics.json \
180 | --result_path results --dataset kinetics --resume_path results/save_100.pth \
181 | --model_depth 34 --n_classes 400 --batch_size 128 --n_threads 4 --checkpoint 5
182 | ```
183 |
184 |
185 |
--------------------------------------------------------------------------------
/3D_experiment/dataset.py:
--------------------------------------------------------------------------------
1 | from datasets.kinetics import Kinetics
2 | from datasets.activitynet import ActivityNet
3 | from datasets.ucf101 import UCF101
4 | from datasets.hmdb51 import HMDB51
5 |
6 |
7 | def get_training_set(opt, spatial_transform, temporal_transform,
8 | target_transform):
9 | assert opt.dataset in ['kinetics', 'activitynet', 'ucf101', 'hmdb51']
10 |
11 | if opt.dataset == 'kinetics':
12 | training_data = Kinetics(
13 | opt.video_path,
14 | opt.annotation_path,
15 | 'training',
16 | spatial_transform=spatial_transform,
17 | temporal_transform=temporal_transform,
18 | target_transform=target_transform)
19 | elif opt.dataset == 'activitynet':
20 | training_data = ActivityNet(
21 | opt.video_path,
22 | opt.annotation_path,
23 | 'training',
24 | False,
25 | spatial_transform=spatial_transform,
26 | temporal_transform=temporal_transform,
27 | target_transform=target_transform)
28 | elif opt.dataset == 'ucf101':
29 | training_data = UCF101(
30 | opt.video_path,
31 | opt.annotation_path,
32 | 'training',
33 | spatial_transform=spatial_transform,
34 | temporal_transform=temporal_transform,
35 | target_transform=target_transform)
36 | elif opt.dataset == 'hmdb51':
37 | training_data = HMDB51(
38 | opt.video_path,
39 | opt.annotation_path,
40 | 'training',
41 | spatial_transform=spatial_transform,
42 | temporal_transform=temporal_transform,
43 | target_transform=target_transform)
44 |
45 | return training_data
46 |
47 |
48 | def get_validation_set(opt, spatial_transform, temporal_transform,
49 | target_transform):
50 | assert opt.dataset in ['kinetics', 'activitynet', 'ucf101', 'hmdb51']
51 |
52 | if opt.dataset == 'kinetics':
53 | validation_data = Kinetics(
54 | opt.video_path,
55 | opt.annotation_path,
56 | 'validation',
57 | opt.n_val_samples,
58 | spatial_transform,
59 | temporal_transform,
60 | target_transform,
61 | sample_duration=opt.sample_duration)
62 | elif opt.dataset == 'activitynet':
63 | validation_data = ActivityNet(
64 | opt.video_path,
65 | opt.annotation_path,
66 | 'validation',
67 | False,
68 | opt.n_val_samples,
69 | spatial_transform,
70 | temporal_transform,
71 | target_transform,
72 | sample_duration=opt.sample_duration)
73 | elif opt.dataset == 'ucf101':
74 | validation_data = UCF101(
75 | opt.video_path,
76 | opt.annotation_path,
77 | 'validation',
78 | opt.n_val_samples,
79 | spatial_transform,
80 | temporal_transform,
81 | target_transform,
82 | sample_duration=opt.sample_duration)
83 | elif opt.dataset == 'hmdb51':
84 | validation_data = HMDB51(
85 | opt.video_path,
86 | opt.annotation_path,
87 | 'validation',
88 | opt.n_val_samples,
89 | spatial_transform,
90 | temporal_transform,
91 | target_transform,
92 | sample_duration=opt.sample_duration)
93 | return validation_data
94 |
95 |
96 | def get_test_set(opt, spatial_transform, temporal_transform, target_transform):
97 | assert opt.dataset in ['kinetics', 'activitynet', 'ucf101', 'hmdb51']
98 | assert opt.test_subset in ['val', 'test']
99 |
100 | if opt.test_subset == 'val':
101 | subset = 'validation'
102 | elif opt.test_subset == 'test':
103 | subset = 'testing'
104 | if opt.dataset == 'kinetics':
105 | test_data = Kinetics(
106 | opt.video_path,
107 | opt.annotation_path,
108 | subset,
109 | 0,
110 | spatial_transform,
111 | temporal_transform,
112 | target_transform,
113 | sample_duration=opt.sample_duration)
114 | elif opt.dataset == 'activitynet':
115 | test_data = ActivityNet(
116 | opt.video_path,
117 | opt.annotation_path,
118 | subset,
119 | True,
120 | 0,
121 | spatial_transform,
122 | temporal_transform,
123 | target_transform,
124 | sample_duration=opt.sample_duration)
125 | elif opt.dataset == 'ucf101':
126 | test_data = UCF101(
127 | opt.video_path,
128 | opt.annotation_path,
129 | subset,
130 | 0,
131 | spatial_transform,
132 | temporal_transform,
133 | target_transform,
134 | sample_duration=opt.sample_duration)
135 | elif opt.dataset == 'hmdb51':
136 | test_data = HMDB51(
137 | opt.video_path,
138 | opt.annotation_path,
139 | subset,
140 | 0,
141 | spatial_transform,
142 | temporal_transform,
143 | target_transform,
144 | sample_duration=opt.sample_duration)
145 |
146 | return test_data
147 |
--------------------------------------------------------------------------------
/3D_experiment/datasets/activitynet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.utils.data as data
3 | from PIL import Image
4 | import os
5 | import functools
6 | import json
7 | import copy
8 | import math
9 |
10 | from utils import load_value_file
11 |
12 |
13 | def pil_loader(path):
14 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
15 | with open(path, 'rb') as f:
16 | with Image.open(f) as img:
17 | return img.convert('RGB')
18 |
19 |
20 | def accimage_loader(path):
21 | try:
22 | import accimage
23 | return accimage.Image(path)
24 | except IOError:
25 | # Potentially a decoding problem, fall back to PIL.Image
26 | return pil_loader(path)
27 |
28 |
29 | def get_default_image_loader():
30 | from torchvision import get_image_backend
31 | if get_image_backend() == 'accimage':
32 | return accimage_loader
33 | else:
34 | return pil_loader
35 |
36 |
37 | def video_loader(video_dir_path, frame_indices, image_loader):
38 | video = []
39 | for i in frame_indices:
40 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
41 | if os.path.exists(image_path):
42 | video.append(image_loader(image_path))
43 | else:
44 | return video
45 |
46 | return video
47 |
48 |
49 | def get_default_video_loader():
50 | image_loader = get_default_image_loader()
51 | return functools.partial(video_loader, image_loader=image_loader)
52 |
53 |
54 | def load_annotation_data(data_file_path):
55 | with open(data_file_path, 'r') as data_file:
56 | return json.load(data_file)
57 |
58 |
59 | def get_class_labels(data):
60 | class_names = []
61 | index = 0
62 | for node1 in data['taxonomy']:
63 | is_leaf = True
64 | for node2 in data['taxonomy']:
65 | if node2['parentId'] == node1['nodeId']:
66 | is_leaf = False
67 | break
68 | if is_leaf:
69 | class_names.append(node1['nodeName'])
70 |
71 | class_labels_map = {}
72 |
73 | for i, class_name in enumerate(class_names):
74 | class_labels_map[class_name] = i
75 |
76 | return class_labels_map
77 |
78 |
79 | def get_video_names_and_annotations(data, subset):
80 | video_names = []
81 | annotations = []
82 |
83 | for key, value in data['database'].items():
84 | this_subset = value['subset']
85 | if this_subset == subset:
86 | if subset == 'testing':
87 | video_names.append('v_{}'.format(key))
88 | else:
89 | video_names.append('v_{}'.format(key))
90 | annotations.append(value['annotations'])
91 |
92 | return video_names, annotations
93 |
94 |
95 | def modify_frame_indices(video_dir_path, frame_indices):
96 | modified_indices = []
97 | for i in frame_indices:
98 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
99 | if not os.path.exists(image_path):
100 | return modified_indices
101 | modified_indices.append(i)
102 | return modified_indices
103 |
104 |
105 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
106 | sample_duration):
107 | data = load_annotation_data(annotation_path)
108 | video_names, annotations = get_video_names_and_annotations(data, subset)
109 | class_to_idx = get_class_labels(data)
110 | idx_to_class = {}
111 | for name, label in class_to_idx.items():
112 | idx_to_class[label] = name
113 |
114 | dataset = []
115 | for i in range(len(video_names)):
116 | if i % 1000 == 0:
117 | print('dataset loading [{}/{}]'.format(i, len(video_names)))
118 |
119 | video_path = os.path.join(root_path, video_names[i])
120 | if not os.path.exists(video_path):
121 | continue
122 |
123 | fps_file_path = os.path.join(video_path, 'fps')
124 | fps = load_value_file(fps_file_path)
125 |
126 | for annotation in annotations[i]:
127 | begin_t = math.ceil(annotation['segment'][0] * fps)
128 | end_t = math.ceil(annotation['segment'][1] * fps)
129 | if begin_t == 0:
130 | begin_t = 1
131 | n_frames = end_t - begin_t
132 |
133 | sample = {
134 | 'video': video_path,
135 | 'segment': [begin_t, end_t],
136 | 'fps': fps,
137 | 'video_id': video_names[i][2:]
138 | }
139 | if len(annotations) != 0:
140 | sample['label'] = class_to_idx[annotation['label']]
141 | else:
142 | sample['label'] = -1
143 |
144 | if n_samples_for_each_video == 1:
145 | frame_indices = list(range(begin_t, end_t))
146 | frame_indices = modify_frame_indices(sample['video'],
147 | frame_indices)
148 | if len(frame_indices) < 16:
149 | continue
150 | sample['frame_indices'] = frame_indices
151 | dataset.append(sample)
152 | else:
153 | if n_samples_for_each_video > 1:
154 | step = max(1,
155 | math.ceil((n_frames - 1 - sample_duration) /
156 | (n_samples_for_each_video - 1)))
157 | else:
158 | step = sample_duration
159 | for j in range(begin_t, end_t, step):
160 | sample_j = copy.deepcopy(sample)
161 | frame_indices = list(range(j, j + sample_duration))
162 | frame_indices = modify_frame_indices(
163 | sample_j['video'], frame_indices)
164 | if len(frame_indices) < 16:
165 | continue
166 | sample_j['frame_indices'] = frame_indices
167 | dataset.append(sample_j)
168 |
169 | return dataset, idx_to_class
170 |
171 |
172 | def get_end_t(video_path):
173 | file_names = os.listdir(video_path)
174 | image_file_names = [x for x in file_names if 'image' in x]
175 | image_file_names.sort(reverse=True)
176 | return int(image_file_names[0][6:11])
177 |
178 |
179 | def make_untrimmed_dataset(root_path, annotation_path, subset,
180 | n_samples_for_each_video, sample_duration):
181 | data = load_annotation_data(annotation_path)
182 | video_names, _ = get_video_names_and_annotations(data, subset)
183 | class_to_idx = get_class_labels(data)
184 | idx_to_class = {}
185 | for name, label in class_to_idx.items():
186 | idx_to_class[label] = name
187 |
188 | dataset = []
189 | for i in range(len(video_names)):
190 | if i % 1000 == 0:
191 | print('dataset loading [{}/{}]'.format(i, len(video_names)))
192 |
193 | video_path = os.path.join(root_path, video_names[i])
194 | if not os.path.exists(video_path):
195 | continue
196 |
197 | fps_file_path = os.path.join(video_path, 'fps')
198 | fps = load_value_file(fps_file_path)
199 |
200 | begin_t = 1
201 | end_t = get_end_t(video_path)
202 | n_frames = end_t - begin_t
203 |
204 | sample = {
205 | 'video': video_path,
206 | 'segment': [begin_t, end_t],
207 | 'fps': fps,
208 | 'video_id': video_names[i][2:]
209 | }
210 |
211 | if n_samples_for_each_video >= 1:
212 | step = max(1,
213 | math.ceil((n_frames - 1 - sample_duration) /
214 | (n_samples_for_each_video - 1)))
215 | else:
216 | step = sample_duration
217 | for j in range(begin_t, end_t, step):
218 | sample_j = copy.deepcopy(sample)
219 | frame_indices = list(range(j, j + sample_duration))
220 | frame_indices = modify_frame_indices(sample_j['video'],
221 | frame_indices)
222 | if len(frame_indices) < 16:
223 | continue
224 | sample_j['frame_indices'] = frame_indices
225 | dataset.append(sample_j)
226 |
227 | return dataset, idx_to_class
228 |
229 |
230 | class ActivityNet(data.Dataset):
231 | """
232 | Args:
233 | root (string): Root directory path.
234 | spatial_transform (callable, optional): A function/transform that takes in an PIL image
235 | and returns a transformed version. E.g, ``transforms.RandomCrop``
236 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices
237 | and returns a transformed version
238 | target_transform (callable, optional): A function/transform that takes in the
239 | target and transforms it.
240 | loader (callable, optional): A function to load an video given its path and frame indices.
241 | Attributes:
242 | classes (list): List of the class names.
243 | class_to_idx (dict): Dict with items (class_name, class_index).
244 | imgs (list): List of (image path, class_index) tuples
245 | """
246 |
247 | def __init__(self,
248 | root_path,
249 | annotation_path,
250 | subset,
251 | is_untrimmed_setting=False,
252 | n_samples_for_each_video=1,
253 | spatial_transform=None,
254 | temporal_transform=None,
255 | target_transform=None,
256 | sample_duration=16,
257 | get_loader=get_default_video_loader):
258 | if is_untrimmed_setting:
259 | self.data, self.class_names = make_untrimmed_dataset(
260 | root_path, annotation_path, subset, n_samples_for_each_video,
261 | sample_duration)
262 | else:
263 | self.data, self.class_names = make_dataset(
264 | root_path, annotation_path, subset, n_samples_for_each_video,
265 | sample_duration)
266 |
267 | self.spatial_transform = spatial_transform
268 | self.temporal_transform = temporal_transform
269 | self.target_transform = target_transform
270 | self.loader = get_loader()
271 |
272 | def __getitem__(self, index):
273 | """
274 | Args:
275 | index (int): Index
276 | Returns:
277 | tuple: (image, target) where target is class_index of the target class.
278 | """
279 | path = self.data[index]['video']
280 |
281 | frame_indices = self.data[index]['frame_indices']
282 | if self.temporal_transform is not None:
283 | frame_indices = self.temporal_transform(frame_indices)
284 | clip = self.loader(path, frame_indices)
285 | if self.spatial_transform is not None:
286 | self.spatial_transform.randomize_parameters()
287 | clip = [self.spatial_transform(img) for img in clip]
288 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
289 |
290 | target = self.data[index]
291 | if self.target_transform is not None:
292 | target = self.target_transform(target)
293 |
294 | return clip, target
295 |
296 | def __len__(self):
297 | return len(self.data)
298 |
--------------------------------------------------------------------------------
/3D_experiment/datasets/hmdb51.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.utils.data as data
3 | from PIL import Image
4 | import os
5 | import math
6 | import functools
7 | import json
8 | import copy
9 |
10 | from utils import load_value_file
11 |
12 |
13 | def pil_loader(path):
14 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
15 | with open(path, 'rb') as f:
16 | with Image.open(f) as img:
17 | return img.convert('RGB')
18 |
19 |
20 | def accimage_loader(path):
21 | try:
22 | import accimage
23 | return accimage.Image(path)
24 | except IOError:
25 | # Potentially a decoding problem, fall back to PIL.Image
26 | return pil_loader(path)
27 |
28 |
29 | def get_default_image_loader():
30 | from torchvision import get_image_backend
31 | if get_image_backend() == 'accimage':
32 | return accimage_loader
33 | else:
34 | return pil_loader
35 |
36 |
37 | def video_loader(video_dir_path, frame_indices, image_loader):
38 | video = []
39 | for i in frame_indices:
40 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
41 | if os.path.exists(image_path):
42 | video.append(image_loader(image_path))
43 | else:
44 | return video
45 |
46 | return video
47 |
48 |
49 | def get_default_video_loader():
50 | image_loader = get_default_image_loader()
51 | return functools.partial(video_loader, image_loader=image_loader)
52 |
53 |
54 | def load_annotation_data(data_file_path):
55 | with open(data_file_path, 'r') as data_file:
56 | return json.load(data_file)
57 |
58 |
59 | def get_class_labels(data):
60 | class_labels_map = {}
61 | index = 0
62 | for class_label in data['labels']:
63 | class_labels_map[class_label] = index
64 | index += 1
65 | return class_labels_map
66 |
67 |
68 | def get_video_names_and_annotations(data, subset):
69 | video_names = []
70 | annotations = []
71 |
72 | for key, value in data['database'].items():
73 | this_subset = value['subset']
74 | if this_subset == subset:
75 | label = value['annotations']['label']
76 | video_names.append('{}/{}'.format(label, key))
77 | annotations.append(value['annotations'])
78 |
79 | return video_names, annotations
80 |
81 |
82 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
83 | sample_duration):
84 | data = load_annotation_data(annotation_path)
85 | video_names, annotations = get_video_names_and_annotations(data, subset)
86 | class_to_idx = get_class_labels(data)
87 | idx_to_class = {}
88 | for name, label in class_to_idx.items():
89 | idx_to_class[label] = name
90 |
91 | dataset = []
92 | for i in range(len(video_names)):
93 | if i % 1000 == 0:
94 | print('dataset loading [{}/{}]'.format(i, len(video_names)))
95 |
96 | video_path = os.path.join(root_path, video_names[i])
97 | if not os.path.exists(video_path):
98 | continue
99 |
100 | n_frames_file_path = os.path.join(video_path, 'n_frames')
101 | n_frames = int(load_value_file(n_frames_file_path))
102 | if n_frames <= 0:
103 | continue
104 |
105 | begin_t = 1
106 | end_t = n_frames
107 | sample = {
108 | 'video': video_path,
109 | 'segment': [begin_t, end_t],
110 | 'n_frames': n_frames,
111 | 'video_id': video_names[i].split('/')[1]
112 | }
113 | if len(annotations) != 0:
114 | sample['label'] = class_to_idx[annotations[i]['label']]
115 | else:
116 | sample['label'] = -1
117 |
118 | if n_samples_for_each_video == 1:
119 | sample['frame_indices'] = list(range(1, n_frames + 1))
120 | dataset.append(sample)
121 | else:
122 | if n_samples_for_each_video > 1:
123 | step = max(1,
124 | math.ceil((n_frames - 1 - sample_duration) /
125 | (n_samples_for_each_video - 1)))
126 | else:
127 | step = sample_duration
128 | for j in range(1, n_frames, step):
129 | sample_j = copy.deepcopy(sample)
130 | sample_j['frame_indices'] = list(
131 | range(j, min(n_frames + 1, j + sample_duration)))
132 | dataset.append(sample_j)
133 |
134 | return dataset, idx_to_class
135 |
136 |
137 | class HMDB51(data.Dataset):
138 | """
139 | Args:
140 | root (string): Root directory path.
141 | spatial_transform (callable, optional): A function/transform that takes in an PIL image
142 | and returns a transformed version. E.g, ``transforms.RandomCrop``
143 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices
144 | and returns a transformed version
145 | target_transform (callable, optional): A function/transform that takes in the
146 | target and transforms it.
147 | loader (callable, optional): A function to load an video given its path and frame indices.
148 | Attributes:
149 | classes (list): List of the class names.
150 | class_to_idx (dict): Dict with items (class_name, class_index).
151 | imgs (list): List of (image path, class_index) tuples
152 | """
153 |
154 | def __init__(self,
155 | root_path,
156 | annotation_path,
157 | subset,
158 | n_samples_for_each_video=1,
159 | spatial_transform=None,
160 | temporal_transform=None,
161 | target_transform=None,
162 | sample_duration=16,
163 | get_loader=get_default_video_loader):
164 | self.data, self.class_names = make_dataset(
165 | root_path, annotation_path, subset, n_samples_for_each_video,
166 | sample_duration)
167 |
168 | self.spatial_transform = spatial_transform
169 | self.temporal_transform = temporal_transform
170 | self.target_transform = target_transform
171 | self.loader = get_loader()
172 |
173 | def __getitem__(self, index):
174 | """
175 | Args:
176 | index (int): Index
177 | Returns:
178 | tuple: (image, target) where target is class_index of the target class.
179 | """
180 | path = self.data[index]['video']
181 |
182 | frame_indices = self.data[index]['frame_indices']
183 | if self.temporal_transform is not None:
184 | frame_indices = self.temporal_transform(frame_indices)
185 | clip = self.loader(path, frame_indices)
186 | if self.spatial_transform is not None:
187 | self.spatial_transform.randomize_parameters()
188 | clip = [self.spatial_transform(img) for img in clip]
189 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
190 |
191 | target = self.data[index]
192 | if self.target_transform is not None:
193 | target = self.target_transform(target)
194 |
195 | return clip, target
196 |
197 | def __len__(self):
198 | return len(self.data)
199 |
--------------------------------------------------------------------------------
/3D_experiment/datasets/kinetics.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.utils.data as data
3 | from PIL import Image
4 | import os
5 | import math
6 | import functools
7 | import json
8 | import copy
9 |
10 | from utils import load_value_file
11 |
12 |
13 | def pil_loader(path):
14 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
15 | with open(path, 'rb') as f:
16 | with Image.open(f) as img:
17 | return img.convert('RGB')
18 |
19 |
20 | def accimage_loader(path):
21 | try:
22 | import accimage
23 | return accimage.Image(path)
24 | except IOError:
25 | # Potentially a decoding problem, fall back to PIL.Image
26 | return pil_loader(path)
27 |
28 |
29 | def get_default_image_loader():
30 | from torchvision import get_image_backend
31 | if get_image_backend() == 'accimage':
32 | return accimage_loader
33 | else:
34 | return pil_loader
35 |
36 |
37 | def video_loader(video_dir_path, frame_indices, image_loader):
38 | video = []
39 | for i in frame_indices:
40 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
41 | if os.path.exists(image_path):
42 | video.append(image_loader(image_path))
43 | else:
44 | return video
45 |
46 | return video
47 |
48 |
49 | def get_default_video_loader():
50 | image_loader = get_default_image_loader()
51 | return functools.partial(video_loader, image_loader=image_loader)
52 |
53 |
54 | def load_annotation_data(data_file_path):
55 | with open(data_file_path, 'r') as data_file:
56 | return json.load(data_file)
57 |
58 |
59 | def get_class_labels(data):
60 | class_labels_map = {}
61 | index = 0
62 | for class_label in data['labels']:
63 | class_labels_map[class_label] = index
64 | index += 1
65 | return class_labels_map
66 |
67 |
68 | def get_video_names_and_annotations(data, subset):
69 | video_names = []
70 | annotations = []
71 |
72 | for key, value in data['database'].items():
73 | this_subset = value['subset']
74 | if this_subset == subset:
75 | if subset == 'testing':
76 | video_names.append('test/{}'.format(key))
77 | else:
78 | label = value['annotations']['label']
79 | video_names.append('{}/{}'.format(label, key))
80 | annotations.append(value['annotations'])
81 |
82 | return video_names, annotations
83 |
84 |
85 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
86 | sample_duration):
87 | data = load_annotation_data(annotation_path)
88 | video_names, annotations = get_video_names_and_annotations(data, subset)
89 | class_to_idx = get_class_labels(data)
90 | idx_to_class = {}
91 | for name, label in class_to_idx.items():
92 | idx_to_class[label] = name
93 |
94 | dataset = []
95 | for i in range(len(video_names)):
96 | if i % 1000 == 0:
97 | print('dataset loading [{}/{}]'.format(i, len(video_names)))
98 |
99 | video_path = os.path.join(root_path, video_names[i])
100 | if not os.path.exists(video_path):
101 | continue
102 |
103 | n_frames_file_path = os.path.join(video_path, 'n_frames')
104 | n_frames = int(load_value_file(n_frames_file_path))
105 | if n_frames <= 0:
106 | continue
107 |
108 | begin_t = 1
109 | end_t = n_frames
110 | sample = {
111 | 'video': video_path,
112 | 'segment': [begin_t, end_t],
113 | 'n_frames': n_frames,
114 | 'video_id': video_names[i][:-14].split('/')[1]
115 | }
116 | if len(annotations) != 0:
117 | sample['label'] = class_to_idx[annotations[i]['label']]
118 | else:
119 | sample['label'] = -1
120 |
121 | if n_samples_for_each_video == 1:
122 | sample['frame_indices'] = list(range(1, n_frames + 1))
123 | dataset.append(sample)
124 | else:
125 | if n_samples_for_each_video > 1:
126 | step = max(1,
127 | math.ceil((n_frames - 1 - sample_duration) /
128 | (n_samples_for_each_video - 1)))
129 | else:
130 | step = sample_duration
131 | for j in range(1, n_frames, step):
132 | sample_j = copy.deepcopy(sample)
133 | sample_j['frame_indices'] = list(
134 | range(j, min(n_frames + 1, j + sample_duration)))
135 | dataset.append(sample_j)
136 |
137 | return dataset, idx_to_class
138 |
139 |
140 | class Kinetics(data.Dataset):
141 | """
142 | Args:
143 | root (string): Root directory path.
144 | spatial_transform (callable, optional): A function/transform that takes in an PIL image
145 | and returns a transformed version. E.g, ``transforms.RandomCrop``
146 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices
147 | and returns a transformed version
148 | target_transform (callable, optional): A function/transform that takes in the
149 | target and transforms it.
150 | loader (callable, optional): A function to load an video given its path and frame indices.
151 | Attributes:
152 | classes (list): List of the class names.
153 | class_to_idx (dict): Dict with items (class_name, class_index).
154 | imgs (list): List of (image path, class_index) tuples
155 | """
156 |
157 | def __init__(self,
158 | root_path,
159 | annotation_path,
160 | subset,
161 | n_samples_for_each_video=1,
162 | spatial_transform=None,
163 | temporal_transform=None,
164 | target_transform=None,
165 | sample_duration=16,
166 | get_loader=get_default_video_loader):
167 | self.data, self.class_names = make_dataset(
168 | root_path, annotation_path, subset, n_samples_for_each_video,
169 | sample_duration)
170 |
171 | self.spatial_transform = spatial_transform
172 | self.temporal_transform = temporal_transform
173 | self.target_transform = target_transform
174 | self.loader = get_loader()
175 |
176 | def __getitem__(self, index):
177 | """
178 | Args:
179 | index (int): Index
180 | Returns:
181 | tuple: (image, target) where target is class_index of the target class.
182 | """
183 | path = self.data[index]['video']
184 |
185 | frame_indices = self.data[index]['frame_indices']
186 | if self.temporal_transform is not None:
187 | frame_indices = self.temporal_transform(frame_indices)
188 | clip = self.loader(path, frame_indices)
189 | if self.spatial_transform is not None:
190 | self.spatial_transform.randomize_parameters()
191 | clip = [self.spatial_transform(img) for img in clip]
192 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
193 |
194 | target = self.data[index]
195 | if self.target_transform is not None:
196 | target = self.target_transform(target)
197 |
198 | return clip, target
199 |
200 | def __len__(self):
201 | return len(self.data)
202 |
--------------------------------------------------------------------------------
/3D_experiment/datasets/ucf101.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.utils.data as data
3 | from PIL import Image
4 | import os
5 | import math
6 | import functools
7 | import json
8 | import copy
9 |
10 | from utils import load_value_file
11 |
12 |
13 | def pil_loader(path):
14 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
15 | with open(path, 'rb') as f:
16 | with Image.open(f) as img:
17 | return img.convert('RGB')
18 |
19 |
20 | def accimage_loader(path):
21 | try:
22 | import accimage
23 | return accimage.Image(path)
24 | except IOError:
25 | # Potentially a decoding problem, fall back to PIL.Image
26 | return pil_loader(path)
27 |
28 |
29 | def get_default_image_loader():
30 | from torchvision import get_image_backend
31 | if get_image_backend() == 'accimage':
32 | return accimage_loader
33 | else:
34 | return pil_loader
35 |
36 |
37 | def video_loader(video_dir_path, frame_indices, image_loader):
38 | video = []
39 | for i in frame_indices:
40 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
41 | if os.path.exists(image_path):
42 | video.append(image_loader(image_path))
43 | else:
44 | return video
45 |
46 | return video
47 |
48 |
49 | def get_default_video_loader():
50 | image_loader = get_default_image_loader()
51 | return functools.partial(video_loader, image_loader=image_loader)
52 |
53 |
54 | def load_annotation_data(data_file_path):
55 | with open(data_file_path, 'r') as data_file:
56 | return json.load(data_file)
57 |
58 |
59 | def get_class_labels(data):
60 | class_labels_map = {}
61 | index = 0
62 | for class_label in data['labels']:
63 | class_labels_map[class_label] = index
64 | index += 1
65 | return class_labels_map
66 |
67 |
68 | def get_video_names_and_annotations(data, subset):
69 | video_names = []
70 | annotations = []
71 |
72 | for key, value in data['database'].items():
73 | this_subset = value['subset']
74 | if this_subset == subset:
75 | label = value['annotations']['label']
76 | video_names.append('{}/{}'.format(label, key))
77 | annotations.append(value['annotations'])
78 |
79 | return video_names, annotations
80 |
81 |
82 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video,
83 | sample_duration):
84 | data = load_annotation_data(annotation_path)
85 | video_names, annotations = get_video_names_and_annotations(data, subset)
86 | class_to_idx = get_class_labels(data)
87 | idx_to_class = {}
88 | for name, label in class_to_idx.items():
89 | idx_to_class[label] = name
90 |
91 | dataset = []
92 | for i in range(len(video_names)):
93 | if i % 1000 == 0:
94 | print('dataset loading [{}/{}]'.format(i, len(video_names)))
95 |
96 | video_path = os.path.join(root_path, video_names[i])
97 | if not os.path.exists(video_path):
98 | continue
99 |
100 | n_frames_file_path = os.path.join(video_path, 'n_frames')
101 | n_frames = int(load_value_file(n_frames_file_path))
102 | if n_frames <= 0:
103 | continue
104 |
105 | begin_t = 1
106 | end_t = n_frames
107 | sample = {
108 | 'video': video_path,
109 | 'segment': [begin_t, end_t],
110 | 'n_frames': n_frames,
111 | 'video_id': video_names[i].split('/')[1]
112 | }
113 | if len(annotations) != 0:
114 | sample['label'] = class_to_idx[annotations[i]['label']]
115 | else:
116 | sample['label'] = -1
117 |
118 | if n_samples_for_each_video == 1:
119 | sample['frame_indices'] = list(range(1, n_frames + 1))
120 | dataset.append(sample)
121 | else:
122 | if n_samples_for_each_video > 1:
123 | step = max(1,
124 | math.ceil((n_frames - 1 - sample_duration) /
125 | (n_samples_for_each_video - 1)))
126 | else:
127 | step = sample_duration
128 | for j in range(1, n_frames, step):
129 | sample_j = copy.deepcopy(sample)
130 | sample_j['frame_indices'] = list(
131 | range(j, min(n_frames + 1, j + sample_duration)))
132 | dataset.append(sample_j)
133 |
134 | return dataset, idx_to_class
135 |
136 |
137 | class UCF101(data.Dataset):
138 | """
139 | Args:
140 | root (string): Root directory path.
141 | spatial_transform (callable, optional): A function/transform that takes in an PIL image
142 | and returns a transformed version. E.g, ``transforms.RandomCrop``
143 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices
144 | and returns a transformed version
145 | target_transform (callable, optional): A function/transform that takes in the
146 | target and transforms it.
147 | loader (callable, optional): A function to load an video given its path and frame indices.
148 | Attributes:
149 | classes (list): List of the class names.
150 | class_to_idx (dict): Dict with items (class_name, class_index).
151 | imgs (list): List of (image path, class_index) tuples
152 | """
153 |
154 | def __init__(self,
155 | root_path,
156 | annotation_path,
157 | subset,
158 | n_samples_for_each_video=1,
159 | spatial_transform=None,
160 | temporal_transform=None,
161 | target_transform=None,
162 | sample_duration=16,
163 | get_loader=get_default_video_loader):
164 | self.data, self.class_names = make_dataset(
165 | root_path, annotation_path, subset, n_samples_for_each_video,
166 | sample_duration)
167 |
168 | self.spatial_transform = spatial_transform
169 | self.temporal_transform = temporal_transform
170 | self.target_transform = target_transform
171 | self.loader = get_loader()
172 |
173 | def __getitem__(self, index):
174 | """
175 | Args:
176 | index (int): Index
177 | Returns:
178 | tuple: (image, target) where target is class_index of the target class.
179 | """
180 | path = self.data[index]['video']
181 |
182 | frame_indices = self.data[index]['frame_indices']
183 | if self.temporal_transform is not None:
184 | frame_indices = self.temporal_transform(frame_indices)
185 | clip = self.loader(path, frame_indices)
186 | if self.spatial_transform is not None:
187 | self.spatial_transform.randomize_parameters()
188 | clip = [self.spatial_transform(img) for img in clip]
189 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
190 |
191 | target = self.data[index]
192 | if self.target_transform is not None:
193 | target = self.target_transform(target)
194 |
195 | return clip, target
196 |
197 | def __len__(self):
198 | return len(self.data)
199 |
--------------------------------------------------------------------------------
/3D_experiment/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import json
4 | import numpy as np
5 | import torch
6 | from torch import nn
7 | from torch import optim
8 | from torch.optim import lr_scheduler
9 |
10 | from opts import parse_opts
11 | from mean import get_mean, get_std
12 | from spatial_transforms import (
13 | Compose, Normalize, Scale, CenterCrop, CornerCrop, MultiScaleCornerCrop,
14 | MultiScaleRandomCrop, RandomHorizontalFlip, ToTensor)
15 | from temporal_transforms import LoopPadding, TemporalRandomCrop
16 | from target_transforms import ClassLabel, VideoID
17 | from target_transforms import Compose as TargetCompose
18 | from dataset import get_training_set, get_validation_set, get_test_set
19 | from utils import Logger
20 | from train import train_epoch
21 | from validation import val_epoch
22 | import test
23 | from models.resnet3D import resnet3D50
24 |
25 | if __name__ == '__main__':
26 | opt = parse_opts()
27 | if opt.root_path != '':
28 | opt.video_path = os.path.join(opt.root_path, opt.video_path)
29 | opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path)
30 | opt.result_path = os.path.join(opt.root_path, opt.result_path)
31 | if opt.resume_path:
32 | opt.resume_path = os.path.join(opt.root_path, opt.resume_path)
33 | if opt.pretrain_path:
34 | opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path)
35 | opt.scales = [opt.initial_scale]
36 | for i in range(1, opt.n_scales):
37 | opt.scales.append(opt.scales[-1] * opt.scale_step)
38 | opt.arch = '{}-{}'.format(opt.model, opt.model_depth)
39 | opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset)
40 | opt.std = get_std(opt.norm_value)
41 | print(opt)
42 | with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file:
43 | json.dump(vars(opt), opt_file)
44 |
45 | torch.manual_seed(opt.manual_seed)
46 |
47 | model = resnet3D50(non_local=True)
48 | parameters = model.parameters()
49 |
50 | if not opt.no_cuda:
51 | model = model.cuda()
52 | model = nn.DataParallel(model, device_ids=None)
53 |
54 | print(model)
55 | criterion = nn.CrossEntropyLoss()
56 | if not opt.no_cuda:
57 | criterion = criterion.cuda()
58 |
59 | if opt.no_mean_norm and not opt.std_norm:
60 | norm_method = Normalize([0, 0, 0], [1, 1, 1])
61 | elif not opt.std_norm:
62 | norm_method = Normalize(opt.mean, [1, 1, 1])
63 | else:
64 | norm_method = Normalize(opt.mean, opt.std)
65 |
66 | if not opt.no_train:
67 | assert opt.train_crop in ['random', 'corner', 'center']
68 | if opt.train_crop == 'random':
69 | crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size)
70 | elif opt.train_crop == 'corner':
71 | crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size)
72 | elif opt.train_crop == 'center':
73 | crop_method = MultiScaleCornerCrop(
74 | opt.scales, opt.sample_size, crop_positions=['c'])
75 | spatial_transform = Compose([
76 | crop_method,
77 | RandomHorizontalFlip(),
78 | ToTensor(opt.norm_value), norm_method
79 | ])
80 | temporal_transform = TemporalRandomCrop(opt.sample_duration)
81 | target_transform = ClassLabel()
82 | training_data = get_training_set(opt, spatial_transform,
83 | temporal_transform, target_transform)
84 | train_loader = torch.utils.data.DataLoader(
85 | training_data,
86 | batch_size=opt.batch_size,
87 | shuffle=True,
88 | num_workers=opt.n_threads,
89 | pin_memory=True)
90 | train_logger = Logger(
91 | os.path.join(opt.result_path, 'train.log'),
92 | ['epoch', 'loss', 'acc', 'lr'])
93 | train_batch_logger = Logger(
94 | os.path.join(opt.result_path, 'train_batch.log'),
95 | ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr'])
96 |
97 | if opt.nesterov:
98 | dampening = 0
99 | else:
100 | dampening = opt.dampening
101 | optimizer = optim.SGD(
102 | parameters,
103 | lr=opt.learning_rate,
104 | momentum=opt.momentum,
105 | dampening=dampening,
106 | weight_decay=opt.weight_decay,
107 | nesterov=opt.nesterov)
108 | scheduler = lr_scheduler.ReduceLROnPlateau(
109 | optimizer, 'min', patience=opt.lr_patience)
110 | if not opt.no_val:
111 | spatial_transform = Compose([
112 | Scale(opt.sample_size),
113 | CenterCrop(opt.sample_size),
114 | ToTensor(opt.norm_value), norm_method
115 | ])
116 | temporal_transform = LoopPadding(opt.sample_duration)
117 | target_transform = ClassLabel()
118 | validation_data = get_validation_set(
119 | opt, spatial_transform, temporal_transform, target_transform)
120 | val_loader = torch.utils.data.DataLoader(
121 | validation_data,
122 | batch_size=opt.batch_size,
123 | shuffle=False,
124 | num_workers=opt.n_threads,
125 | pin_memory=True)
126 | val_logger = Logger(
127 | os.path.join(opt.result_path, 'val.log'), ['epoch', 'loss', 'acc'])
128 |
129 | if opt.resume_path:
130 | print('loading checkpoint {}'.format(opt.resume_path))
131 | checkpoint = torch.load(opt.resume_path)
132 | assert opt.arch == checkpoint['arch']
133 |
134 | opt.begin_epoch = checkpoint['epoch']
135 | model.load_state_dict(checkpoint['state_dict'])
136 | if not opt.no_train:
137 | optimizer.load_state_dict(checkpoint['optimizer'])
138 |
139 | print('run')
140 | for i in range(opt.begin_epoch, opt.n_epochs + 1):
141 | if not opt.no_train:
142 | train_epoch(i, train_loader, model, criterion, optimizer, opt,
143 | train_logger, train_batch_logger)
144 | if not opt.no_val:
145 | validation_loss = val_epoch(i, val_loader, model, criterion, opt,
146 | val_logger)
147 |
148 | if not opt.no_train and not opt.no_val:
149 | scheduler.step(validation_loss)
150 |
151 | if opt.test:
152 | spatial_transform = Compose([
153 | Scale(int(opt.sample_size / opt.scale_in_test)),
154 | CornerCrop(opt.sample_size, opt.crop_position_in_test),
155 | ToTensor(opt.norm_value), norm_method
156 | ])
157 | temporal_transform = LoopPadding(opt.sample_duration)
158 | target_transform = VideoID()
159 |
160 | test_data = get_test_set(opt, spatial_transform, temporal_transform,
161 | target_transform)
162 | test_loader = torch.utils.data.DataLoader(
163 | test_data,
164 | batch_size=opt.batch_size,
165 | shuffle=False,
166 | num_workers=opt.n_threads,
167 | pin_memory=True)
168 | test.test(test_loader, model, opt, test_data.class_names)
169 |
--------------------------------------------------------------------------------
/3D_experiment/mean.py:
--------------------------------------------------------------------------------
1 | def get_mean(norm_value=255, dataset='activitynet'):
2 | assert dataset in ['activitynet', 'kinetics']
3 |
4 | if dataset == 'activitynet':
5 | return [
6 | 114.7748 / norm_value, 107.7354 / norm_value, 99.4750 / norm_value
7 | ]
8 | elif dataset == 'kinetics':
9 | # Kinetics (10 videos for each class)
10 | return [
11 | 110.63666788 / norm_value, 103.16065604 / norm_value,
12 | 96.29023126 / norm_value
13 | ]
14 |
15 |
16 | def get_std(norm_value=255):
17 | # Kinetics (10 videos for each class)
18 | return [
19 | 38.7568578 / norm_value, 37.88248729 / norm_value,
20 | 40.02898126 / norm_value
21 | ]
22 |
--------------------------------------------------------------------------------
/3D_experiment/models/non_local.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from torch.nn import functional as F
4 |
5 |
6 | class NLBlockND(nn.Module):
7 | def __init__(self, in_channels, inter_channels=None, mode='embedded',
8 | dimension=3, bn_layer=True):
9 | """Implementation of Non-Local Block with 4 different pairwise functions
10 | args:
11 | in_channels: original channel size (1024 in the paper)
12 | inter_channels: channel size inside the block if not specifed reduced to half (512 in the paper)
13 | mode: supports Gaussian, Embedded Gaussian, Dot Product, and Concatenation
14 | dimension: can be 1 (temporal), 2 (spatial), 3 (spatiotemporal)
15 | bn_layer: whether to add batch norm
16 | """
17 | super(NLBlockND, self).__init__()
18 |
19 | assert dimension in [1, 2, 3]
20 |
21 | if mode not in ['gaussian', 'embedded', 'dot', 'concatenate']:
22 | raise ValueError('`mode` must be one of `gaussian`, `embedded`, `dot` or `concatenate`')
23 |
24 | self.mode = mode
25 | self.dimension = dimension
26 |
27 | self.in_channels = in_channels
28 | self.inter_channels = inter_channels
29 |
30 | # the channel size is reduced to half inside the block
31 | if self.inter_channels is None:
32 | self.inter_channels = in_channels // 2
33 | if self.inter_channels == 0:
34 | self.inter_channels = 1
35 |
36 | # assign appropriate convolutional, max pool, and batch norm layers for different dimensions
37 | if dimension == 3:
38 | conv_nd = nn.Conv3d
39 | max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
40 | bn = nn.BatchNorm3d
41 | elif dimension == 2:
42 | conv_nd = nn.Conv2d
43 | max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
44 | bn = nn.BatchNorm2d
45 | else:
46 | conv_nd = nn.Conv1d
47 | max_pool_layer = nn.MaxPool1d(kernel_size=(2))
48 | bn = nn.BatchNorm1d
49 |
50 | # function g in the paper which goes through conv. with kernel size 1
51 | self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
52 |
53 | # add BatchNorm layer after the last conv layer
54 | if bn_layer:
55 | self.W_z = nn.Sequential(
56 | conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, kernel_size=1),
57 | bn(self.in_channels)
58 | )
59 | nn.init.constant_(self.W_z[1].weight, 0)
60 | nn.init.constant_(self.W_z[1].bias, 0)
61 | else:
62 | self.W_z = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, kernel_size=1)
63 | nn.init.constant_(self.W_z.weight, 0)
64 | nn.init.constant_(self.W_z.bias, 0)
65 |
66 | # define theta and phi for all operations except gaussian
67 | if self.mode == "embedded" or self.mode == "dot" or self.mode == "concatenate":
68 | self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
69 | self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
70 |
71 | if self.mode == "concatenate":
72 | self.W_f = nn.Sequential(
73 | nn.Conv2d(in_channels=self.inter_channel * 2, out_channels=1, kernel_size=1),
74 | nn.ReLU()
75 | )
76 |
77 | def forward(self, x):
78 | """
79 | args
80 | x: (N, C, T, H, W) for dimension=3; (N, C, H, W) for dimension 2; (N, C, T) for dimension 1
81 | """
82 |
83 | batch_size = x.size(0)
84 |
85 | # (N, C, THW)
86 | g_x = self.g(x).view(batch_size, self.inter_channels, -1)
87 | g_x = g_x.permute(0, 2, 1)
88 |
89 | if self.mode == "gaussian":
90 | theta_x = x.view(batch_size, self.in_channels, -1)
91 | phi_x = x.view(batch_size, self.in_channels, -1)
92 | theta_x = theta_x.permute(0, 2, 1)
93 | f = torch.matmul(theta_x, phi_x)
94 |
95 | elif self.mode == "embedded" or self.mode == "dot":
96 | theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
97 | phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
98 | theta_x = theta_x.permute(0, 2, 1)
99 | f = torch.matmul(theta_x, phi_x)
100 |
101 | elif self.mode == "concatenate":
102 | theta_x = self.theta(x).view(batch_size, self.inter_channels, -1, 1)
103 | phi_x = self.phi(x).view(batch_size, self.inter_channels, 1, -1)
104 |
105 | h = theta_x.size(2)
106 | w = phi_x.size(3)
107 | theta_x = theta_x.repeat(1, 1, 1, w)
108 | phi_x = phi_x.repeat(1, 1, h, 1)
109 |
110 | concat = torch.cat([theta_x, phi_x], dim=1)
111 | f = self.W_f(concat)
112 | f = f.view(f.size(0), f.size(2), f.size(3))
113 |
114 | if self.mode == "gaussian" or self.mode == "embedded":
115 | f_div_C = F.softmax(f, dim=-1)
116 | elif self.mode == "dot" or self.mode == "concatenate":
117 | N = f.size(-1) # number of position in x
118 | f_div_C = f / N
119 |
120 | y = torch.matmul(f_div_C, g_x)
121 |
122 | # contiguous here just allocates contiguous chunk of memory
123 | y = y.permute(0, 2, 1).contiguous()
124 | y = y.view(batch_size, self.inter_channels, *x.size()[2:])
125 |
126 | W_y = self.W_z(y)
127 | # residual connection
128 | z = W_y + x
129 |
130 | return z
131 |
132 |
133 | if __name__ == '__main__':
134 | import torch
135 |
136 | for bn_layer in [True, False]:
137 | img = torch.zeros(2, 3, 20)
138 | net = NLBlockND(in_channels=3, mode='concatenate', dimension=1, bn_layer=bn_layer)
139 | out = net(img)
140 | print(out.size())
141 |
142 | img = torch.zeros(2, 3, 20, 20)
143 | net = NLBlockND(in_channels=3, mode='concatenate', dimension=2, bn_layer=bn_layer)
144 | out = net(img)
145 | print(out.size())
146 |
147 | img = torch.randn(2, 3, 8, 20, 20)
148 | net = NLBlockND(in_channels=3, mode='concatenate', dimension=3, bn_layer=bn_layer)
149 | out = net(img)
150 | print(out.size())
151 |
152 |
153 |
--------------------------------------------------------------------------------
/3D_experiment/models/resnet3D.py:
--------------------------------------------------------------------------------
1 | """
2 | ResNet50 (C2D) for spatiotemporal task. Only ResNet50 backbone structure was implemented here.
3 | """
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 | import math
9 | from functools import partial
10 | from models.non_local import NLBlockND
11 |
12 |
13 | class Bottleneck(nn.Module):
14 | """
15 | Bottleneck block structure used in ResNet 50.
16 | As mentioned in Section 4. 2D ConvNet baseline (C2D),
17 | all convolutions are in essence 2D kernels that prcoess the input frame-by-frame
18 | (implemented as (1 x k x k) kernels).
19 | """
20 | expansion = 4
21 |
22 | def __init__(self, inplanes, planes, stride=1, padding=(0, 1, 1), downsample=None):
23 | super(Bottleneck, self).__init__()
24 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=(1, 1, 1), bias=False)
25 | self.bn1 = nn.BatchNorm3d(planes)
26 | self.conv2 = nn.Conv3d(planes, planes, kernel_size=(1, 3, 3), stride=stride, padding=padding, bias=False)
27 | self.bn2 = nn.BatchNorm3d(planes)
28 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=(1, 1, 1), bias=False)
29 | self.bn3 = nn.BatchNorm3d(planes * 4)
30 | self.relu = nn.ReLU(inplace=True)
31 | self.downsample = downsample
32 | self.stride = stride
33 |
34 | def forward(self, x):
35 | identity = x
36 |
37 | out = self.conv1(x)
38 | out = self.bn1(out)
39 | out = self.relu(out)
40 |
41 | out = self.conv2(out)
42 | out = self.bn2(out)
43 | out = self.relu(out)
44 |
45 | out = self.conv3(out)
46 | out = self.bn3(out)
47 |
48 | if self.downsample is not None:
49 | identity = self.downsample(x)
50 |
51 | out += identity
52 | out = self.relu(out)
53 |
54 | return out
55 |
56 |
57 | class ResNet3D(nn.Module):
58 | """C2D with ResNet 50 backbone.
59 | The only operation involving the temporal domain are the pooling layer after the second residual block.
60 | For more details of the structure, refer to Table 1 from the paper.
61 | Padding was added accordingly to match the correct dimensionality.
62 | """
63 | def __init__(self, block, layers, num_classes=400, non_local=False):
64 | self.inplanes = 64
65 | super(ResNet3D, self).__init__()
66 |
67 | # first convolution operation has essentially 2D kernels
68 | # output: 64 x 16 x 112 x 112
69 | self.conv1 = nn.Conv3d(3, 64, kernel_size=(1, 7, 7), stride=2, padding=(0, 3, 3), bias=False)
70 | self.bn1 = nn.BatchNorm3d(64)
71 | self.relu = nn.ReLU(inplace=True)
72 |
73 | # output: 64 x 8 x 56 x 56
74 | self.pool1 = nn.MaxPool3d(kernel_size=3, stride=2)
75 |
76 | # output: 256 x 8 x 56 x 56
77 | self.layer1 = self._make_layer(block, 64, layers[0], stride=1, d_padding=0)
78 |
79 | # pooling on temporal domain
80 | # output: 256 x 4 x 56 x 56
81 | self.pool_t = nn.MaxPool3d(kernel_size=(3, 1, 1), stride=(2, 1, 1))
82 |
83 | # output: 512 x 4 x 28 x 28
84 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2, padding=(2, 1, 1))
85 |
86 | # add one non-local block here
87 | # output: 1024 x 4 x 14 x 14
88 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2, padding=(2, 1, 1), non_local=non_local)
89 |
90 | # output: 2048 x 4 x 7 x 7
91 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2, padding=(2, 1, 1))
92 |
93 | # output: 2048 x 1
94 | self.avgpool = nn.AvgPool3d(kernel_size=(4, 7, 7))
95 | self.fc = nn.Linear(512 * block.expansion, num_classes)
96 |
97 | for m in self.modules():
98 | if isinstance(m, nn.Conv3d):
99 | m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out')
100 | elif isinstance(m, nn.BatchNorm3d):
101 | m.weight.data.fill_(1)
102 | m.bias.data.zero_()
103 |
104 | def _make_layer(self, block, planes, blocks, stride=1, padding=(0, 1, 1), d_padding=(2, 0, 0), non_local=False):
105 | downsample = nn.Sequential(
106 | nn.Conv3d(self.inplanes, planes * block.expansion,
107 | kernel_size=1, stride=stride, padding=d_padding, bias=False),
108 | nn.BatchNorm3d(planes * block.expansion)
109 | )
110 |
111 | layers = []
112 | layers.append(block(self.inplanes, planes, stride, padding, downsample))
113 | self.inplanes = planes * block.expansion
114 |
115 | last_idx = blocks
116 | if non_local:
117 | last_idx = blocks - 1
118 |
119 | for i in range(1, last_idx):
120 | layers.append(block(self.inplanes, planes))
121 |
122 | # add non-local block here
123 | if non_local:
124 | layers.append(NLBlockND(in_channels=1024, dimension=3))
125 | layers.append(block(self.inplanes, planes))
126 |
127 | return nn.Sequential(*layers)
128 |
129 | def forward(self, x):
130 | x = self.conv1(x)
131 | x = self.bn1(x)
132 | x = self.relu(x)
133 | x = self.pool1(x)
134 |
135 | x = self.layer1(x)
136 | x = self.pool_t(x)
137 | x = self.layer2(x)
138 | x = self.layer3(x)
139 | x = self.layer4(x)
140 |
141 | x = self.avgpool(x)
142 |
143 | x = x.view(x.size(0), -1)
144 | x = self.fc(x)
145 |
146 | return x
147 |
148 |
149 | def resnet3D50(non_local=False, **kwargs):
150 | """Constructs a C2D ResNet-50 model.
151 | """
152 | model = ResNet3D(Bottleneck, [3, 4, 6, 3], non_local=non_local, **kwargs)
153 | return model
154 |
155 |
156 |
157 | if __name__=='__main__':
158 | # Test case of 32 frames (224 x 224 x 3) input of batch size 1
159 | img = Variable(torch.randn(1, 3, 32, 224, 224))
160 | net = resnet3D50(non_local=True)
161 | count = 0
162 | for name, param in net.named_parameters():
163 | if param.requires_grad:
164 | count += 1
165 | print(name)
166 | print (count)
167 | out = net(img)
168 | print(out.size())
169 |
--------------------------------------------------------------------------------
/3D_experiment/opts.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 |
4 | def parse_opts():
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument(
7 | '--root_path',
8 | default='/root/data/ActivityNet',
9 | type=str,
10 | help='Root directory path of data')
11 | parser.add_argument(
12 | '--video_path',
13 | default='video_kinetics_jpg',
14 | type=str,
15 | help='Directory path of Videos')
16 | parser.add_argument(
17 | '--annotation_path',
18 | default='kinetics.json',
19 | type=str,
20 | help='Annotation file path')
21 | parser.add_argument(
22 | '--result_path',
23 | default='results',
24 | type=str,
25 | help='Result directory path')
26 | parser.add_argument(
27 | '--dataset',
28 | default='kinetics',
29 | type=str,
30 | help='Used dataset (activitynet | kinetics | ucf101 | hmdb51)')
31 | parser.add_argument(
32 | '--n_classes',
33 | default=400,
34 | type=int,
35 | help=
36 | 'Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)'
37 | )
38 | parser.add_argument(
39 | '--n_finetune_classes',
40 | default=400,
41 | type=int,
42 | help=
43 | 'Number of classes for fine-tuning. n_classes is set to the number when pretraining.'
44 | )
45 | parser.add_argument(
46 | '--sample_size',
47 | default=112,
48 | type=int,
49 | help='Height and width of inputs')
50 | parser.add_argument(
51 | '--sample_duration',
52 | default=16,
53 | type=int,
54 | help='Temporal duration of inputs')
55 | parser.add_argument(
56 | '--initial_scale',
57 | default=1.0,
58 | type=float,
59 | help='Initial scale for multiscale cropping')
60 | parser.add_argument(
61 | '--n_scales',
62 | default=5,
63 | type=int,
64 | help='Number of scales for multiscale cropping')
65 | parser.add_argument(
66 | '--scale_step',
67 | default=0.84089641525,
68 | type=float,
69 | help='Scale step for multiscale cropping')
70 | parser.add_argument(
71 | '--train_crop',
72 | default='corner',
73 | type=str,
74 | help=
75 | 'Spatial cropping method in training. random is uniform. corner is selection from 4 corners and 1 center. (random | corner | center)'
76 | )
77 | parser.add_argument(
78 | '--learning_rate',
79 | default=0.1,
80 | type=float,
81 | help=
82 | 'Initial learning rate (divided by 10 while training by lr scheduler)')
83 | parser.add_argument('--momentum', default=0.9, type=float, help='Momentum')
84 | parser.add_argument(
85 | '--dampening', default=0.9, type=float, help='dampening of SGD')
86 | parser.add_argument(
87 | '--weight_decay', default=1e-3, type=float, help='Weight Decay')
88 | parser.add_argument(
89 | '--mean_dataset',
90 | default='activitynet',
91 | type=str,
92 | help=
93 | 'dataset for mean values of mean subtraction (activitynet | kinetics)')
94 | parser.add_argument(
95 | '--no_mean_norm',
96 | action='store_true',
97 | help='If true, inputs are not normalized by mean.')
98 | parser.set_defaults(no_mean_norm=False)
99 | parser.add_argument(
100 | '--std_norm',
101 | action='store_true',
102 | help='If true, inputs are normalized by standard deviation.')
103 | parser.set_defaults(std_norm=False)
104 | parser.add_argument(
105 | '--nesterov', action='store_true', help='Nesterov momentum')
106 | parser.set_defaults(nesterov=False)
107 | parser.add_argument(
108 | '--optimizer',
109 | default='sgd',
110 | type=str,
111 | help='Currently only support SGD')
112 | parser.add_argument(
113 | '--lr_patience',
114 | default=10,
115 | type=int,
116 | help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.'
117 | )
118 | parser.add_argument(
119 | '--batch_size', default=128, type=int, help='Batch Size')
120 | parser.add_argument(
121 | '--n_epochs',
122 | default=200,
123 | type=int,
124 | help='Number of total epochs to run')
125 | parser.add_argument(
126 | '--begin_epoch',
127 | default=1,
128 | type=int,
129 | help=
130 | 'Training begins at this epoch. Previous trained model indicated by resume_path is loaded.'
131 | )
132 | parser.add_argument(
133 | '--n_val_samples',
134 | default=3,
135 | type=int,
136 | help='Number of validation samples for each activity')
137 | parser.add_argument(
138 | '--resume_path',
139 | default='',
140 | type=str,
141 | help='Save data (.pth) of previous training')
142 | parser.add_argument(
143 | '--pretrain_path', default='', type=str, help='Pretrained model (.pth)')
144 | parser.add_argument(
145 | '--ft_begin_index',
146 | default=0,
147 | type=int,
148 | help='Begin block index of fine-tuning')
149 | parser.add_argument(
150 | '--no_train',
151 | action='store_true',
152 | help='If true, training is not performed.')
153 | parser.set_defaults(no_train=False)
154 | parser.add_argument(
155 | '--no_val',
156 | action='store_true',
157 | help='If true, validation is not performed.')
158 | parser.set_defaults(no_val=False)
159 | parser.add_argument(
160 | '--test', action='store_true', help='If true, test is performed.')
161 | parser.set_defaults(test=False)
162 | parser.add_argument(
163 | '--test_subset',
164 | default='val',
165 | type=str,
166 | help='Used subset in test (val | test)')
167 | parser.add_argument(
168 | '--scale_in_test',
169 | default=1.0,
170 | type=float,
171 | help='Spatial scale in test')
172 | parser.add_argument(
173 | '--crop_position_in_test',
174 | default='c',
175 | type=str,
176 | help='Cropping method (c | tl | tr | bl | br) in test')
177 | parser.add_argument(
178 | '--no_softmax_in_test',
179 | action='store_true',
180 | help='If true, output for each clip is not normalized using softmax.')
181 | parser.set_defaults(no_softmax_in_test=False)
182 | parser.add_argument(
183 | '--no_cuda', action='store_true', help='If true, cuda is not used.')
184 | parser.set_defaults(no_cuda=False)
185 | parser.add_argument(
186 | '--n_threads',
187 | default=4,
188 | type=int,
189 | help='Number of threads for multi-thread loading')
190 | parser.add_argument(
191 | '--checkpoint',
192 | default=10,
193 | type=int,
194 | help='Trained model is saved at every this epochs.')
195 | parser.add_argument(
196 | '--no_hflip',
197 | action='store_true',
198 | help='If true holizontal flipping is not performed.')
199 | parser.set_defaults(no_hflip=False)
200 | parser.add_argument(
201 | '--norm_value',
202 | default=1,
203 | type=int,
204 | help=
205 | 'If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].')
206 | parser.add_argument(
207 | '--model',
208 | default='resnet',
209 | type=str,
210 | help='(resnet | resnet_nl | preresnet | wideresnet | resnext | densenet | ')
211 | parser.add_argument(
212 | '--model_depth',
213 | default=18,
214 | type=int,
215 | help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
216 | parser.add_argument(
217 | '--resnet_shortcut',
218 | default='B',
219 | type=str,
220 | help='Shortcut type of resnet (A | B)')
221 | parser.add_argument(
222 | '--wide_resnet_k', default=2, type=int, help='Wide resnet k')
223 | parser.add_argument(
224 | '--resnext_cardinality',
225 | default=32,
226 | type=int,
227 | help='ResNeXt cardinality')
228 | parser.add_argument(
229 | '--manual_seed', default=1, type=int, help='Manually set random seed')
230 |
231 | args = parser.parse_args()
232 |
233 | return args
234 |
--------------------------------------------------------------------------------
/3D_experiment/run.sh:
--------------------------------------------------------------------------------
1 | python main.py --sample_size 224 --root_path ./data --video_path hmdb51/jpg --annotation_path hmdb51_1.json --result_path results --dataset hmdb51 --model resnet --model_depth 50 --n_classes 51 --batch_size 32 --n_threads 4 --checkpoint 5 2>&1 | tee output_hmdb.txt
2 |
--------------------------------------------------------------------------------
/3D_experiment/spatial_transforms.py:
--------------------------------------------------------------------------------
1 | import random
2 | import math
3 | import numbers
4 | import collections
5 | import numpy as np
6 | import torch
7 | from PIL import Image, ImageOps
8 | try:
9 | import accimage
10 | except ImportError:
11 | accimage = None
12 |
13 |
14 | class Compose(object):
15 | """Composes several transforms together.
16 | Args:
17 | transforms (list of ``Transform`` objects): list of transforms to compose.
18 | Example:
19 | >>> transforms.Compose([
20 | >>> transforms.CenterCrop(10),
21 | >>> transforms.ToTensor(),
22 | >>> ])
23 | """
24 |
25 | def __init__(self, transforms):
26 | self.transforms = transforms
27 |
28 | def __call__(self, img):
29 | for t in self.transforms:
30 | img = t(img)
31 | return img
32 |
33 | def randomize_parameters(self):
34 | for t in self.transforms:
35 | t.randomize_parameters()
36 |
37 |
38 | class ToTensor(object):
39 | """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor.
40 | Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
41 | [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
42 | """
43 |
44 | def __init__(self, norm_value=255):
45 | self.norm_value = norm_value
46 |
47 | def __call__(self, pic):
48 | """
49 | Args:
50 | pic (PIL.Image or numpy.ndarray): Image to be converted to tensor.
51 | Returns:
52 | Tensor: Converted image.
53 | """
54 | if isinstance(pic, np.ndarray):
55 | # handle numpy array
56 | img = torch.from_numpy(pic.transpose((2, 0, 1)))
57 | # backward compatibility
58 | return img.float().div(self.norm_value)
59 |
60 | if accimage is not None and isinstance(pic, accimage.Image):
61 | nppic = np.zeros(
62 | [pic.channels, pic.height, pic.width], dtype=np.float32)
63 | pic.copyto(nppic)
64 | return torch.from_numpy(nppic)
65 |
66 | # handle PIL Image
67 | if pic.mode == 'I':
68 | img = torch.from_numpy(np.array(pic, np.int32, copy=False))
69 | elif pic.mode == 'I;16':
70 | img = torch.from_numpy(np.array(pic, np.int16, copy=False))
71 | else:
72 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
73 | # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
74 | if pic.mode == 'YCbCr':
75 | nchannel = 3
76 | elif pic.mode == 'I;16':
77 | nchannel = 1
78 | else:
79 | nchannel = len(pic.mode)
80 | img = img.view(pic.size[1], pic.size[0], nchannel)
81 | # put it from HWC to CHW format
82 | # yikes, this transpose takes 80% of the loading time/CPU
83 | img = img.transpose(0, 1).transpose(0, 2).contiguous()
84 | if isinstance(img, torch.ByteTensor):
85 | return img.float().div(self.norm_value)
86 | else:
87 | return img
88 |
89 | def randomize_parameters(self):
90 | pass
91 |
92 |
93 | class Normalize(object):
94 | """Normalize an tensor image with mean and standard deviation.
95 | Given mean: (R, G, B) and std: (R, G, B),
96 | will normalize each channel of the torch.*Tensor, i.e.
97 | channel = (channel - mean) / std
98 | Args:
99 | mean (sequence): Sequence of means for R, G, B channels respecitvely.
100 | std (sequence): Sequence of standard deviations for R, G, B channels
101 | respecitvely.
102 | """
103 |
104 | def __init__(self, mean, std):
105 | self.mean = mean
106 | self.std = std
107 |
108 | def __call__(self, tensor):
109 | """
110 | Args:
111 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
112 | Returns:
113 | Tensor: Normalized image.
114 | """
115 | # TODO: make efficient
116 | for t, m, s in zip(tensor, self.mean, self.std):
117 | t.sub_(m).div_(s)
118 | return tensor
119 |
120 | def randomize_parameters(self):
121 | pass
122 |
123 |
124 | class Scale(object):
125 | """Rescale the input PIL.Image to the given size.
126 | Args:
127 | size (sequence or int): Desired output size. If size is a sequence like
128 | (w, h), output size will be matched to this. If size is an int,
129 | smaller edge of the image will be matched to this number.
130 | i.e, if height > width, then image will be rescaled to
131 | (size * height / width, size)
132 | interpolation (int, optional): Desired interpolation. Default is
133 | ``PIL.Image.BILINEAR``
134 | """
135 |
136 | def __init__(self, size, interpolation=Image.BILINEAR):
137 | assert isinstance(size,
138 | int) or (isinstance(size, collections.Iterable) and
139 | len(size) == 2)
140 | self.size = size
141 | self.interpolation = interpolation
142 |
143 | def __call__(self, img):
144 | """
145 | Args:
146 | img (PIL.Image): Image to be scaled.
147 | Returns:
148 | PIL.Image: Rescaled image.
149 | """
150 | if isinstance(self.size, int):
151 | w, h = img.size
152 | if (w <= h and w == self.size) or (h <= w and h == self.size):
153 | return img
154 | if w < h:
155 | ow = self.size
156 | oh = int(self.size * h / w)
157 | return img.resize((ow, oh), self.interpolation)
158 | else:
159 | oh = self.size
160 | ow = int(self.size * w / h)
161 | return img.resize((ow, oh), self.interpolation)
162 | else:
163 | return img.resize(self.size, self.interpolation)
164 |
165 | def randomize_parameters(self):
166 | pass
167 |
168 |
169 | class CenterCrop(object):
170 | """Crops the given PIL.Image at the center.
171 | Args:
172 | size (sequence or int): Desired output size of the crop. If size is an
173 | int instead of sequence like (h, w), a square crop (size, size) is
174 | made.
175 | """
176 |
177 | def __init__(self, size):
178 | if isinstance(size, numbers.Number):
179 | self.size = (int(size), int(size))
180 | else:
181 | self.size = size
182 |
183 | def __call__(self, img):
184 | """
185 | Args:
186 | img (PIL.Image): Image to be cropped.
187 | Returns:
188 | PIL.Image: Cropped image.
189 | """
190 | w, h = img.size
191 | th, tw = self.size
192 | x1 = int(round((w - tw) / 2.))
193 | y1 = int(round((h - th) / 2.))
194 | return img.crop((x1, y1, x1 + tw, y1 + th))
195 |
196 | def randomize_parameters(self):
197 | pass
198 |
199 |
200 | class CornerCrop(object):
201 |
202 | def __init__(self, size, crop_position=None):
203 | self.size = size
204 | if crop_position is None:
205 | self.randomize = True
206 | else:
207 | self.randomize = False
208 | self.crop_position = crop_position
209 | self.crop_positions = ['c', 'tl', 'tr', 'bl', 'br']
210 |
211 | def __call__(self, img):
212 | image_width = img.size[0]
213 | image_height = img.size[1]
214 |
215 | if self.crop_position == 'c':
216 | th, tw = (self.size, self.size)
217 | x1 = int(round((image_width - tw) / 2.))
218 | y1 = int(round((image_height - th) / 2.))
219 | x2 = x1 + tw
220 | y2 = y1 + th
221 | elif self.crop_position == 'tl':
222 | x1 = 0
223 | y1 = 0
224 | x2 = self.size
225 | y2 = self.size
226 | elif self.crop_position == 'tr':
227 | x1 = image_width - self.size
228 | y1 = 0
229 | x2 = image_width
230 | y2 = self.size
231 | elif self.crop_position == 'bl':
232 | x1 = 0
233 | y1 = image_height - self.size
234 | x2 = self.size
235 | y2 = image_height
236 | elif self.crop_position == 'br':
237 | x1 = image_width - self.size
238 | y1 = image_height - self.size
239 | x2 = image_width
240 | y2 = image_height
241 |
242 | img = img.crop((x1, y1, x2, y2))
243 |
244 | return img
245 |
246 | def randomize_parameters(self):
247 | if self.randomize:
248 | self.crop_position = self.crop_positions[random.randint(
249 | 0,
250 | len(self.crop_positions) - 1)]
251 |
252 |
253 | class RandomHorizontalFlip(object):
254 | """Horizontally flip the given PIL.Image randomly with a probability of 0.5."""
255 |
256 | def __call__(self, img):
257 | """
258 | Args:
259 | img (PIL.Image): Image to be flipped.
260 | Returns:
261 | PIL.Image: Randomly flipped image.
262 | """
263 | if self.p < 0.5:
264 | return img.transpose(Image.FLIP_LEFT_RIGHT)
265 | return img
266 |
267 | def randomize_parameters(self):
268 | self.p = random.random()
269 |
270 |
271 | class MultiScaleCornerCrop(object):
272 | """Crop the given PIL.Image to randomly selected size.
273 | A crop of size is selected from scales of the original size.
274 | A position of cropping is randomly selected from 4 corners and 1 center.
275 | This crop is finally resized to given size.
276 | Args:
277 | scales: cropping scales of the original size
278 | size: size of the smaller edge
279 | interpolation: Default: PIL.Image.BILINEAR
280 | """
281 |
282 | def __init__(self,
283 | scales,
284 | size,
285 | interpolation=Image.BILINEAR,
286 | crop_positions=['c', 'tl', 'tr', 'bl', 'br']):
287 | self.scales = scales
288 | self.size = size
289 | self.interpolation = interpolation
290 |
291 | self.crop_positions = crop_positions
292 |
293 | def __call__(self, img):
294 | min_length = min(img.size[0], img.size[1])
295 | crop_size = int(min_length * self.scale)
296 |
297 | image_width = img.size[0]
298 | image_height = img.size[1]
299 |
300 | if self.crop_position == 'c':
301 | center_x = image_width // 2
302 | center_y = image_height // 2
303 | box_half = crop_size // 2
304 | x1 = center_x - box_half
305 | y1 = center_y - box_half
306 | x2 = center_x + box_half
307 | y2 = center_y + box_half
308 | elif self.crop_position == 'tl':
309 | x1 = 0
310 | y1 = 0
311 | x2 = crop_size
312 | y2 = crop_size
313 | elif self.crop_position == 'tr':
314 | x1 = image_width - crop_size
315 | y1 = 0
316 | x2 = image_width
317 | y2 = crop_size
318 | elif self.crop_position == 'bl':
319 | x1 = 0
320 | y1 = image_height - crop_size
321 | x2 = crop_size
322 | y2 = image_height
323 | elif self.crop_position == 'br':
324 | x1 = image_width - crop_size
325 | y1 = image_height - crop_size
326 | x2 = image_width
327 | y2 = image_height
328 |
329 | img = img.crop((x1, y1, x2, y2))
330 |
331 | return img.resize((self.size, self.size), self.interpolation)
332 |
333 | def randomize_parameters(self):
334 | self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
335 | self.crop_position = self.crop_positions[random.randint(
336 | 0,
337 | len(self.crop_positions) - 1)]
338 |
339 |
340 | class MultiScaleRandomCrop(object):
341 |
342 | def __init__(self, scales, size, interpolation=Image.BILINEAR):
343 | self.scales = scales
344 | self.size = size
345 | self.interpolation = interpolation
346 |
347 | def __call__(self, img):
348 | min_length = min(img.size[0], img.size[1])
349 | crop_size = int(min_length * self.scale)
350 |
351 | image_width = img.size[0]
352 | image_height = img.size[1]
353 |
354 | x1 = self.tl_x * (image_width - crop_size)
355 | y1 = self.tl_y * (image_height - crop_size)
356 | x2 = x1 + crop_size
357 | y2 = y1 + crop_size
358 |
359 | img = img.crop((x1, y1, x2, y2))
360 |
361 | return img.resize((self.size, self.size), self.interpolation)
362 |
363 | def randomize_parameters(self):
364 | self.scale = self.scales[random.randint(0, len(self.scales) - 1)]
365 | self.tl_x = random.random()
366 | self.tl_y = random.random()
367 |
--------------------------------------------------------------------------------
/3D_experiment/target_transforms.py:
--------------------------------------------------------------------------------
1 | import random
2 | import math
3 |
4 |
5 | class Compose(object):
6 |
7 | def __init__(self, transforms):
8 | self.transforms = transforms
9 |
10 | def __call__(self, target):
11 | dst = []
12 | for t in self.transforms:
13 | dst.append(t(target))
14 | return dst
15 |
16 |
17 | class ClassLabel(object):
18 |
19 | def __call__(self, target):
20 | return target['label']
21 |
22 |
23 | class VideoID(object):
24 |
25 | def __call__(self, target):
26 | return target['video_id']
27 |
--------------------------------------------------------------------------------
/3D_experiment/temporal_transforms.py:
--------------------------------------------------------------------------------
1 | import random
2 | import math
3 |
4 |
5 | class LoopPadding(object):
6 |
7 | def __init__(self, size):
8 | self.size = size
9 |
10 | def __call__(self, frame_indices):
11 | out = frame_indices
12 |
13 | for index in out:
14 | if len(out) >= self.size:
15 | break
16 | out.append(index)
17 |
18 | return out
19 |
20 |
21 | class TemporalBeginCrop(object):
22 | """Temporally crop the given frame indices at a beginning.
23 |
24 | If the number of frames is less than the size,
25 | loop the indices as many times as necessary to satisfy the size.
26 |
27 | Args:
28 | size (int): Desired output size of the crop.
29 | """
30 |
31 | def __init__(self, size):
32 | self.size = size
33 |
34 | def __call__(self, frame_indices):
35 | out = frame_indices[:self.size]
36 |
37 | for index in out:
38 | if len(out) >= self.size:
39 | break
40 | out.append(index)
41 |
42 | return out
43 |
44 |
45 | class TemporalCenterCrop(object):
46 | """Temporally crop the given frame indices at a center.
47 |
48 | If the number of frames is less than the size,
49 | loop the indices as many times as necessary to satisfy the size.
50 |
51 | Args:
52 | size (int): Desired output size of the crop.
53 | """
54 |
55 | def __init__(self, size):
56 | self.size = size
57 |
58 | def __call__(self, frame_indices):
59 | """
60 | Args:
61 | frame_indices (list): frame indices to be cropped.
62 | Returns:
63 | list: Cropped frame indices.
64 | """
65 |
66 | center_index = len(frame_indices) // 2
67 | begin_index = max(0, center_index - (self.size // 2))
68 | end_index = min(begin_index + self.size, len(frame_indices))
69 |
70 | out = frame_indices[begin_index:end_index]
71 |
72 | for index in out:
73 | if len(out) >= self.size:
74 | break
75 | out.append(index)
76 |
77 | return out
78 |
79 |
80 | class TemporalRandomCrop(object):
81 | """Temporally crop the given frame indices at a random location.
82 |
83 | If the number of frames is less than the size,
84 | loop the indices as many times as necessary to satisfy the size.
85 |
86 | Args:
87 | size (int): Desired output size of the crop.
88 | """
89 |
90 | def __init__(self, size):
91 | self.size = size
92 |
93 | def __call__(self, frame_indices):
94 | """
95 | Args:
96 | frame_indices (list): frame indices to be cropped.
97 | Returns:
98 | list: Cropped frame indices.
99 | """
100 |
101 | rand_end = max(0, len(frame_indices) - self.size - 1)
102 | begin_index = random.randint(0, rand_end)
103 | end_index = min(begin_index + self.size, len(frame_indices))
104 |
105 | out = frame_indices[begin_index:end_index]
106 |
107 | for index in out:
108 | if len(out) >= self.size:
109 | break
110 | out.append(index)
111 |
112 | return out
113 |
--------------------------------------------------------------------------------
/3D_experiment/test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.autograd import Variable
3 | import torch.nn.functional as F
4 | import time
5 | import os
6 | import sys
7 | import json
8 |
9 | from utils import AverageMeter
10 |
11 |
12 | def calculate_video_results(output_buffer, video_id, test_results, class_names):
13 | video_outputs = torch.stack(output_buffer)
14 | average_scores = torch.mean(video_outputs, dim=0)
15 | sorted_scores, locs = torch.topk(average_scores, k=10)
16 |
17 | video_results = []
18 | for i in range(sorted_scores.size(0)):
19 | video_results.append({
20 | 'label': class_names[locs[i]],
21 | 'score': sorted_scores[i]
22 | })
23 |
24 | test_results['results'][video_id] = video_results
25 |
26 |
27 | def test(data_loader, model, opt, class_names):
28 | print('test')
29 |
30 | model.eval()
31 |
32 | batch_time = AverageMeter()
33 | data_time = AverageMeter()
34 |
35 | end_time = time.time()
36 | output_buffer = []
37 | previous_video_id = ''
38 | test_results = {'results': {}}
39 | for i, (inputs, targets) in enumerate(data_loader):
40 | data_time.update(time.time() - end_time)
41 |
42 | inputs = Variable(inputs, volatile=True)
43 | outputs = model(inputs)
44 | if not opt.no_softmax_in_test:
45 | outputs = F.softmax(outputs)
46 |
47 | for j in range(outputs.size(0)):
48 | if not (i == 0 and j == 0) and targets[j] != previous_video_id:
49 | calculate_video_results(output_buffer, previous_video_id,
50 | test_results, class_names)
51 | output_buffer = []
52 | output_buffer.append(outputs[j].data.cpu())
53 | previous_video_id = targets[j]
54 |
55 | if (i % 100) == 0:
56 | with open(
57 | os.path.join(opt.result_path, '{}.json'.format(
58 | opt.test_subset)), 'w') as f:
59 | json.dump(test_results, f)
60 |
61 | batch_time.update(time.time() - end_time)
62 | end_time = time.time()
63 |
64 | print('[{}/{}]\t'
65 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
66 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format(
67 | i + 1,
68 | len(data_loader),
69 | batch_time=batch_time,
70 | data_time=data_time))
71 | with open(
72 | os.path.join(opt.result_path, '{}.json'.format(opt.test_subset)),
73 | 'w') as f:
74 | json.dump(test_results, f)
75 |
--------------------------------------------------------------------------------
/3D_experiment/train.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.autograd import Variable
3 | import time
4 | import os
5 | import sys
6 |
7 | from utils import AverageMeter, calculate_accuracy
8 |
9 |
10 | def train_epoch(epoch, data_loader, model, criterion, optimizer, opt,
11 | epoch_logger, batch_logger):
12 | print('train at epoch {}'.format(epoch))
13 |
14 | model.train()
15 |
16 | batch_time = AverageMeter()
17 | data_time = AverageMeter()
18 | losses = AverageMeter()
19 | accuracies = AverageMeter()
20 |
21 | end_time = time.time()
22 | for i, (inputs, targets) in enumerate(data_loader):
23 | data_time.update(time.time() - end_time)
24 |
25 | if not opt.no_cuda:
26 | targets = targets.cuda(async=True)
27 | inputs = Variable(inputs)
28 | targets = Variable(targets)
29 | outputs = model(inputs)
30 | loss = criterion(outputs, targets)
31 | acc = calculate_accuracy(outputs, targets)
32 |
33 | losses.update(loss.data[0], inputs.size(0))
34 | accuracies.update(acc, inputs.size(0))
35 |
36 | optimizer.zero_grad()
37 | loss.backward()
38 | optimizer.step()
39 |
40 | batch_time.update(time.time() - end_time)
41 | end_time = time.time()
42 |
43 | batch_logger.log({
44 | 'epoch': epoch,
45 | 'batch': i + 1,
46 | 'iter': (epoch - 1) * len(data_loader) + (i + 1),
47 | 'loss': losses.val,
48 | 'acc': accuracies.val,
49 | 'lr': optimizer.param_groups[0]['lr']
50 | })
51 |
52 | print('Epoch: [{0}][{1}/{2}]\t'
53 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
54 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
55 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
56 | 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
57 | epoch,
58 | i + 1,
59 | len(data_loader),
60 | batch_time=batch_time,
61 | data_time=data_time,
62 | loss=losses,
63 | acc=accuracies))
64 |
65 | epoch_logger.log({
66 | 'epoch': epoch,
67 | 'loss': losses.avg,
68 | 'acc': accuracies.avg,
69 | 'lr': optimizer.param_groups[0]['lr']
70 | })
71 |
72 | if epoch % opt.checkpoint == 0:
73 | save_file_path = os.path.join(opt.result_path,
74 | 'save_{}.pth'.format(epoch))
75 | states = {
76 | 'epoch': epoch + 1,
77 | 'arch': opt.arch,
78 | 'state_dict': model.state_dict(),
79 | 'optimizer': optimizer.state_dict(),
80 | }
81 | torch.save(states, save_file_path)
82 |
--------------------------------------------------------------------------------
/3D_experiment/utils.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 |
4 | class AverageMeter(object):
5 | """Computes and stores the average and current value"""
6 |
7 | def __init__(self):
8 | self.reset()
9 |
10 | def reset(self):
11 | self.val = 0
12 | self.avg = 0
13 | self.sum = 0
14 | self.count = 0
15 |
16 | def update(self, val, n=1):
17 | self.val = val
18 | self.sum += val * n
19 | self.count += n
20 | self.avg = self.sum / self.count
21 |
22 |
23 | class Logger(object):
24 |
25 | def __init__(self, path, header):
26 | self.log_file = open(path, 'w')
27 | self.logger = csv.writer(self.log_file, delimiter='\t')
28 |
29 | self.logger.writerow(header)
30 | self.header = header
31 |
32 | def __del(self):
33 | self.log_file.close()
34 |
35 | def log(self, values):
36 | write_values = []
37 | for col in self.header:
38 | assert col in values
39 | write_values.append(values[col])
40 |
41 | self.logger.writerow(write_values)
42 | self.log_file.flush()
43 |
44 |
45 | def load_value_file(file_path):
46 | with open(file_path, 'r') as input_file:
47 | value = float(input_file.read().rstrip('\n\r'))
48 |
49 | return value
50 |
51 |
52 | def calculate_accuracy(outputs, targets):
53 | batch_size = targets.size(0)
54 |
55 | _, pred = outputs.topk(1, 1, True)
56 | pred = pred.t()
57 | correct = pred.eq(targets.view(1, -1))
58 | n_correct_elems = correct.float().sum().data[0]
59 |
60 | return n_correct_elems / batch_size
61 |
--------------------------------------------------------------------------------
/3D_experiment/utils/eval_hmdb51.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import numpy as np
4 | import pandas as pd
5 |
6 | class HMDBclassification(object):
7 |
8 | def __init__(self, ground_truth_filename=None, prediction_filename=None,
9 | subset='validation', verbose=False, top_k=1):
10 | if not ground_truth_filename:
11 | raise IOError('Please input a valid ground truth file.')
12 | if not prediction_filename:
13 | raise IOError('Please input a valid prediction file.')
14 | self.subset = subset
15 | self.verbose = verbose
16 | self.top_k = top_k
17 | self.ap = None
18 | self.hit_at_k = None
19 | # Import ground truth and predictions.
20 | self.ground_truth, self.activity_index = self._import_ground_truth(
21 | ground_truth_filename)
22 | self.prediction = self._import_prediction(prediction_filename)
23 |
24 | if self.verbose:
25 | print '[INIT] Loaded annotations from {} subset.'.format(subset)
26 | nr_gt = len(self.ground_truth)
27 | print '\tNumber of ground truth instances: {}'.format(nr_gt)
28 | nr_pred = len(self.prediction)
29 | print '\tNumber of predictions: {}'.format(nr_pred)
30 |
31 | def _import_ground_truth(self, ground_truth_filename):
32 | """Reads ground truth file, checks if it is well formatted, and returns
33 | the ground truth instances and the activity classes.
34 |
35 | Parameters
36 | ----------
37 | ground_truth_filename : str
38 | Full path to the ground truth json file.
39 |
40 | Outputs
41 | -------
42 | ground_truth : df
43 | Data frame containing the ground truth instances.
44 | activity_index : dict
45 | Dictionary containing class index.
46 | """
47 | with open(ground_truth_filename, 'r') as fobj:
48 | data = json.load(fobj)
49 | # Checking format
50 | # if not all([field in data.keys() for field in self.gt_fields]):
51 | # raise IOError('Please input a valid ground truth file.')
52 |
53 | # Initialize data frame
54 | activity_index, cidx = {}, 0
55 | video_lst, label_lst = [], []
56 | for videoid, v in data['database'].iteritems():
57 | if self.subset != v['subset']:
58 | continue
59 | this_label = v['annotations']['label']
60 | if this_label not in activity_index:
61 | activity_index[this_label] = cidx
62 | cidx += 1
63 | video_lst.append(videoid)
64 | label_lst.append(activity_index[this_label])
65 | ground_truth = pd.DataFrame({'video-id': video_lst,
66 | 'label': label_lst})
67 | ground_truth = ground_truth.drop_duplicates().reset_index(drop=True)
68 | return ground_truth, activity_index
69 |
70 | def _import_prediction(self, prediction_filename):
71 | """Reads prediction file, checks if it is well formatted, and returns
72 | the prediction instances.
73 |
74 | Parameters
75 | ----------
76 | prediction_filename : str
77 | Full path to the prediction json file.
78 |
79 | Outputs
80 | -------
81 | prediction : df
82 | Data frame containing the prediction instances.
83 | """
84 | with open(prediction_filename, 'r') as fobj:
85 | data = json.load(fobj)
86 | # Checking format...
87 | # if not all([field in data.keys() for field in self.pred_fields]):
88 | # raise IOError('Please input a valid prediction file.')
89 |
90 | # Initialize data frame
91 | video_lst, label_lst, score_lst = [], [], []
92 | for videoid, v in data['results'].iteritems():
93 | for result in v:
94 | label = self.activity_index[result['label']]
95 | video_lst.append(videoid)
96 | label_lst.append(label)
97 | score_lst.append(result['score'])
98 | prediction = pd.DataFrame({'video-id': video_lst,
99 | 'label': label_lst,
100 | 'score': score_lst})
101 | return prediction
102 |
103 | def evaluate(self):
104 | """Evaluates a prediction file. For the detection task we measure the
105 | interpolated mean average precision to measure the performance of a
106 | method.
107 | """
108 | hit_at_k = compute_video_hit_at_k(self.ground_truth,
109 | self.prediction, top_k=self.top_k)
110 | if self.verbose:
111 | print ('[RESULTS] Performance on ActivityNet untrimmed video '
112 | 'classification task.')
113 | print '\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k)
114 | #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k)
115 | self.hit_at_k = hit_at_k
116 |
117 | ################################################################################
118 | # Metrics
119 | ################################################################################
120 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3):
121 | """Compute accuracy at k prediction between ground truth and
122 | predictions data frames. This code is greatly inspired by evaluation
123 | performed in Karpathy et al. CVPR14.
124 |
125 | Parameters
126 | ----------
127 | ground_truth : df
128 | Data frame containing the ground truth instances.
129 | Required fields: ['video-id', 'label']
130 | prediction : df
131 | Data frame containing the prediction instances.
132 | Required fields: ['video-id, 'label', 'score']
133 |
134 | Outputs
135 | -------
136 | acc : float
137 | Top k accuracy score.
138 | """
139 | video_ids = np.unique(ground_truth['video-id'].values)
140 | avg_hits_per_vid = np.zeros(video_ids.size)
141 | for i, vid in enumerate(video_ids):
142 | pred_idx = prediction['video-id'] == vid
143 | if not pred_idx.any():
144 | continue
145 | this_pred = prediction.loc[pred_idx].reset_index(drop=True)
146 | # Get top K predictions sorted by decreasing score.
147 | sort_idx = this_pred['score'].values.argsort()[::-1][:top_k]
148 | this_pred = this_pred.loc[sort_idx].reset_index(drop=True)
149 | # Get labels and compare against ground truth.
150 | pred_label = this_pred['label'].tolist()
151 | gt_idx = ground_truth['video-id'] == vid
152 | gt_label = ground_truth.loc[gt_idx]['label'].tolist()
153 | avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0
154 | for this_label in gt_label])
155 | return float(avg_hits_per_vid.mean())
156 |
--------------------------------------------------------------------------------
/3D_experiment/utils/eval_kinetics.py:
--------------------------------------------------------------------------------
1 | import json
2 | import urllib2
3 |
4 | import numpy as np
5 | import pandas as pd
6 |
7 | API = 'http://ec2-52-11-11-89.us-west-2.compute.amazonaws.com/challenge17/api.py'
8 |
9 | def get_blocked_videos(api=API):
10 | api_url = '{}?action=get_blocked'.format(api)
11 | req = urllib2.Request(api_url)
12 | response = urllib2.urlopen(req)
13 | return json.loads(response.read())
14 |
15 | class KINETICSclassification(object):
16 | GROUND_TRUTH_FIELDS = ['database', 'labels']
17 | PREDICTION_FIELDS = ['results', 'version', 'external_data']
18 |
19 | def __init__(self, ground_truth_filename=None, prediction_filename=None,
20 | ground_truth_fields=GROUND_TRUTH_FIELDS,
21 | prediction_fields=PREDICTION_FIELDS,
22 | subset='validation', verbose=False, top_k=1,
23 | check_status=True):
24 | if not ground_truth_filename:
25 | raise IOError('Please input a valid ground truth file.')
26 | if not prediction_filename:
27 | raise IOError('Please input a valid prediction file.')
28 | self.subset = subset
29 | self.verbose = verbose
30 | self.gt_fields = ground_truth_fields
31 | self.pred_fields = prediction_fields
32 | self.top_k = top_k
33 | self.ap = None
34 | self.hit_at_k = None
35 | self.check_status = check_status
36 | # Retrieve blocked videos from server.
37 | if self.check_status:
38 | self.blocked_videos = get_blocked_videos()
39 | else:
40 | self.blocked_videos = list()
41 | # Import ground truth and predictions.
42 | self.ground_truth, self.activity_index = self._import_ground_truth(
43 | ground_truth_filename)
44 | self.prediction = self._import_prediction(prediction_filename)
45 |
46 | if self.verbose:
47 | print '[INIT] Loaded annotations from {} subset.'.format(subset)
48 | nr_gt = len(self.ground_truth)
49 | print '\tNumber of ground truth instances: {}'.format(nr_gt)
50 | nr_pred = len(self.prediction)
51 | print '\tNumber of predictions: {}'.format(nr_pred)
52 |
53 | def _import_ground_truth(self, ground_truth_filename):
54 | """Reads ground truth file, checks if it is well formatted, and returns
55 | the ground truth instances and the activity classes.
56 |
57 | Parameters
58 | ----------
59 | ground_truth_filename : str
60 | Full path to the ground truth json file.
61 |
62 | Outputs
63 | -------
64 | ground_truth : df
65 | Data frame containing the ground truth instances.
66 | activity_index : dict
67 | Dictionary containing class index.
68 | """
69 | with open(ground_truth_filename, 'r') as fobj:
70 | data = json.load(fobj)
71 | # Checking format
72 | # if not all([field in data.keys() for field in self.gt_fields]):
73 | # raise IOError('Please input a valid ground truth file.')
74 |
75 | # Initialize data frame
76 | activity_index, cidx = {}, 0
77 | video_lst, label_lst = [], []
78 | for videoid, v in data['database'].iteritems():
79 | if self.subset != v['subset']:
80 | continue
81 | if videoid in self.blocked_videos:
82 | continue
83 | this_label = v['annotations']['label']
84 | if this_label not in activity_index:
85 | activity_index[this_label] = cidx
86 | cidx += 1
87 | video_lst.append(videoid[:-14])
88 | label_lst.append(activity_index[this_label])
89 | ground_truth = pd.DataFrame({'video-id': video_lst,
90 | 'label': label_lst})
91 | ground_truth = ground_truth.drop_duplicates().reset_index(drop=True)
92 | return ground_truth, activity_index
93 |
94 | def _import_prediction(self, prediction_filename):
95 | """Reads prediction file, checks if it is well formatted, and returns
96 | the prediction instances.
97 |
98 | Parameters
99 | ----------
100 | prediction_filename : str
101 | Full path to the prediction json file.
102 |
103 | Outputs
104 | -------
105 | prediction : df
106 | Data frame containing the prediction instances.
107 | """
108 | with open(prediction_filename, 'r') as fobj:
109 | data = json.load(fobj)
110 | # Checking format...
111 | # if not all([field in data.keys() for field in self.pred_fields]):
112 | # raise IOError('Please input a valid prediction file.')
113 |
114 | # Initialize data frame
115 | video_lst, label_lst, score_lst = [], [], []
116 | for videoid, v in data['results'].iteritems():
117 | if videoid in self.blocked_videos:
118 | continue
119 | for result in v:
120 | label = self.activity_index[result['label']]
121 | video_lst.append(videoid)
122 | label_lst.append(label)
123 | score_lst.append(result['score'])
124 | prediction = pd.DataFrame({'video-id': video_lst,
125 | 'label': label_lst,
126 | 'score': score_lst})
127 | return prediction
128 |
129 | def evaluate(self):
130 | """Evaluates a prediction file. For the detection task we measure the
131 | interpolated mean average precision to measure the performance of a
132 | method.
133 | """
134 | hit_at_k = compute_video_hit_at_k(self.ground_truth,
135 | self.prediction, top_k=self.top_k)
136 | # avg_hit_at_k = compute_video_hit_at_k(
137 | # self.ground_truth, self.prediction, top_k=self.top_k, avg=True)
138 | if self.verbose:
139 | print ('[RESULTS] Performance on ActivityNet untrimmed video '
140 | 'classification task.')
141 | # print '\tMean Average Precision: {}'.format(ap.mean())
142 | print '\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k)
143 | #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k)
144 | # self.ap = ap
145 | self.hit_at_k = hit_at_k
146 | # self.avg_hit_at_k = avg_hit_at_k
147 |
148 | ################################################################################
149 | # Metrics
150 | ################################################################################
151 |
152 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3, avg=False):
153 | """Compute accuracy at k prediction between ground truth and
154 | predictions data frames. This code is greatly inspired by evaluation
155 | performed in Karpathy et al. CVPR14.
156 |
157 | Parameters
158 | ----------
159 | ground_truth : df
160 | Data frame containing the ground truth instances.
161 | Required fields: ['video-id', 'label']
162 | prediction : df
163 | Data frame containing the prediction instances.
164 | Required fields: ['video-id, 'label', 'score']
165 |
166 | Outputs
167 | -------
168 | acc : float
169 | Top k accuracy score.
170 | """
171 | video_ids = np.unique(ground_truth['video-id'].values)
172 | avg_hits_per_vid = np.zeros(video_ids.size)
173 | for i, vid in enumerate(video_ids):
174 | pred_idx = prediction['video-id'] == vid
175 | if not pred_idx.any():
176 | continue
177 | this_pred = prediction.loc[pred_idx].reset_index(drop=True)
178 | # Get top K predictions sorted by decreasing score.
179 | sort_idx = this_pred['score'].values.argsort()[::-1][:top_k]
180 | this_pred = this_pred.loc[sort_idx].reset_index(drop=True)
181 | # Get labels and compare against ground truth.
182 | pred_label = this_pred['label'].tolist()
183 | gt_idx = ground_truth['video-id'] == vid
184 | gt_label = ground_truth.loc[gt_idx]['label'].tolist()
185 | avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0
186 | for this_label in gt_label])
187 | if not avg:
188 | avg_hits_per_vid[i] = np.ceil(avg_hits_per_vid[i])
189 | return float(avg_hits_per_vid.mean())
190 |
--------------------------------------------------------------------------------
/3D_experiment/utils/eval_ucf101.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import numpy as np
4 | import pandas as pd
5 |
6 | class UCFclassification(object):
7 |
8 | def __init__(self, ground_truth_filename=None, prediction_filename=None,
9 | subset='validation', verbose=False, top_k=1):
10 | if not ground_truth_filename:
11 | raise IOError('Please input a valid ground truth file.')
12 | if not prediction_filename:
13 | raise IOError('Please input a valid prediction file.')
14 | self.subset = subset
15 | self.verbose = verbose
16 | self.top_k = top_k
17 | self.ap = None
18 | self.hit_at_k = None
19 | # Import ground truth and predictions.
20 | self.ground_truth, self.activity_index = self._import_ground_truth(
21 | ground_truth_filename)
22 | self.prediction = self._import_prediction(prediction_filename)
23 |
24 | if self.verbose:
25 | print '[INIT] Loaded annotations from {} subset.'.format(subset)
26 | nr_gt = len(self.ground_truth)
27 | print '\tNumber of ground truth instances: {}'.format(nr_gt)
28 | nr_pred = len(self.prediction)
29 | print '\tNumber of predictions: {}'.format(nr_pred)
30 |
31 | def _import_ground_truth(self, ground_truth_filename):
32 | """Reads ground truth file, checks if it is well formatted, and returns
33 | the ground truth instances and the activity classes.
34 |
35 | Parameters
36 | ----------
37 | ground_truth_filename : str
38 | Full path to the ground truth json file.
39 |
40 | Outputs
41 | -------
42 | ground_truth : df
43 | Data frame containing the ground truth instances.
44 | activity_index : dict
45 | Dictionary containing class index.
46 | """
47 | with open(ground_truth_filename, 'r') as fobj:
48 | data = json.load(fobj)
49 | # Checking format
50 | # if not all([field in data.keys() for field in self.gt_fields]):
51 | # raise IOError('Please input a valid ground truth file.')
52 |
53 | # Initialize data frame
54 | activity_index, cidx = {}, 0
55 | video_lst, label_lst = [], []
56 | for videoid, v in data['database'].iteritems():
57 | if self.subset != v['subset']:
58 | continue
59 | this_label = v['annotations']['label']
60 | if this_label not in activity_index:
61 | activity_index[this_label] = cidx
62 | cidx += 1
63 | video_lst.append(videoid)
64 | label_lst.append(activity_index[this_label])
65 | ground_truth = pd.DataFrame({'video-id': video_lst,
66 | 'label': label_lst})
67 | ground_truth = ground_truth.drop_duplicates().reset_index(drop=True)
68 | return ground_truth, activity_index
69 |
70 | def _import_prediction(self, prediction_filename):
71 | """Reads prediction file, checks if it is well formatted, and returns
72 | the prediction instances.
73 |
74 | Parameters
75 | ----------
76 | prediction_filename : str
77 | Full path to the prediction json file.
78 |
79 | Outputs
80 | -------
81 | prediction : df
82 | Data frame containing the prediction instances.
83 | """
84 | with open(prediction_filename, 'r') as fobj:
85 | data = json.load(fobj)
86 | # Checking format...
87 | # if not all([field in data.keys() for field in self.pred_fields]):
88 | # raise IOError('Please input a valid prediction file.')
89 |
90 | # Initialize data frame
91 | video_lst, label_lst, score_lst = [], [], []
92 | for videoid, v in data['results'].iteritems():
93 | for result in v:
94 | label = self.activity_index[result['label']]
95 | video_lst.append(videoid)
96 | label_lst.append(label)
97 | score_lst.append(result['score'])
98 | prediction = pd.DataFrame({'video-id': video_lst,
99 | 'label': label_lst,
100 | 'score': score_lst})
101 | return prediction
102 |
103 | def evaluate(self):
104 | """Evaluates a prediction file. For the detection task we measure the
105 | interpolated mean average precision to measure the performance of a
106 | method.
107 | """
108 | hit_at_k = compute_video_hit_at_k(self.ground_truth,
109 | self.prediction, top_k=self.top_k)
110 | if self.verbose:
111 | print ('[RESULTS] Performance on ActivityNet untrimmed video '
112 | 'classification task.')
113 | print '\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k)
114 | #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k)
115 | self.hit_at_k = hit_at_k
116 |
117 | ################################################################################
118 | # Metrics
119 | ################################################################################
120 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3):
121 | """Compute accuracy at k prediction between ground truth and
122 | predictions data frames. This code is greatly inspired by evaluation
123 | performed in Karpathy et al. CVPR14.
124 |
125 | Parameters
126 | ----------
127 | ground_truth : df
128 | Data frame containing the ground truth instances.
129 | Required fields: ['video-id', 'label']
130 | prediction : df
131 | Data frame containing the prediction instances.
132 | Required fields: ['video-id, 'label', 'score']
133 |
134 | Outputs
135 | -------
136 | acc : float
137 | Top k accuracy score.
138 | """
139 | video_ids = np.unique(ground_truth['video-id'].values)
140 | avg_hits_per_vid = np.zeros(video_ids.size)
141 | for i, vid in enumerate(video_ids):
142 | pred_idx = prediction['video-id'] == vid
143 | if not pred_idx.any():
144 | continue
145 | this_pred = prediction.loc[pred_idx].reset_index(drop=True)
146 | # Get top K predictions sorted by decreasing score.
147 | sort_idx = this_pred['score'].values.argsort()[::-1][:top_k]
148 | this_pred = this_pred.loc[sort_idx].reset_index(drop=True)
149 | # Get labels and compare against ground truth.
150 | pred_label = this_pred['label'].tolist()
151 | gt_idx = ground_truth['video-id'] == vid
152 | gt_label = ground_truth.loc[gt_idx]['label'].tolist()
153 | avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0
154 | for this_label in gt_label])
155 | return float(avg_hits_per_vid.mean())
156 |
--------------------------------------------------------------------------------
/3D_experiment/utils/fps.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 | import os
3 | import sys
4 | import subprocess
5 |
6 |
7 | if __name__=="__main__":
8 | dir_path = sys.argv[1]
9 | dst_dir_path = sys.argv[2]
10 |
11 | for file_name in os.listdir(dir_path):
12 | if '.mp4' not in file_name:
13 | continue
14 | name, ext = os.path.splitext(file_name)
15 | dst_directory_path = os.path.join(dst_dir_path, name)
16 |
17 | video_file_path = os.path.join(dir_path, file_name)
18 | p = subprocess.Popen('ffprobe {}'.format(video_file_path),
19 | shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
20 | _, res = p.communicate()
21 | res = res.decode('utf-8')
22 |
23 | duration_index = res.find('Duration:')
24 | duration_str = res[(duration_index + 10):(duration_index + 21)]
25 | hour = float(duration_str[0:2])
26 | minute = float(duration_str[3:5])
27 | sec = float(duration_str[6:10])
28 | total_sec = hour * 3600 + minute * 60 + sec
29 |
30 | n_frames = len(os.listdir(dst_directory_path))
31 | if os.path.exists(os.path.join(dst_directory_path, 'fps')):
32 | n_frames -= 1
33 |
34 | fps = round(n_frames / total_sec, 2)
35 |
36 | print(video_file_path, os.path.exists(video_file_path), fps)
37 | with open(os.path.join(dst_directory_path, 'fps'), 'w') as fps_file:
38 | fps_file.write('{}\n'.format(fps))
39 |
--------------------------------------------------------------------------------
/3D_experiment/utils/hmdb51_json.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 | import os
3 | import sys
4 | import json
5 | import pandas as pd
6 |
7 | def convert_csv_to_dict(csv_dir_path, split_index):
8 | database = {}
9 | for filename in os.listdir(csv_dir_path):
10 | if 'split{}'.format(split_index) not in filename:
11 | continue
12 |
13 | data = pd.read_csv(os.path.join(csv_dir_path, filename),
14 | delimiter=' ', header=None)
15 | keys = []
16 | subsets = []
17 | for i in range(data.shape[0]):
18 | row = data.ix[i, :]
19 | if row[1] == 0:
20 | continue
21 | elif row[1] == 1:
22 | subset = 'training'
23 | elif row[1] == 2:
24 | subset = 'validation'
25 |
26 | keys.append(row[0].split('.')[0])
27 | subsets.append(subset)
28 |
29 | for i in range(len(keys)):
30 | key = keys[i]
31 | database[key] = {}
32 | database[key]['subset'] = subsets[i]
33 | label = '_'.join(filename.split('_')[:-2])
34 | database[key]['annotations'] = {'label': label}
35 |
36 | return database
37 |
38 | def get_labels(csv_dir_path):
39 | labels = []
40 | for name in os.listdir(csv_dir_path):
41 | labels.append('_'.join(name.split('_')[:-2]))
42 | return sorted(list(set(labels)))
43 |
44 | def convert_hmdb51_csv_to_activitynet_json(csv_dir_path, split_index, dst_json_path):
45 | labels = get_labels(csv_dir_path)
46 | database = convert_csv_to_dict(csv_dir_path, split_index)
47 |
48 | dst_data = {}
49 | dst_data['labels'] = labels
50 | dst_data['database'] = {}
51 | dst_data['database'].update(database)
52 |
53 | with open(dst_json_path, 'w') as dst_file:
54 | json.dump(dst_data, dst_file)
55 |
56 | if __name__ == '__main__':
57 | csv_dir_path = sys.argv[1]
58 |
59 | for split_index in range(1, 4):
60 | dst_json_path = os.path.join(csv_dir_path, 'hmdb51_{}.json'.format(split_index))
61 | convert_hmdb51_csv_to_activitynet_json(csv_dir_path, split_index, dst_json_path)
--------------------------------------------------------------------------------
/3D_experiment/utils/kinetics_json.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 | import os
3 | import sys
4 | import json
5 | import pandas as pd
6 |
7 | def convert_csv_to_dict(csv_path, subset):
8 | data = pd.read_csv(csv_path)
9 | keys = []
10 | key_labels = []
11 | for i in range(data.shape[0]):
12 | row = data.ix[i, :]
13 | basename = '%s_%s_%s' % (row['youtube_id'],
14 | '%06d' % row['time_start'],
15 | '%06d' % row['time_end'])
16 | keys.append(basename)
17 | if subset != 'testing':
18 | key_labels.append(row['label'])
19 |
20 | database = {}
21 | for i in range(len(keys)):
22 | key = keys[i]
23 | database[key] = {}
24 | database[key]['subset'] = subset
25 | if subset != 'testing':
26 | label = key_labels[i]
27 | database[key]['annotations'] = {'label': label}
28 | else:
29 | database[key]['annotations'] = {}
30 |
31 | return database
32 |
33 | def load_labels(train_csv_path):
34 | data = pd.read_csv(train_csv_path)
35 | return data['label'].unique().tolist()
36 |
37 | def convert_kinetics_csv_to_activitynet_json(train_csv_path, val_csv_path, test_csv_path, dst_json_path):
38 | labels = load_labels(train_csv_path)
39 | train_database = convert_csv_to_dict(train_csv_path, 'training')
40 | val_database = convert_csv_to_dict(val_csv_path, 'validation')
41 | test_database = convert_csv_to_dict(test_csv_path, 'testing')
42 |
43 | dst_data = {}
44 | dst_data['labels'] = labels
45 | dst_data['database'] = {}
46 | dst_data['database'].update(train_database)
47 | dst_data['database'].update(val_database)
48 | dst_data['database'].update(test_database)
49 |
50 | with open(dst_json_path, 'w') as dst_file:
51 | json.dump(dst_data, dst_file)
52 |
53 | if __name__=="__main__":
54 | train_csv_path = sys.argv[1]
55 | val_csv_path = sys.argv[2]
56 | test_csv_path = sys.argv[3]
57 | dst_json_path = sys.argv[4]
58 |
59 | convert_kinetics_csv_to_activitynet_json(
60 | train_csv_path, val_csv_path, test_csv_path, dst_json_path)
61 |
--------------------------------------------------------------------------------
/3D_experiment/utils/n_frames_kinetics.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 | import os
3 | import sys
4 | import subprocess
5 |
6 | def class_process(dir_path, class_name):
7 | class_path = os.path.join(dir_path, class_name)
8 | if not os.path.isdir(class_path):
9 | return
10 |
11 | for file_name in os.listdir(class_path):
12 | video_dir_path = os.path.join(class_path, file_name)
13 | image_indices = []
14 | for image_file_name in os.listdir(video_dir_path):
15 | if 'image' not in image_file_name:
16 | continue
17 | image_indices.append(int(image_file_name[6:11]))
18 |
19 | if len(image_indices) == 0:
20 | print('no image files', video_dir_path)
21 | n_frames = 0
22 | else:
23 | image_indices.sort(reverse=True)
24 | n_frames = image_indices[0]
25 | print(video_dir_path, n_frames)
26 | with open(os.path.join(video_dir_path, 'n_frames'), 'w') as dst_file:
27 | dst_file.write(str(n_frames))
28 |
29 |
30 | if __name__=="__main__":
31 | dir_path = sys.argv[1]
32 | for class_name in os.listdir(dir_path):
33 | class_process(dir_path, class_name)
34 |
35 | class_name = 'test'
36 | class_process(dir_path, class_name)
37 |
--------------------------------------------------------------------------------
/3D_experiment/utils/n_frames_ucf101_hmdb51.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 | import os
3 | import sys
4 | import subprocess
5 |
6 | def class_process(dir_path, class_name):
7 | class_path = os.path.join(dir_path, class_name)
8 | if not os.path.isdir(class_path):
9 | return
10 |
11 | for file_name in os.listdir(class_path):
12 | video_dir_path = os.path.join(class_path, file_name)
13 | image_indices = []
14 | for image_file_name in os.listdir(video_dir_path):
15 | if 'image' not in image_file_name:
16 | continue
17 | image_indices.append(int(image_file_name[6:11]))
18 |
19 | if len(image_indices) == 0:
20 | print('no image files', video_dir_path)
21 | n_frames = 0
22 | else:
23 | image_indices.sort(reverse=True)
24 | n_frames = image_indices[0]
25 | print(video_dir_path, n_frames)
26 | with open(os.path.join(video_dir_path, 'n_frames'), 'w') as dst_file:
27 | dst_file.write(str(n_frames))
28 |
29 |
30 | if __name__=="__main__":
31 | dir_path = sys.argv[1]
32 | for class_name in os.listdir(dir_path):
33 | class_process(dir_path, class_name)
34 |
--------------------------------------------------------------------------------
/3D_experiment/utils/ucf101_json.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 | import os
3 | import sys
4 | import json
5 | import pandas as pd
6 |
7 | def convert_csv_to_dict(csv_path, subset):
8 | data = pd.read_csv(csv_path, delimiter=' ', header=None)
9 | keys = []
10 | key_labels = []
11 | for i in range(data.shape[0]):
12 | row = data.ix[i, :]
13 | slash_rows = data.ix[i, 0].split('/')
14 | class_name = slash_rows[0]
15 | basename = slash_rows[1].split('.')[0]
16 |
17 | keys.append(basename)
18 | key_labels.append(class_name)
19 |
20 | database = {}
21 | for i in range(len(keys)):
22 | key = keys[i]
23 | database[key] = {}
24 | database[key]['subset'] = subset
25 | label = key_labels[i]
26 | database[key]['annotations'] = {'label': label}
27 |
28 | return database
29 |
30 | def load_labels(label_csv_path):
31 | data = pd.read_csv(label_csv_path, delimiter=' ', header=None)
32 | labels = []
33 | for i in range(data.shape[0]):
34 | labels.append(data.ix[i, 1])
35 | return labels
36 |
37 | def convert_ucf101_csv_to_activitynet_json(label_csv_path, train_csv_path,
38 | val_csv_path, dst_json_path):
39 | labels = load_labels(label_csv_path)
40 | train_database = convert_csv_to_dict(train_csv_path, 'training')
41 | val_database = convert_csv_to_dict(val_csv_path, 'validation')
42 |
43 | dst_data = {}
44 | dst_data['labels'] = labels
45 | dst_data['database'] = {}
46 | dst_data['database'].update(train_database)
47 | dst_data['database'].update(val_database)
48 |
49 | with open(dst_json_path, 'w') as dst_file:
50 | json.dump(dst_data, dst_file)
51 |
52 | if __name__ == '__main__':
53 | csv_dir_path = sys.argv[1]
54 |
55 | for split_index in range(1, 4):
56 | label_csv_path = os.path.join(csv_dir_path, 'classInd.txt')
57 | train_csv_path = os.path.join(csv_dir_path, 'trainlist0{}.txt'.format(split_index))
58 | val_csv_path = os.path.join(csv_dir_path, 'testlist0{}.txt'.format(split_index))
59 | dst_json_path = os.path.join(csv_dir_path, 'ucf101_0{}.json'.format(split_index))
60 |
61 | convert_ucf101_csv_to_activitynet_json(label_csv_path, train_csv_path,
62 | val_csv_path, dst_json_path)
63 |
--------------------------------------------------------------------------------
/3D_experiment/utils/video_jpg.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 | import os
3 | import sys
4 | import subprocess
5 |
6 |
7 | if __name__=="__main__":
8 | dir_path = sys.argv[1]
9 | dst_dir_path = sys.argv[2]
10 |
11 | for file_name in os.listdir(dir_path):
12 | if '.mp4' not in file_name:
13 | continue
14 | name, ext = os.path.splitext(file_name)
15 | dst_directory_path = os.path.join(dst_dir_path, name)
16 |
17 | video_file_path = os.path.join(dir_path, file_name)
18 | try:
19 | if os.path.exists(dst_directory_path):
20 | if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')):
21 | subprocess.call('rm -r {}'.format(dst_directory_path), shell=True)
22 | print('remove {}'.format(dst_directory_path))
23 | os.mkdir(dst_directory_path)
24 | else:
25 | continue
26 | else:
27 | os.mkdir(dst_directory_path)
28 | except:
29 | print(dst_directory_path)
30 | continue
31 | cmd = 'ffmpeg -i {} -vf scale=-1:360 {}/image_%05d.jpg'.format(video_file_path, dst_directory_path)
32 | print(cmd)
33 | subprocess.call(cmd, shell=True)
34 | print('\n')
35 |
--------------------------------------------------------------------------------
/3D_experiment/utils/video_jpg_kinetics.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 | import os
3 | import sys
4 | import subprocess
5 |
6 | def class_process(dir_path, dst_dir_path, class_name):
7 | class_path = os.path.join(dir_path, class_name)
8 | if not os.path.isdir(class_path):
9 | return
10 |
11 | dst_class_path = os.path.join(dst_dir_path, class_name)
12 | if not os.path.exists(dst_class_path):
13 | os.mkdir(dst_class_path)
14 |
15 | for file_name in os.listdir(class_path):
16 | if '.mp4' not in file_name:
17 | continue
18 | name, ext = os.path.splitext(file_name)
19 | dst_directory_path = os.path.join(dst_class_path, name)
20 |
21 | video_file_path = os.path.join(class_path, file_name)
22 | try:
23 | if os.path.exists(dst_directory_path):
24 | if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')):
25 | subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True)
26 | print('remove {}'.format(dst_directory_path))
27 | os.mkdir(dst_directory_path)
28 | else:
29 | continue
30 | else:
31 | os.mkdir(dst_directory_path)
32 | except:
33 | print(dst_directory_path)
34 | continue
35 | cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path)
36 | print(cmd)
37 | subprocess.call(cmd, shell=True)
38 | print('\n')
39 |
40 | if __name__=="__main__":
41 | dir_path = sys.argv[1]
42 | dst_dir_path = sys.argv[2]
43 |
44 | for class_name in os.listdir(dir_path):
45 | class_process(dir_path, dst_dir_path, class_name)
46 |
47 | class_name = 'test'
48 | class_process(dir_path, dst_dir_path, class_name)
49 |
--------------------------------------------------------------------------------
/3D_experiment/utils/video_jpg_ucf101_hmdb51.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 | import os
3 | import sys
4 | import subprocess
5 |
6 | def class_process(dir_path, dst_dir_path, class_name):
7 | class_path = os.path.join(dir_path, class_name)
8 | if not os.path.isdir(class_path):
9 | return
10 |
11 | dst_class_path = os.path.join(dst_dir_path, class_name)
12 | if not os.path.exists(dst_class_path):
13 | os.mkdir(dst_class_path)
14 |
15 | for file_name in os.listdir(class_path):
16 | if '.avi' not in file_name:
17 | continue
18 | name, ext = os.path.splitext(file_name)
19 | dst_directory_path = os.path.join(dst_class_path, name)
20 |
21 | video_file_path = os.path.join(class_path, file_name)
22 | try:
23 | if os.path.exists(dst_directory_path):
24 | if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')):
25 | subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True)
26 | print('remove {}'.format(dst_directory_path))
27 | os.mkdir(dst_directory_path)
28 | else:
29 | continue
30 | else:
31 | os.mkdir(dst_directory_path)
32 | except:
33 | print(dst_directory_path)
34 | continue
35 | cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path)
36 | print(cmd)
37 | subprocess.call(cmd, shell=True)
38 | print('\n')
39 |
40 | if __name__=="__main__":
41 | dir_path = sys.argv[1]
42 | dst_dir_path = sys.argv[2]
43 |
44 | for class_name in os.listdir(dir_path):
45 | class_process(dir_path, dst_dir_path, class_name)
46 |
--------------------------------------------------------------------------------
/3D_experiment/validation.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.autograd import Variable
3 | import time
4 | import sys
5 |
6 | from utils import AverageMeter, calculate_accuracy
7 |
8 |
9 | def val_epoch(epoch, data_loader, model, criterion, opt, logger):
10 | print('validation at epoch {}'.format(epoch))
11 |
12 | model.eval()
13 |
14 | batch_time = AverageMeter()
15 | data_time = AverageMeter()
16 | losses = AverageMeter()
17 | accuracies = AverageMeter()
18 |
19 | end_time = time.time()
20 | for i, (inputs, targets) in enumerate(data_loader):
21 | data_time.update(time.time() - end_time)
22 |
23 | if not opt.no_cuda:
24 | targets = targets.cuda(async=True)
25 | inputs = Variable(inputs, volatile=True)
26 | targets = Variable(targets, volatile=True)
27 | outputs = model(inputs)
28 | loss = criterion(outputs, targets)
29 | acc = calculate_accuracy(outputs, targets)
30 |
31 | losses.update(loss.data[0], inputs.size(0))
32 | accuracies.update(acc, inputs.size(0))
33 |
34 | batch_time.update(time.time() - end_time)
35 | end_time = time.time()
36 |
37 | print('Epoch: [{0}][{1}/{2}]\t'
38 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
39 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
40 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
41 | 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
42 | epoch,
43 | i + 1,
44 | len(data_loader),
45 | batch_time=batch_time,
46 | data_time=data_time,
47 | loss=losses,
48 | acc=accuracies))
49 |
50 | logger.log({'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg})
51 |
52 | return losses.avg
53 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Seunghwan Cha
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PyTorch Implementation of Non-Local Neural Network
2 |
3 | This repository contains my implementation of [Non-Local Neural Netowrks (CVPR 2018)](https://arxiv.org/pdf/1711.07971.pdf).
4 |
5 | To understand more about the structure of this paper, you may refer to this [slide](https://www.youtube.com/redirect?redir_token=4Bf1C-e-Vz_0r5HbPD9meYLcyL58MTU1MTc5MjE0NEAxNTUxNzA1NzQ0&q=https%3A%2F%2Fwww.slideshare.net%2FTaeohKim4%2Fpr083-nonlocal-neural-networks&v=ZM153wo3baA&event=video_description) and [video](https://www.youtube.com/watch?v=ZM153wo3baA) which is in Korean.
6 |
7 | The experiment was run on CIFAR-10 dataset for the sake of ensuring that the code runs without error.
8 |
9 | ## Implementation Details
10 | The original paper used ResNet-50 as its backbone structure for conducting experiment on video datasets such as Kinetics, Charades.
11 |
12 | As an inital study, I adopted ResNet-56 strucutre for CIFAR-10 dataset which is a 2D classification. The architecture is implemented in `models/resnet2D.py`.
13 |
14 | Original baseline model from the paper called C2D uses ResNet-50 as its backbone and 1 non-local block after the 4th residual block. This structure is implemented in `models/resnet3D.py`. The detail of the architecture is shown in the below figure:
15 |
16 |
17 |
18 | The four different pairwise functions discussed in the paper are implemented accordingly in `models/non_local.py`. You can simply pass one of the operation as an argument. The details of the non-local block is shown in the below figure:
19 |
20 |
21 |
22 | Finally, the original experiment of activity recognition was similarly replicated in `3D_experiment` folder. The necessary data preprocessing code was borrowed from https://github.com/kenshohara/3D-ResNets-PyTorch. The training is run without error but I didn't have enough time to compare the performance boost from the addition of non-local block.
23 |
24 | ## Training
25 | 1) To start training for CIFAR-10 with ResNet-56, you can simply execute `run.sh`.
26 |
27 | 2) To start training for HMDB51 dataset with C2D, you first need to prepare the HMDB51 dataset as instructed in the `3D_experiment` folder. Then, execute `run.sh`. It seems like use of multiple GPU(s) may be need due to memory issues.
28 |
29 | ## Results
30 | Trained on CIFAR-10 for 200 epochs using the command shown in `run.sh`. The training was conducted using single 1080ti GPU.
31 | The result showed that there wasn't a huge performance boost for image classification task on CIFAR-10. The below graph illustrates the loss curves for two different networks.
32 |
33 |
34 |
35 | The Top-1 validation accuracy for ResNet-56 without non-local block was *93.97%* while the one with non-local block had *93.98%* validation accuracy.
36 |
37 | This could be due to two reasons: 1) the proposed task was mainly for video classification 2) the input size of CIFAR-10 is too small so may not maintain spatial information after the second resnet block.
38 |
39 | ## TO DO
40 | - [x] Compare the result of baseline model and that of non-local model for CIFAR-10
41 | - [x] Prepare video dataset (e.g. UCF-101, HMDB-51)
42 | - [x] Modify the model code to adapt to spatiotemporal settings
43 | - [x] Run test on some video datasets
44 | - [ ] Run test on image segmentation dataset (e.g. COCO)
45 |
46 | ## Reference
47 | This repo is an adaptation from several other exisitng works.
48 | - https://github.com/akamaster/pytorch_resnet_cifar10
49 | - https://github.com/kuangliu/pytorch-cifar
50 | - https://github.com/facebookresearch/video-nonlocal-net
51 | - https://github.com/AlexHex7/Non-local_pytorch
52 | - https://github.com/kenshohara/3D-ResNets-PyTorch
53 |
54 |
--------------------------------------------------------------------------------
/figure/Figure2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tea1528/Non-Local-NN-Pytorch/64c6047f7d801402d04340695bfbdfeee03e4797/figure/Figure2.jpg
--------------------------------------------------------------------------------
/figure/Table1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tea1528/Non-Local-NN-Pytorch/64c6047f7d801402d04340695bfbdfeee03e4797/figure/Table1.jpg
--------------------------------------------------------------------------------
/figure/resnet56_cifar.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tea1528/Non-Local-NN-Pytorch/64c6047f7d801402d04340695bfbdfeee03e4797/figure/resnet56_cifar.jpg
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | '''Train CIFAR10 with PyTorch.'''
2 | from __future__ import print_function
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch.optim as optim
7 | import torch.nn.functional as F
8 | import torch.backends.cudnn as cudnn
9 |
10 | import torchvision
11 | import torchvision.transforms as transforms
12 |
13 | import os
14 | import argparse
15 |
16 | from models.resnet2D import resnet2D56
17 |
18 | parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
19 | parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
20 | parser.add_argument('--verbose', '-v', action='store_true', help='display progress bar')
21 | parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint')
22 | parser.add_argument('--nl', '-n', action='store_true', help='add non-local block')
23 | args = parser.parse_args()
24 |
25 | if args.verbose:
26 | from utils import progress_bar
27 |
28 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
29 | best_acc = 0 # best test accuracy
30 | start_epoch = 0 # start from epoch 0 or last checkpoint epoch
31 |
32 | # Data
33 | print('==> Preparing data..')
34 | transform_train = transforms.Compose([
35 | transforms.RandomCrop(32, padding=4),
36 | transforms.RandomHorizontalFlip(),
37 | transforms.ToTensor(),
38 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
39 | ])
40 |
41 | transform_test = transforms.Compose([
42 | transforms.ToTensor(),
43 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
44 | ])
45 |
46 | trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
47 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)
48 |
49 | testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
50 | testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)
51 |
52 | classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
53 |
54 | # Model
55 | print('==> Building model..')
56 | if args.nl:
57 | print("ResNet-56 with non-local block after second residual block..")
58 | net = resnet2D56(non_local=True)
59 | else:
60 | print("ResNet-56 without non-local block..")
61 | net = resnet2D56(non_local=False)
62 |
63 |
64 |
65 | net = net.to(device)
66 |
67 | if device == 'cuda':
68 | net = torch.nn.DataParallel(net)
69 | cudnn.benchmark = True
70 |
71 | if args.resume:
72 | # Load checkpoint.
73 | print('==> Resuming from checkpoint..')
74 | assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
75 | checkpoint = torch.load('./checkpoint/ckpt.t7')
76 | net.load_state_dict(checkpoint['net'])
77 | best_acc = checkpoint['acc']
78 | start_epoch = checkpoint['epoch']
79 |
80 | criterion = nn.CrossEntropyLoss()
81 | optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
82 | lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150], last_epoch=start_epoch - 1)
83 |
84 | # Training
85 | def train(epoch):
86 | print('\nEpoch: %d' % epoch)
87 | net.train()
88 | train_loss = 0
89 | correct = 0
90 | total = 0
91 | for batch_idx, (inputs, targets) in enumerate(trainloader):
92 | inputs, targets = inputs.to(device), targets.to(device)
93 | optimizer.zero_grad()
94 | outputs = net(inputs)
95 | loss = criterion(outputs, targets)
96 | loss.backward()
97 | optimizer.step()
98 |
99 | train_loss += loss.item()
100 | _, predicted = outputs.max(1)
101 | total += targets.size(0)
102 | correct += predicted.eq(targets).sum().item()
103 |
104 | if args.verbose:
105 | progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
106 | % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
107 | if not args.verbose:
108 | print('Loss: %.3f' % train_loss)
109 |
110 | return train_loss
111 |
112 | def test(epoch):
113 | global best_acc
114 | net.eval()
115 | test_loss = 0
116 | correct = 0
117 | total = 0
118 | with torch.no_grad():
119 | for batch_idx, (inputs, targets) in enumerate(testloader):
120 | inputs, targets = inputs.to(device), targets.to(device)
121 | outputs = net(inputs)
122 | loss = criterion(outputs, targets)
123 |
124 | test_loss += loss.item()
125 | _, predicted = outputs.max(1)
126 | total += targets.size(0)
127 | correct += predicted.eq(targets).sum().item()
128 |
129 | if args.verbose:
130 | progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
131 | % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
132 |
133 | if not args.verbose:
134 | print('Loss: %.3f' % test_loss)
135 |
136 | # Save checkpoint.
137 | acc = 100.*correct/total
138 | if acc > best_acc:
139 | print('Saving..')
140 | state = {
141 | 'net': net.state_dict(),
142 | 'acc': acc,
143 | 'epoch': epoch,
144 | }
145 | if not os.path.isdir('checkpoint'):
146 | os.mkdir('checkpoint')
147 | torch.save(state, './checkpoint/ckpt.t7')
148 | best_acc = acc
149 | return test_loss
150 |
151 | tr_loss_list = []
152 | tst_loss_list = []
153 |
154 | for epoch in range(start_epoch, start_epoch+200):
155 | train_l = train(epoch)
156 | lr_scheduler.step()
157 | test_l = test(epoch)
158 | tr_loss_list.append(train_l)
159 | tst_loss_list.append(test_l)
160 |
161 | print("Best Accuracy: ", best_acc)
162 | print("-----------------------------------------------")
163 |
164 | print("train loss")
165 | print(tr_loss_list)
166 | print("test loss")
167 | print(tst_loss_list)
168 |
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 |
--------------------------------------------------------------------------------
/models/non_local.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from torch.nn import functional as F
4 |
5 |
6 | class NLBlockND(nn.Module):
7 | def __init__(self, in_channels, inter_channels=None, mode='embedded',
8 | dimension=3, bn_layer=True):
9 | """Implementation of Non-Local Block with 4 different pairwise functions but doesn't include subsampling trick
10 | args:
11 | in_channels: original channel size (1024 in the paper)
12 | inter_channels: channel size inside the block if not specifed reduced to half (512 in the paper)
13 | mode: supports Gaussian, Embedded Gaussian, Dot Product, and Concatenation
14 | dimension: can be 1 (temporal), 2 (spatial), 3 (spatiotemporal)
15 | bn_layer: whether to add batch norm
16 | """
17 | super(NLBlockND, self).__init__()
18 |
19 | assert dimension in [1, 2, 3]
20 |
21 | if mode not in ['gaussian', 'embedded', 'dot', 'concatenate']:
22 | raise ValueError('`mode` must be one of `gaussian`, `embedded`, `dot` or `concatenate`')
23 |
24 | self.mode = mode
25 | self.dimension = dimension
26 |
27 | self.in_channels = in_channels
28 | self.inter_channels = inter_channels
29 |
30 | # the channel size is reduced to half inside the block
31 | if self.inter_channels is None:
32 | self.inter_channels = in_channels // 2
33 | if self.inter_channels == 0:
34 | self.inter_channels = 1
35 |
36 | # assign appropriate convolutional, max pool, and batch norm layers for different dimensions
37 | if dimension == 3:
38 | conv_nd = nn.Conv3d
39 | max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
40 | bn = nn.BatchNorm3d
41 | elif dimension == 2:
42 | conv_nd = nn.Conv2d
43 | max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
44 | bn = nn.BatchNorm2d
45 | else:
46 | conv_nd = nn.Conv1d
47 | max_pool_layer = nn.MaxPool1d(kernel_size=(2))
48 | bn = nn.BatchNorm1d
49 |
50 | # function g in the paper which goes through conv. with kernel size 1
51 | self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
52 |
53 | # add BatchNorm layer after the last conv layer
54 | if bn_layer:
55 | self.W_z = nn.Sequential(
56 | conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, kernel_size=1),
57 | bn(self.in_channels)
58 | )
59 | # from section 4.1 of the paper, initializing params of BN ensures that the initial state of non-local block is identity mapping
60 | nn.init.constant_(self.W_z[1].weight, 0)
61 | nn.init.constant_(self.W_z[1].bias, 0)
62 | else:
63 | self.W_z = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, kernel_size=1)
64 |
65 | # from section 3.3 of the paper by initializing Wz to 0, this block can be inserted to any existing architecture
66 | nn.init.constant_(self.W_z.weight, 0)
67 | nn.init.constant_(self.W_z.bias, 0)
68 |
69 | # define theta and phi for all operations except gaussian
70 | if self.mode == "embedded" or self.mode == "dot" or self.mode == "concatenate":
71 | self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
72 | self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1)
73 |
74 | if self.mode == "concatenate":
75 | self.W_f = nn.Sequential(
76 | nn.Conv2d(in_channels=self.inter_channels * 2, out_channels=1, kernel_size=1),
77 | nn.ReLU()
78 | )
79 |
80 | def forward(self, x):
81 | """
82 | args
83 | x: (N, C, T, H, W) for dimension=3; (N, C, H, W) for dimension 2; (N, C, T) for dimension 1
84 | """
85 |
86 | batch_size = x.size(0)
87 |
88 | # (N, C, THW)
89 | # this reshaping and permutation is from the spacetime_nonlocal function in the original Caffe2 implementation
90 | g_x = self.g(x).view(batch_size, self.inter_channels, -1)
91 | g_x = g_x.permute(0, 2, 1)
92 |
93 | if self.mode == "gaussian":
94 | theta_x = x.view(batch_size, self.in_channels, -1)
95 | phi_x = x.view(batch_size, self.in_channels, -1)
96 | theta_x = theta_x.permute(0, 2, 1)
97 | f = torch.matmul(theta_x, phi_x)
98 |
99 | elif self.mode == "embedded" or self.mode == "dot":
100 | theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
101 | phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)
102 | theta_x = theta_x.permute(0, 2, 1)
103 | f = torch.matmul(theta_x, phi_x)
104 |
105 | elif self.mode == "concatenate":
106 | theta_x = self.theta(x).view(batch_size, self.inter_channels, -1, 1)
107 | phi_x = self.phi(x).view(batch_size, self.inter_channels, 1, -1)
108 |
109 | h = theta_x.size(2)
110 | w = phi_x.size(3)
111 | theta_x = theta_x.repeat(1, 1, 1, w)
112 | phi_x = phi_x.repeat(1, 1, h, 1)
113 |
114 | concat = torch.cat([theta_x, phi_x], dim=1)
115 | f = self.W_f(concat)
116 | f = f.view(f.size(0), f.size(2), f.size(3))
117 |
118 | if self.mode == "gaussian" or self.mode == "embedded":
119 | f_div_C = F.softmax(f, dim=-1)
120 | elif self.mode == "dot" or self.mode == "concatenate":
121 | N = f.size(-1) # number of position in x
122 | f_div_C = f / N
123 |
124 | y = torch.matmul(f_div_C, g_x)
125 |
126 | # contiguous here just allocates contiguous chunk of memory
127 | y = y.permute(0, 2, 1).contiguous()
128 | y = y.view(batch_size, self.inter_channels, *x.size()[2:])
129 |
130 | W_y = self.W_z(y)
131 | # residual connection
132 | z = W_y + x
133 |
134 | return z
135 |
136 |
137 | if __name__ == '__main__':
138 | import torch
139 |
140 | for bn_layer in [True, False]:
141 | img = torch.zeros(2, 3, 20)
142 | net = NLBlockND(in_channels=3, mode='concatenate', dimension=1, bn_layer=bn_layer)
143 | out = net(img)
144 | print(out.size())
145 |
146 | img = torch.zeros(2, 3, 20, 20)
147 | net = NLBlockND(in_channels=3, mode='concatenate', dimension=2, bn_layer=bn_layer)
148 | out = net(img)
149 | print(out.size())
150 |
151 | img = torch.randn(2, 3, 8, 20, 20)
152 | net = NLBlockND(in_channels=3, mode='concatenate', dimension=3, bn_layer=bn_layer)
153 | out = net(img)
154 | print(out.size())
155 |
156 |
157 |
--------------------------------------------------------------------------------
/models/resnet2D.py:
--------------------------------------------------------------------------------
1 | '''
2 | Non-Local ResNet2D-50 for CIFAR-10 dataset.
3 | Most of the code is borrowed from https://github.com/akamaster/pytorch_resnet_cifar10
4 |
5 | Properly implemented ResNet-s for CIFAR10 as described in paper [1].
6 |
7 | The implementation and structure of this file is hugely influenced by [2]
8 | which is implemented for ImageNet and doesn't have option A for identity.
9 | Moreover, most of the implementations on the web is copy-paste from
10 | torchvision's resnet and has wrong number of params.
11 |
12 | Reference:
13 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
14 | Deep Residual Learning for Image Recognition. arXiv:1512.03385
15 | [2] https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
16 | '''
17 | import torch
18 | import torch.nn as nn
19 | import torch.nn.functional as F
20 | import torch.nn.init as init
21 |
22 | from torch.autograd import Variable
23 | from models.non_local import NLBlockND
24 |
25 |
26 | def _weights_init(m):
27 | if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
28 | init.kaiming_normal_(m.weight)
29 |
30 | class LambdaLayer(nn.Module):
31 | def __init__(self, lambd):
32 | super(LambdaLayer, self).__init__()
33 | self.lambd = lambd
34 |
35 | def forward(self, x):
36 | return self.lambd(x)
37 |
38 |
39 | class BasicBlock(nn.Module):
40 | expansion = 1
41 |
42 | def __init__(self, in_planes, planes, stride=1, option='A'):
43 | super(BasicBlock, self).__init__()
44 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
45 | self.bn1 = nn.BatchNorm2d(planes)
46 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
47 | self.bn2 = nn.BatchNorm2d(planes)
48 |
49 | self.shortcut = nn.Sequential()
50 | if stride != 1 or in_planes != planes:
51 | if option == 'A':
52 | """
53 | For CIFAR10 ResNet paper uses option A.
54 | """
55 | self.shortcut = LambdaLayer(lambda x:
56 | F.pad(x[:, :, ::2, ::2], (0, 0, 0, 0, planes//4, planes//4), "constant", 0))
57 | elif option == 'B':
58 | self.shortcut = nn.Sequential(
59 | nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
60 | nn.BatchNorm2d(self.expansion * planes)
61 | )
62 |
63 | def forward(self, x):
64 | out = F.relu(self.bn1(self.conv1(x)))
65 | out = self.bn2(self.conv2(out))
66 | out += self.shortcut(x)
67 | out = F.relu(out)
68 | return out
69 |
70 |
71 | class ResNet2D(nn.Module):
72 | def __init__(self, block, num_blocks, num_classes=10, non_local=False):
73 | super(ResNet2D, self).__init__()
74 | self.in_planes = 16
75 |
76 | self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
77 | self.bn1 = nn.BatchNorm2d(16)
78 | self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1)
79 |
80 | # add non-local block after layer 2
81 | self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2, non_local=non_local)
82 | self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2)
83 | self.linear = nn.Linear(64, num_classes)
84 |
85 | self.apply(_weights_init)
86 |
87 | def _make_layer(self, block, planes, num_blocks, stride, non_local=False):
88 | strides = [stride] + [1]*(num_blocks-1)
89 | layers = []
90 |
91 | last_idx = len(strides)
92 | if non_local:
93 | last_idx = len(strides) - 1
94 |
95 | for i in range(last_idx):
96 | layers.append(block(self.in_planes, planes, strides[i]))
97 | self.in_planes = planes * block.expansion
98 |
99 | if non_local:
100 | layers.append(NLBlockND(in_channels=planes, dimension=2))
101 | layers.append(block(self.in_planes, planes, strides[-1]))
102 |
103 | return nn.Sequential(*layers)
104 |
105 | def forward(self, x):
106 | out = F.relu(self.bn1(self.conv1(x)))
107 | out = self.layer1(out)
108 | out = self.layer2(out)
109 | out = self.layer3(out)
110 | out = F.avg_pool2d(out, out.size()[3])
111 | out = out.view(out.size(0), -1)
112 | out = self.linear(out)
113 | return out
114 |
115 |
116 | def resnet2D56(non_local=False, **kwargs):
117 | """Constructs a ResNet-56 model.
118 | """
119 | return ResNet2D(BasicBlock, [9, 9, 9], non_local=non_local, **kwargs)
120 |
121 |
122 | if __name__=='__main__':
123 | # Test case for (224 x 224 x 3) input of batch size 1
124 | img = Variable(torch.randn(1, 3, 224, 224))
125 | net = resnet2D56()
126 | count = 0
127 | for name, param in net.named_parameters():
128 | if param.requires_grad:
129 | count += 1
130 | print(name)
131 | print (count)
132 | out = net(img)
133 | print(out.size())
134 |
--------------------------------------------------------------------------------
/models/resnet3D.py:
--------------------------------------------------------------------------------
1 | """
2 | ResNet50 (C2D) for spatiotemporal task. Only ResNet50 backbone structure was implemented here.
3 | """
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 | import math
9 | from functools import partial
10 | from models.non_local import NLBlockND
11 |
12 |
13 | class Bottleneck(nn.Module):
14 | """
15 | Bottleneck block structure used in ResNet 50.
16 | As mentioned in Section 4. 2D ConvNet baseline (C2D),
17 | all convolutions are in essence 2D kernels that prcoess the input frame-by-frame
18 | (implemented as (1 x k x k) kernels).
19 | """
20 | expansion = 4
21 |
22 | def __init__(self, inplanes, planes, stride=1, padding=(0, 1, 1), downsample=None):
23 | super(Bottleneck, self).__init__()
24 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=(1, 1, 1), bias=False)
25 | self.bn1 = nn.BatchNorm3d(planes)
26 | self.conv2 = nn.Conv3d(planes, planes, kernel_size=(1, 3, 3), stride=stride, padding=padding, bias=False)
27 | self.bn2 = nn.BatchNorm3d(planes)
28 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=(1, 1, 1), bias=False)
29 | self.bn3 = nn.BatchNorm3d(planes * 4)
30 | self.relu = nn.ReLU(inplace=True)
31 | self.downsample = downsample
32 | self.stride = stride
33 |
34 | def forward(self, x):
35 | identity = x
36 |
37 | out = self.conv1(x)
38 | out = self.bn1(out)
39 | out = self.relu(out)
40 |
41 | out = self.conv2(out)
42 | out = self.bn2(out)
43 | out = self.relu(out)
44 |
45 | out = self.conv3(out)
46 | out = self.bn3(out)
47 |
48 | if self.downsample is not None:
49 | identity = self.downsample(x)
50 |
51 | out += identity
52 | out = self.relu(out)
53 |
54 | return out
55 |
56 |
57 | class ResNet3D(nn.Module):
58 | """C2D with ResNet 50 backbone.
59 | The only operation involving the temporal domain are the pooling layer after the second residual block.
60 | For more details of the structure, refer to Table 1 from the paper.
61 | Padding was added accordingly to match the correct dimensionality.
62 | """
63 | def __init__(self, block, layers, num_classes=400, non_local=False):
64 | self.inplanes = 64
65 | super(ResNet3D, self).__init__()
66 |
67 | # first convolution operation has essentially 2D kernels
68 | # output: 64 x 16 x 112 x 112
69 | self.conv1 = nn.Conv3d(3, 64, kernel_size=(1, 7, 7), stride=2, padding=(0, 3, 3), bias=False)
70 | self.bn1 = nn.BatchNorm3d(64)
71 | self.relu = nn.ReLU(inplace=True)
72 |
73 | # output: 64 x 8 x 56 x 56
74 | self.pool1 = nn.MaxPool3d(kernel_size=3, stride=2)
75 |
76 | # output: 256 x 8 x 56 x 56
77 | self.layer1 = self._make_layer(block, 64, layers[0], stride=1, d_padding=0)
78 |
79 | # pooling on temporal domain
80 | # output: 256 x 4 x 56 x 56
81 | self.pool_t = nn.MaxPool3d(kernel_size=(3, 1, 1), stride=(2, 1, 1))
82 |
83 | # output: 512 x 4 x 28 x 28
84 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2, padding=(2, 1, 1))
85 |
86 | # add one non-local block here
87 | # output: 1024 x 4 x 14 x 14
88 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2, padding=(2, 1, 1), non_local=non_local)
89 |
90 | # output: 2048 x 4 x 7 x 7
91 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2, padding=(2, 1, 1))
92 |
93 | # output: 2048 x 1
94 | self.avgpool = nn.AvgPool3d(kernel_size=(4, 7, 7))
95 | self.fc = nn.Linear(512 * block.expansion, num_classes)
96 |
97 | for m in self.modules():
98 | if isinstance(m, nn.Conv3d):
99 | m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out')
100 | elif isinstance(m, nn.BatchNorm3d):
101 | m.weight.data.fill_(1)
102 | m.bias.data.zero_()
103 |
104 | def _make_layer(self, block, planes, blocks, stride=1, padding=(0, 1, 1), d_padding=(2, 0, 0), non_local=False):
105 | downsample = nn.Sequential(
106 | nn.Conv3d(self.inplanes, planes * block.expansion,
107 | kernel_size=1, stride=stride, padding=d_padding, bias=False),
108 | nn.BatchNorm3d(planes * block.expansion)
109 | )
110 |
111 | layers = []
112 | layers.append(block(self.inplanes, planes, stride, padding, downsample))
113 | self.inplanes = planes * block.expansion
114 |
115 | last_idx = blocks
116 | if non_local:
117 | last_idx = blocks - 1
118 |
119 | for i in range(1, last_idx):
120 | layers.append(block(self.inplanes, planes))
121 |
122 | # add non-local block here
123 | if non_local:
124 | layers.append(NLBlockND(in_channels=1024, dimension=3))
125 | layers.append(block(self.inplanes, planes))
126 |
127 | return nn.Sequential(*layers)
128 |
129 | def forward(self, x):
130 | x = self.conv1(x)
131 | x = self.bn1(x)
132 | x = self.relu(x)
133 | x = self.pool1(x)
134 |
135 | x = self.layer1(x)
136 | x = self.pool_t(x)
137 | x = self.layer2(x)
138 | x = self.layer3(x)
139 | x = self.layer4(x)
140 |
141 | x = self.avgpool(x)
142 |
143 | x = x.view(x.size(0), -1)
144 | x = self.fc(x)
145 |
146 | return x
147 |
148 |
149 | def resnet3D50(non_local=False, **kwargs):
150 | """Constructs a C2D ResNet-50 model.
151 | """
152 | model = ResNet3D(Bottleneck, [3, 4, 6, 3], non_local=non_local, **kwargs)
153 | return model
154 |
155 |
156 |
157 | if __name__=='__main__':
158 | # Test case of 32 frames (224 x 224 x 3) input of batch size 1
159 | img = Variable(torch.randn(1, 3, 32, 224, 224))
160 | net = resnet3D50(non_local=True)
161 | count = 0
162 | for name, param in net.named_parameters():
163 | if param.requires_grad:
164 | count += 1
165 | print(name)
166 | print (count)
167 | out = net(img)
168 | print(out.size())
169 |
--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | python main.py --verbose 2>&1 | tee regular_output.txt
2 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | '''Some helper functions for PyTorch, including:
2 | - get_mean_and_std: calculate the mean and std value of dataset.
3 | - msr_init: net parameter initialization.
4 | - progress_bar: progress bar mimic xlua.progress.
5 | '''
6 | import os
7 | import sys
8 | import time
9 | import math
10 |
11 | import torch.nn as nn
12 | import torch.nn.init as init
13 |
14 |
15 | def get_mean_and_std(dataset):
16 | '''Compute the mean and std value of dataset.'''
17 | dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2)
18 | mean = torch.zeros(3)
19 | std = torch.zeros(3)
20 | print('==> Computing mean and std..')
21 | for inputs, targets in dataloader:
22 | for i in range(3):
23 | mean[i] += inputs[:,i,:,:].mean()
24 | std[i] += inputs[:,i,:,:].std()
25 | mean.div_(len(dataset))
26 | std.div_(len(dataset))
27 | return mean, std
28 |
29 | def init_params(net):
30 | '''Init layer parameters.'''
31 | for m in net.modules():
32 | if isinstance(m, nn.Conv2d):
33 | init.kaiming_normal(m.weight, mode='fan_out')
34 | if m.bias:
35 | init.constant(m.bias, 0)
36 | elif isinstance(m, nn.BatchNorm2d):
37 | init.constant(m.weight, 1)
38 | init.constant(m.bias, 0)
39 | elif isinstance(m, nn.Linear):
40 | init.normal(m.weight, std=1e-3)
41 | if m.bias:
42 | init.constant(m.bias, 0)
43 |
44 |
45 | _, term_width = os.popen('stty size', 'r').read().split()
46 | term_width = int(term_width)
47 |
48 | TOTAL_BAR_LENGTH = 65.
49 | last_time = time.time()
50 | begin_time = last_time
51 | def progress_bar(current, total, msg=None):
52 | global last_time, begin_time
53 | if current == 0:
54 | begin_time = time.time() # Reset for new bar.
55 |
56 | cur_len = int(TOTAL_BAR_LENGTH*current/total)
57 | rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1
58 |
59 | sys.stdout.write(' [')
60 | for i in range(cur_len):
61 | sys.stdout.write('=')
62 | sys.stdout.write('>')
63 | for i in range(rest_len):
64 | sys.stdout.write('.')
65 | sys.stdout.write(']')
66 |
67 | cur_time = time.time()
68 | step_time = cur_time - last_time
69 | last_time = cur_time
70 | tot_time = cur_time - begin_time
71 |
72 | L = []
73 | L.append(' Step: %s' % format_time(step_time))
74 | L.append(' | Tot: %s' % format_time(tot_time))
75 | if msg:
76 | L.append(' | ' + msg)
77 |
78 | msg = ''.join(L)
79 | sys.stdout.write(msg)
80 | for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3):
81 | sys.stdout.write(' ')
82 |
83 | # Go back to the center of the bar.
84 | for i in range(term_width-int(TOTAL_BAR_LENGTH/2)+2):
85 | sys.stdout.write('\b')
86 | sys.stdout.write(' %d/%d ' % (current+1, total))
87 |
88 | if current < total-1:
89 | sys.stdout.write('\r')
90 | else:
91 | sys.stdout.write('\n')
92 | sys.stdout.flush()
93 |
94 | def format_time(seconds):
95 | days = int(seconds / 3600/24)
96 | seconds = seconds - days*3600*24
97 | hours = int(seconds / 3600)
98 | seconds = seconds - hours*3600
99 | minutes = int(seconds / 60)
100 | seconds = seconds - minutes*60
101 | secondsf = int(seconds)
102 | seconds = seconds - secondsf
103 | millis = int(seconds*1000)
104 |
105 | f = ''
106 | i = 1
107 | if days > 0:
108 | f += str(days) + 'D'
109 | i += 1
110 | if hours > 0 and i <= 2:
111 | f += str(hours) + 'h'
112 | i += 1
113 | if minutes > 0 and i <= 2:
114 | f += str(minutes) + 'm'
115 | i += 1
116 | if secondsf > 0 and i <= 2:
117 | f += str(secondsf) + 's'
118 | i += 1
119 | if millis > 0 and i <= 2:
120 | f += str(millis) + 'ms'
121 | i += 1
122 | if f == '':
123 | f = '0ms'
124 | return f
125 |
--------------------------------------------------------------------------------