├── .gitignore ├── 3D_experiment ├── .gitignore ├── LICENSE ├── README.md ├── dataset.py ├── datasets │ ├── activitynet.py │ ├── hmdb51.py │ ├── kinetics.py │ └── ucf101.py ├── main.py ├── mean.py ├── models │ ├── non_local.py │ └── resnet3D.py ├── opts.py ├── run.sh ├── spatial_transforms.py ├── target_transforms.py ├── temporal_transforms.py ├── test.py ├── train.py ├── utils.py ├── utils │ ├── eval_hmdb51.py │ ├── eval_kinetics.py │ ├── eval_ucf101.py │ ├── fps.py │ ├── hmdb51_json.py │ ├── kinetics_json.py │ ├── n_frames_kinetics.py │ ├── n_frames_ucf101_hmdb51.py │ ├── ucf101_json.py │ ├── video_jpg.py │ ├── video_jpg_kinetics.py │ └── video_jpg_ucf101_hmdb51.py └── validation.py ├── LICENSE ├── README.md ├── figure ├── Figure2.jpg ├── Table1.jpg └── resnet56_cifar.jpg ├── main.py ├── models ├── __init__.py ├── non_local.py ├── resnet2D.py └── resnet3D.py ├── run.sh └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | data/ 3 | checkpoint/ 4 | *.txt 5 | plot_loss.ipynb 6 | -------------------------------------------------------------------------------- /3D_experiment/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | data/ 7 | __pycache__ 8 | *.txt 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | .venv 91 | venv/ 92 | ENV/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | 107 | .DS_Store 108 | 109 | .vscode 110 | -------------------------------------------------------------------------------- /3D_experiment/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Kensho Hara 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /3D_experiment/README.md: -------------------------------------------------------------------------------- 1 | # 3D ResNets for Action Recognition 2 | 3 | ## TL:DR 4 | Run `run.sh` to start training using C2D model. If you wish to run other models, please refer to the original repository. 5 | Most of the code is borrowed from https://github.com/kenshohara/3D-ResNets-PyTorch except for the model architecture. 6 | 7 | ## Summary 8 | 9 | This is the PyTorch code for the following papers: 10 | 11 | [ 12 | Kensho Hara, Hirokatsu Kataoka, and Yutaka Satoh, 13 | "Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet?", 14 | Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6546-6555, 2018. 15 | ](http://openaccess.thecvf.com/content_cvpr_2018/html/Hara_Can_Spatiotemporal_3D_CVPR_2018_paper.html) 16 | 17 | [ 18 | Kensho Hara, Hirokatsu Kataoka, and Yutaka Satoh, 19 | "Learning Spatio-Temporal Features with 3D Residual Networks for Action Recognition", 20 | Proceedings of the ICCV Workshop on Action, Gesture, and Emotion Recognition, 2017. 21 | ](http://openaccess.thecvf.com/content_ICCV_2017_workshops/papers/w44/Hara_Learning_Spatio-Temporal_Features_ICCV_2017_paper.pdf) 22 | 23 | This code includes training, fine-tuning and testing on Kinetics, ActivityNet, UCF-101, and HMDB-51. 24 | 25 | ## Citation 26 | 27 | If you use this code or pre-trained models, please cite the following: 28 | 29 | ```bibtex 30 | @inproceedings{hara3dcnns, 31 | author={Kensho Hara and Hirokatsu Kataoka and Yutaka Satoh}, 32 | title={Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet?}, 33 | booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, 34 | pages={6546--6555}, 35 | year={2018}, 36 | } 37 | ``` 38 | 39 | ## Requirements 40 | 41 | * [PyTorch](http://pytorch.org/) 42 | 43 | ```bash 44 | conda install pytorch torchvision cuda80 -c soumith 45 | ``` 46 | 47 | * FFmpeg, FFprobe 48 | 49 | ```bash 50 | wget http://johnvansickle.com/ffmpeg/releases/ffmpeg-release-64bit-static.tar.xz 51 | tar xvf ffmpeg-release-64bit-static.tar.xz 52 | cd ./ffmpeg-3.3.3-64bit-static/; sudo cp ffmpeg ffprobe /usr/local/bin; 53 | ``` 54 | 55 | * Python 3 56 | 57 | ## Preparation 58 | 59 | ### ActivityNet 60 | 61 | * Download videos using [the official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler). 62 | * Convert from avi to jpg files using ```utils/video_jpg.py``` 63 | 64 | ```bash 65 | python utils/video_jpg.py avi_video_directory jpg_video_directory 66 | ``` 67 | 68 | * Generate fps files using ```utils/fps.py``` 69 | 70 | ```bash 71 | python utils/fps.py avi_video_directory jpg_video_directory 72 | ``` 73 | 74 | ### Kinetics 75 | 76 | * Download videos using [the official crawler](https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics). 77 | * Locate test set in ```video_directory/test```. 78 | * Convert from avi to jpg files using ```utils/video_jpg_kinetics.py``` 79 | 80 | ```bash 81 | python utils/video_jpg_kinetics.py avi_video_directory jpg_video_directory 82 | ``` 83 | 84 | * Generate n_frames files using ```utils/n_frames_kinetics.py``` 85 | 86 | ```bash 87 | python utils/n_frames_kinetics.py jpg_video_directory 88 | ``` 89 | 90 | * Generate annotation file in json format similar to ActivityNet using ```utils/kinetics_json.py``` 91 | * The CSV files (kinetics_{train, val, test}.csv) are included in the crawler. 92 | 93 | ```bash 94 | python utils/kinetics_json.py train_csv_path val_csv_path test_csv_path dst_json_path 95 | ``` 96 | 97 | ### UCF-101 98 | 99 | * Download videos and train/test splits [here](http://crcv.ucf.edu/data/UCF101.php). 100 | * Convert from avi to jpg files using ```utils/video_jpg_ucf101_hmdb51.py``` 101 | 102 | ```bash 103 | python utils/video_jpg_ucf101_hmdb51.py avi_video_directory jpg_video_directory 104 | ``` 105 | 106 | * Generate n_frames files using ```utils/n_frames_ucf101_hmdb51.py``` 107 | 108 | ```bash 109 | python utils/n_frames_ucf101_hmdb51.py jpg_video_directory 110 | ``` 111 | 112 | * Generate annotation file in json format similar to ActivityNet using ```utils/ucf101_json.py``` 113 | * ```annotation_dir_path``` includes classInd.txt, trainlist0{1, 2, 3}.txt, testlist0{1, 2, 3}.txt 114 | 115 | ```bash 116 | python utils/ucf101_json.py annotation_dir_path 117 | ``` 118 | 119 | ### HMDB-51 120 | 121 | * Download videos and train/test splits [here](http://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/). 122 | * Convert from avi to jpg files using ```utils/video_jpg_ucf101_hmdb51.py``` 123 | 124 | ```bash 125 | python utils/video_jpg_ucf101_hmdb51.py avi_video_directory jpg_video_directory 126 | ``` 127 | 128 | * Generate n_frames files using ```utils/n_frames_ucf101_hmdb51.py``` 129 | 130 | ```bash 131 | python utils/n_frames_ucf101_hmdb51.py jpg_video_directory 132 | ``` 133 | 134 | * Generate annotation file in json format similar to ActivityNet using ```utils/hmdb51_json.py``` 135 | * ```annotation_dir_path``` includes brush_hair_test_split1.txt, ... 136 | 137 | ```bash 138 | python utils/hmdb51_json.py annotation_dir_path 139 | ``` 140 | 141 | ## Running the code 142 | 143 | Assume the structure of data directories is the following: 144 | 145 | ```misc 146 | ~/ 147 | data/ 148 | kinetics_videos/ 149 | jpg/ 150 | .../ (directories of class names) 151 | .../ (directories of video names) 152 | ... (jpg files) 153 | results/ 154 | save_100.pth 155 | kinetics.json 156 | ``` 157 | 158 | Confirm all options. 159 | 160 | ```bash 161 | python main.lua -h 162 | ``` 163 | 164 | Train ResNets-34 on the Kinetics dataset (400 classes) with 4 CPU threads (for data loading). 165 | Batch size is 128. 166 | Save models at every 5 epochs. 167 | All GPUs is used for the training. 168 | If you want a part of GPUs, use ```CUDA_VISIBLE_DEVICES=...```. 169 | 170 | ```bash 171 | python main.py --root_path ~/data --video_path kinetics_videos/jpg --annotation_path kinetics.json \ 172 | --result_path results --dataset kinetics --model resnet \ 173 | --model_depth 34 --n_classes 400 --batch_size 128 --n_threads 4 --checkpoint 5 174 | ``` 175 | 176 | Continue Training from epoch 101. (~/data/results/save_100.pth is loaded.) 177 | 178 | ```bash 179 | python main.py --root_path ~/data --video_path kinetics_videos/jpg --annotation_path kinetics.json \ 180 | --result_path results --dataset kinetics --resume_path results/save_100.pth \ 181 | --model_depth 34 --n_classes 400 --batch_size 128 --n_threads 4 --checkpoint 5 182 | ``` 183 | 184 | 185 | -------------------------------------------------------------------------------- /3D_experiment/dataset.py: -------------------------------------------------------------------------------- 1 | from datasets.kinetics import Kinetics 2 | from datasets.activitynet import ActivityNet 3 | from datasets.ucf101 import UCF101 4 | from datasets.hmdb51 import HMDB51 5 | 6 | 7 | def get_training_set(opt, spatial_transform, temporal_transform, 8 | target_transform): 9 | assert opt.dataset in ['kinetics', 'activitynet', 'ucf101', 'hmdb51'] 10 | 11 | if opt.dataset == 'kinetics': 12 | training_data = Kinetics( 13 | opt.video_path, 14 | opt.annotation_path, 15 | 'training', 16 | spatial_transform=spatial_transform, 17 | temporal_transform=temporal_transform, 18 | target_transform=target_transform) 19 | elif opt.dataset == 'activitynet': 20 | training_data = ActivityNet( 21 | opt.video_path, 22 | opt.annotation_path, 23 | 'training', 24 | False, 25 | spatial_transform=spatial_transform, 26 | temporal_transform=temporal_transform, 27 | target_transform=target_transform) 28 | elif opt.dataset == 'ucf101': 29 | training_data = UCF101( 30 | opt.video_path, 31 | opt.annotation_path, 32 | 'training', 33 | spatial_transform=spatial_transform, 34 | temporal_transform=temporal_transform, 35 | target_transform=target_transform) 36 | elif opt.dataset == 'hmdb51': 37 | training_data = HMDB51( 38 | opt.video_path, 39 | opt.annotation_path, 40 | 'training', 41 | spatial_transform=spatial_transform, 42 | temporal_transform=temporal_transform, 43 | target_transform=target_transform) 44 | 45 | return training_data 46 | 47 | 48 | def get_validation_set(opt, spatial_transform, temporal_transform, 49 | target_transform): 50 | assert opt.dataset in ['kinetics', 'activitynet', 'ucf101', 'hmdb51'] 51 | 52 | if opt.dataset == 'kinetics': 53 | validation_data = Kinetics( 54 | opt.video_path, 55 | opt.annotation_path, 56 | 'validation', 57 | opt.n_val_samples, 58 | spatial_transform, 59 | temporal_transform, 60 | target_transform, 61 | sample_duration=opt.sample_duration) 62 | elif opt.dataset == 'activitynet': 63 | validation_data = ActivityNet( 64 | opt.video_path, 65 | opt.annotation_path, 66 | 'validation', 67 | False, 68 | opt.n_val_samples, 69 | spatial_transform, 70 | temporal_transform, 71 | target_transform, 72 | sample_duration=opt.sample_duration) 73 | elif opt.dataset == 'ucf101': 74 | validation_data = UCF101( 75 | opt.video_path, 76 | opt.annotation_path, 77 | 'validation', 78 | opt.n_val_samples, 79 | spatial_transform, 80 | temporal_transform, 81 | target_transform, 82 | sample_duration=opt.sample_duration) 83 | elif opt.dataset == 'hmdb51': 84 | validation_data = HMDB51( 85 | opt.video_path, 86 | opt.annotation_path, 87 | 'validation', 88 | opt.n_val_samples, 89 | spatial_transform, 90 | temporal_transform, 91 | target_transform, 92 | sample_duration=opt.sample_duration) 93 | return validation_data 94 | 95 | 96 | def get_test_set(opt, spatial_transform, temporal_transform, target_transform): 97 | assert opt.dataset in ['kinetics', 'activitynet', 'ucf101', 'hmdb51'] 98 | assert opt.test_subset in ['val', 'test'] 99 | 100 | if opt.test_subset == 'val': 101 | subset = 'validation' 102 | elif opt.test_subset == 'test': 103 | subset = 'testing' 104 | if opt.dataset == 'kinetics': 105 | test_data = Kinetics( 106 | opt.video_path, 107 | opt.annotation_path, 108 | subset, 109 | 0, 110 | spatial_transform, 111 | temporal_transform, 112 | target_transform, 113 | sample_duration=opt.sample_duration) 114 | elif opt.dataset == 'activitynet': 115 | test_data = ActivityNet( 116 | opt.video_path, 117 | opt.annotation_path, 118 | subset, 119 | True, 120 | 0, 121 | spatial_transform, 122 | temporal_transform, 123 | target_transform, 124 | sample_duration=opt.sample_duration) 125 | elif opt.dataset == 'ucf101': 126 | test_data = UCF101( 127 | opt.video_path, 128 | opt.annotation_path, 129 | subset, 130 | 0, 131 | spatial_transform, 132 | temporal_transform, 133 | target_transform, 134 | sample_duration=opt.sample_duration) 135 | elif opt.dataset == 'hmdb51': 136 | test_data = HMDB51( 137 | opt.video_path, 138 | opt.annotation_path, 139 | subset, 140 | 0, 141 | spatial_transform, 142 | temporal_transform, 143 | target_transform, 144 | sample_duration=opt.sample_duration) 145 | 146 | return test_data 147 | -------------------------------------------------------------------------------- /3D_experiment/datasets/activitynet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from PIL import Image 4 | import os 5 | import functools 6 | import json 7 | import copy 8 | import math 9 | 10 | from utils import load_value_file 11 | 12 | 13 | def pil_loader(path): 14 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 15 | with open(path, 'rb') as f: 16 | with Image.open(f) as img: 17 | return img.convert('RGB') 18 | 19 | 20 | def accimage_loader(path): 21 | try: 22 | import accimage 23 | return accimage.Image(path) 24 | except IOError: 25 | # Potentially a decoding problem, fall back to PIL.Image 26 | return pil_loader(path) 27 | 28 | 29 | def get_default_image_loader(): 30 | from torchvision import get_image_backend 31 | if get_image_backend() == 'accimage': 32 | return accimage_loader 33 | else: 34 | return pil_loader 35 | 36 | 37 | def video_loader(video_dir_path, frame_indices, image_loader): 38 | video = [] 39 | for i in frame_indices: 40 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 41 | if os.path.exists(image_path): 42 | video.append(image_loader(image_path)) 43 | else: 44 | return video 45 | 46 | return video 47 | 48 | 49 | def get_default_video_loader(): 50 | image_loader = get_default_image_loader() 51 | return functools.partial(video_loader, image_loader=image_loader) 52 | 53 | 54 | def load_annotation_data(data_file_path): 55 | with open(data_file_path, 'r') as data_file: 56 | return json.load(data_file) 57 | 58 | 59 | def get_class_labels(data): 60 | class_names = [] 61 | index = 0 62 | for node1 in data['taxonomy']: 63 | is_leaf = True 64 | for node2 in data['taxonomy']: 65 | if node2['parentId'] == node1['nodeId']: 66 | is_leaf = False 67 | break 68 | if is_leaf: 69 | class_names.append(node1['nodeName']) 70 | 71 | class_labels_map = {} 72 | 73 | for i, class_name in enumerate(class_names): 74 | class_labels_map[class_name] = i 75 | 76 | return class_labels_map 77 | 78 | 79 | def get_video_names_and_annotations(data, subset): 80 | video_names = [] 81 | annotations = [] 82 | 83 | for key, value in data['database'].items(): 84 | this_subset = value['subset'] 85 | if this_subset == subset: 86 | if subset == 'testing': 87 | video_names.append('v_{}'.format(key)) 88 | else: 89 | video_names.append('v_{}'.format(key)) 90 | annotations.append(value['annotations']) 91 | 92 | return video_names, annotations 93 | 94 | 95 | def modify_frame_indices(video_dir_path, frame_indices): 96 | modified_indices = [] 97 | for i in frame_indices: 98 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 99 | if not os.path.exists(image_path): 100 | return modified_indices 101 | modified_indices.append(i) 102 | return modified_indices 103 | 104 | 105 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video, 106 | sample_duration): 107 | data = load_annotation_data(annotation_path) 108 | video_names, annotations = get_video_names_and_annotations(data, subset) 109 | class_to_idx = get_class_labels(data) 110 | idx_to_class = {} 111 | for name, label in class_to_idx.items(): 112 | idx_to_class[label] = name 113 | 114 | dataset = [] 115 | for i in range(len(video_names)): 116 | if i % 1000 == 0: 117 | print('dataset loading [{}/{}]'.format(i, len(video_names))) 118 | 119 | video_path = os.path.join(root_path, video_names[i]) 120 | if not os.path.exists(video_path): 121 | continue 122 | 123 | fps_file_path = os.path.join(video_path, 'fps') 124 | fps = load_value_file(fps_file_path) 125 | 126 | for annotation in annotations[i]: 127 | begin_t = math.ceil(annotation['segment'][0] * fps) 128 | end_t = math.ceil(annotation['segment'][1] * fps) 129 | if begin_t == 0: 130 | begin_t = 1 131 | n_frames = end_t - begin_t 132 | 133 | sample = { 134 | 'video': video_path, 135 | 'segment': [begin_t, end_t], 136 | 'fps': fps, 137 | 'video_id': video_names[i][2:] 138 | } 139 | if len(annotations) != 0: 140 | sample['label'] = class_to_idx[annotation['label']] 141 | else: 142 | sample['label'] = -1 143 | 144 | if n_samples_for_each_video == 1: 145 | frame_indices = list(range(begin_t, end_t)) 146 | frame_indices = modify_frame_indices(sample['video'], 147 | frame_indices) 148 | if len(frame_indices) < 16: 149 | continue 150 | sample['frame_indices'] = frame_indices 151 | dataset.append(sample) 152 | else: 153 | if n_samples_for_each_video > 1: 154 | step = max(1, 155 | math.ceil((n_frames - 1 - sample_duration) / 156 | (n_samples_for_each_video - 1))) 157 | else: 158 | step = sample_duration 159 | for j in range(begin_t, end_t, step): 160 | sample_j = copy.deepcopy(sample) 161 | frame_indices = list(range(j, j + sample_duration)) 162 | frame_indices = modify_frame_indices( 163 | sample_j['video'], frame_indices) 164 | if len(frame_indices) < 16: 165 | continue 166 | sample_j['frame_indices'] = frame_indices 167 | dataset.append(sample_j) 168 | 169 | return dataset, idx_to_class 170 | 171 | 172 | def get_end_t(video_path): 173 | file_names = os.listdir(video_path) 174 | image_file_names = [x for x in file_names if 'image' in x] 175 | image_file_names.sort(reverse=True) 176 | return int(image_file_names[0][6:11]) 177 | 178 | 179 | def make_untrimmed_dataset(root_path, annotation_path, subset, 180 | n_samples_for_each_video, sample_duration): 181 | data = load_annotation_data(annotation_path) 182 | video_names, _ = get_video_names_and_annotations(data, subset) 183 | class_to_idx = get_class_labels(data) 184 | idx_to_class = {} 185 | for name, label in class_to_idx.items(): 186 | idx_to_class[label] = name 187 | 188 | dataset = [] 189 | for i in range(len(video_names)): 190 | if i % 1000 == 0: 191 | print('dataset loading [{}/{}]'.format(i, len(video_names))) 192 | 193 | video_path = os.path.join(root_path, video_names[i]) 194 | if not os.path.exists(video_path): 195 | continue 196 | 197 | fps_file_path = os.path.join(video_path, 'fps') 198 | fps = load_value_file(fps_file_path) 199 | 200 | begin_t = 1 201 | end_t = get_end_t(video_path) 202 | n_frames = end_t - begin_t 203 | 204 | sample = { 205 | 'video': video_path, 206 | 'segment': [begin_t, end_t], 207 | 'fps': fps, 208 | 'video_id': video_names[i][2:] 209 | } 210 | 211 | if n_samples_for_each_video >= 1: 212 | step = max(1, 213 | math.ceil((n_frames - 1 - sample_duration) / 214 | (n_samples_for_each_video - 1))) 215 | else: 216 | step = sample_duration 217 | for j in range(begin_t, end_t, step): 218 | sample_j = copy.deepcopy(sample) 219 | frame_indices = list(range(j, j + sample_duration)) 220 | frame_indices = modify_frame_indices(sample_j['video'], 221 | frame_indices) 222 | if len(frame_indices) < 16: 223 | continue 224 | sample_j['frame_indices'] = frame_indices 225 | dataset.append(sample_j) 226 | 227 | return dataset, idx_to_class 228 | 229 | 230 | class ActivityNet(data.Dataset): 231 | """ 232 | Args: 233 | root (string): Root directory path. 234 | spatial_transform (callable, optional): A function/transform that takes in an PIL image 235 | and returns a transformed version. E.g, ``transforms.RandomCrop`` 236 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices 237 | and returns a transformed version 238 | target_transform (callable, optional): A function/transform that takes in the 239 | target and transforms it. 240 | loader (callable, optional): A function to load an video given its path and frame indices. 241 | Attributes: 242 | classes (list): List of the class names. 243 | class_to_idx (dict): Dict with items (class_name, class_index). 244 | imgs (list): List of (image path, class_index) tuples 245 | """ 246 | 247 | def __init__(self, 248 | root_path, 249 | annotation_path, 250 | subset, 251 | is_untrimmed_setting=False, 252 | n_samples_for_each_video=1, 253 | spatial_transform=None, 254 | temporal_transform=None, 255 | target_transform=None, 256 | sample_duration=16, 257 | get_loader=get_default_video_loader): 258 | if is_untrimmed_setting: 259 | self.data, self.class_names = make_untrimmed_dataset( 260 | root_path, annotation_path, subset, n_samples_for_each_video, 261 | sample_duration) 262 | else: 263 | self.data, self.class_names = make_dataset( 264 | root_path, annotation_path, subset, n_samples_for_each_video, 265 | sample_duration) 266 | 267 | self.spatial_transform = spatial_transform 268 | self.temporal_transform = temporal_transform 269 | self.target_transform = target_transform 270 | self.loader = get_loader() 271 | 272 | def __getitem__(self, index): 273 | """ 274 | Args: 275 | index (int): Index 276 | Returns: 277 | tuple: (image, target) where target is class_index of the target class. 278 | """ 279 | path = self.data[index]['video'] 280 | 281 | frame_indices = self.data[index]['frame_indices'] 282 | if self.temporal_transform is not None: 283 | frame_indices = self.temporal_transform(frame_indices) 284 | clip = self.loader(path, frame_indices) 285 | if self.spatial_transform is not None: 286 | self.spatial_transform.randomize_parameters() 287 | clip = [self.spatial_transform(img) for img in clip] 288 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 289 | 290 | target = self.data[index] 291 | if self.target_transform is not None: 292 | target = self.target_transform(target) 293 | 294 | return clip, target 295 | 296 | def __len__(self): 297 | return len(self.data) 298 | -------------------------------------------------------------------------------- /3D_experiment/datasets/hmdb51.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from PIL import Image 4 | import os 5 | import math 6 | import functools 7 | import json 8 | import copy 9 | 10 | from utils import load_value_file 11 | 12 | 13 | def pil_loader(path): 14 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 15 | with open(path, 'rb') as f: 16 | with Image.open(f) as img: 17 | return img.convert('RGB') 18 | 19 | 20 | def accimage_loader(path): 21 | try: 22 | import accimage 23 | return accimage.Image(path) 24 | except IOError: 25 | # Potentially a decoding problem, fall back to PIL.Image 26 | return pil_loader(path) 27 | 28 | 29 | def get_default_image_loader(): 30 | from torchvision import get_image_backend 31 | if get_image_backend() == 'accimage': 32 | return accimage_loader 33 | else: 34 | return pil_loader 35 | 36 | 37 | def video_loader(video_dir_path, frame_indices, image_loader): 38 | video = [] 39 | for i in frame_indices: 40 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 41 | if os.path.exists(image_path): 42 | video.append(image_loader(image_path)) 43 | else: 44 | return video 45 | 46 | return video 47 | 48 | 49 | def get_default_video_loader(): 50 | image_loader = get_default_image_loader() 51 | return functools.partial(video_loader, image_loader=image_loader) 52 | 53 | 54 | def load_annotation_data(data_file_path): 55 | with open(data_file_path, 'r') as data_file: 56 | return json.load(data_file) 57 | 58 | 59 | def get_class_labels(data): 60 | class_labels_map = {} 61 | index = 0 62 | for class_label in data['labels']: 63 | class_labels_map[class_label] = index 64 | index += 1 65 | return class_labels_map 66 | 67 | 68 | def get_video_names_and_annotations(data, subset): 69 | video_names = [] 70 | annotations = [] 71 | 72 | for key, value in data['database'].items(): 73 | this_subset = value['subset'] 74 | if this_subset == subset: 75 | label = value['annotations']['label'] 76 | video_names.append('{}/{}'.format(label, key)) 77 | annotations.append(value['annotations']) 78 | 79 | return video_names, annotations 80 | 81 | 82 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video, 83 | sample_duration): 84 | data = load_annotation_data(annotation_path) 85 | video_names, annotations = get_video_names_and_annotations(data, subset) 86 | class_to_idx = get_class_labels(data) 87 | idx_to_class = {} 88 | for name, label in class_to_idx.items(): 89 | idx_to_class[label] = name 90 | 91 | dataset = [] 92 | for i in range(len(video_names)): 93 | if i % 1000 == 0: 94 | print('dataset loading [{}/{}]'.format(i, len(video_names))) 95 | 96 | video_path = os.path.join(root_path, video_names[i]) 97 | if not os.path.exists(video_path): 98 | continue 99 | 100 | n_frames_file_path = os.path.join(video_path, 'n_frames') 101 | n_frames = int(load_value_file(n_frames_file_path)) 102 | if n_frames <= 0: 103 | continue 104 | 105 | begin_t = 1 106 | end_t = n_frames 107 | sample = { 108 | 'video': video_path, 109 | 'segment': [begin_t, end_t], 110 | 'n_frames': n_frames, 111 | 'video_id': video_names[i].split('/')[1] 112 | } 113 | if len(annotations) != 0: 114 | sample['label'] = class_to_idx[annotations[i]['label']] 115 | else: 116 | sample['label'] = -1 117 | 118 | if n_samples_for_each_video == 1: 119 | sample['frame_indices'] = list(range(1, n_frames + 1)) 120 | dataset.append(sample) 121 | else: 122 | if n_samples_for_each_video > 1: 123 | step = max(1, 124 | math.ceil((n_frames - 1 - sample_duration) / 125 | (n_samples_for_each_video - 1))) 126 | else: 127 | step = sample_duration 128 | for j in range(1, n_frames, step): 129 | sample_j = copy.deepcopy(sample) 130 | sample_j['frame_indices'] = list( 131 | range(j, min(n_frames + 1, j + sample_duration))) 132 | dataset.append(sample_j) 133 | 134 | return dataset, idx_to_class 135 | 136 | 137 | class HMDB51(data.Dataset): 138 | """ 139 | Args: 140 | root (string): Root directory path. 141 | spatial_transform (callable, optional): A function/transform that takes in an PIL image 142 | and returns a transformed version. E.g, ``transforms.RandomCrop`` 143 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices 144 | and returns a transformed version 145 | target_transform (callable, optional): A function/transform that takes in the 146 | target and transforms it. 147 | loader (callable, optional): A function to load an video given its path and frame indices. 148 | Attributes: 149 | classes (list): List of the class names. 150 | class_to_idx (dict): Dict with items (class_name, class_index). 151 | imgs (list): List of (image path, class_index) tuples 152 | """ 153 | 154 | def __init__(self, 155 | root_path, 156 | annotation_path, 157 | subset, 158 | n_samples_for_each_video=1, 159 | spatial_transform=None, 160 | temporal_transform=None, 161 | target_transform=None, 162 | sample_duration=16, 163 | get_loader=get_default_video_loader): 164 | self.data, self.class_names = make_dataset( 165 | root_path, annotation_path, subset, n_samples_for_each_video, 166 | sample_duration) 167 | 168 | self.spatial_transform = spatial_transform 169 | self.temporal_transform = temporal_transform 170 | self.target_transform = target_transform 171 | self.loader = get_loader() 172 | 173 | def __getitem__(self, index): 174 | """ 175 | Args: 176 | index (int): Index 177 | Returns: 178 | tuple: (image, target) where target is class_index of the target class. 179 | """ 180 | path = self.data[index]['video'] 181 | 182 | frame_indices = self.data[index]['frame_indices'] 183 | if self.temporal_transform is not None: 184 | frame_indices = self.temporal_transform(frame_indices) 185 | clip = self.loader(path, frame_indices) 186 | if self.spatial_transform is not None: 187 | self.spatial_transform.randomize_parameters() 188 | clip = [self.spatial_transform(img) for img in clip] 189 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 190 | 191 | target = self.data[index] 192 | if self.target_transform is not None: 193 | target = self.target_transform(target) 194 | 195 | return clip, target 196 | 197 | def __len__(self): 198 | return len(self.data) 199 | -------------------------------------------------------------------------------- /3D_experiment/datasets/kinetics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from PIL import Image 4 | import os 5 | import math 6 | import functools 7 | import json 8 | import copy 9 | 10 | from utils import load_value_file 11 | 12 | 13 | def pil_loader(path): 14 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 15 | with open(path, 'rb') as f: 16 | with Image.open(f) as img: 17 | return img.convert('RGB') 18 | 19 | 20 | def accimage_loader(path): 21 | try: 22 | import accimage 23 | return accimage.Image(path) 24 | except IOError: 25 | # Potentially a decoding problem, fall back to PIL.Image 26 | return pil_loader(path) 27 | 28 | 29 | def get_default_image_loader(): 30 | from torchvision import get_image_backend 31 | if get_image_backend() == 'accimage': 32 | return accimage_loader 33 | else: 34 | return pil_loader 35 | 36 | 37 | def video_loader(video_dir_path, frame_indices, image_loader): 38 | video = [] 39 | for i in frame_indices: 40 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 41 | if os.path.exists(image_path): 42 | video.append(image_loader(image_path)) 43 | else: 44 | return video 45 | 46 | return video 47 | 48 | 49 | def get_default_video_loader(): 50 | image_loader = get_default_image_loader() 51 | return functools.partial(video_loader, image_loader=image_loader) 52 | 53 | 54 | def load_annotation_data(data_file_path): 55 | with open(data_file_path, 'r') as data_file: 56 | return json.load(data_file) 57 | 58 | 59 | def get_class_labels(data): 60 | class_labels_map = {} 61 | index = 0 62 | for class_label in data['labels']: 63 | class_labels_map[class_label] = index 64 | index += 1 65 | return class_labels_map 66 | 67 | 68 | def get_video_names_and_annotations(data, subset): 69 | video_names = [] 70 | annotations = [] 71 | 72 | for key, value in data['database'].items(): 73 | this_subset = value['subset'] 74 | if this_subset == subset: 75 | if subset == 'testing': 76 | video_names.append('test/{}'.format(key)) 77 | else: 78 | label = value['annotations']['label'] 79 | video_names.append('{}/{}'.format(label, key)) 80 | annotations.append(value['annotations']) 81 | 82 | return video_names, annotations 83 | 84 | 85 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video, 86 | sample_duration): 87 | data = load_annotation_data(annotation_path) 88 | video_names, annotations = get_video_names_and_annotations(data, subset) 89 | class_to_idx = get_class_labels(data) 90 | idx_to_class = {} 91 | for name, label in class_to_idx.items(): 92 | idx_to_class[label] = name 93 | 94 | dataset = [] 95 | for i in range(len(video_names)): 96 | if i % 1000 == 0: 97 | print('dataset loading [{}/{}]'.format(i, len(video_names))) 98 | 99 | video_path = os.path.join(root_path, video_names[i]) 100 | if not os.path.exists(video_path): 101 | continue 102 | 103 | n_frames_file_path = os.path.join(video_path, 'n_frames') 104 | n_frames = int(load_value_file(n_frames_file_path)) 105 | if n_frames <= 0: 106 | continue 107 | 108 | begin_t = 1 109 | end_t = n_frames 110 | sample = { 111 | 'video': video_path, 112 | 'segment': [begin_t, end_t], 113 | 'n_frames': n_frames, 114 | 'video_id': video_names[i][:-14].split('/')[1] 115 | } 116 | if len(annotations) != 0: 117 | sample['label'] = class_to_idx[annotations[i]['label']] 118 | else: 119 | sample['label'] = -1 120 | 121 | if n_samples_for_each_video == 1: 122 | sample['frame_indices'] = list(range(1, n_frames + 1)) 123 | dataset.append(sample) 124 | else: 125 | if n_samples_for_each_video > 1: 126 | step = max(1, 127 | math.ceil((n_frames - 1 - sample_duration) / 128 | (n_samples_for_each_video - 1))) 129 | else: 130 | step = sample_duration 131 | for j in range(1, n_frames, step): 132 | sample_j = copy.deepcopy(sample) 133 | sample_j['frame_indices'] = list( 134 | range(j, min(n_frames + 1, j + sample_duration))) 135 | dataset.append(sample_j) 136 | 137 | return dataset, idx_to_class 138 | 139 | 140 | class Kinetics(data.Dataset): 141 | """ 142 | Args: 143 | root (string): Root directory path. 144 | spatial_transform (callable, optional): A function/transform that takes in an PIL image 145 | and returns a transformed version. E.g, ``transforms.RandomCrop`` 146 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices 147 | and returns a transformed version 148 | target_transform (callable, optional): A function/transform that takes in the 149 | target and transforms it. 150 | loader (callable, optional): A function to load an video given its path and frame indices. 151 | Attributes: 152 | classes (list): List of the class names. 153 | class_to_idx (dict): Dict with items (class_name, class_index). 154 | imgs (list): List of (image path, class_index) tuples 155 | """ 156 | 157 | def __init__(self, 158 | root_path, 159 | annotation_path, 160 | subset, 161 | n_samples_for_each_video=1, 162 | spatial_transform=None, 163 | temporal_transform=None, 164 | target_transform=None, 165 | sample_duration=16, 166 | get_loader=get_default_video_loader): 167 | self.data, self.class_names = make_dataset( 168 | root_path, annotation_path, subset, n_samples_for_each_video, 169 | sample_duration) 170 | 171 | self.spatial_transform = spatial_transform 172 | self.temporal_transform = temporal_transform 173 | self.target_transform = target_transform 174 | self.loader = get_loader() 175 | 176 | def __getitem__(self, index): 177 | """ 178 | Args: 179 | index (int): Index 180 | Returns: 181 | tuple: (image, target) where target is class_index of the target class. 182 | """ 183 | path = self.data[index]['video'] 184 | 185 | frame_indices = self.data[index]['frame_indices'] 186 | if self.temporal_transform is not None: 187 | frame_indices = self.temporal_transform(frame_indices) 188 | clip = self.loader(path, frame_indices) 189 | if self.spatial_transform is not None: 190 | self.spatial_transform.randomize_parameters() 191 | clip = [self.spatial_transform(img) for img in clip] 192 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 193 | 194 | target = self.data[index] 195 | if self.target_transform is not None: 196 | target = self.target_transform(target) 197 | 198 | return clip, target 199 | 200 | def __len__(self): 201 | return len(self.data) 202 | -------------------------------------------------------------------------------- /3D_experiment/datasets/ucf101.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from PIL import Image 4 | import os 5 | import math 6 | import functools 7 | import json 8 | import copy 9 | 10 | from utils import load_value_file 11 | 12 | 13 | def pil_loader(path): 14 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 15 | with open(path, 'rb') as f: 16 | with Image.open(f) as img: 17 | return img.convert('RGB') 18 | 19 | 20 | def accimage_loader(path): 21 | try: 22 | import accimage 23 | return accimage.Image(path) 24 | except IOError: 25 | # Potentially a decoding problem, fall back to PIL.Image 26 | return pil_loader(path) 27 | 28 | 29 | def get_default_image_loader(): 30 | from torchvision import get_image_backend 31 | if get_image_backend() == 'accimage': 32 | return accimage_loader 33 | else: 34 | return pil_loader 35 | 36 | 37 | def video_loader(video_dir_path, frame_indices, image_loader): 38 | video = [] 39 | for i in frame_indices: 40 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 41 | if os.path.exists(image_path): 42 | video.append(image_loader(image_path)) 43 | else: 44 | return video 45 | 46 | return video 47 | 48 | 49 | def get_default_video_loader(): 50 | image_loader = get_default_image_loader() 51 | return functools.partial(video_loader, image_loader=image_loader) 52 | 53 | 54 | def load_annotation_data(data_file_path): 55 | with open(data_file_path, 'r') as data_file: 56 | return json.load(data_file) 57 | 58 | 59 | def get_class_labels(data): 60 | class_labels_map = {} 61 | index = 0 62 | for class_label in data['labels']: 63 | class_labels_map[class_label] = index 64 | index += 1 65 | return class_labels_map 66 | 67 | 68 | def get_video_names_and_annotations(data, subset): 69 | video_names = [] 70 | annotations = [] 71 | 72 | for key, value in data['database'].items(): 73 | this_subset = value['subset'] 74 | if this_subset == subset: 75 | label = value['annotations']['label'] 76 | video_names.append('{}/{}'.format(label, key)) 77 | annotations.append(value['annotations']) 78 | 79 | return video_names, annotations 80 | 81 | 82 | def make_dataset(root_path, annotation_path, subset, n_samples_for_each_video, 83 | sample_duration): 84 | data = load_annotation_data(annotation_path) 85 | video_names, annotations = get_video_names_and_annotations(data, subset) 86 | class_to_idx = get_class_labels(data) 87 | idx_to_class = {} 88 | for name, label in class_to_idx.items(): 89 | idx_to_class[label] = name 90 | 91 | dataset = [] 92 | for i in range(len(video_names)): 93 | if i % 1000 == 0: 94 | print('dataset loading [{}/{}]'.format(i, len(video_names))) 95 | 96 | video_path = os.path.join(root_path, video_names[i]) 97 | if not os.path.exists(video_path): 98 | continue 99 | 100 | n_frames_file_path = os.path.join(video_path, 'n_frames') 101 | n_frames = int(load_value_file(n_frames_file_path)) 102 | if n_frames <= 0: 103 | continue 104 | 105 | begin_t = 1 106 | end_t = n_frames 107 | sample = { 108 | 'video': video_path, 109 | 'segment': [begin_t, end_t], 110 | 'n_frames': n_frames, 111 | 'video_id': video_names[i].split('/')[1] 112 | } 113 | if len(annotations) != 0: 114 | sample['label'] = class_to_idx[annotations[i]['label']] 115 | else: 116 | sample['label'] = -1 117 | 118 | if n_samples_for_each_video == 1: 119 | sample['frame_indices'] = list(range(1, n_frames + 1)) 120 | dataset.append(sample) 121 | else: 122 | if n_samples_for_each_video > 1: 123 | step = max(1, 124 | math.ceil((n_frames - 1 - sample_duration) / 125 | (n_samples_for_each_video - 1))) 126 | else: 127 | step = sample_duration 128 | for j in range(1, n_frames, step): 129 | sample_j = copy.deepcopy(sample) 130 | sample_j['frame_indices'] = list( 131 | range(j, min(n_frames + 1, j + sample_duration))) 132 | dataset.append(sample_j) 133 | 134 | return dataset, idx_to_class 135 | 136 | 137 | class UCF101(data.Dataset): 138 | """ 139 | Args: 140 | root (string): Root directory path. 141 | spatial_transform (callable, optional): A function/transform that takes in an PIL image 142 | and returns a transformed version. E.g, ``transforms.RandomCrop`` 143 | temporal_transform (callable, optional): A function/transform that takes in a list of frame indices 144 | and returns a transformed version 145 | target_transform (callable, optional): A function/transform that takes in the 146 | target and transforms it. 147 | loader (callable, optional): A function to load an video given its path and frame indices. 148 | Attributes: 149 | classes (list): List of the class names. 150 | class_to_idx (dict): Dict with items (class_name, class_index). 151 | imgs (list): List of (image path, class_index) tuples 152 | """ 153 | 154 | def __init__(self, 155 | root_path, 156 | annotation_path, 157 | subset, 158 | n_samples_for_each_video=1, 159 | spatial_transform=None, 160 | temporal_transform=None, 161 | target_transform=None, 162 | sample_duration=16, 163 | get_loader=get_default_video_loader): 164 | self.data, self.class_names = make_dataset( 165 | root_path, annotation_path, subset, n_samples_for_each_video, 166 | sample_duration) 167 | 168 | self.spatial_transform = spatial_transform 169 | self.temporal_transform = temporal_transform 170 | self.target_transform = target_transform 171 | self.loader = get_loader() 172 | 173 | def __getitem__(self, index): 174 | """ 175 | Args: 176 | index (int): Index 177 | Returns: 178 | tuple: (image, target) where target is class_index of the target class. 179 | """ 180 | path = self.data[index]['video'] 181 | 182 | frame_indices = self.data[index]['frame_indices'] 183 | if self.temporal_transform is not None: 184 | frame_indices = self.temporal_transform(frame_indices) 185 | clip = self.loader(path, frame_indices) 186 | if self.spatial_transform is not None: 187 | self.spatial_transform.randomize_parameters() 188 | clip = [self.spatial_transform(img) for img in clip] 189 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 190 | 191 | target = self.data[index] 192 | if self.target_transform is not None: 193 | target = self.target_transform(target) 194 | 195 | return clip, target 196 | 197 | def __len__(self): 198 | return len(self.data) 199 | -------------------------------------------------------------------------------- /3D_experiment/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import numpy as np 5 | import torch 6 | from torch import nn 7 | from torch import optim 8 | from torch.optim import lr_scheduler 9 | 10 | from opts import parse_opts 11 | from mean import get_mean, get_std 12 | from spatial_transforms import ( 13 | Compose, Normalize, Scale, CenterCrop, CornerCrop, MultiScaleCornerCrop, 14 | MultiScaleRandomCrop, RandomHorizontalFlip, ToTensor) 15 | from temporal_transforms import LoopPadding, TemporalRandomCrop 16 | from target_transforms import ClassLabel, VideoID 17 | from target_transforms import Compose as TargetCompose 18 | from dataset import get_training_set, get_validation_set, get_test_set 19 | from utils import Logger 20 | from train import train_epoch 21 | from validation import val_epoch 22 | import test 23 | from models.resnet3D import resnet3D50 24 | 25 | if __name__ == '__main__': 26 | opt = parse_opts() 27 | if opt.root_path != '': 28 | opt.video_path = os.path.join(opt.root_path, opt.video_path) 29 | opt.annotation_path = os.path.join(opt.root_path, opt.annotation_path) 30 | opt.result_path = os.path.join(opt.root_path, opt.result_path) 31 | if opt.resume_path: 32 | opt.resume_path = os.path.join(opt.root_path, opt.resume_path) 33 | if opt.pretrain_path: 34 | opt.pretrain_path = os.path.join(opt.root_path, opt.pretrain_path) 35 | opt.scales = [opt.initial_scale] 36 | for i in range(1, opt.n_scales): 37 | opt.scales.append(opt.scales[-1] * opt.scale_step) 38 | opt.arch = '{}-{}'.format(opt.model, opt.model_depth) 39 | opt.mean = get_mean(opt.norm_value, dataset=opt.mean_dataset) 40 | opt.std = get_std(opt.norm_value) 41 | print(opt) 42 | with open(os.path.join(opt.result_path, 'opts.json'), 'w') as opt_file: 43 | json.dump(vars(opt), opt_file) 44 | 45 | torch.manual_seed(opt.manual_seed) 46 | 47 | model = resnet3D50(non_local=True) 48 | parameters = model.parameters() 49 | 50 | if not opt.no_cuda: 51 | model = model.cuda() 52 | model = nn.DataParallel(model, device_ids=None) 53 | 54 | print(model) 55 | criterion = nn.CrossEntropyLoss() 56 | if not opt.no_cuda: 57 | criterion = criterion.cuda() 58 | 59 | if opt.no_mean_norm and not opt.std_norm: 60 | norm_method = Normalize([0, 0, 0], [1, 1, 1]) 61 | elif not opt.std_norm: 62 | norm_method = Normalize(opt.mean, [1, 1, 1]) 63 | else: 64 | norm_method = Normalize(opt.mean, opt.std) 65 | 66 | if not opt.no_train: 67 | assert opt.train_crop in ['random', 'corner', 'center'] 68 | if opt.train_crop == 'random': 69 | crop_method = MultiScaleRandomCrop(opt.scales, opt.sample_size) 70 | elif opt.train_crop == 'corner': 71 | crop_method = MultiScaleCornerCrop(opt.scales, opt.sample_size) 72 | elif opt.train_crop == 'center': 73 | crop_method = MultiScaleCornerCrop( 74 | opt.scales, opt.sample_size, crop_positions=['c']) 75 | spatial_transform = Compose([ 76 | crop_method, 77 | RandomHorizontalFlip(), 78 | ToTensor(opt.norm_value), norm_method 79 | ]) 80 | temporal_transform = TemporalRandomCrop(opt.sample_duration) 81 | target_transform = ClassLabel() 82 | training_data = get_training_set(opt, spatial_transform, 83 | temporal_transform, target_transform) 84 | train_loader = torch.utils.data.DataLoader( 85 | training_data, 86 | batch_size=opt.batch_size, 87 | shuffle=True, 88 | num_workers=opt.n_threads, 89 | pin_memory=True) 90 | train_logger = Logger( 91 | os.path.join(opt.result_path, 'train.log'), 92 | ['epoch', 'loss', 'acc', 'lr']) 93 | train_batch_logger = Logger( 94 | os.path.join(opt.result_path, 'train_batch.log'), 95 | ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr']) 96 | 97 | if opt.nesterov: 98 | dampening = 0 99 | else: 100 | dampening = opt.dampening 101 | optimizer = optim.SGD( 102 | parameters, 103 | lr=opt.learning_rate, 104 | momentum=opt.momentum, 105 | dampening=dampening, 106 | weight_decay=opt.weight_decay, 107 | nesterov=opt.nesterov) 108 | scheduler = lr_scheduler.ReduceLROnPlateau( 109 | optimizer, 'min', patience=opt.lr_patience) 110 | if not opt.no_val: 111 | spatial_transform = Compose([ 112 | Scale(opt.sample_size), 113 | CenterCrop(opt.sample_size), 114 | ToTensor(opt.norm_value), norm_method 115 | ]) 116 | temporal_transform = LoopPadding(opt.sample_duration) 117 | target_transform = ClassLabel() 118 | validation_data = get_validation_set( 119 | opt, spatial_transform, temporal_transform, target_transform) 120 | val_loader = torch.utils.data.DataLoader( 121 | validation_data, 122 | batch_size=opt.batch_size, 123 | shuffle=False, 124 | num_workers=opt.n_threads, 125 | pin_memory=True) 126 | val_logger = Logger( 127 | os.path.join(opt.result_path, 'val.log'), ['epoch', 'loss', 'acc']) 128 | 129 | if opt.resume_path: 130 | print('loading checkpoint {}'.format(opt.resume_path)) 131 | checkpoint = torch.load(opt.resume_path) 132 | assert opt.arch == checkpoint['arch'] 133 | 134 | opt.begin_epoch = checkpoint['epoch'] 135 | model.load_state_dict(checkpoint['state_dict']) 136 | if not opt.no_train: 137 | optimizer.load_state_dict(checkpoint['optimizer']) 138 | 139 | print('run') 140 | for i in range(opt.begin_epoch, opt.n_epochs + 1): 141 | if not opt.no_train: 142 | train_epoch(i, train_loader, model, criterion, optimizer, opt, 143 | train_logger, train_batch_logger) 144 | if not opt.no_val: 145 | validation_loss = val_epoch(i, val_loader, model, criterion, opt, 146 | val_logger) 147 | 148 | if not opt.no_train and not opt.no_val: 149 | scheduler.step(validation_loss) 150 | 151 | if opt.test: 152 | spatial_transform = Compose([ 153 | Scale(int(opt.sample_size / opt.scale_in_test)), 154 | CornerCrop(opt.sample_size, opt.crop_position_in_test), 155 | ToTensor(opt.norm_value), norm_method 156 | ]) 157 | temporal_transform = LoopPadding(opt.sample_duration) 158 | target_transform = VideoID() 159 | 160 | test_data = get_test_set(opt, spatial_transform, temporal_transform, 161 | target_transform) 162 | test_loader = torch.utils.data.DataLoader( 163 | test_data, 164 | batch_size=opt.batch_size, 165 | shuffle=False, 166 | num_workers=opt.n_threads, 167 | pin_memory=True) 168 | test.test(test_loader, model, opt, test_data.class_names) 169 | -------------------------------------------------------------------------------- /3D_experiment/mean.py: -------------------------------------------------------------------------------- 1 | def get_mean(norm_value=255, dataset='activitynet'): 2 | assert dataset in ['activitynet', 'kinetics'] 3 | 4 | if dataset == 'activitynet': 5 | return [ 6 | 114.7748 / norm_value, 107.7354 / norm_value, 99.4750 / norm_value 7 | ] 8 | elif dataset == 'kinetics': 9 | # Kinetics (10 videos for each class) 10 | return [ 11 | 110.63666788 / norm_value, 103.16065604 / norm_value, 12 | 96.29023126 / norm_value 13 | ] 14 | 15 | 16 | def get_std(norm_value=255): 17 | # Kinetics (10 videos for each class) 18 | return [ 19 | 38.7568578 / norm_value, 37.88248729 / norm_value, 20 | 40.02898126 / norm_value 21 | ] 22 | -------------------------------------------------------------------------------- /3D_experiment/models/non_local.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | 6 | class NLBlockND(nn.Module): 7 | def __init__(self, in_channels, inter_channels=None, mode='embedded', 8 | dimension=3, bn_layer=True): 9 | """Implementation of Non-Local Block with 4 different pairwise functions 10 | args: 11 | in_channels: original channel size (1024 in the paper) 12 | inter_channels: channel size inside the block if not specifed reduced to half (512 in the paper) 13 | mode: supports Gaussian, Embedded Gaussian, Dot Product, and Concatenation 14 | dimension: can be 1 (temporal), 2 (spatial), 3 (spatiotemporal) 15 | bn_layer: whether to add batch norm 16 | """ 17 | super(NLBlockND, self).__init__() 18 | 19 | assert dimension in [1, 2, 3] 20 | 21 | if mode not in ['gaussian', 'embedded', 'dot', 'concatenate']: 22 | raise ValueError('`mode` must be one of `gaussian`, `embedded`, `dot` or `concatenate`') 23 | 24 | self.mode = mode 25 | self.dimension = dimension 26 | 27 | self.in_channels = in_channels 28 | self.inter_channels = inter_channels 29 | 30 | # the channel size is reduced to half inside the block 31 | if self.inter_channels is None: 32 | self.inter_channels = in_channels // 2 33 | if self.inter_channels == 0: 34 | self.inter_channels = 1 35 | 36 | # assign appropriate convolutional, max pool, and batch norm layers for different dimensions 37 | if dimension == 3: 38 | conv_nd = nn.Conv3d 39 | max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2)) 40 | bn = nn.BatchNorm3d 41 | elif dimension == 2: 42 | conv_nd = nn.Conv2d 43 | max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2)) 44 | bn = nn.BatchNorm2d 45 | else: 46 | conv_nd = nn.Conv1d 47 | max_pool_layer = nn.MaxPool1d(kernel_size=(2)) 48 | bn = nn.BatchNorm1d 49 | 50 | # function g in the paper which goes through conv. with kernel size 1 51 | self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1) 52 | 53 | # add BatchNorm layer after the last conv layer 54 | if bn_layer: 55 | self.W_z = nn.Sequential( 56 | conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, kernel_size=1), 57 | bn(self.in_channels) 58 | ) 59 | nn.init.constant_(self.W_z[1].weight, 0) 60 | nn.init.constant_(self.W_z[1].bias, 0) 61 | else: 62 | self.W_z = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, kernel_size=1) 63 | nn.init.constant_(self.W_z.weight, 0) 64 | nn.init.constant_(self.W_z.bias, 0) 65 | 66 | # define theta and phi for all operations except gaussian 67 | if self.mode == "embedded" or self.mode == "dot" or self.mode == "concatenate": 68 | self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1) 69 | self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1) 70 | 71 | if self.mode == "concatenate": 72 | self.W_f = nn.Sequential( 73 | nn.Conv2d(in_channels=self.inter_channel * 2, out_channels=1, kernel_size=1), 74 | nn.ReLU() 75 | ) 76 | 77 | def forward(self, x): 78 | """ 79 | args 80 | x: (N, C, T, H, W) for dimension=3; (N, C, H, W) for dimension 2; (N, C, T) for dimension 1 81 | """ 82 | 83 | batch_size = x.size(0) 84 | 85 | # (N, C, THW) 86 | g_x = self.g(x).view(batch_size, self.inter_channels, -1) 87 | g_x = g_x.permute(0, 2, 1) 88 | 89 | if self.mode == "gaussian": 90 | theta_x = x.view(batch_size, self.in_channels, -1) 91 | phi_x = x.view(batch_size, self.in_channels, -1) 92 | theta_x = theta_x.permute(0, 2, 1) 93 | f = torch.matmul(theta_x, phi_x) 94 | 95 | elif self.mode == "embedded" or self.mode == "dot": 96 | theta_x = self.theta(x).view(batch_size, self.inter_channels, -1) 97 | phi_x = self.phi(x).view(batch_size, self.inter_channels, -1) 98 | theta_x = theta_x.permute(0, 2, 1) 99 | f = torch.matmul(theta_x, phi_x) 100 | 101 | elif self.mode == "concatenate": 102 | theta_x = self.theta(x).view(batch_size, self.inter_channels, -1, 1) 103 | phi_x = self.phi(x).view(batch_size, self.inter_channels, 1, -1) 104 | 105 | h = theta_x.size(2) 106 | w = phi_x.size(3) 107 | theta_x = theta_x.repeat(1, 1, 1, w) 108 | phi_x = phi_x.repeat(1, 1, h, 1) 109 | 110 | concat = torch.cat([theta_x, phi_x], dim=1) 111 | f = self.W_f(concat) 112 | f = f.view(f.size(0), f.size(2), f.size(3)) 113 | 114 | if self.mode == "gaussian" or self.mode == "embedded": 115 | f_div_C = F.softmax(f, dim=-1) 116 | elif self.mode == "dot" or self.mode == "concatenate": 117 | N = f.size(-1) # number of position in x 118 | f_div_C = f / N 119 | 120 | y = torch.matmul(f_div_C, g_x) 121 | 122 | # contiguous here just allocates contiguous chunk of memory 123 | y = y.permute(0, 2, 1).contiguous() 124 | y = y.view(batch_size, self.inter_channels, *x.size()[2:]) 125 | 126 | W_y = self.W_z(y) 127 | # residual connection 128 | z = W_y + x 129 | 130 | return z 131 | 132 | 133 | if __name__ == '__main__': 134 | import torch 135 | 136 | for bn_layer in [True, False]: 137 | img = torch.zeros(2, 3, 20) 138 | net = NLBlockND(in_channels=3, mode='concatenate', dimension=1, bn_layer=bn_layer) 139 | out = net(img) 140 | print(out.size()) 141 | 142 | img = torch.zeros(2, 3, 20, 20) 143 | net = NLBlockND(in_channels=3, mode='concatenate', dimension=2, bn_layer=bn_layer) 144 | out = net(img) 145 | print(out.size()) 146 | 147 | img = torch.randn(2, 3, 8, 20, 20) 148 | net = NLBlockND(in_channels=3, mode='concatenate', dimension=3, bn_layer=bn_layer) 149 | out = net(img) 150 | print(out.size()) 151 | 152 | 153 | -------------------------------------------------------------------------------- /3D_experiment/models/resnet3D.py: -------------------------------------------------------------------------------- 1 | """ 2 | ResNet50 (C2D) for spatiotemporal task. Only ResNet50 backbone structure was implemented here. 3 | """ 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import math 9 | from functools import partial 10 | from models.non_local import NLBlockND 11 | 12 | 13 | class Bottleneck(nn.Module): 14 | """ 15 | Bottleneck block structure used in ResNet 50. 16 | As mentioned in Section 4. 2D ConvNet baseline (C2D), 17 | all convolutions are in essence 2D kernels that prcoess the input frame-by-frame 18 | (implemented as (1 x k x k) kernels). 19 | """ 20 | expansion = 4 21 | 22 | def __init__(self, inplanes, planes, stride=1, padding=(0, 1, 1), downsample=None): 23 | super(Bottleneck, self).__init__() 24 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=(1, 1, 1), bias=False) 25 | self.bn1 = nn.BatchNorm3d(planes) 26 | self.conv2 = nn.Conv3d(planes, planes, kernel_size=(1, 3, 3), stride=stride, padding=padding, bias=False) 27 | self.bn2 = nn.BatchNorm3d(planes) 28 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=(1, 1, 1), bias=False) 29 | self.bn3 = nn.BatchNorm3d(planes * 4) 30 | self.relu = nn.ReLU(inplace=True) 31 | self.downsample = downsample 32 | self.stride = stride 33 | 34 | def forward(self, x): 35 | identity = x 36 | 37 | out = self.conv1(x) 38 | out = self.bn1(out) 39 | out = self.relu(out) 40 | 41 | out = self.conv2(out) 42 | out = self.bn2(out) 43 | out = self.relu(out) 44 | 45 | out = self.conv3(out) 46 | out = self.bn3(out) 47 | 48 | if self.downsample is not None: 49 | identity = self.downsample(x) 50 | 51 | out += identity 52 | out = self.relu(out) 53 | 54 | return out 55 | 56 | 57 | class ResNet3D(nn.Module): 58 | """C2D with ResNet 50 backbone. 59 | The only operation involving the temporal domain are the pooling layer after the second residual block. 60 | For more details of the structure, refer to Table 1 from the paper. 61 | Padding was added accordingly to match the correct dimensionality. 62 | """ 63 | def __init__(self, block, layers, num_classes=400, non_local=False): 64 | self.inplanes = 64 65 | super(ResNet3D, self).__init__() 66 | 67 | # first convolution operation has essentially 2D kernels 68 | # output: 64 x 16 x 112 x 112 69 | self.conv1 = nn.Conv3d(3, 64, kernel_size=(1, 7, 7), stride=2, padding=(0, 3, 3), bias=False) 70 | self.bn1 = nn.BatchNorm3d(64) 71 | self.relu = nn.ReLU(inplace=True) 72 | 73 | # output: 64 x 8 x 56 x 56 74 | self.pool1 = nn.MaxPool3d(kernel_size=3, stride=2) 75 | 76 | # output: 256 x 8 x 56 x 56 77 | self.layer1 = self._make_layer(block, 64, layers[0], stride=1, d_padding=0) 78 | 79 | # pooling on temporal domain 80 | # output: 256 x 4 x 56 x 56 81 | self.pool_t = nn.MaxPool3d(kernel_size=(3, 1, 1), stride=(2, 1, 1)) 82 | 83 | # output: 512 x 4 x 28 x 28 84 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2, padding=(2, 1, 1)) 85 | 86 | # add one non-local block here 87 | # output: 1024 x 4 x 14 x 14 88 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2, padding=(2, 1, 1), non_local=non_local) 89 | 90 | # output: 2048 x 4 x 7 x 7 91 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2, padding=(2, 1, 1)) 92 | 93 | # output: 2048 x 1 94 | self.avgpool = nn.AvgPool3d(kernel_size=(4, 7, 7)) 95 | self.fc = nn.Linear(512 * block.expansion, num_classes) 96 | 97 | for m in self.modules(): 98 | if isinstance(m, nn.Conv3d): 99 | m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out') 100 | elif isinstance(m, nn.BatchNorm3d): 101 | m.weight.data.fill_(1) 102 | m.bias.data.zero_() 103 | 104 | def _make_layer(self, block, planes, blocks, stride=1, padding=(0, 1, 1), d_padding=(2, 0, 0), non_local=False): 105 | downsample = nn.Sequential( 106 | nn.Conv3d(self.inplanes, planes * block.expansion, 107 | kernel_size=1, stride=stride, padding=d_padding, bias=False), 108 | nn.BatchNorm3d(planes * block.expansion) 109 | ) 110 | 111 | layers = [] 112 | layers.append(block(self.inplanes, planes, stride, padding, downsample)) 113 | self.inplanes = planes * block.expansion 114 | 115 | last_idx = blocks 116 | if non_local: 117 | last_idx = blocks - 1 118 | 119 | for i in range(1, last_idx): 120 | layers.append(block(self.inplanes, planes)) 121 | 122 | # add non-local block here 123 | if non_local: 124 | layers.append(NLBlockND(in_channels=1024, dimension=3)) 125 | layers.append(block(self.inplanes, planes)) 126 | 127 | return nn.Sequential(*layers) 128 | 129 | def forward(self, x): 130 | x = self.conv1(x) 131 | x = self.bn1(x) 132 | x = self.relu(x) 133 | x = self.pool1(x) 134 | 135 | x = self.layer1(x) 136 | x = self.pool_t(x) 137 | x = self.layer2(x) 138 | x = self.layer3(x) 139 | x = self.layer4(x) 140 | 141 | x = self.avgpool(x) 142 | 143 | x = x.view(x.size(0), -1) 144 | x = self.fc(x) 145 | 146 | return x 147 | 148 | 149 | def resnet3D50(non_local=False, **kwargs): 150 | """Constructs a C2D ResNet-50 model. 151 | """ 152 | model = ResNet3D(Bottleneck, [3, 4, 6, 3], non_local=non_local, **kwargs) 153 | return model 154 | 155 | 156 | 157 | if __name__=='__main__': 158 | # Test case of 32 frames (224 x 224 x 3) input of batch size 1 159 | img = Variable(torch.randn(1, 3, 32, 224, 224)) 160 | net = resnet3D50(non_local=True) 161 | count = 0 162 | for name, param in net.named_parameters(): 163 | if param.requires_grad: 164 | count += 1 165 | print(name) 166 | print (count) 167 | out = net(img) 168 | print(out.size()) 169 | -------------------------------------------------------------------------------- /3D_experiment/opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def parse_opts(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument( 7 | '--root_path', 8 | default='/root/data/ActivityNet', 9 | type=str, 10 | help='Root directory path of data') 11 | parser.add_argument( 12 | '--video_path', 13 | default='video_kinetics_jpg', 14 | type=str, 15 | help='Directory path of Videos') 16 | parser.add_argument( 17 | '--annotation_path', 18 | default='kinetics.json', 19 | type=str, 20 | help='Annotation file path') 21 | parser.add_argument( 22 | '--result_path', 23 | default='results', 24 | type=str, 25 | help='Result directory path') 26 | parser.add_argument( 27 | '--dataset', 28 | default='kinetics', 29 | type=str, 30 | help='Used dataset (activitynet | kinetics | ucf101 | hmdb51)') 31 | parser.add_argument( 32 | '--n_classes', 33 | default=400, 34 | type=int, 35 | help= 36 | 'Number of classes (activitynet: 200, kinetics: 400, ucf101: 101, hmdb51: 51)' 37 | ) 38 | parser.add_argument( 39 | '--n_finetune_classes', 40 | default=400, 41 | type=int, 42 | help= 43 | 'Number of classes for fine-tuning. n_classes is set to the number when pretraining.' 44 | ) 45 | parser.add_argument( 46 | '--sample_size', 47 | default=112, 48 | type=int, 49 | help='Height and width of inputs') 50 | parser.add_argument( 51 | '--sample_duration', 52 | default=16, 53 | type=int, 54 | help='Temporal duration of inputs') 55 | parser.add_argument( 56 | '--initial_scale', 57 | default=1.0, 58 | type=float, 59 | help='Initial scale for multiscale cropping') 60 | parser.add_argument( 61 | '--n_scales', 62 | default=5, 63 | type=int, 64 | help='Number of scales for multiscale cropping') 65 | parser.add_argument( 66 | '--scale_step', 67 | default=0.84089641525, 68 | type=float, 69 | help='Scale step for multiscale cropping') 70 | parser.add_argument( 71 | '--train_crop', 72 | default='corner', 73 | type=str, 74 | help= 75 | 'Spatial cropping method in training. random is uniform. corner is selection from 4 corners and 1 center. (random | corner | center)' 76 | ) 77 | parser.add_argument( 78 | '--learning_rate', 79 | default=0.1, 80 | type=float, 81 | help= 82 | 'Initial learning rate (divided by 10 while training by lr scheduler)') 83 | parser.add_argument('--momentum', default=0.9, type=float, help='Momentum') 84 | parser.add_argument( 85 | '--dampening', default=0.9, type=float, help='dampening of SGD') 86 | parser.add_argument( 87 | '--weight_decay', default=1e-3, type=float, help='Weight Decay') 88 | parser.add_argument( 89 | '--mean_dataset', 90 | default='activitynet', 91 | type=str, 92 | help= 93 | 'dataset for mean values of mean subtraction (activitynet | kinetics)') 94 | parser.add_argument( 95 | '--no_mean_norm', 96 | action='store_true', 97 | help='If true, inputs are not normalized by mean.') 98 | parser.set_defaults(no_mean_norm=False) 99 | parser.add_argument( 100 | '--std_norm', 101 | action='store_true', 102 | help='If true, inputs are normalized by standard deviation.') 103 | parser.set_defaults(std_norm=False) 104 | parser.add_argument( 105 | '--nesterov', action='store_true', help='Nesterov momentum') 106 | parser.set_defaults(nesterov=False) 107 | parser.add_argument( 108 | '--optimizer', 109 | default='sgd', 110 | type=str, 111 | help='Currently only support SGD') 112 | parser.add_argument( 113 | '--lr_patience', 114 | default=10, 115 | type=int, 116 | help='Patience of LR scheduler. See documentation of ReduceLROnPlateau.' 117 | ) 118 | parser.add_argument( 119 | '--batch_size', default=128, type=int, help='Batch Size') 120 | parser.add_argument( 121 | '--n_epochs', 122 | default=200, 123 | type=int, 124 | help='Number of total epochs to run') 125 | parser.add_argument( 126 | '--begin_epoch', 127 | default=1, 128 | type=int, 129 | help= 130 | 'Training begins at this epoch. Previous trained model indicated by resume_path is loaded.' 131 | ) 132 | parser.add_argument( 133 | '--n_val_samples', 134 | default=3, 135 | type=int, 136 | help='Number of validation samples for each activity') 137 | parser.add_argument( 138 | '--resume_path', 139 | default='', 140 | type=str, 141 | help='Save data (.pth) of previous training') 142 | parser.add_argument( 143 | '--pretrain_path', default='', type=str, help='Pretrained model (.pth)') 144 | parser.add_argument( 145 | '--ft_begin_index', 146 | default=0, 147 | type=int, 148 | help='Begin block index of fine-tuning') 149 | parser.add_argument( 150 | '--no_train', 151 | action='store_true', 152 | help='If true, training is not performed.') 153 | parser.set_defaults(no_train=False) 154 | parser.add_argument( 155 | '--no_val', 156 | action='store_true', 157 | help='If true, validation is not performed.') 158 | parser.set_defaults(no_val=False) 159 | parser.add_argument( 160 | '--test', action='store_true', help='If true, test is performed.') 161 | parser.set_defaults(test=False) 162 | parser.add_argument( 163 | '--test_subset', 164 | default='val', 165 | type=str, 166 | help='Used subset in test (val | test)') 167 | parser.add_argument( 168 | '--scale_in_test', 169 | default=1.0, 170 | type=float, 171 | help='Spatial scale in test') 172 | parser.add_argument( 173 | '--crop_position_in_test', 174 | default='c', 175 | type=str, 176 | help='Cropping method (c | tl | tr | bl | br) in test') 177 | parser.add_argument( 178 | '--no_softmax_in_test', 179 | action='store_true', 180 | help='If true, output for each clip is not normalized using softmax.') 181 | parser.set_defaults(no_softmax_in_test=False) 182 | parser.add_argument( 183 | '--no_cuda', action='store_true', help='If true, cuda is not used.') 184 | parser.set_defaults(no_cuda=False) 185 | parser.add_argument( 186 | '--n_threads', 187 | default=4, 188 | type=int, 189 | help='Number of threads for multi-thread loading') 190 | parser.add_argument( 191 | '--checkpoint', 192 | default=10, 193 | type=int, 194 | help='Trained model is saved at every this epochs.') 195 | parser.add_argument( 196 | '--no_hflip', 197 | action='store_true', 198 | help='If true holizontal flipping is not performed.') 199 | parser.set_defaults(no_hflip=False) 200 | parser.add_argument( 201 | '--norm_value', 202 | default=1, 203 | type=int, 204 | help= 205 | 'If 1, range of inputs is [0-255]. If 255, range of inputs is [0-1].') 206 | parser.add_argument( 207 | '--model', 208 | default='resnet', 209 | type=str, 210 | help='(resnet | resnet_nl | preresnet | wideresnet | resnext | densenet | ') 211 | parser.add_argument( 212 | '--model_depth', 213 | default=18, 214 | type=int, 215 | help='Depth of resnet (10 | 18 | 34 | 50 | 101)') 216 | parser.add_argument( 217 | '--resnet_shortcut', 218 | default='B', 219 | type=str, 220 | help='Shortcut type of resnet (A | B)') 221 | parser.add_argument( 222 | '--wide_resnet_k', default=2, type=int, help='Wide resnet k') 223 | parser.add_argument( 224 | '--resnext_cardinality', 225 | default=32, 226 | type=int, 227 | help='ResNeXt cardinality') 228 | parser.add_argument( 229 | '--manual_seed', default=1, type=int, help='Manually set random seed') 230 | 231 | args = parser.parse_args() 232 | 233 | return args 234 | -------------------------------------------------------------------------------- /3D_experiment/run.sh: -------------------------------------------------------------------------------- 1 | python main.py --sample_size 224 --root_path ./data --video_path hmdb51/jpg --annotation_path hmdb51_1.json --result_path results --dataset hmdb51 --model resnet --model_depth 50 --n_classes 51 --batch_size 32 --n_threads 4 --checkpoint 5 2>&1 | tee output_hmdb.txt 2 | -------------------------------------------------------------------------------- /3D_experiment/spatial_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | import numbers 4 | import collections 5 | import numpy as np 6 | import torch 7 | from PIL import Image, ImageOps 8 | try: 9 | import accimage 10 | except ImportError: 11 | accimage = None 12 | 13 | 14 | class Compose(object): 15 | """Composes several transforms together. 16 | Args: 17 | transforms (list of ``Transform`` objects): list of transforms to compose. 18 | Example: 19 | >>> transforms.Compose([ 20 | >>> transforms.CenterCrop(10), 21 | >>> transforms.ToTensor(), 22 | >>> ]) 23 | """ 24 | 25 | def __init__(self, transforms): 26 | self.transforms = transforms 27 | 28 | def __call__(self, img): 29 | for t in self.transforms: 30 | img = t(img) 31 | return img 32 | 33 | def randomize_parameters(self): 34 | for t in self.transforms: 35 | t.randomize_parameters() 36 | 37 | 38 | class ToTensor(object): 39 | """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor. 40 | Converts a PIL.Image or numpy.ndarray (H x W x C) in the range 41 | [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. 42 | """ 43 | 44 | def __init__(self, norm_value=255): 45 | self.norm_value = norm_value 46 | 47 | def __call__(self, pic): 48 | """ 49 | Args: 50 | pic (PIL.Image or numpy.ndarray): Image to be converted to tensor. 51 | Returns: 52 | Tensor: Converted image. 53 | """ 54 | if isinstance(pic, np.ndarray): 55 | # handle numpy array 56 | img = torch.from_numpy(pic.transpose((2, 0, 1))) 57 | # backward compatibility 58 | return img.float().div(self.norm_value) 59 | 60 | if accimage is not None and isinstance(pic, accimage.Image): 61 | nppic = np.zeros( 62 | [pic.channels, pic.height, pic.width], dtype=np.float32) 63 | pic.copyto(nppic) 64 | return torch.from_numpy(nppic) 65 | 66 | # handle PIL Image 67 | if pic.mode == 'I': 68 | img = torch.from_numpy(np.array(pic, np.int32, copy=False)) 69 | elif pic.mode == 'I;16': 70 | img = torch.from_numpy(np.array(pic, np.int16, copy=False)) 71 | else: 72 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) 73 | # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK 74 | if pic.mode == 'YCbCr': 75 | nchannel = 3 76 | elif pic.mode == 'I;16': 77 | nchannel = 1 78 | else: 79 | nchannel = len(pic.mode) 80 | img = img.view(pic.size[1], pic.size[0], nchannel) 81 | # put it from HWC to CHW format 82 | # yikes, this transpose takes 80% of the loading time/CPU 83 | img = img.transpose(0, 1).transpose(0, 2).contiguous() 84 | if isinstance(img, torch.ByteTensor): 85 | return img.float().div(self.norm_value) 86 | else: 87 | return img 88 | 89 | def randomize_parameters(self): 90 | pass 91 | 92 | 93 | class Normalize(object): 94 | """Normalize an tensor image with mean and standard deviation. 95 | Given mean: (R, G, B) and std: (R, G, B), 96 | will normalize each channel of the torch.*Tensor, i.e. 97 | channel = (channel - mean) / std 98 | Args: 99 | mean (sequence): Sequence of means for R, G, B channels respecitvely. 100 | std (sequence): Sequence of standard deviations for R, G, B channels 101 | respecitvely. 102 | """ 103 | 104 | def __init__(self, mean, std): 105 | self.mean = mean 106 | self.std = std 107 | 108 | def __call__(self, tensor): 109 | """ 110 | Args: 111 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized. 112 | Returns: 113 | Tensor: Normalized image. 114 | """ 115 | # TODO: make efficient 116 | for t, m, s in zip(tensor, self.mean, self.std): 117 | t.sub_(m).div_(s) 118 | return tensor 119 | 120 | def randomize_parameters(self): 121 | pass 122 | 123 | 124 | class Scale(object): 125 | """Rescale the input PIL.Image to the given size. 126 | Args: 127 | size (sequence or int): Desired output size. If size is a sequence like 128 | (w, h), output size will be matched to this. If size is an int, 129 | smaller edge of the image will be matched to this number. 130 | i.e, if height > width, then image will be rescaled to 131 | (size * height / width, size) 132 | interpolation (int, optional): Desired interpolation. Default is 133 | ``PIL.Image.BILINEAR`` 134 | """ 135 | 136 | def __init__(self, size, interpolation=Image.BILINEAR): 137 | assert isinstance(size, 138 | int) or (isinstance(size, collections.Iterable) and 139 | len(size) == 2) 140 | self.size = size 141 | self.interpolation = interpolation 142 | 143 | def __call__(self, img): 144 | """ 145 | Args: 146 | img (PIL.Image): Image to be scaled. 147 | Returns: 148 | PIL.Image: Rescaled image. 149 | """ 150 | if isinstance(self.size, int): 151 | w, h = img.size 152 | if (w <= h and w == self.size) or (h <= w and h == self.size): 153 | return img 154 | if w < h: 155 | ow = self.size 156 | oh = int(self.size * h / w) 157 | return img.resize((ow, oh), self.interpolation) 158 | else: 159 | oh = self.size 160 | ow = int(self.size * w / h) 161 | return img.resize((ow, oh), self.interpolation) 162 | else: 163 | return img.resize(self.size, self.interpolation) 164 | 165 | def randomize_parameters(self): 166 | pass 167 | 168 | 169 | class CenterCrop(object): 170 | """Crops the given PIL.Image at the center. 171 | Args: 172 | size (sequence or int): Desired output size of the crop. If size is an 173 | int instead of sequence like (h, w), a square crop (size, size) is 174 | made. 175 | """ 176 | 177 | def __init__(self, size): 178 | if isinstance(size, numbers.Number): 179 | self.size = (int(size), int(size)) 180 | else: 181 | self.size = size 182 | 183 | def __call__(self, img): 184 | """ 185 | Args: 186 | img (PIL.Image): Image to be cropped. 187 | Returns: 188 | PIL.Image: Cropped image. 189 | """ 190 | w, h = img.size 191 | th, tw = self.size 192 | x1 = int(round((w - tw) / 2.)) 193 | y1 = int(round((h - th) / 2.)) 194 | return img.crop((x1, y1, x1 + tw, y1 + th)) 195 | 196 | def randomize_parameters(self): 197 | pass 198 | 199 | 200 | class CornerCrop(object): 201 | 202 | def __init__(self, size, crop_position=None): 203 | self.size = size 204 | if crop_position is None: 205 | self.randomize = True 206 | else: 207 | self.randomize = False 208 | self.crop_position = crop_position 209 | self.crop_positions = ['c', 'tl', 'tr', 'bl', 'br'] 210 | 211 | def __call__(self, img): 212 | image_width = img.size[0] 213 | image_height = img.size[1] 214 | 215 | if self.crop_position == 'c': 216 | th, tw = (self.size, self.size) 217 | x1 = int(round((image_width - tw) / 2.)) 218 | y1 = int(round((image_height - th) / 2.)) 219 | x2 = x1 + tw 220 | y2 = y1 + th 221 | elif self.crop_position == 'tl': 222 | x1 = 0 223 | y1 = 0 224 | x2 = self.size 225 | y2 = self.size 226 | elif self.crop_position == 'tr': 227 | x1 = image_width - self.size 228 | y1 = 0 229 | x2 = image_width 230 | y2 = self.size 231 | elif self.crop_position == 'bl': 232 | x1 = 0 233 | y1 = image_height - self.size 234 | x2 = self.size 235 | y2 = image_height 236 | elif self.crop_position == 'br': 237 | x1 = image_width - self.size 238 | y1 = image_height - self.size 239 | x2 = image_width 240 | y2 = image_height 241 | 242 | img = img.crop((x1, y1, x2, y2)) 243 | 244 | return img 245 | 246 | def randomize_parameters(self): 247 | if self.randomize: 248 | self.crop_position = self.crop_positions[random.randint( 249 | 0, 250 | len(self.crop_positions) - 1)] 251 | 252 | 253 | class RandomHorizontalFlip(object): 254 | """Horizontally flip the given PIL.Image randomly with a probability of 0.5.""" 255 | 256 | def __call__(self, img): 257 | """ 258 | Args: 259 | img (PIL.Image): Image to be flipped. 260 | Returns: 261 | PIL.Image: Randomly flipped image. 262 | """ 263 | if self.p < 0.5: 264 | return img.transpose(Image.FLIP_LEFT_RIGHT) 265 | return img 266 | 267 | def randomize_parameters(self): 268 | self.p = random.random() 269 | 270 | 271 | class MultiScaleCornerCrop(object): 272 | """Crop the given PIL.Image to randomly selected size. 273 | A crop of size is selected from scales of the original size. 274 | A position of cropping is randomly selected from 4 corners and 1 center. 275 | This crop is finally resized to given size. 276 | Args: 277 | scales: cropping scales of the original size 278 | size: size of the smaller edge 279 | interpolation: Default: PIL.Image.BILINEAR 280 | """ 281 | 282 | def __init__(self, 283 | scales, 284 | size, 285 | interpolation=Image.BILINEAR, 286 | crop_positions=['c', 'tl', 'tr', 'bl', 'br']): 287 | self.scales = scales 288 | self.size = size 289 | self.interpolation = interpolation 290 | 291 | self.crop_positions = crop_positions 292 | 293 | def __call__(self, img): 294 | min_length = min(img.size[0], img.size[1]) 295 | crop_size = int(min_length * self.scale) 296 | 297 | image_width = img.size[0] 298 | image_height = img.size[1] 299 | 300 | if self.crop_position == 'c': 301 | center_x = image_width // 2 302 | center_y = image_height // 2 303 | box_half = crop_size // 2 304 | x1 = center_x - box_half 305 | y1 = center_y - box_half 306 | x2 = center_x + box_half 307 | y2 = center_y + box_half 308 | elif self.crop_position == 'tl': 309 | x1 = 0 310 | y1 = 0 311 | x2 = crop_size 312 | y2 = crop_size 313 | elif self.crop_position == 'tr': 314 | x1 = image_width - crop_size 315 | y1 = 0 316 | x2 = image_width 317 | y2 = crop_size 318 | elif self.crop_position == 'bl': 319 | x1 = 0 320 | y1 = image_height - crop_size 321 | x2 = crop_size 322 | y2 = image_height 323 | elif self.crop_position == 'br': 324 | x1 = image_width - crop_size 325 | y1 = image_height - crop_size 326 | x2 = image_width 327 | y2 = image_height 328 | 329 | img = img.crop((x1, y1, x2, y2)) 330 | 331 | return img.resize((self.size, self.size), self.interpolation) 332 | 333 | def randomize_parameters(self): 334 | self.scale = self.scales[random.randint(0, len(self.scales) - 1)] 335 | self.crop_position = self.crop_positions[random.randint( 336 | 0, 337 | len(self.crop_positions) - 1)] 338 | 339 | 340 | class MultiScaleRandomCrop(object): 341 | 342 | def __init__(self, scales, size, interpolation=Image.BILINEAR): 343 | self.scales = scales 344 | self.size = size 345 | self.interpolation = interpolation 346 | 347 | def __call__(self, img): 348 | min_length = min(img.size[0], img.size[1]) 349 | crop_size = int(min_length * self.scale) 350 | 351 | image_width = img.size[0] 352 | image_height = img.size[1] 353 | 354 | x1 = self.tl_x * (image_width - crop_size) 355 | y1 = self.tl_y * (image_height - crop_size) 356 | x2 = x1 + crop_size 357 | y2 = y1 + crop_size 358 | 359 | img = img.crop((x1, y1, x2, y2)) 360 | 361 | return img.resize((self.size, self.size), self.interpolation) 362 | 363 | def randomize_parameters(self): 364 | self.scale = self.scales[random.randint(0, len(self.scales) - 1)] 365 | self.tl_x = random.random() 366 | self.tl_y = random.random() 367 | -------------------------------------------------------------------------------- /3D_experiment/target_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | 4 | 5 | class Compose(object): 6 | 7 | def __init__(self, transforms): 8 | self.transforms = transforms 9 | 10 | def __call__(self, target): 11 | dst = [] 12 | for t in self.transforms: 13 | dst.append(t(target)) 14 | return dst 15 | 16 | 17 | class ClassLabel(object): 18 | 19 | def __call__(self, target): 20 | return target['label'] 21 | 22 | 23 | class VideoID(object): 24 | 25 | def __call__(self, target): 26 | return target['video_id'] 27 | -------------------------------------------------------------------------------- /3D_experiment/temporal_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | 4 | 5 | class LoopPadding(object): 6 | 7 | def __init__(self, size): 8 | self.size = size 9 | 10 | def __call__(self, frame_indices): 11 | out = frame_indices 12 | 13 | for index in out: 14 | if len(out) >= self.size: 15 | break 16 | out.append(index) 17 | 18 | return out 19 | 20 | 21 | class TemporalBeginCrop(object): 22 | """Temporally crop the given frame indices at a beginning. 23 | 24 | If the number of frames is less than the size, 25 | loop the indices as many times as necessary to satisfy the size. 26 | 27 | Args: 28 | size (int): Desired output size of the crop. 29 | """ 30 | 31 | def __init__(self, size): 32 | self.size = size 33 | 34 | def __call__(self, frame_indices): 35 | out = frame_indices[:self.size] 36 | 37 | for index in out: 38 | if len(out) >= self.size: 39 | break 40 | out.append(index) 41 | 42 | return out 43 | 44 | 45 | class TemporalCenterCrop(object): 46 | """Temporally crop the given frame indices at a center. 47 | 48 | If the number of frames is less than the size, 49 | loop the indices as many times as necessary to satisfy the size. 50 | 51 | Args: 52 | size (int): Desired output size of the crop. 53 | """ 54 | 55 | def __init__(self, size): 56 | self.size = size 57 | 58 | def __call__(self, frame_indices): 59 | """ 60 | Args: 61 | frame_indices (list): frame indices to be cropped. 62 | Returns: 63 | list: Cropped frame indices. 64 | """ 65 | 66 | center_index = len(frame_indices) // 2 67 | begin_index = max(0, center_index - (self.size // 2)) 68 | end_index = min(begin_index + self.size, len(frame_indices)) 69 | 70 | out = frame_indices[begin_index:end_index] 71 | 72 | for index in out: 73 | if len(out) >= self.size: 74 | break 75 | out.append(index) 76 | 77 | return out 78 | 79 | 80 | class TemporalRandomCrop(object): 81 | """Temporally crop the given frame indices at a random location. 82 | 83 | If the number of frames is less than the size, 84 | loop the indices as many times as necessary to satisfy the size. 85 | 86 | Args: 87 | size (int): Desired output size of the crop. 88 | """ 89 | 90 | def __init__(self, size): 91 | self.size = size 92 | 93 | def __call__(self, frame_indices): 94 | """ 95 | Args: 96 | frame_indices (list): frame indices to be cropped. 97 | Returns: 98 | list: Cropped frame indices. 99 | """ 100 | 101 | rand_end = max(0, len(frame_indices) - self.size - 1) 102 | begin_index = random.randint(0, rand_end) 103 | end_index = min(begin_index + self.size, len(frame_indices)) 104 | 105 | out = frame_indices[begin_index:end_index] 106 | 107 | for index in out: 108 | if len(out) >= self.size: 109 | break 110 | out.append(index) 111 | 112 | return out 113 | -------------------------------------------------------------------------------- /3D_experiment/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import torch.nn.functional as F 4 | import time 5 | import os 6 | import sys 7 | import json 8 | 9 | from utils import AverageMeter 10 | 11 | 12 | def calculate_video_results(output_buffer, video_id, test_results, class_names): 13 | video_outputs = torch.stack(output_buffer) 14 | average_scores = torch.mean(video_outputs, dim=0) 15 | sorted_scores, locs = torch.topk(average_scores, k=10) 16 | 17 | video_results = [] 18 | for i in range(sorted_scores.size(0)): 19 | video_results.append({ 20 | 'label': class_names[locs[i]], 21 | 'score': sorted_scores[i] 22 | }) 23 | 24 | test_results['results'][video_id] = video_results 25 | 26 | 27 | def test(data_loader, model, opt, class_names): 28 | print('test') 29 | 30 | model.eval() 31 | 32 | batch_time = AverageMeter() 33 | data_time = AverageMeter() 34 | 35 | end_time = time.time() 36 | output_buffer = [] 37 | previous_video_id = '' 38 | test_results = {'results': {}} 39 | for i, (inputs, targets) in enumerate(data_loader): 40 | data_time.update(time.time() - end_time) 41 | 42 | inputs = Variable(inputs, volatile=True) 43 | outputs = model(inputs) 44 | if not opt.no_softmax_in_test: 45 | outputs = F.softmax(outputs) 46 | 47 | for j in range(outputs.size(0)): 48 | if not (i == 0 and j == 0) and targets[j] != previous_video_id: 49 | calculate_video_results(output_buffer, previous_video_id, 50 | test_results, class_names) 51 | output_buffer = [] 52 | output_buffer.append(outputs[j].data.cpu()) 53 | previous_video_id = targets[j] 54 | 55 | if (i % 100) == 0: 56 | with open( 57 | os.path.join(opt.result_path, '{}.json'.format( 58 | opt.test_subset)), 'w') as f: 59 | json.dump(test_results, f) 60 | 61 | batch_time.update(time.time() - end_time) 62 | end_time = time.time() 63 | 64 | print('[{}/{}]\t' 65 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 66 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format( 67 | i + 1, 68 | len(data_loader), 69 | batch_time=batch_time, 70 | data_time=data_time)) 71 | with open( 72 | os.path.join(opt.result_path, '{}.json'.format(opt.test_subset)), 73 | 'w') as f: 74 | json.dump(test_results, f) 75 | -------------------------------------------------------------------------------- /3D_experiment/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import time 4 | import os 5 | import sys 6 | 7 | from utils import AverageMeter, calculate_accuracy 8 | 9 | 10 | def train_epoch(epoch, data_loader, model, criterion, optimizer, opt, 11 | epoch_logger, batch_logger): 12 | print('train at epoch {}'.format(epoch)) 13 | 14 | model.train() 15 | 16 | batch_time = AverageMeter() 17 | data_time = AverageMeter() 18 | losses = AverageMeter() 19 | accuracies = AverageMeter() 20 | 21 | end_time = time.time() 22 | for i, (inputs, targets) in enumerate(data_loader): 23 | data_time.update(time.time() - end_time) 24 | 25 | if not opt.no_cuda: 26 | targets = targets.cuda(async=True) 27 | inputs = Variable(inputs) 28 | targets = Variable(targets) 29 | outputs = model(inputs) 30 | loss = criterion(outputs, targets) 31 | acc = calculate_accuracy(outputs, targets) 32 | 33 | losses.update(loss.data[0], inputs.size(0)) 34 | accuracies.update(acc, inputs.size(0)) 35 | 36 | optimizer.zero_grad() 37 | loss.backward() 38 | optimizer.step() 39 | 40 | batch_time.update(time.time() - end_time) 41 | end_time = time.time() 42 | 43 | batch_logger.log({ 44 | 'epoch': epoch, 45 | 'batch': i + 1, 46 | 'iter': (epoch - 1) * len(data_loader) + (i + 1), 47 | 'loss': losses.val, 48 | 'acc': accuracies.val, 49 | 'lr': optimizer.param_groups[0]['lr'] 50 | }) 51 | 52 | print('Epoch: [{0}][{1}/{2}]\t' 53 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 54 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 55 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 56 | 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( 57 | epoch, 58 | i + 1, 59 | len(data_loader), 60 | batch_time=batch_time, 61 | data_time=data_time, 62 | loss=losses, 63 | acc=accuracies)) 64 | 65 | epoch_logger.log({ 66 | 'epoch': epoch, 67 | 'loss': losses.avg, 68 | 'acc': accuracies.avg, 69 | 'lr': optimizer.param_groups[0]['lr'] 70 | }) 71 | 72 | if epoch % opt.checkpoint == 0: 73 | save_file_path = os.path.join(opt.result_path, 74 | 'save_{}.pth'.format(epoch)) 75 | states = { 76 | 'epoch': epoch + 1, 77 | 'arch': opt.arch, 78 | 'state_dict': model.state_dict(), 79 | 'optimizer': optimizer.state_dict(), 80 | } 81 | torch.save(states, save_file_path) 82 | -------------------------------------------------------------------------------- /3D_experiment/utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | 4 | class AverageMeter(object): 5 | """Computes and stores the average and current value""" 6 | 7 | def __init__(self): 8 | self.reset() 9 | 10 | def reset(self): 11 | self.val = 0 12 | self.avg = 0 13 | self.sum = 0 14 | self.count = 0 15 | 16 | def update(self, val, n=1): 17 | self.val = val 18 | self.sum += val * n 19 | self.count += n 20 | self.avg = self.sum / self.count 21 | 22 | 23 | class Logger(object): 24 | 25 | def __init__(self, path, header): 26 | self.log_file = open(path, 'w') 27 | self.logger = csv.writer(self.log_file, delimiter='\t') 28 | 29 | self.logger.writerow(header) 30 | self.header = header 31 | 32 | def __del(self): 33 | self.log_file.close() 34 | 35 | def log(self, values): 36 | write_values = [] 37 | for col in self.header: 38 | assert col in values 39 | write_values.append(values[col]) 40 | 41 | self.logger.writerow(write_values) 42 | self.log_file.flush() 43 | 44 | 45 | def load_value_file(file_path): 46 | with open(file_path, 'r') as input_file: 47 | value = float(input_file.read().rstrip('\n\r')) 48 | 49 | return value 50 | 51 | 52 | def calculate_accuracy(outputs, targets): 53 | batch_size = targets.size(0) 54 | 55 | _, pred = outputs.topk(1, 1, True) 56 | pred = pred.t() 57 | correct = pred.eq(targets.view(1, -1)) 58 | n_correct_elems = correct.float().sum().data[0] 59 | 60 | return n_correct_elems / batch_size 61 | -------------------------------------------------------------------------------- /3D_experiment/utils/eval_hmdb51.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | class HMDBclassification(object): 7 | 8 | def __init__(self, ground_truth_filename=None, prediction_filename=None, 9 | subset='validation', verbose=False, top_k=1): 10 | if not ground_truth_filename: 11 | raise IOError('Please input a valid ground truth file.') 12 | if not prediction_filename: 13 | raise IOError('Please input a valid prediction file.') 14 | self.subset = subset 15 | self.verbose = verbose 16 | self.top_k = top_k 17 | self.ap = None 18 | self.hit_at_k = None 19 | # Import ground truth and predictions. 20 | self.ground_truth, self.activity_index = self._import_ground_truth( 21 | ground_truth_filename) 22 | self.prediction = self._import_prediction(prediction_filename) 23 | 24 | if self.verbose: 25 | print '[INIT] Loaded annotations from {} subset.'.format(subset) 26 | nr_gt = len(self.ground_truth) 27 | print '\tNumber of ground truth instances: {}'.format(nr_gt) 28 | nr_pred = len(self.prediction) 29 | print '\tNumber of predictions: {}'.format(nr_pred) 30 | 31 | def _import_ground_truth(self, ground_truth_filename): 32 | """Reads ground truth file, checks if it is well formatted, and returns 33 | the ground truth instances and the activity classes. 34 | 35 | Parameters 36 | ---------- 37 | ground_truth_filename : str 38 | Full path to the ground truth json file. 39 | 40 | Outputs 41 | ------- 42 | ground_truth : df 43 | Data frame containing the ground truth instances. 44 | activity_index : dict 45 | Dictionary containing class index. 46 | """ 47 | with open(ground_truth_filename, 'r') as fobj: 48 | data = json.load(fobj) 49 | # Checking format 50 | # if not all([field in data.keys() for field in self.gt_fields]): 51 | # raise IOError('Please input a valid ground truth file.') 52 | 53 | # Initialize data frame 54 | activity_index, cidx = {}, 0 55 | video_lst, label_lst = [], [] 56 | for videoid, v in data['database'].iteritems(): 57 | if self.subset != v['subset']: 58 | continue 59 | this_label = v['annotations']['label'] 60 | if this_label not in activity_index: 61 | activity_index[this_label] = cidx 62 | cidx += 1 63 | video_lst.append(videoid) 64 | label_lst.append(activity_index[this_label]) 65 | ground_truth = pd.DataFrame({'video-id': video_lst, 66 | 'label': label_lst}) 67 | ground_truth = ground_truth.drop_duplicates().reset_index(drop=True) 68 | return ground_truth, activity_index 69 | 70 | def _import_prediction(self, prediction_filename): 71 | """Reads prediction file, checks if it is well formatted, and returns 72 | the prediction instances. 73 | 74 | Parameters 75 | ---------- 76 | prediction_filename : str 77 | Full path to the prediction json file. 78 | 79 | Outputs 80 | ------- 81 | prediction : df 82 | Data frame containing the prediction instances. 83 | """ 84 | with open(prediction_filename, 'r') as fobj: 85 | data = json.load(fobj) 86 | # Checking format... 87 | # if not all([field in data.keys() for field in self.pred_fields]): 88 | # raise IOError('Please input a valid prediction file.') 89 | 90 | # Initialize data frame 91 | video_lst, label_lst, score_lst = [], [], [] 92 | for videoid, v in data['results'].iteritems(): 93 | for result in v: 94 | label = self.activity_index[result['label']] 95 | video_lst.append(videoid) 96 | label_lst.append(label) 97 | score_lst.append(result['score']) 98 | prediction = pd.DataFrame({'video-id': video_lst, 99 | 'label': label_lst, 100 | 'score': score_lst}) 101 | return prediction 102 | 103 | def evaluate(self): 104 | """Evaluates a prediction file. For the detection task we measure the 105 | interpolated mean average precision to measure the performance of a 106 | method. 107 | """ 108 | hit_at_k = compute_video_hit_at_k(self.ground_truth, 109 | self.prediction, top_k=self.top_k) 110 | if self.verbose: 111 | print ('[RESULTS] Performance on ActivityNet untrimmed video ' 112 | 'classification task.') 113 | print '\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k) 114 | #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k) 115 | self.hit_at_k = hit_at_k 116 | 117 | ################################################################################ 118 | # Metrics 119 | ################################################################################ 120 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3): 121 | """Compute accuracy at k prediction between ground truth and 122 | predictions data frames. This code is greatly inspired by evaluation 123 | performed in Karpathy et al. CVPR14. 124 | 125 | Parameters 126 | ---------- 127 | ground_truth : df 128 | Data frame containing the ground truth instances. 129 | Required fields: ['video-id', 'label'] 130 | prediction : df 131 | Data frame containing the prediction instances. 132 | Required fields: ['video-id, 'label', 'score'] 133 | 134 | Outputs 135 | ------- 136 | acc : float 137 | Top k accuracy score. 138 | """ 139 | video_ids = np.unique(ground_truth['video-id'].values) 140 | avg_hits_per_vid = np.zeros(video_ids.size) 141 | for i, vid in enumerate(video_ids): 142 | pred_idx = prediction['video-id'] == vid 143 | if not pred_idx.any(): 144 | continue 145 | this_pred = prediction.loc[pred_idx].reset_index(drop=True) 146 | # Get top K predictions sorted by decreasing score. 147 | sort_idx = this_pred['score'].values.argsort()[::-1][:top_k] 148 | this_pred = this_pred.loc[sort_idx].reset_index(drop=True) 149 | # Get labels and compare against ground truth. 150 | pred_label = this_pred['label'].tolist() 151 | gt_idx = ground_truth['video-id'] == vid 152 | gt_label = ground_truth.loc[gt_idx]['label'].tolist() 153 | avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0 154 | for this_label in gt_label]) 155 | return float(avg_hits_per_vid.mean()) 156 | -------------------------------------------------------------------------------- /3D_experiment/utils/eval_kinetics.py: -------------------------------------------------------------------------------- 1 | import json 2 | import urllib2 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | API = 'http://ec2-52-11-11-89.us-west-2.compute.amazonaws.com/challenge17/api.py' 8 | 9 | def get_blocked_videos(api=API): 10 | api_url = '{}?action=get_blocked'.format(api) 11 | req = urllib2.Request(api_url) 12 | response = urllib2.urlopen(req) 13 | return json.loads(response.read()) 14 | 15 | class KINETICSclassification(object): 16 | GROUND_TRUTH_FIELDS = ['database', 'labels'] 17 | PREDICTION_FIELDS = ['results', 'version', 'external_data'] 18 | 19 | def __init__(self, ground_truth_filename=None, prediction_filename=None, 20 | ground_truth_fields=GROUND_TRUTH_FIELDS, 21 | prediction_fields=PREDICTION_FIELDS, 22 | subset='validation', verbose=False, top_k=1, 23 | check_status=True): 24 | if not ground_truth_filename: 25 | raise IOError('Please input a valid ground truth file.') 26 | if not prediction_filename: 27 | raise IOError('Please input a valid prediction file.') 28 | self.subset = subset 29 | self.verbose = verbose 30 | self.gt_fields = ground_truth_fields 31 | self.pred_fields = prediction_fields 32 | self.top_k = top_k 33 | self.ap = None 34 | self.hit_at_k = None 35 | self.check_status = check_status 36 | # Retrieve blocked videos from server. 37 | if self.check_status: 38 | self.blocked_videos = get_blocked_videos() 39 | else: 40 | self.blocked_videos = list() 41 | # Import ground truth and predictions. 42 | self.ground_truth, self.activity_index = self._import_ground_truth( 43 | ground_truth_filename) 44 | self.prediction = self._import_prediction(prediction_filename) 45 | 46 | if self.verbose: 47 | print '[INIT] Loaded annotations from {} subset.'.format(subset) 48 | nr_gt = len(self.ground_truth) 49 | print '\tNumber of ground truth instances: {}'.format(nr_gt) 50 | nr_pred = len(self.prediction) 51 | print '\tNumber of predictions: {}'.format(nr_pred) 52 | 53 | def _import_ground_truth(self, ground_truth_filename): 54 | """Reads ground truth file, checks if it is well formatted, and returns 55 | the ground truth instances and the activity classes. 56 | 57 | Parameters 58 | ---------- 59 | ground_truth_filename : str 60 | Full path to the ground truth json file. 61 | 62 | Outputs 63 | ------- 64 | ground_truth : df 65 | Data frame containing the ground truth instances. 66 | activity_index : dict 67 | Dictionary containing class index. 68 | """ 69 | with open(ground_truth_filename, 'r') as fobj: 70 | data = json.load(fobj) 71 | # Checking format 72 | # if not all([field in data.keys() for field in self.gt_fields]): 73 | # raise IOError('Please input a valid ground truth file.') 74 | 75 | # Initialize data frame 76 | activity_index, cidx = {}, 0 77 | video_lst, label_lst = [], [] 78 | for videoid, v in data['database'].iteritems(): 79 | if self.subset != v['subset']: 80 | continue 81 | if videoid in self.blocked_videos: 82 | continue 83 | this_label = v['annotations']['label'] 84 | if this_label not in activity_index: 85 | activity_index[this_label] = cidx 86 | cidx += 1 87 | video_lst.append(videoid[:-14]) 88 | label_lst.append(activity_index[this_label]) 89 | ground_truth = pd.DataFrame({'video-id': video_lst, 90 | 'label': label_lst}) 91 | ground_truth = ground_truth.drop_duplicates().reset_index(drop=True) 92 | return ground_truth, activity_index 93 | 94 | def _import_prediction(self, prediction_filename): 95 | """Reads prediction file, checks if it is well formatted, and returns 96 | the prediction instances. 97 | 98 | Parameters 99 | ---------- 100 | prediction_filename : str 101 | Full path to the prediction json file. 102 | 103 | Outputs 104 | ------- 105 | prediction : df 106 | Data frame containing the prediction instances. 107 | """ 108 | with open(prediction_filename, 'r') as fobj: 109 | data = json.load(fobj) 110 | # Checking format... 111 | # if not all([field in data.keys() for field in self.pred_fields]): 112 | # raise IOError('Please input a valid prediction file.') 113 | 114 | # Initialize data frame 115 | video_lst, label_lst, score_lst = [], [], [] 116 | for videoid, v in data['results'].iteritems(): 117 | if videoid in self.blocked_videos: 118 | continue 119 | for result in v: 120 | label = self.activity_index[result['label']] 121 | video_lst.append(videoid) 122 | label_lst.append(label) 123 | score_lst.append(result['score']) 124 | prediction = pd.DataFrame({'video-id': video_lst, 125 | 'label': label_lst, 126 | 'score': score_lst}) 127 | return prediction 128 | 129 | def evaluate(self): 130 | """Evaluates a prediction file. For the detection task we measure the 131 | interpolated mean average precision to measure the performance of a 132 | method. 133 | """ 134 | hit_at_k = compute_video_hit_at_k(self.ground_truth, 135 | self.prediction, top_k=self.top_k) 136 | # avg_hit_at_k = compute_video_hit_at_k( 137 | # self.ground_truth, self.prediction, top_k=self.top_k, avg=True) 138 | if self.verbose: 139 | print ('[RESULTS] Performance on ActivityNet untrimmed video ' 140 | 'classification task.') 141 | # print '\tMean Average Precision: {}'.format(ap.mean()) 142 | print '\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k) 143 | #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k) 144 | # self.ap = ap 145 | self.hit_at_k = hit_at_k 146 | # self.avg_hit_at_k = avg_hit_at_k 147 | 148 | ################################################################################ 149 | # Metrics 150 | ################################################################################ 151 | 152 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3, avg=False): 153 | """Compute accuracy at k prediction between ground truth and 154 | predictions data frames. This code is greatly inspired by evaluation 155 | performed in Karpathy et al. CVPR14. 156 | 157 | Parameters 158 | ---------- 159 | ground_truth : df 160 | Data frame containing the ground truth instances. 161 | Required fields: ['video-id', 'label'] 162 | prediction : df 163 | Data frame containing the prediction instances. 164 | Required fields: ['video-id, 'label', 'score'] 165 | 166 | Outputs 167 | ------- 168 | acc : float 169 | Top k accuracy score. 170 | """ 171 | video_ids = np.unique(ground_truth['video-id'].values) 172 | avg_hits_per_vid = np.zeros(video_ids.size) 173 | for i, vid in enumerate(video_ids): 174 | pred_idx = prediction['video-id'] == vid 175 | if not pred_idx.any(): 176 | continue 177 | this_pred = prediction.loc[pred_idx].reset_index(drop=True) 178 | # Get top K predictions sorted by decreasing score. 179 | sort_idx = this_pred['score'].values.argsort()[::-1][:top_k] 180 | this_pred = this_pred.loc[sort_idx].reset_index(drop=True) 181 | # Get labels and compare against ground truth. 182 | pred_label = this_pred['label'].tolist() 183 | gt_idx = ground_truth['video-id'] == vid 184 | gt_label = ground_truth.loc[gt_idx]['label'].tolist() 185 | avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0 186 | for this_label in gt_label]) 187 | if not avg: 188 | avg_hits_per_vid[i] = np.ceil(avg_hits_per_vid[i]) 189 | return float(avg_hits_per_vid.mean()) 190 | -------------------------------------------------------------------------------- /3D_experiment/utils/eval_ucf101.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | class UCFclassification(object): 7 | 8 | def __init__(self, ground_truth_filename=None, prediction_filename=None, 9 | subset='validation', verbose=False, top_k=1): 10 | if not ground_truth_filename: 11 | raise IOError('Please input a valid ground truth file.') 12 | if not prediction_filename: 13 | raise IOError('Please input a valid prediction file.') 14 | self.subset = subset 15 | self.verbose = verbose 16 | self.top_k = top_k 17 | self.ap = None 18 | self.hit_at_k = None 19 | # Import ground truth and predictions. 20 | self.ground_truth, self.activity_index = self._import_ground_truth( 21 | ground_truth_filename) 22 | self.prediction = self._import_prediction(prediction_filename) 23 | 24 | if self.verbose: 25 | print '[INIT] Loaded annotations from {} subset.'.format(subset) 26 | nr_gt = len(self.ground_truth) 27 | print '\tNumber of ground truth instances: {}'.format(nr_gt) 28 | nr_pred = len(self.prediction) 29 | print '\tNumber of predictions: {}'.format(nr_pred) 30 | 31 | def _import_ground_truth(self, ground_truth_filename): 32 | """Reads ground truth file, checks if it is well formatted, and returns 33 | the ground truth instances and the activity classes. 34 | 35 | Parameters 36 | ---------- 37 | ground_truth_filename : str 38 | Full path to the ground truth json file. 39 | 40 | Outputs 41 | ------- 42 | ground_truth : df 43 | Data frame containing the ground truth instances. 44 | activity_index : dict 45 | Dictionary containing class index. 46 | """ 47 | with open(ground_truth_filename, 'r') as fobj: 48 | data = json.load(fobj) 49 | # Checking format 50 | # if not all([field in data.keys() for field in self.gt_fields]): 51 | # raise IOError('Please input a valid ground truth file.') 52 | 53 | # Initialize data frame 54 | activity_index, cidx = {}, 0 55 | video_lst, label_lst = [], [] 56 | for videoid, v in data['database'].iteritems(): 57 | if self.subset != v['subset']: 58 | continue 59 | this_label = v['annotations']['label'] 60 | if this_label not in activity_index: 61 | activity_index[this_label] = cidx 62 | cidx += 1 63 | video_lst.append(videoid) 64 | label_lst.append(activity_index[this_label]) 65 | ground_truth = pd.DataFrame({'video-id': video_lst, 66 | 'label': label_lst}) 67 | ground_truth = ground_truth.drop_duplicates().reset_index(drop=True) 68 | return ground_truth, activity_index 69 | 70 | def _import_prediction(self, prediction_filename): 71 | """Reads prediction file, checks if it is well formatted, and returns 72 | the prediction instances. 73 | 74 | Parameters 75 | ---------- 76 | prediction_filename : str 77 | Full path to the prediction json file. 78 | 79 | Outputs 80 | ------- 81 | prediction : df 82 | Data frame containing the prediction instances. 83 | """ 84 | with open(prediction_filename, 'r') as fobj: 85 | data = json.load(fobj) 86 | # Checking format... 87 | # if not all([field in data.keys() for field in self.pred_fields]): 88 | # raise IOError('Please input a valid prediction file.') 89 | 90 | # Initialize data frame 91 | video_lst, label_lst, score_lst = [], [], [] 92 | for videoid, v in data['results'].iteritems(): 93 | for result in v: 94 | label = self.activity_index[result['label']] 95 | video_lst.append(videoid) 96 | label_lst.append(label) 97 | score_lst.append(result['score']) 98 | prediction = pd.DataFrame({'video-id': video_lst, 99 | 'label': label_lst, 100 | 'score': score_lst}) 101 | return prediction 102 | 103 | def evaluate(self): 104 | """Evaluates a prediction file. For the detection task we measure the 105 | interpolated mean average precision to measure the performance of a 106 | method. 107 | """ 108 | hit_at_k = compute_video_hit_at_k(self.ground_truth, 109 | self.prediction, top_k=self.top_k) 110 | if self.verbose: 111 | print ('[RESULTS] Performance on ActivityNet untrimmed video ' 112 | 'classification task.') 113 | print '\tError@{}: {}'.format(self.top_k, 1.0 - hit_at_k) 114 | #print '\tAvg Hit@{}: {}'.format(self.top_k, avg_hit_at_k) 115 | self.hit_at_k = hit_at_k 116 | 117 | ################################################################################ 118 | # Metrics 119 | ################################################################################ 120 | def compute_video_hit_at_k(ground_truth, prediction, top_k=3): 121 | """Compute accuracy at k prediction between ground truth and 122 | predictions data frames. This code is greatly inspired by evaluation 123 | performed in Karpathy et al. CVPR14. 124 | 125 | Parameters 126 | ---------- 127 | ground_truth : df 128 | Data frame containing the ground truth instances. 129 | Required fields: ['video-id', 'label'] 130 | prediction : df 131 | Data frame containing the prediction instances. 132 | Required fields: ['video-id, 'label', 'score'] 133 | 134 | Outputs 135 | ------- 136 | acc : float 137 | Top k accuracy score. 138 | """ 139 | video_ids = np.unique(ground_truth['video-id'].values) 140 | avg_hits_per_vid = np.zeros(video_ids.size) 141 | for i, vid in enumerate(video_ids): 142 | pred_idx = prediction['video-id'] == vid 143 | if not pred_idx.any(): 144 | continue 145 | this_pred = prediction.loc[pred_idx].reset_index(drop=True) 146 | # Get top K predictions sorted by decreasing score. 147 | sort_idx = this_pred['score'].values.argsort()[::-1][:top_k] 148 | this_pred = this_pred.loc[sort_idx].reset_index(drop=True) 149 | # Get labels and compare against ground truth. 150 | pred_label = this_pred['label'].tolist() 151 | gt_idx = ground_truth['video-id'] == vid 152 | gt_label = ground_truth.loc[gt_idx]['label'].tolist() 153 | avg_hits_per_vid[i] = np.mean([1 if this_label in pred_label else 0 154 | for this_label in gt_label]) 155 | return float(avg_hits_per_vid.mean()) 156 | -------------------------------------------------------------------------------- /3D_experiment/utils/fps.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | 7 | if __name__=="__main__": 8 | dir_path = sys.argv[1] 9 | dst_dir_path = sys.argv[2] 10 | 11 | for file_name in os.listdir(dir_path): 12 | if '.mp4' not in file_name: 13 | continue 14 | name, ext = os.path.splitext(file_name) 15 | dst_directory_path = os.path.join(dst_dir_path, name) 16 | 17 | video_file_path = os.path.join(dir_path, file_name) 18 | p = subprocess.Popen('ffprobe {}'.format(video_file_path), 19 | shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 20 | _, res = p.communicate() 21 | res = res.decode('utf-8') 22 | 23 | duration_index = res.find('Duration:') 24 | duration_str = res[(duration_index + 10):(duration_index + 21)] 25 | hour = float(duration_str[0:2]) 26 | minute = float(duration_str[3:5]) 27 | sec = float(duration_str[6:10]) 28 | total_sec = hour * 3600 + minute * 60 + sec 29 | 30 | n_frames = len(os.listdir(dst_directory_path)) 31 | if os.path.exists(os.path.join(dst_directory_path, 'fps')): 32 | n_frames -= 1 33 | 34 | fps = round(n_frames / total_sec, 2) 35 | 36 | print(video_file_path, os.path.exists(video_file_path), fps) 37 | with open(os.path.join(dst_directory_path, 'fps'), 'w') as fps_file: 38 | fps_file.write('{}\n'.format(fps)) 39 | -------------------------------------------------------------------------------- /3D_experiment/utils/hmdb51_json.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import json 5 | import pandas as pd 6 | 7 | def convert_csv_to_dict(csv_dir_path, split_index): 8 | database = {} 9 | for filename in os.listdir(csv_dir_path): 10 | if 'split{}'.format(split_index) not in filename: 11 | continue 12 | 13 | data = pd.read_csv(os.path.join(csv_dir_path, filename), 14 | delimiter=' ', header=None) 15 | keys = [] 16 | subsets = [] 17 | for i in range(data.shape[0]): 18 | row = data.ix[i, :] 19 | if row[1] == 0: 20 | continue 21 | elif row[1] == 1: 22 | subset = 'training' 23 | elif row[1] == 2: 24 | subset = 'validation' 25 | 26 | keys.append(row[0].split('.')[0]) 27 | subsets.append(subset) 28 | 29 | for i in range(len(keys)): 30 | key = keys[i] 31 | database[key] = {} 32 | database[key]['subset'] = subsets[i] 33 | label = '_'.join(filename.split('_')[:-2]) 34 | database[key]['annotations'] = {'label': label} 35 | 36 | return database 37 | 38 | def get_labels(csv_dir_path): 39 | labels = [] 40 | for name in os.listdir(csv_dir_path): 41 | labels.append('_'.join(name.split('_')[:-2])) 42 | return sorted(list(set(labels))) 43 | 44 | def convert_hmdb51_csv_to_activitynet_json(csv_dir_path, split_index, dst_json_path): 45 | labels = get_labels(csv_dir_path) 46 | database = convert_csv_to_dict(csv_dir_path, split_index) 47 | 48 | dst_data = {} 49 | dst_data['labels'] = labels 50 | dst_data['database'] = {} 51 | dst_data['database'].update(database) 52 | 53 | with open(dst_json_path, 'w') as dst_file: 54 | json.dump(dst_data, dst_file) 55 | 56 | if __name__ == '__main__': 57 | csv_dir_path = sys.argv[1] 58 | 59 | for split_index in range(1, 4): 60 | dst_json_path = os.path.join(csv_dir_path, 'hmdb51_{}.json'.format(split_index)) 61 | convert_hmdb51_csv_to_activitynet_json(csv_dir_path, split_index, dst_json_path) -------------------------------------------------------------------------------- /3D_experiment/utils/kinetics_json.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import json 5 | import pandas as pd 6 | 7 | def convert_csv_to_dict(csv_path, subset): 8 | data = pd.read_csv(csv_path) 9 | keys = [] 10 | key_labels = [] 11 | for i in range(data.shape[0]): 12 | row = data.ix[i, :] 13 | basename = '%s_%s_%s' % (row['youtube_id'], 14 | '%06d' % row['time_start'], 15 | '%06d' % row['time_end']) 16 | keys.append(basename) 17 | if subset != 'testing': 18 | key_labels.append(row['label']) 19 | 20 | database = {} 21 | for i in range(len(keys)): 22 | key = keys[i] 23 | database[key] = {} 24 | database[key]['subset'] = subset 25 | if subset != 'testing': 26 | label = key_labels[i] 27 | database[key]['annotations'] = {'label': label} 28 | else: 29 | database[key]['annotations'] = {} 30 | 31 | return database 32 | 33 | def load_labels(train_csv_path): 34 | data = pd.read_csv(train_csv_path) 35 | return data['label'].unique().tolist() 36 | 37 | def convert_kinetics_csv_to_activitynet_json(train_csv_path, val_csv_path, test_csv_path, dst_json_path): 38 | labels = load_labels(train_csv_path) 39 | train_database = convert_csv_to_dict(train_csv_path, 'training') 40 | val_database = convert_csv_to_dict(val_csv_path, 'validation') 41 | test_database = convert_csv_to_dict(test_csv_path, 'testing') 42 | 43 | dst_data = {} 44 | dst_data['labels'] = labels 45 | dst_data['database'] = {} 46 | dst_data['database'].update(train_database) 47 | dst_data['database'].update(val_database) 48 | dst_data['database'].update(test_database) 49 | 50 | with open(dst_json_path, 'w') as dst_file: 51 | json.dump(dst_data, dst_file) 52 | 53 | if __name__=="__main__": 54 | train_csv_path = sys.argv[1] 55 | val_csv_path = sys.argv[2] 56 | test_csv_path = sys.argv[3] 57 | dst_json_path = sys.argv[4] 58 | 59 | convert_kinetics_csv_to_activitynet_json( 60 | train_csv_path, val_csv_path, test_csv_path, dst_json_path) 61 | -------------------------------------------------------------------------------- /3D_experiment/utils/n_frames_kinetics.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | def class_process(dir_path, class_name): 7 | class_path = os.path.join(dir_path, class_name) 8 | if not os.path.isdir(class_path): 9 | return 10 | 11 | for file_name in os.listdir(class_path): 12 | video_dir_path = os.path.join(class_path, file_name) 13 | image_indices = [] 14 | for image_file_name in os.listdir(video_dir_path): 15 | if 'image' not in image_file_name: 16 | continue 17 | image_indices.append(int(image_file_name[6:11])) 18 | 19 | if len(image_indices) == 0: 20 | print('no image files', video_dir_path) 21 | n_frames = 0 22 | else: 23 | image_indices.sort(reverse=True) 24 | n_frames = image_indices[0] 25 | print(video_dir_path, n_frames) 26 | with open(os.path.join(video_dir_path, 'n_frames'), 'w') as dst_file: 27 | dst_file.write(str(n_frames)) 28 | 29 | 30 | if __name__=="__main__": 31 | dir_path = sys.argv[1] 32 | for class_name in os.listdir(dir_path): 33 | class_process(dir_path, class_name) 34 | 35 | class_name = 'test' 36 | class_process(dir_path, class_name) 37 | -------------------------------------------------------------------------------- /3D_experiment/utils/n_frames_ucf101_hmdb51.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | def class_process(dir_path, class_name): 7 | class_path = os.path.join(dir_path, class_name) 8 | if not os.path.isdir(class_path): 9 | return 10 | 11 | for file_name in os.listdir(class_path): 12 | video_dir_path = os.path.join(class_path, file_name) 13 | image_indices = [] 14 | for image_file_name in os.listdir(video_dir_path): 15 | if 'image' not in image_file_name: 16 | continue 17 | image_indices.append(int(image_file_name[6:11])) 18 | 19 | if len(image_indices) == 0: 20 | print('no image files', video_dir_path) 21 | n_frames = 0 22 | else: 23 | image_indices.sort(reverse=True) 24 | n_frames = image_indices[0] 25 | print(video_dir_path, n_frames) 26 | with open(os.path.join(video_dir_path, 'n_frames'), 'w') as dst_file: 27 | dst_file.write(str(n_frames)) 28 | 29 | 30 | if __name__=="__main__": 31 | dir_path = sys.argv[1] 32 | for class_name in os.listdir(dir_path): 33 | class_process(dir_path, class_name) 34 | -------------------------------------------------------------------------------- /3D_experiment/utils/ucf101_json.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import json 5 | import pandas as pd 6 | 7 | def convert_csv_to_dict(csv_path, subset): 8 | data = pd.read_csv(csv_path, delimiter=' ', header=None) 9 | keys = [] 10 | key_labels = [] 11 | for i in range(data.shape[0]): 12 | row = data.ix[i, :] 13 | slash_rows = data.ix[i, 0].split('/') 14 | class_name = slash_rows[0] 15 | basename = slash_rows[1].split('.')[0] 16 | 17 | keys.append(basename) 18 | key_labels.append(class_name) 19 | 20 | database = {} 21 | for i in range(len(keys)): 22 | key = keys[i] 23 | database[key] = {} 24 | database[key]['subset'] = subset 25 | label = key_labels[i] 26 | database[key]['annotations'] = {'label': label} 27 | 28 | return database 29 | 30 | def load_labels(label_csv_path): 31 | data = pd.read_csv(label_csv_path, delimiter=' ', header=None) 32 | labels = [] 33 | for i in range(data.shape[0]): 34 | labels.append(data.ix[i, 1]) 35 | return labels 36 | 37 | def convert_ucf101_csv_to_activitynet_json(label_csv_path, train_csv_path, 38 | val_csv_path, dst_json_path): 39 | labels = load_labels(label_csv_path) 40 | train_database = convert_csv_to_dict(train_csv_path, 'training') 41 | val_database = convert_csv_to_dict(val_csv_path, 'validation') 42 | 43 | dst_data = {} 44 | dst_data['labels'] = labels 45 | dst_data['database'] = {} 46 | dst_data['database'].update(train_database) 47 | dst_data['database'].update(val_database) 48 | 49 | with open(dst_json_path, 'w') as dst_file: 50 | json.dump(dst_data, dst_file) 51 | 52 | if __name__ == '__main__': 53 | csv_dir_path = sys.argv[1] 54 | 55 | for split_index in range(1, 4): 56 | label_csv_path = os.path.join(csv_dir_path, 'classInd.txt') 57 | train_csv_path = os.path.join(csv_dir_path, 'trainlist0{}.txt'.format(split_index)) 58 | val_csv_path = os.path.join(csv_dir_path, 'testlist0{}.txt'.format(split_index)) 59 | dst_json_path = os.path.join(csv_dir_path, 'ucf101_0{}.json'.format(split_index)) 60 | 61 | convert_ucf101_csv_to_activitynet_json(label_csv_path, train_csv_path, 62 | val_csv_path, dst_json_path) 63 | -------------------------------------------------------------------------------- /3D_experiment/utils/video_jpg.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | 7 | if __name__=="__main__": 8 | dir_path = sys.argv[1] 9 | dst_dir_path = sys.argv[2] 10 | 11 | for file_name in os.listdir(dir_path): 12 | if '.mp4' not in file_name: 13 | continue 14 | name, ext = os.path.splitext(file_name) 15 | dst_directory_path = os.path.join(dst_dir_path, name) 16 | 17 | video_file_path = os.path.join(dir_path, file_name) 18 | try: 19 | if os.path.exists(dst_directory_path): 20 | if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')): 21 | subprocess.call('rm -r {}'.format(dst_directory_path), shell=True) 22 | print('remove {}'.format(dst_directory_path)) 23 | os.mkdir(dst_directory_path) 24 | else: 25 | continue 26 | else: 27 | os.mkdir(dst_directory_path) 28 | except: 29 | print(dst_directory_path) 30 | continue 31 | cmd = 'ffmpeg -i {} -vf scale=-1:360 {}/image_%05d.jpg'.format(video_file_path, dst_directory_path) 32 | print(cmd) 33 | subprocess.call(cmd, shell=True) 34 | print('\n') 35 | -------------------------------------------------------------------------------- /3D_experiment/utils/video_jpg_kinetics.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | def class_process(dir_path, dst_dir_path, class_name): 7 | class_path = os.path.join(dir_path, class_name) 8 | if not os.path.isdir(class_path): 9 | return 10 | 11 | dst_class_path = os.path.join(dst_dir_path, class_name) 12 | if not os.path.exists(dst_class_path): 13 | os.mkdir(dst_class_path) 14 | 15 | for file_name in os.listdir(class_path): 16 | if '.mp4' not in file_name: 17 | continue 18 | name, ext = os.path.splitext(file_name) 19 | dst_directory_path = os.path.join(dst_class_path, name) 20 | 21 | video_file_path = os.path.join(class_path, file_name) 22 | try: 23 | if os.path.exists(dst_directory_path): 24 | if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')): 25 | subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True) 26 | print('remove {}'.format(dst_directory_path)) 27 | os.mkdir(dst_directory_path) 28 | else: 29 | continue 30 | else: 31 | os.mkdir(dst_directory_path) 32 | except: 33 | print(dst_directory_path) 34 | continue 35 | cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path) 36 | print(cmd) 37 | subprocess.call(cmd, shell=True) 38 | print('\n') 39 | 40 | if __name__=="__main__": 41 | dir_path = sys.argv[1] 42 | dst_dir_path = sys.argv[2] 43 | 44 | for class_name in os.listdir(dir_path): 45 | class_process(dir_path, dst_dir_path, class_name) 46 | 47 | class_name = 'test' 48 | class_process(dir_path, dst_dir_path, class_name) 49 | -------------------------------------------------------------------------------- /3D_experiment/utils/video_jpg_ucf101_hmdb51.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | def class_process(dir_path, dst_dir_path, class_name): 7 | class_path = os.path.join(dir_path, class_name) 8 | if not os.path.isdir(class_path): 9 | return 10 | 11 | dst_class_path = os.path.join(dst_dir_path, class_name) 12 | if not os.path.exists(dst_class_path): 13 | os.mkdir(dst_class_path) 14 | 15 | for file_name in os.listdir(class_path): 16 | if '.avi' not in file_name: 17 | continue 18 | name, ext = os.path.splitext(file_name) 19 | dst_directory_path = os.path.join(dst_class_path, name) 20 | 21 | video_file_path = os.path.join(class_path, file_name) 22 | try: 23 | if os.path.exists(dst_directory_path): 24 | if not os.path.exists(os.path.join(dst_directory_path, 'image_00001.jpg')): 25 | subprocess.call('rm -r \"{}\"'.format(dst_directory_path), shell=True) 26 | print('remove {}'.format(dst_directory_path)) 27 | os.mkdir(dst_directory_path) 28 | else: 29 | continue 30 | else: 31 | os.mkdir(dst_directory_path) 32 | except: 33 | print(dst_directory_path) 34 | continue 35 | cmd = 'ffmpeg -i \"{}\" -vf scale=-1:240 \"{}/image_%05d.jpg\"'.format(video_file_path, dst_directory_path) 36 | print(cmd) 37 | subprocess.call(cmd, shell=True) 38 | print('\n') 39 | 40 | if __name__=="__main__": 41 | dir_path = sys.argv[1] 42 | dst_dir_path = sys.argv[2] 43 | 44 | for class_name in os.listdir(dir_path): 45 | class_process(dir_path, dst_dir_path, class_name) 46 | -------------------------------------------------------------------------------- /3D_experiment/validation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import time 4 | import sys 5 | 6 | from utils import AverageMeter, calculate_accuracy 7 | 8 | 9 | def val_epoch(epoch, data_loader, model, criterion, opt, logger): 10 | print('validation at epoch {}'.format(epoch)) 11 | 12 | model.eval() 13 | 14 | batch_time = AverageMeter() 15 | data_time = AverageMeter() 16 | losses = AverageMeter() 17 | accuracies = AverageMeter() 18 | 19 | end_time = time.time() 20 | for i, (inputs, targets) in enumerate(data_loader): 21 | data_time.update(time.time() - end_time) 22 | 23 | if not opt.no_cuda: 24 | targets = targets.cuda(async=True) 25 | inputs = Variable(inputs, volatile=True) 26 | targets = Variable(targets, volatile=True) 27 | outputs = model(inputs) 28 | loss = criterion(outputs, targets) 29 | acc = calculate_accuracy(outputs, targets) 30 | 31 | losses.update(loss.data[0], inputs.size(0)) 32 | accuracies.update(acc, inputs.size(0)) 33 | 34 | batch_time.update(time.time() - end_time) 35 | end_time = time.time() 36 | 37 | print('Epoch: [{0}][{1}/{2}]\t' 38 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 39 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 40 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 41 | 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( 42 | epoch, 43 | i + 1, 44 | len(data_loader), 45 | batch_time=batch_time, 46 | data_time=data_time, 47 | loss=losses, 48 | acc=accuracies)) 49 | 50 | logger.log({'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg}) 51 | 52 | return losses.avg 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Seunghwan Cha 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Implementation of Non-Local Neural Network 2 | 3 | This repository contains my implementation of [Non-Local Neural Netowrks (CVPR 2018)](https://arxiv.org/pdf/1711.07971.pdf). 4 | 5 | To understand more about the structure of this paper, you may refer to this [slide](https://www.youtube.com/redirect?redir_token=4Bf1C-e-Vz_0r5HbPD9meYLcyL58MTU1MTc5MjE0NEAxNTUxNzA1NzQ0&q=https%3A%2F%2Fwww.slideshare.net%2FTaeohKim4%2Fpr083-nonlocal-neural-networks&v=ZM153wo3baA&event=video_description) and [video](https://www.youtube.com/watch?v=ZM153wo3baA) which is in Korean. 6 | 7 | The experiment was run on CIFAR-10 dataset for the sake of ensuring that the code runs without error. 8 | 9 | ## Implementation Details 10 | The original paper used ResNet-50 as its backbone structure for conducting experiment on video datasets such as Kinetics, Charades. 11 | 12 | As an inital study, I adopted ResNet-56 strucutre for CIFAR-10 dataset which is a 2D classification. The architecture is implemented in `models/resnet2D.py`. 13 | 14 | Original baseline model from the paper called C2D uses ResNet-50 as its backbone and 1 non-local block after the 4th residual block. This structure is implemented in `models/resnet3D.py`. The detail of the architecture is shown in the below figure: 15 | 16 | 17 | 18 | The four different pairwise functions discussed in the paper are implemented accordingly in `models/non_local.py`. You can simply pass one of the operation as an argument. The details of the non-local block is shown in the below figure: 19 | 20 | 21 | 22 | Finally, the original experiment of activity recognition was similarly replicated in `3D_experiment` folder. The necessary data preprocessing code was borrowed from https://github.com/kenshohara/3D-ResNets-PyTorch. The training is run without error but I didn't have enough time to compare the performance boost from the addition of non-local block. 23 | 24 | ## Training 25 | 1) To start training for CIFAR-10 with ResNet-56, you can simply execute `run.sh`. 26 | 27 | 2) To start training for HMDB51 dataset with C2D, you first need to prepare the HMDB51 dataset as instructed in the `3D_experiment` folder. Then, execute `run.sh`. It seems like use of multiple GPU(s) may be need due to memory issues. 28 | 29 | ## Results 30 | Trained on CIFAR-10 for 200 epochs using the command shown in `run.sh`. The training was conducted using single 1080ti GPU. 31 | The result showed that there wasn't a huge performance boost for image classification task on CIFAR-10. The below graph illustrates the loss curves for two different networks. 32 | 33 | 34 | 35 | The Top-1 validation accuracy for ResNet-56 without non-local block was *93.97%* while the one with non-local block had *93.98%* validation accuracy. 36 | 37 | This could be due to two reasons: 1) the proposed task was mainly for video classification 2) the input size of CIFAR-10 is too small so may not maintain spatial information after the second resnet block. 38 | 39 | ## TO DO 40 | - [x] Compare the result of baseline model and that of non-local model for CIFAR-10 41 | - [x] Prepare video dataset (e.g. UCF-101, HMDB-51) 42 | - [x] Modify the model code to adapt to spatiotemporal settings 43 | - [x] Run test on some video datasets 44 | - [ ] Run test on image segmentation dataset (e.g. COCO) 45 | 46 | ## Reference 47 | This repo is an adaptation from several other exisitng works. 48 | - https://github.com/akamaster/pytorch_resnet_cifar10 49 | - https://github.com/kuangliu/pytorch-cifar 50 | - https://github.com/facebookresearch/video-nonlocal-net 51 | - https://github.com/AlexHex7/Non-local_pytorch 52 | - https://github.com/kenshohara/3D-ResNets-PyTorch 53 | 54 | -------------------------------------------------------------------------------- /figure/Figure2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tea1528/Non-Local-NN-Pytorch/64c6047f7d801402d04340695bfbdfeee03e4797/figure/Figure2.jpg -------------------------------------------------------------------------------- /figure/Table1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tea1528/Non-Local-NN-Pytorch/64c6047f7d801402d04340695bfbdfeee03e4797/figure/Table1.jpg -------------------------------------------------------------------------------- /figure/resnet56_cifar.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tea1528/Non-Local-NN-Pytorch/64c6047f7d801402d04340695bfbdfeee03e4797/figure/resnet56_cifar.jpg -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | '''Train CIFAR10 with PyTorch.''' 2 | from __future__ import print_function 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | import torch.nn.functional as F 8 | import torch.backends.cudnn as cudnn 9 | 10 | import torchvision 11 | import torchvision.transforms as transforms 12 | 13 | import os 14 | import argparse 15 | 16 | from models.resnet2D import resnet2D56 17 | 18 | parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training') 19 | parser.add_argument('--lr', default=0.1, type=float, help='learning rate') 20 | parser.add_argument('--verbose', '-v', action='store_true', help='display progress bar') 21 | parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint') 22 | parser.add_argument('--nl', '-n', action='store_true', help='add non-local block') 23 | args = parser.parse_args() 24 | 25 | if args.verbose: 26 | from utils import progress_bar 27 | 28 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 29 | best_acc = 0 # best test accuracy 30 | start_epoch = 0 # start from epoch 0 or last checkpoint epoch 31 | 32 | # Data 33 | print('==> Preparing data..') 34 | transform_train = transforms.Compose([ 35 | transforms.RandomCrop(32, padding=4), 36 | transforms.RandomHorizontalFlip(), 37 | transforms.ToTensor(), 38 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), 39 | ]) 40 | 41 | transform_test = transforms.Compose([ 42 | transforms.ToTensor(), 43 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), 44 | ]) 45 | 46 | trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) 47 | trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2) 48 | 49 | testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) 50 | testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) 51 | 52 | classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') 53 | 54 | # Model 55 | print('==> Building model..') 56 | if args.nl: 57 | print("ResNet-56 with non-local block after second residual block..") 58 | net = resnet2D56(non_local=True) 59 | else: 60 | print("ResNet-56 without non-local block..") 61 | net = resnet2D56(non_local=False) 62 | 63 | 64 | 65 | net = net.to(device) 66 | 67 | if device == 'cuda': 68 | net = torch.nn.DataParallel(net) 69 | cudnn.benchmark = True 70 | 71 | if args.resume: 72 | # Load checkpoint. 73 | print('==> Resuming from checkpoint..') 74 | assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' 75 | checkpoint = torch.load('./checkpoint/ckpt.t7') 76 | net.load_state_dict(checkpoint['net']) 77 | best_acc = checkpoint['acc'] 78 | start_epoch = checkpoint['epoch'] 79 | 80 | criterion = nn.CrossEntropyLoss() 81 | optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) 82 | lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[100, 150], last_epoch=start_epoch - 1) 83 | 84 | # Training 85 | def train(epoch): 86 | print('\nEpoch: %d' % epoch) 87 | net.train() 88 | train_loss = 0 89 | correct = 0 90 | total = 0 91 | for batch_idx, (inputs, targets) in enumerate(trainloader): 92 | inputs, targets = inputs.to(device), targets.to(device) 93 | optimizer.zero_grad() 94 | outputs = net(inputs) 95 | loss = criterion(outputs, targets) 96 | loss.backward() 97 | optimizer.step() 98 | 99 | train_loss += loss.item() 100 | _, predicted = outputs.max(1) 101 | total += targets.size(0) 102 | correct += predicted.eq(targets).sum().item() 103 | 104 | if args.verbose: 105 | progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' 106 | % (train_loss/(batch_idx+1), 100.*correct/total, correct, total)) 107 | if not args.verbose: 108 | print('Loss: %.3f' % train_loss) 109 | 110 | return train_loss 111 | 112 | def test(epoch): 113 | global best_acc 114 | net.eval() 115 | test_loss = 0 116 | correct = 0 117 | total = 0 118 | with torch.no_grad(): 119 | for batch_idx, (inputs, targets) in enumerate(testloader): 120 | inputs, targets = inputs.to(device), targets.to(device) 121 | outputs = net(inputs) 122 | loss = criterion(outputs, targets) 123 | 124 | test_loss += loss.item() 125 | _, predicted = outputs.max(1) 126 | total += targets.size(0) 127 | correct += predicted.eq(targets).sum().item() 128 | 129 | if args.verbose: 130 | progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' 131 | % (test_loss/(batch_idx+1), 100.*correct/total, correct, total)) 132 | 133 | if not args.verbose: 134 | print('Loss: %.3f' % test_loss) 135 | 136 | # Save checkpoint. 137 | acc = 100.*correct/total 138 | if acc > best_acc: 139 | print('Saving..') 140 | state = { 141 | 'net': net.state_dict(), 142 | 'acc': acc, 143 | 'epoch': epoch, 144 | } 145 | if not os.path.isdir('checkpoint'): 146 | os.mkdir('checkpoint') 147 | torch.save(state, './checkpoint/ckpt.t7') 148 | best_acc = acc 149 | return test_loss 150 | 151 | tr_loss_list = [] 152 | tst_loss_list = [] 153 | 154 | for epoch in range(start_epoch, start_epoch+200): 155 | train_l = train(epoch) 156 | lr_scheduler.step() 157 | test_l = test(epoch) 158 | tr_loss_list.append(train_l) 159 | tst_loss_list.append(test_l) 160 | 161 | print("Best Accuracy: ", best_acc) 162 | print("-----------------------------------------------") 163 | 164 | print("train loss") 165 | print(tr_loss_list) 166 | print("test loss") 167 | print(tst_loss_list) 168 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | -------------------------------------------------------------------------------- /models/non_local.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | 6 | class NLBlockND(nn.Module): 7 | def __init__(self, in_channels, inter_channels=None, mode='embedded', 8 | dimension=3, bn_layer=True): 9 | """Implementation of Non-Local Block with 4 different pairwise functions but doesn't include subsampling trick 10 | args: 11 | in_channels: original channel size (1024 in the paper) 12 | inter_channels: channel size inside the block if not specifed reduced to half (512 in the paper) 13 | mode: supports Gaussian, Embedded Gaussian, Dot Product, and Concatenation 14 | dimension: can be 1 (temporal), 2 (spatial), 3 (spatiotemporal) 15 | bn_layer: whether to add batch norm 16 | """ 17 | super(NLBlockND, self).__init__() 18 | 19 | assert dimension in [1, 2, 3] 20 | 21 | if mode not in ['gaussian', 'embedded', 'dot', 'concatenate']: 22 | raise ValueError('`mode` must be one of `gaussian`, `embedded`, `dot` or `concatenate`') 23 | 24 | self.mode = mode 25 | self.dimension = dimension 26 | 27 | self.in_channels = in_channels 28 | self.inter_channels = inter_channels 29 | 30 | # the channel size is reduced to half inside the block 31 | if self.inter_channels is None: 32 | self.inter_channels = in_channels // 2 33 | if self.inter_channels == 0: 34 | self.inter_channels = 1 35 | 36 | # assign appropriate convolutional, max pool, and batch norm layers for different dimensions 37 | if dimension == 3: 38 | conv_nd = nn.Conv3d 39 | max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2)) 40 | bn = nn.BatchNorm3d 41 | elif dimension == 2: 42 | conv_nd = nn.Conv2d 43 | max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2)) 44 | bn = nn.BatchNorm2d 45 | else: 46 | conv_nd = nn.Conv1d 47 | max_pool_layer = nn.MaxPool1d(kernel_size=(2)) 48 | bn = nn.BatchNorm1d 49 | 50 | # function g in the paper which goes through conv. with kernel size 1 51 | self.g = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1) 52 | 53 | # add BatchNorm layer after the last conv layer 54 | if bn_layer: 55 | self.W_z = nn.Sequential( 56 | conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, kernel_size=1), 57 | bn(self.in_channels) 58 | ) 59 | # from section 4.1 of the paper, initializing params of BN ensures that the initial state of non-local block is identity mapping 60 | nn.init.constant_(self.W_z[1].weight, 0) 61 | nn.init.constant_(self.W_z[1].bias, 0) 62 | else: 63 | self.W_z = conv_nd(in_channels=self.inter_channels, out_channels=self.in_channels, kernel_size=1) 64 | 65 | # from section 3.3 of the paper by initializing Wz to 0, this block can be inserted to any existing architecture 66 | nn.init.constant_(self.W_z.weight, 0) 67 | nn.init.constant_(self.W_z.bias, 0) 68 | 69 | # define theta and phi for all operations except gaussian 70 | if self.mode == "embedded" or self.mode == "dot" or self.mode == "concatenate": 71 | self.theta = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1) 72 | self.phi = conv_nd(in_channels=self.in_channels, out_channels=self.inter_channels, kernel_size=1) 73 | 74 | if self.mode == "concatenate": 75 | self.W_f = nn.Sequential( 76 | nn.Conv2d(in_channels=self.inter_channels * 2, out_channels=1, kernel_size=1), 77 | nn.ReLU() 78 | ) 79 | 80 | def forward(self, x): 81 | """ 82 | args 83 | x: (N, C, T, H, W) for dimension=3; (N, C, H, W) for dimension 2; (N, C, T) for dimension 1 84 | """ 85 | 86 | batch_size = x.size(0) 87 | 88 | # (N, C, THW) 89 | # this reshaping and permutation is from the spacetime_nonlocal function in the original Caffe2 implementation 90 | g_x = self.g(x).view(batch_size, self.inter_channels, -1) 91 | g_x = g_x.permute(0, 2, 1) 92 | 93 | if self.mode == "gaussian": 94 | theta_x = x.view(batch_size, self.in_channels, -1) 95 | phi_x = x.view(batch_size, self.in_channels, -1) 96 | theta_x = theta_x.permute(0, 2, 1) 97 | f = torch.matmul(theta_x, phi_x) 98 | 99 | elif self.mode == "embedded" or self.mode == "dot": 100 | theta_x = self.theta(x).view(batch_size, self.inter_channels, -1) 101 | phi_x = self.phi(x).view(batch_size, self.inter_channels, -1) 102 | theta_x = theta_x.permute(0, 2, 1) 103 | f = torch.matmul(theta_x, phi_x) 104 | 105 | elif self.mode == "concatenate": 106 | theta_x = self.theta(x).view(batch_size, self.inter_channels, -1, 1) 107 | phi_x = self.phi(x).view(batch_size, self.inter_channels, 1, -1) 108 | 109 | h = theta_x.size(2) 110 | w = phi_x.size(3) 111 | theta_x = theta_x.repeat(1, 1, 1, w) 112 | phi_x = phi_x.repeat(1, 1, h, 1) 113 | 114 | concat = torch.cat([theta_x, phi_x], dim=1) 115 | f = self.W_f(concat) 116 | f = f.view(f.size(0), f.size(2), f.size(3)) 117 | 118 | if self.mode == "gaussian" or self.mode == "embedded": 119 | f_div_C = F.softmax(f, dim=-1) 120 | elif self.mode == "dot" or self.mode == "concatenate": 121 | N = f.size(-1) # number of position in x 122 | f_div_C = f / N 123 | 124 | y = torch.matmul(f_div_C, g_x) 125 | 126 | # contiguous here just allocates contiguous chunk of memory 127 | y = y.permute(0, 2, 1).contiguous() 128 | y = y.view(batch_size, self.inter_channels, *x.size()[2:]) 129 | 130 | W_y = self.W_z(y) 131 | # residual connection 132 | z = W_y + x 133 | 134 | return z 135 | 136 | 137 | if __name__ == '__main__': 138 | import torch 139 | 140 | for bn_layer in [True, False]: 141 | img = torch.zeros(2, 3, 20) 142 | net = NLBlockND(in_channels=3, mode='concatenate', dimension=1, bn_layer=bn_layer) 143 | out = net(img) 144 | print(out.size()) 145 | 146 | img = torch.zeros(2, 3, 20, 20) 147 | net = NLBlockND(in_channels=3, mode='concatenate', dimension=2, bn_layer=bn_layer) 148 | out = net(img) 149 | print(out.size()) 150 | 151 | img = torch.randn(2, 3, 8, 20, 20) 152 | net = NLBlockND(in_channels=3, mode='concatenate', dimension=3, bn_layer=bn_layer) 153 | out = net(img) 154 | print(out.size()) 155 | 156 | 157 | -------------------------------------------------------------------------------- /models/resnet2D.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Non-Local ResNet2D-50 for CIFAR-10 dataset. 3 | Most of the code is borrowed from https://github.com/akamaster/pytorch_resnet_cifar10 4 | 5 | Properly implemented ResNet-s for CIFAR10 as described in paper [1]. 6 | 7 | The implementation and structure of this file is hugely influenced by [2] 8 | which is implemented for ImageNet and doesn't have option A for identity. 9 | Moreover, most of the implementations on the web is copy-paste from 10 | torchvision's resnet and has wrong number of params. 11 | 12 | Reference: 13 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 14 | Deep Residual Learning for Image Recognition. arXiv:1512.03385 15 | [2] https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py 16 | ''' 17 | import torch 18 | import torch.nn as nn 19 | import torch.nn.functional as F 20 | import torch.nn.init as init 21 | 22 | from torch.autograd import Variable 23 | from models.non_local import NLBlockND 24 | 25 | 26 | def _weights_init(m): 27 | if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d): 28 | init.kaiming_normal_(m.weight) 29 | 30 | class LambdaLayer(nn.Module): 31 | def __init__(self, lambd): 32 | super(LambdaLayer, self).__init__() 33 | self.lambd = lambd 34 | 35 | def forward(self, x): 36 | return self.lambd(x) 37 | 38 | 39 | class BasicBlock(nn.Module): 40 | expansion = 1 41 | 42 | def __init__(self, in_planes, planes, stride=1, option='A'): 43 | super(BasicBlock, self).__init__() 44 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 45 | self.bn1 = nn.BatchNorm2d(planes) 46 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) 47 | self.bn2 = nn.BatchNorm2d(planes) 48 | 49 | self.shortcut = nn.Sequential() 50 | if stride != 1 or in_planes != planes: 51 | if option == 'A': 52 | """ 53 | For CIFAR10 ResNet paper uses option A. 54 | """ 55 | self.shortcut = LambdaLayer(lambda x: 56 | F.pad(x[:, :, ::2, ::2], (0, 0, 0, 0, planes//4, planes//4), "constant", 0)) 57 | elif option == 'B': 58 | self.shortcut = nn.Sequential( 59 | nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), 60 | nn.BatchNorm2d(self.expansion * planes) 61 | ) 62 | 63 | def forward(self, x): 64 | out = F.relu(self.bn1(self.conv1(x))) 65 | out = self.bn2(self.conv2(out)) 66 | out += self.shortcut(x) 67 | out = F.relu(out) 68 | return out 69 | 70 | 71 | class ResNet2D(nn.Module): 72 | def __init__(self, block, num_blocks, num_classes=10, non_local=False): 73 | super(ResNet2D, self).__init__() 74 | self.in_planes = 16 75 | 76 | self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False) 77 | self.bn1 = nn.BatchNorm2d(16) 78 | self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1) 79 | 80 | # add non-local block after layer 2 81 | self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2, non_local=non_local) 82 | self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2) 83 | self.linear = nn.Linear(64, num_classes) 84 | 85 | self.apply(_weights_init) 86 | 87 | def _make_layer(self, block, planes, num_blocks, stride, non_local=False): 88 | strides = [stride] + [1]*(num_blocks-1) 89 | layers = [] 90 | 91 | last_idx = len(strides) 92 | if non_local: 93 | last_idx = len(strides) - 1 94 | 95 | for i in range(last_idx): 96 | layers.append(block(self.in_planes, planes, strides[i])) 97 | self.in_planes = planes * block.expansion 98 | 99 | if non_local: 100 | layers.append(NLBlockND(in_channels=planes, dimension=2)) 101 | layers.append(block(self.in_planes, planes, strides[-1])) 102 | 103 | return nn.Sequential(*layers) 104 | 105 | def forward(self, x): 106 | out = F.relu(self.bn1(self.conv1(x))) 107 | out = self.layer1(out) 108 | out = self.layer2(out) 109 | out = self.layer3(out) 110 | out = F.avg_pool2d(out, out.size()[3]) 111 | out = out.view(out.size(0), -1) 112 | out = self.linear(out) 113 | return out 114 | 115 | 116 | def resnet2D56(non_local=False, **kwargs): 117 | """Constructs a ResNet-56 model. 118 | """ 119 | return ResNet2D(BasicBlock, [9, 9, 9], non_local=non_local, **kwargs) 120 | 121 | 122 | if __name__=='__main__': 123 | # Test case for (224 x 224 x 3) input of batch size 1 124 | img = Variable(torch.randn(1, 3, 224, 224)) 125 | net = resnet2D56() 126 | count = 0 127 | for name, param in net.named_parameters(): 128 | if param.requires_grad: 129 | count += 1 130 | print(name) 131 | print (count) 132 | out = net(img) 133 | print(out.size()) 134 | -------------------------------------------------------------------------------- /models/resnet3D.py: -------------------------------------------------------------------------------- 1 | """ 2 | ResNet50 (C2D) for spatiotemporal task. Only ResNet50 backbone structure was implemented here. 3 | """ 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import math 9 | from functools import partial 10 | from models.non_local import NLBlockND 11 | 12 | 13 | class Bottleneck(nn.Module): 14 | """ 15 | Bottleneck block structure used in ResNet 50. 16 | As mentioned in Section 4. 2D ConvNet baseline (C2D), 17 | all convolutions are in essence 2D kernels that prcoess the input frame-by-frame 18 | (implemented as (1 x k x k) kernels). 19 | """ 20 | expansion = 4 21 | 22 | def __init__(self, inplanes, planes, stride=1, padding=(0, 1, 1), downsample=None): 23 | super(Bottleneck, self).__init__() 24 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=(1, 1, 1), bias=False) 25 | self.bn1 = nn.BatchNorm3d(planes) 26 | self.conv2 = nn.Conv3d(planes, planes, kernel_size=(1, 3, 3), stride=stride, padding=padding, bias=False) 27 | self.bn2 = nn.BatchNorm3d(planes) 28 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=(1, 1, 1), bias=False) 29 | self.bn3 = nn.BatchNorm3d(planes * 4) 30 | self.relu = nn.ReLU(inplace=True) 31 | self.downsample = downsample 32 | self.stride = stride 33 | 34 | def forward(self, x): 35 | identity = x 36 | 37 | out = self.conv1(x) 38 | out = self.bn1(out) 39 | out = self.relu(out) 40 | 41 | out = self.conv2(out) 42 | out = self.bn2(out) 43 | out = self.relu(out) 44 | 45 | out = self.conv3(out) 46 | out = self.bn3(out) 47 | 48 | if self.downsample is not None: 49 | identity = self.downsample(x) 50 | 51 | out += identity 52 | out = self.relu(out) 53 | 54 | return out 55 | 56 | 57 | class ResNet3D(nn.Module): 58 | """C2D with ResNet 50 backbone. 59 | The only operation involving the temporal domain are the pooling layer after the second residual block. 60 | For more details of the structure, refer to Table 1 from the paper. 61 | Padding was added accordingly to match the correct dimensionality. 62 | """ 63 | def __init__(self, block, layers, num_classes=400, non_local=False): 64 | self.inplanes = 64 65 | super(ResNet3D, self).__init__() 66 | 67 | # first convolution operation has essentially 2D kernels 68 | # output: 64 x 16 x 112 x 112 69 | self.conv1 = nn.Conv3d(3, 64, kernel_size=(1, 7, 7), stride=2, padding=(0, 3, 3), bias=False) 70 | self.bn1 = nn.BatchNorm3d(64) 71 | self.relu = nn.ReLU(inplace=True) 72 | 73 | # output: 64 x 8 x 56 x 56 74 | self.pool1 = nn.MaxPool3d(kernel_size=3, stride=2) 75 | 76 | # output: 256 x 8 x 56 x 56 77 | self.layer1 = self._make_layer(block, 64, layers[0], stride=1, d_padding=0) 78 | 79 | # pooling on temporal domain 80 | # output: 256 x 4 x 56 x 56 81 | self.pool_t = nn.MaxPool3d(kernel_size=(3, 1, 1), stride=(2, 1, 1)) 82 | 83 | # output: 512 x 4 x 28 x 28 84 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2, padding=(2, 1, 1)) 85 | 86 | # add one non-local block here 87 | # output: 1024 x 4 x 14 x 14 88 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2, padding=(2, 1, 1), non_local=non_local) 89 | 90 | # output: 2048 x 4 x 7 x 7 91 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2, padding=(2, 1, 1)) 92 | 93 | # output: 2048 x 1 94 | self.avgpool = nn.AvgPool3d(kernel_size=(4, 7, 7)) 95 | self.fc = nn.Linear(512 * block.expansion, num_classes) 96 | 97 | for m in self.modules(): 98 | if isinstance(m, nn.Conv3d): 99 | m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out') 100 | elif isinstance(m, nn.BatchNorm3d): 101 | m.weight.data.fill_(1) 102 | m.bias.data.zero_() 103 | 104 | def _make_layer(self, block, planes, blocks, stride=1, padding=(0, 1, 1), d_padding=(2, 0, 0), non_local=False): 105 | downsample = nn.Sequential( 106 | nn.Conv3d(self.inplanes, planes * block.expansion, 107 | kernel_size=1, stride=stride, padding=d_padding, bias=False), 108 | nn.BatchNorm3d(planes * block.expansion) 109 | ) 110 | 111 | layers = [] 112 | layers.append(block(self.inplanes, planes, stride, padding, downsample)) 113 | self.inplanes = planes * block.expansion 114 | 115 | last_idx = blocks 116 | if non_local: 117 | last_idx = blocks - 1 118 | 119 | for i in range(1, last_idx): 120 | layers.append(block(self.inplanes, planes)) 121 | 122 | # add non-local block here 123 | if non_local: 124 | layers.append(NLBlockND(in_channels=1024, dimension=3)) 125 | layers.append(block(self.inplanes, planes)) 126 | 127 | return nn.Sequential(*layers) 128 | 129 | def forward(self, x): 130 | x = self.conv1(x) 131 | x = self.bn1(x) 132 | x = self.relu(x) 133 | x = self.pool1(x) 134 | 135 | x = self.layer1(x) 136 | x = self.pool_t(x) 137 | x = self.layer2(x) 138 | x = self.layer3(x) 139 | x = self.layer4(x) 140 | 141 | x = self.avgpool(x) 142 | 143 | x = x.view(x.size(0), -1) 144 | x = self.fc(x) 145 | 146 | return x 147 | 148 | 149 | def resnet3D50(non_local=False, **kwargs): 150 | """Constructs a C2D ResNet-50 model. 151 | """ 152 | model = ResNet3D(Bottleneck, [3, 4, 6, 3], non_local=non_local, **kwargs) 153 | return model 154 | 155 | 156 | 157 | if __name__=='__main__': 158 | # Test case of 32 frames (224 x 224 x 3) input of batch size 1 159 | img = Variable(torch.randn(1, 3, 32, 224, 224)) 160 | net = resnet3D50(non_local=True) 161 | count = 0 162 | for name, param in net.named_parameters(): 163 | if param.requires_grad: 164 | count += 1 165 | print(name) 166 | print (count) 167 | out = net(img) 168 | print(out.size()) 169 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | python main.py --verbose 2>&1 | tee regular_output.txt 2 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | '''Some helper functions for PyTorch, including: 2 | - get_mean_and_std: calculate the mean and std value of dataset. 3 | - msr_init: net parameter initialization. 4 | - progress_bar: progress bar mimic xlua.progress. 5 | ''' 6 | import os 7 | import sys 8 | import time 9 | import math 10 | 11 | import torch.nn as nn 12 | import torch.nn.init as init 13 | 14 | 15 | def get_mean_and_std(dataset): 16 | '''Compute the mean and std value of dataset.''' 17 | dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2) 18 | mean = torch.zeros(3) 19 | std = torch.zeros(3) 20 | print('==> Computing mean and std..') 21 | for inputs, targets in dataloader: 22 | for i in range(3): 23 | mean[i] += inputs[:,i,:,:].mean() 24 | std[i] += inputs[:,i,:,:].std() 25 | mean.div_(len(dataset)) 26 | std.div_(len(dataset)) 27 | return mean, std 28 | 29 | def init_params(net): 30 | '''Init layer parameters.''' 31 | for m in net.modules(): 32 | if isinstance(m, nn.Conv2d): 33 | init.kaiming_normal(m.weight, mode='fan_out') 34 | if m.bias: 35 | init.constant(m.bias, 0) 36 | elif isinstance(m, nn.BatchNorm2d): 37 | init.constant(m.weight, 1) 38 | init.constant(m.bias, 0) 39 | elif isinstance(m, nn.Linear): 40 | init.normal(m.weight, std=1e-3) 41 | if m.bias: 42 | init.constant(m.bias, 0) 43 | 44 | 45 | _, term_width = os.popen('stty size', 'r').read().split() 46 | term_width = int(term_width) 47 | 48 | TOTAL_BAR_LENGTH = 65. 49 | last_time = time.time() 50 | begin_time = last_time 51 | def progress_bar(current, total, msg=None): 52 | global last_time, begin_time 53 | if current == 0: 54 | begin_time = time.time() # Reset for new bar. 55 | 56 | cur_len = int(TOTAL_BAR_LENGTH*current/total) 57 | rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1 58 | 59 | sys.stdout.write(' [') 60 | for i in range(cur_len): 61 | sys.stdout.write('=') 62 | sys.stdout.write('>') 63 | for i in range(rest_len): 64 | sys.stdout.write('.') 65 | sys.stdout.write(']') 66 | 67 | cur_time = time.time() 68 | step_time = cur_time - last_time 69 | last_time = cur_time 70 | tot_time = cur_time - begin_time 71 | 72 | L = [] 73 | L.append(' Step: %s' % format_time(step_time)) 74 | L.append(' | Tot: %s' % format_time(tot_time)) 75 | if msg: 76 | L.append(' | ' + msg) 77 | 78 | msg = ''.join(L) 79 | sys.stdout.write(msg) 80 | for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3): 81 | sys.stdout.write(' ') 82 | 83 | # Go back to the center of the bar. 84 | for i in range(term_width-int(TOTAL_BAR_LENGTH/2)+2): 85 | sys.stdout.write('\b') 86 | sys.stdout.write(' %d/%d ' % (current+1, total)) 87 | 88 | if current < total-1: 89 | sys.stdout.write('\r') 90 | else: 91 | sys.stdout.write('\n') 92 | sys.stdout.flush() 93 | 94 | def format_time(seconds): 95 | days = int(seconds / 3600/24) 96 | seconds = seconds - days*3600*24 97 | hours = int(seconds / 3600) 98 | seconds = seconds - hours*3600 99 | minutes = int(seconds / 60) 100 | seconds = seconds - minutes*60 101 | secondsf = int(seconds) 102 | seconds = seconds - secondsf 103 | millis = int(seconds*1000) 104 | 105 | f = '' 106 | i = 1 107 | if days > 0: 108 | f += str(days) + 'D' 109 | i += 1 110 | if hours > 0 and i <= 2: 111 | f += str(hours) + 'h' 112 | i += 1 113 | if minutes > 0 and i <= 2: 114 | f += str(minutes) + 'm' 115 | i += 1 116 | if secondsf > 0 and i <= 2: 117 | f += str(secondsf) + 's' 118 | i += 1 119 | if millis > 0 and i <= 2: 120 | f += str(millis) + 'ms' 121 | i += 1 122 | if f == '': 123 | f = '0ms' 124 | return f 125 | --------------------------------------------------------------------------------